Skip to content

Commit

Permalink
mm, vmscan: move LRU lists to node
Browse files Browse the repository at this point in the history
This moves the LRU lists from the zone to the node and related data such
as counters, tracing, congestion tracking and writeback tracking.

Unfortunately, due to reclaim and compaction retry logic, it is
necessary to account for the number of LRU pages on both zone and node
logic.  Most reclaim logic is based on the node counters but the retry
logic uses the zone counters which do not distinguish inactive and
active sizes.  It would be possible to leave the LRU counters on a
per-zone basis but it's a heavier calculation across multiple cache
lines that is much more frequent than the retry checks.

Other than the LRU counters, this is mostly a mechanical patch but note
that it introduces a number of anomalies.  For example, the scans are
per-zone but using per-node counters.  We also mark a node as congested
when a zone is congested.  This causes weird problems that are fixed
later but is easier to review.

In the event that there is excessive overhead on 32-bit systems due to
the nodes being on LRU then there are two potential solutions

1. Long-term isolation of highmem pages when reclaim is lowmem

   When pages are skipped, they are immediately added back onto the LRU
   list. If lowmem reclaim persisted for long periods of time, the same
   highmem pages get continually scanned. The idea would be that lowmem
   keeps those pages on a separate list until a reclaim for highmem pages
   arrives that splices the highmem pages back onto the LRU. It potentially
   could be implemented similar to the UNEVICTABLE list.

   That would reduce the skip rate with the potential corner case is that
   highmem pages have to be scanned and reclaimed to free lowmem slab pages.

2. Linear scan lowmem pages if the initial LRU shrink fails

   This will break LRU ordering but may be preferable and faster during
   memory pressure than skipping LRU pages.

Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Mel Gorman <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Acked-by: Vlastimil Babka <[email protected]>
Cc: Hillf Danton <[email protected]>
Cc: Joonsoo Kim <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Minchan Kim <[email protected]>
Cc: Rik van Riel <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
gormanm authored and torvalds committed Jul 28, 2016
1 parent a52633d commit 599d0c9
Show file tree
Hide file tree
Showing 29 changed files with 386 additions and 300 deletions.
8 changes: 4 additions & 4 deletions arch/tile/mm/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ void show_mem(unsigned int filter)
struct zone *zone;

pr_err("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu free:%lu\n slab:%lu mapped:%lu pagetables:%lu bounce:%lu pagecache:%lu swap:%lu\n",
(global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE)),
(global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE)),
(global_node_page_state(NR_ACTIVE_ANON) +
global_node_page_state(NR_ACTIVE_FILE)),
(global_node_page_state(NR_INACTIVE_ANON) +
global_node_page_state(NR_INACTIVE_FILE)),
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
Expand Down
19 changes: 10 additions & 9 deletions drivers/base/node.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static ssize_t node_read_meminfo(struct device *dev,
{
int n;
int nid = dev->id;
struct pglist_data *pgdat = NODE_DATA(nid);
struct sysinfo i;

si_meminfo_node(&i, nid);
Expand All @@ -74,15 +75,15 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(i.totalram),
nid, K(i.freeram),
nid, K(i.totalram - i.freeram),
nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON) +
sum_zone_node_page_state(nid, NR_ACTIVE_FILE)),
nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON) +
sum_zone_node_page_state(nid, NR_INACTIVE_FILE)),
nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_ANON)),
nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_ANON)),
nid, K(sum_zone_node_page_state(nid, NR_ACTIVE_FILE)),
nid, K(sum_zone_node_page_state(nid, NR_INACTIVE_FILE)),
nid, K(sum_zone_node_page_state(nid, NR_UNEVICTABLE)),
nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
node_page_state(pgdat, NR_ACTIVE_FILE)),
nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
node_page_state(pgdat, NR_INACTIVE_FILE)),
nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));

#ifdef CONFIG_HIGHMEM
Expand Down
8 changes: 4 additions & 4 deletions drivers/staging/android/lowmemorykiller.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ static unsigned long lowmem_deathpending_timeout;
static unsigned long lowmem_count(struct shrinker *s,
struct shrink_control *sc)
{
return global_page_state(NR_ACTIVE_ANON) +
global_page_state(NR_ACTIVE_FILE) +
global_page_state(NR_INACTIVE_ANON) +
global_page_state(NR_INACTIVE_FILE);
return global_node_page_state(NR_ACTIVE_ANON) +
global_node_page_state(NR_ACTIVE_FILE) +
global_node_page_state(NR_INACTIVE_ANON) +
global_node_page_state(NR_INACTIVE_FILE);
}

static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
Expand Down
2 changes: 1 addition & 1 deletion include/linux/backing-dev.h
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
}

long congestion_wait(int sync, long timeout);
long wait_iff_congested(struct zone *zone, int sync, long timeout);
long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
int pdflush_proc_obsolete(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);

Expand Down
18 changes: 9 additions & 9 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct lruvec *lruvec;

if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
lruvec = zone_lruvec(zone);
goto out;
}

Expand All @@ -348,15 +348,15 @@ static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* we have to be prepared to initialize lruvec->pgdat here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
if (unlikely(lruvec->pgdat != zone->zone_pgdat))
lruvec->pgdat = zone->zone_pgdat;
return lruvec;
}

struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);

bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
Expand Down Expand Up @@ -437,7 +437,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages);
enum zone_type zid, int nr_pages);

unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask);
Expand Down Expand Up @@ -612,13 +612,13 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
struct mem_cgroup *memcg)
{
return &zone->lruvec;
return zone_lruvec(zone);
}

static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
struct zone *zone)
struct pglist_data *pgdat)
{
return &zone->lruvec;
return &pgdat->lruvec;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
Expand Down
21 changes: 14 additions & 7 deletions include/linux/mm_inline.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,40 @@ static inline int page_is_file_cache(struct page *page)
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, int nr_pages)
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);

__mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + !!is_file_lru(lru),
nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, int nr_pages)
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#else
__update_lru_size(lruvec, lru, nr_pages);
__update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
update_lru_size(lruvec, lru, hpage_nr_pages(page));
update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}

static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
list_del(&page->lru);
update_lru_size(lruvec, lru, -hpage_nr_pages(page));
update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
}

/**
Expand Down
68 changes: 41 additions & 27 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,9 @@ enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_ALLOC_BATCH,
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_LRU_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_LRU_FILE,
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
NR_ANON_PAGES, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
Expand All @@ -134,12 +131,9 @@ enum zone_stat_item {
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
Expand All @@ -161,6 +155,15 @@ enum zone_stat_item {
NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
NR_VM_NODE_STAT_ITEMS
};

Expand Down Expand Up @@ -219,7 +222,7 @@ struct lruvec {
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
struct zone *zone;
struct pglist_data *pgdat;
#endif
};

Expand Down Expand Up @@ -357,13 +360,6 @@ struct zone {
#ifdef CONFIG_NUMA
int node;
#endif

/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this zone's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;

struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;

Expand Down Expand Up @@ -495,9 +491,6 @@ struct zone {

/* Write-intensive fields used by page reclaim */

/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;

/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
Expand Down Expand Up @@ -537,17 +530,20 @@ struct zone {

enum zone_flags {
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
ZONE_CONGESTED, /* zone has many dirty pages backed by
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};

enum pgdat_flags {
PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
* a congested BDI
*/
ZONE_DIRTY, /* reclaim scanning has recently found
PGDAT_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
*/
ZONE_WRITEBACK, /* reclaim scanning has recently found
PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};

static inline unsigned long zone_end_pfn(const struct zone *zone)
Expand Down Expand Up @@ -707,6 +703,19 @@ typedef struct pglist_data {
unsigned long split_queue_len;
#endif

/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;

/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this node's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;

unsigned long flags;

ZONE_PADDING(_pad2_)

/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
Expand All @@ -728,6 +737,11 @@ static inline spinlock_t *zone_lru_lock(struct zone *zone)
return &zone->zone_pgdat->lru_lock;
}

static inline struct lruvec *zone_lruvec(struct zone *zone)
{
return &zone->zone_pgdat->lruvec;
}

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
return pgdat->node_start_pfn + pgdat->node_spanned_pages;
Expand Down Expand Up @@ -779,12 +793,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,

extern void lruvec_init(struct lruvec *lruvec);

static inline struct zone *lruvec_zone(struct lruvec *lruvec)
static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
return lruvec->zone;
return lruvec->pgdat;
#else
return container_of(lruvec, struct zone, lruvec);
return container_of(lruvec, struct pglist_data, lruvec);
#endif
}

Expand Down
1 change: 1 addition & 0 deletions include/linux/swap.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
Expand Down
10 changes: 5 additions & 5 deletions include/linux/vm_event_item.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
FOR_ALL_ZONES(PGREFILL),
FOR_ALL_ZONES(PGSTEAL_KSWAPD),
FOR_ALL_ZONES(PGSTEAL_DIRECT),
FOR_ALL_ZONES(PGSCAN_KSWAPD),
FOR_ALL_ZONES(PGSCAN_DIRECT),
PGREFILL,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED,
Expand Down
17 changes: 17 additions & 0 deletions include/linux/vmstat.h
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,23 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
return x;
}

static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);

#ifdef CONFIG_SMP
int cpu;
for_each_online_cpu(cpu)
x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];

if (x < 0)
x = 0;
#endif
return x;
}


#ifdef CONFIG_NUMA
extern unsigned long sum_zone_node_page_state(int node,
enum zone_stat_item item);
Expand Down
Loading

0 comments on commit 599d0c9

Please sign in to comment.