Skip to content

Commit

Permalink
mm/demotion: demote pages according to allocation fallback order
Browse files Browse the repository at this point in the history
Currently, a higher tier node can only be demoted to selected nodes on the
next lower tier as defined by the demotion path.  This strict demotion
order does not work in all use cases (e.g.  some use cases may want to
allow cross-socket demotion to another node in the same demotion tier as a
fallback when the preferred demotion node is out of space).  This demotion
order is also inconsistent with the page allocation fallback order when
all the nodes in a higher tier are out of space: The page allocation can
fall back to any node from any lower tier, whereas the demotion order
doesn't allow that currently.

This patch adds support to get all the allowed demotion targets for a
memory tier.  demote_page_list() function is now modified to utilize this
allowed node mask as the fallback allocation mask.

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Jagdish Gediya <[email protected]>
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Acked-by: Wei Xu <[email protected]>
Cc: Alistair Popple <[email protected]>
Cc: Bharata B Rao <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Hesham Almatary <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Jonathan Cameron <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: SeongJae Park <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
Jagdish Gediya authored and akpm00 committed Sep 27, 2022
1 parent b26ac6f commit 3200802
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 18 deletions.
12 changes: 12 additions & 0 deletions include/linux/memory-tiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/nodemask.h>
#include <linux/kref.h>
#include <linux/mmzone.h>
/*
* Each tier cover a abstrace distance chunk size of 128
*/
Expand Down Expand Up @@ -38,11 +39,17 @@ void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
#else
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}

static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
{
*targets = NODE_MASK_NONE;
}
#endif

#else
Expand Down Expand Up @@ -75,5 +82,10 @@ static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}

static inline void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
{
*targets = NODE_MASK_NONE;
}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
51 changes: 48 additions & 3 deletions mm/memory-tiers.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <linux/sysfs.h>
#include <linux/kobject.h>
#include <linux/memory.h>
#include <linux/mmzone.h>
#include <linux/memory-tiers.h>

#include "internal.h"
Expand All @@ -20,6 +19,8 @@ struct memory_tier {
* adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
*/
int adistance_start;
/* All the nodes that are part of all the lower memory tiers. */
nodemask_t lower_tier_mask;
};

struct demotion_nodes {
Expand Down Expand Up @@ -161,6 +162,24 @@ static struct memory_tier *__node_get_memory_tier(int node)
}

#ifdef CONFIG_MIGRATION
void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
{
struct memory_tier *memtier;

/*
* pg_data_t.memtier updates includes a synchronize_rcu()
* which ensures that we either find NULL or a valid memtier
* in NODE_DATA. protect the access via rcu_read_lock();
*/
rcu_read_lock();
memtier = rcu_dereference(pgdat->memtier);
if (memtier)
*targets = memtier->lower_tier_mask;
else
*targets = NODE_MASK_NONE;
rcu_read_unlock();
}

/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
Expand Down Expand Up @@ -208,10 +227,19 @@ int next_demotion_node(int node)

static void disable_all_demotion_targets(void)
{
struct memory_tier *memtier;
int node;

for_each_node_state(node, N_MEMORY)
for_each_node_state(node, N_MEMORY) {
node_demotion[node].preferred = NODE_MASK_NONE;
/*
* We are holding memory_tier_lock, it is safe
* to access pgda->memtier.
*/
memtier = __node_get_memory_tier(node);
if (memtier)
memtier->lower_tier_mask = NODE_MASK_NONE;
}
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
Expand Down Expand Up @@ -243,7 +271,7 @@ static void establish_demotion_targets(void)
struct demotion_nodes *nd;
int target = NUMA_NO_NODE, node;
int distance, best_distance;
nodemask_t tier_nodes;
nodemask_t tier_nodes, lower_tier;

lockdep_assert_held_once(&memory_tier_lock);

Expand Down Expand Up @@ -291,6 +319,23 @@ static void establish_demotion_targets(void)
}
} while (1);
}
/*
* Now build the lower_tier mask for each node collecting node mask from
* all memory tier below it. This allows us to fallback demotion page
* allocation to a set of nodes that is closer the above selected
* perferred node.
*/
lower_tier = node_states[N_MEMORY];
list_for_each_entry(memtier, &memory_tiers, list) {
/*
* Keep removing current tier from lower_tier nodes,
* This will remove all nodes in current and above
* memory tier from the lower_tier mask.
*/
tier_nodes = get_memtier_nodemask(memtier);
nodes_andnot(lower_tier, lower_tier, tier_nodes);
memtier->lower_tier_mask = lower_tier;
}
}

#else
Expand Down
58 changes: 43 additions & 15 deletions mm/vmscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -1533,21 +1533,34 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}

static struct page *alloc_demote_page(struct page *page, unsigned long node)
static struct page *alloc_demote_page(struct page *page, unsigned long private)
{
struct migration_target_control mtc = {
/*
* Allocate from 'node', or fail quickly and quietly.
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_THISNODE | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = node
};
struct page *target_page;
nodemask_t *allowed_mask;
struct migration_target_control *mtc;

mtc = (struct migration_target_control *)private;

allowed_mask = mtc->nmask;
/*
* make sure we allocate from the target node first also trying to
* demote or reclaim pages from the target node via kswapd if we are
* low on free memory on target node. If we don't do this and if
* we have free memory on the slower(lower) memtier, we would start
* allocating pages from slower(lower) memory tiers without even forcing
* a demotion of cold pages from the target memtier. This can result
* in the kernel placing hot pages in slower(lower) memory tiers.
*/
mtc->nmask = NULL;
mtc->gfp_mask |= __GFP_THISNODE;
target_page = alloc_migration_target(page, (unsigned long)mtc);
if (target_page)
return target_page;

return alloc_migration_target(page, (unsigned long)&mtc);
mtc->gfp_mask &= ~__GFP_THISNODE;
mtc->nmask = allowed_mask;

return alloc_migration_target(page, (unsigned long)mtc);
}

/*
Expand All @@ -1560,17 +1573,32 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
nodemask_t allowed_mask;

struct migration_target_control mtc = {
/*
* Allocate from 'node', or fail quickly and quietly.
* When this happens, 'page' will likely just be discarded
* instead of migrated.
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
__GFP_NOMEMALLOC | GFP_NOWAIT,
.nid = target_nid,
.nmask = &allowed_mask
};

if (list_empty(demote_pages))
return 0;

if (target_nid == NUMA_NO_NODE)
return 0;

node_get_allowed_targets(pgdat, &allowed_mask);

/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);

if (current_is_kswapd())
__count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
Expand Down

0 comments on commit 3200802

Please sign in to comment.