Skip to content

Commit

Permalink
mm/demotion: build demotion targets based on explicit memory tiers
Browse files Browse the repository at this point in the history
This patch switch the demotion target building logic to use memory tiers
instead of NUMA distance.  All N_MEMORY NUMA nodes will be placed in the
default memory tier and additional memory tiers will be added by drivers
like dax kmem.

This patch builds the demotion target for a NUMA node by looking at all
memory tiers below the tier to which the NUMA node belongs.  The closest
node in the immediately following memory tier is used as a demotion
target.

Since we are now only building demotion target for N_MEMORY NUMA nodes the
CPU hotplug calls are removed in this patch.

Link: https://lkml.kernel.org/r/[email protected]
Signed-off-by: Aneesh Kumar K.V <[email protected]>
Reviewed-by: "Huang, Ying" <[email protected]>
Acked-by: Wei Xu <[email protected]>
Cc: Alistair Popple <[email protected]>
Cc: Bharata B Rao <[email protected]>
Cc: Dan Williams <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Hesham Almatary <[email protected]>
Cc: Jagdish Gediya <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Jonathan Cameron <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Yang Shi <[email protected]>
Cc: SeongJae Park <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
  • Loading branch information
kvaneesh authored and akpm00 committed Sep 27, 2022
1 parent 7b88bda commit 6c542ab
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 423 deletions.
13 changes: 13 additions & 0 deletions include/linux/memory-tiers.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ struct memory_dev_type *alloc_memory_type(int adistance);
void destroy_memory_type(struct memory_dev_type *memtype);
void init_node_memory_type(int node, struct memory_dev_type *default_type);
void clear_node_memory_type(int node, struct memory_dev_type *memtype);
#ifdef CONFIG_MIGRATION
int next_demotion_node(int node);
#else
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif

#else

Expand All @@ -63,5 +71,10 @@ static inline void clear_node_memory_type(int node, struct memory_dev_type *memt
{

}

static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif /* CONFIG_NUMA */
#endif /* _LINUX_MEMORY_TIERS_H */
13 changes: 0 additions & 13 deletions include/linux/migrate.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,

#endif /* CONFIG_MIGRATION */

#if defined(CONFIG_MIGRATION) && defined(CONFIG_NUMA)
extern void set_migration_target_nodes(void);
extern void migrate_on_reclaim_init(void);
extern int next_demotion_node(int node);
#else
static inline void set_migration_target_nodes(void) {}
static inline void migrate_on_reclaim_init(void) {}
static inline int next_demotion_node(int node)
{
return NUMA_NO_NODE;
}
#endif

#ifdef CONFIG_COMPACTION
bool PageMovable(struct page *page);
void __SetPageMovable(struct page *page, const struct movable_operations *ops);
Expand Down
238 changes: 226 additions & 12 deletions mm/memory-tiers.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <linux/memory.h>
#include <linux/memory-tiers.h>

#include "internal.h"

struct memory_tier {
/* hierarchy of memory tiers */
struct list_head list;
Expand All @@ -19,6 +21,10 @@ struct memory_tier {
int adistance_start;
};

struct demotion_nodes {
nodemask_t preferred;
};

struct node_memory_type_map {
struct memory_dev_type *memtype;
int map_count;
Expand All @@ -28,6 +34,66 @@ static DEFINE_MUTEX(memory_tier_lock);
static LIST_HEAD(memory_tiers);
static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
static struct memory_dev_type *default_dram_type;
#ifdef CONFIG_MIGRATION
/*
* node_demotion[] examples:
*
* Example 1:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
*
* node distances:
* node 0 1 2 3
* 0 10 20 30 40
* 1 20 10 40 30
* 2 30 40 10 40
* 3 40 30 40 10
*
* memory_tiers0 = 0-1
* memory_tiers1 = 2-3
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 3
* node_demotion[2].preferred = <empty>
* node_demotion[3].preferred = <empty>
*
* Example 2:
*
* Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 30
* 2 30 30 10
*
* memory_tiers0 = 0-2
*
* node_demotion[0].preferred = <empty>
* node_demotion[1].preferred = <empty>
* node_demotion[2].preferred = <empty>
*
* Example 3:
*
* Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
*
* node distances:
* node 0 1 2
* 0 10 20 30
* 1 20 10 40
* 2 30 40 10
*
* memory_tiers0 = 1
* memory_tiers1 = 0
* memory_tiers2 = 2
*
* node_demotion[0].preferred = 2
* node_demotion[1].preferred = 0
* node_demotion[2].preferred = <empty>
*
*/
static struct demotion_nodes *node_demotion __read_mostly;
#endif /* CONFIG_MIGRATION */

static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
{
Expand Down Expand Up @@ -73,6 +139,154 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty
return new_memtier;
}

static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;

memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}

#ifdef CONFIG_MIGRATION
/**
* next_demotion_node() - Get the next node in the demotion path
* @node: The starting node to lookup the next node
*
* Return: node id for next memory node in the demotion path hierarchy
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
* @node online or guarantee that it *continues* to be the next demotion
* target.
*/
int next_demotion_node(int node)
{
struct demotion_nodes *nd;
int target;

if (!node_demotion)
return NUMA_NO_NODE;

nd = &node_demotion[node];

/*
* node_demotion[] is updated without excluding this
* function from running.
*
* Make sure to use RCU over entire code blocks if
* node_demotion[] reads need to be consistent.
*/
rcu_read_lock();
/*
* If there are multiple target nodes, just select one
* target node randomly.
*
* In addition, we can also use round-robin to select
* target node, but we should introduce another variable
* for node_demotion[] to record last selected target node,
* that may cause cache ping-pong due to the changing of
* last target node. Or introducing per-cpu data to avoid
* caching issue, which seems more complicated. So selecting
* target node randomly seems better until now.
*/
target = node_random(&nd->preferred);
rcu_read_unlock();

return target;
}

static void disable_all_demotion_targets(void)
{
int node;

for_each_node_state(node, N_MEMORY)
node_demotion[node].preferred = NODE_MASK_NONE;
/*
* Ensure that the "disable" is visible across the system.
* Readers will see either a combination of before+disable
* state or disable+after. They will never see before and
* after state together.
*/
synchronize_rcu();
}

static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
{
nodemask_t nodes = NODE_MASK_NONE;
struct memory_dev_type *memtype;

list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
nodes_or(nodes, nodes, memtype->nodes);

return nodes;
}

/*
* Find an automatic demotion target for all memory
* nodes. Failing here is OK. It might just indicate
* being at the end of a chain.
*/
static void establish_demotion_targets(void)
{
struct memory_tier *memtier;
struct demotion_nodes *nd;
int target = NUMA_NO_NODE, node;
int distance, best_distance;
nodemask_t tier_nodes;

lockdep_assert_held_once(&memory_tier_lock);

if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
return;

disable_all_demotion_targets();

for_each_node_state(node, N_MEMORY) {
best_distance = -1;
nd = &node_demotion[node];

memtier = __node_get_memory_tier(node);
if (!memtier || list_is_last(&memtier->list, &memory_tiers))
continue;
/*
* Get the lower memtier to find the demotion node list.
*/
memtier = list_next_entry(memtier, list);
tier_nodes = get_memtier_nodemask(memtier);
/*
* find_next_best_node, use 'used' nodemask as a skip list.
* Add all memory nodes except the selected memory tier
* nodelist to skip list so that we find the best node from the
* memtier nodelist.
*/
nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);

/*
* Find all the nodes in the memory tier node list of same best distance.
* add them to the preferred mask. We randomly select between nodes
* in the preferred mask when allocating pages during demotion.
*/
do {
target = find_next_best_node(node, &tier_nodes);
if (target == NUMA_NO_NODE)
break;

distance = node_distance(node, target);
if (distance == best_distance || best_distance == -1) {
best_distance = distance;
node_set(target, nd->preferred);
} else {
break;
}
} while (1);
}
}

#else
static inline void disable_all_demotion_targets(void) {}
static inline void establish_demotion_targets(void) {}
#endif /* CONFIG_MIGRATION */

static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
{
if (!node_memory_types[node].memtype)
Expand Down Expand Up @@ -109,16 +323,6 @@ static struct memory_tier *set_node_memory_tier(int node)
return memtier;
}

static struct memory_tier *__node_get_memory_tier(int node)
{
struct memory_dev_type *memtype;

memtype = node_memory_types[node];
if (memtype && node_isset(node, memtype->nodes))
return memtype->memtier;
return NULL;
}

static void destroy_memory_tier(struct memory_tier *memtier)
{
list_del(&memtier->list);
Expand Down Expand Up @@ -207,6 +411,7 @@ EXPORT_SYMBOL_GPL(clear_node_memory_type);
static int __meminit memtier_hotplug_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
struct memory_tier *memtier;
struct memory_notify *arg = _arg;

/*
Expand All @@ -219,12 +424,15 @@ static int __meminit memtier_hotplug_callback(struct notifier_block *self,
switch (action) {
case MEM_OFFLINE:
mutex_lock(&memory_tier_lock);
clear_node_memory_tier(arg->status_change_nid);
if (clear_node_memory_tier(arg->status_change_nid))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
case MEM_ONLINE:
mutex_lock(&memory_tier_lock);
set_node_memory_tier(arg->status_change_nid);
memtier = set_node_memory_tier(arg->status_change_nid);
if (!IS_ERR(memtier))
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);
break;
}
Expand All @@ -237,6 +445,11 @@ static int __init memory_tier_init(void)
int node;
struct memory_tier *memtier;

#ifdef CONFIG_MIGRATION
node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
GFP_KERNEL);
WARN_ON(!node_demotion);
#endif
mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
Expand All @@ -259,6 +472,7 @@ static int __init memory_tier_init(void)
*/
break;
}
establish_demotion_targets();
mutex_unlock(&memory_tier_lock);

hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
Expand Down
Loading

0 comments on commit 6c542ab

Please sign in to comment.