Skip to content

Commit

Permalink
mm: memcg/slab: use a single set of kmem_caches for all accounted all…
Browse files Browse the repository at this point in the history
…ocations

This is fairly big but mostly red patch, which makes all accounted slab
allocations use a single set of kmem_caches instead of creating a separate
set for each memory cgroup.

Because the number of non-root kmem_caches is now capped by the number of
root kmem_caches, there is no need to shrink or destroy them prematurely.
They can be perfectly destroyed together with their root counterparts.
This allows to dramatically simplify the management of non-root
kmem_caches and delete a ton of code.

This patch performs the following changes:
1) introduces memcg_params.memcg_cache pointer to represent the
   kmem_cache which will be used for all non-root allocations
2) reuses the existing memcg kmem_cache creation mechanism
   to create memcg kmem_cache on the first allocation attempt
3) memcg kmem_caches are named <kmemcache_name>-memcg,
   e.g. dentry-memcg
4) simplifies memcg_kmem_get_cache() to just return memcg kmem_cache
   or schedule it's creation and return the root cache
5) removes almost all non-root kmem_cache management code
   (separate refcounter, reparenting, shrinking, etc)
6) makes slab debugfs to display root_mem_cgroup css id and never
   show :dead and :deact flags in the memcg_slabinfo attribute.

Following patches in the series will simplify the kmem_cache creation.

Signed-off-by: Roman Gushchin <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Reviewed-by: Vlastimil Babka <[email protected]>
Reviewed-by: Shakeel Butt <[email protected]>
Cc: Christoph Lameter <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Tejun Heo <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Linus Torvalds <[email protected]>
  • Loading branch information
rgushchin authored and torvalds committed Aug 7, 2020
1 parent 0f876e4 commit 9855609
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 698 deletions.
5 changes: 1 addition & 4 deletions include/linux/memcontrol.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ struct mem_cgroup {
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
struct list_head kmem_caches;
struct obj_cgroup __rcu *objcg;
struct list_head objcg_list; /* list of inherited objcgs */
#endif
Expand Down Expand Up @@ -1404,9 +1403,7 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
}
#endif

struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
struct obj_cgroup **objcgp);
void memcg_kmem_put_cache(struct kmem_cache *cachep);
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);

#ifdef CONFIG_MEMCG_KMEM
int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
Expand Down
5 changes: 1 addition & 4 deletions include/linux/slab.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
void kmem_cache_destroy(struct kmem_cache *);
int kmem_cache_shrink(struct kmem_cache *);

void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *);
void memcg_create_kmem_cache(struct kmem_cache *cachep);

/*
* Please use this macro to create slab caches. Simply specify the
Expand Down Expand Up @@ -580,8 +579,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
return __kmalloc_node(size, flags, node);
}

int memcg_update_all_caches(int num_memcgs);

/**
* kmalloc_array - allocate memory for an array.
* @n: number of elements.
Expand Down
163 changes: 32 additions & 131 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
}

/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
Expand Down Expand Up @@ -569,20 +569,16 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;

rcu_read_lock();
if (PageSlab(page) && !PageTail(page)) {
memcg = memcg_from_slab_page(page);
} else {
memcg = page->mem_cgroup;
memcg = page->mem_cgroup;

/*
* The lowest bit set means that memcg isn't a valid
* memcg pointer, but a obj_cgroups pointer.
* In this case the page is shared and doesn't belong
* to any specific memory cgroup.
*/
if ((unsigned long) memcg & 0x1UL)
memcg = NULL;
}
/*
* The lowest bit set means that memcg isn't a valid
* memcg pointer, but a obj_cgroups pointer.
* In this case the page is shared and doesn't belong
* to any specific memory cgroup.
*/
if ((unsigned long) memcg & 0x1UL)
memcg = NULL;

while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
Expand Down Expand Up @@ -2822,12 +2818,18 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);

/*
* Slab pages don't have page->mem_cgroup set because corresponding
* kmem caches can be reparented during the lifetime. That's why
* memcg_from_slab_page() should be used instead.
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
*/
if (PageSlab(page))
return memcg_from_slab_page(page);
if (page_has_obj_cgroups(page)) {
struct obj_cgroup *objcg;
unsigned int off;

off = obj_to_index(page->slab_cache, page, p);
objcg = page_obj_cgroups(page)[off];
return obj_cgroup_memcg(objcg);
}

/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
Expand Down Expand Up @@ -2882,9 +2884,7 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;

err = memcg_update_all_caches(size);
if (!err)
err = memcg_update_all_list_lrus(size);
err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;

Expand All @@ -2903,7 +2903,6 @@ static void memcg_free_cache_id(int id)
}

struct memcg_kmem_cache_create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
Expand All @@ -2912,136 +2911,51 @@ static void memcg_kmem_cache_create_func(struct work_struct *w)
{
struct memcg_kmem_cache_create_work *cw =
container_of(w, struct memcg_kmem_cache_create_work, work);
struct mem_cgroup *memcg = cw->memcg;
struct kmem_cache *cachep = cw->cachep;

memcg_create_kmem_cache(memcg, cachep);
memcg_create_kmem_cache(cachep);

css_put(&memcg->css);
kfree(cw);
}

/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
static void memcg_schedule_kmem_cache_create(struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;

if (!css_tryget_online(&memcg->css))
return;

cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw) {
css_put(&memcg->css);
if (!cw)
return;
}

cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);

queue_work(memcg_kmem_cache_wq, &cw->work);
}

/**
* memcg_kmem_get_cache: select the correct per-memcg cache for allocation
* memcg_kmem_get_cache: select memcg or root cache for allocation
* @cachep: the original global kmem cache
*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
* If the cache does not exist yet, if we are the first user of it, we
* create it asynchronously in a workqueue and let the current allocation
* go through with the original cache.
*
* This function takes a reference to the cache it returns to assure it
* won't get destroyed while we are working with it. Once the caller is
* done with it, memcg_kmem_put_cache() must be called to release the
* reference.
*/
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
struct obj_cgroup **objcgp)
struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
struct memcg_cache_array *arr;
int kmemcg_id;

VM_BUG_ON(!is_root_cache(cachep));

if (memcg_kmem_bypass())
memcg_cachep = READ_ONCE(cachep->memcg_params.memcg_cache);
if (unlikely(!memcg_cachep)) {
memcg_schedule_kmem_cache_create(cachep);
return cachep;

rcu_read_lock();

if (unlikely(current->active_memcg))
memcg = current->active_memcg;
else
memcg = mem_cgroup_from_task(current);

if (!memcg || memcg == root_mem_cgroup)
goto out_unlock;

kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out_unlock;

arr = rcu_dereference(cachep->memcg_params.memcg_caches);

/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match the data dependency
* barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
*/
memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);

/*
* If we are in a safe context (can wait, and not in interrupt
* context), we could be be predictable and return right away.
* This would guarantee that the allocation being performed
* already belongs in the new cache.
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
*
* If the memcg is dying or memcg_cache is about to be released,
* don't bother creating new kmem_caches. Because memcg_cachep
* is ZEROed as the fist step of kmem offlining, we don't need
* percpu_ref_tryget_live() here. css_tryget_online() check in
* memcg_schedule_kmem_cache_create() will prevent us from
* creation of a new kmem_cache.
*/
if (unlikely(!memcg_cachep))
memcg_schedule_kmem_cache_create(memcg, cachep);
else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) {
struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);

if (!objcg || !obj_cgroup_tryget(objcg)) {
percpu_ref_put(&memcg_cachep->memcg_params.refcnt);
goto out_unlock;
}

*objcgp = objcg;
cachep = memcg_cachep;
}
out_unlock:
rcu_read_unlock();
return cachep;
}

/**
* memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
* @cachep: the cache returned by memcg_kmem_get_cache
*/
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
percpu_ref_put(&cachep->memcg_params.refcnt);
return memcg_cachep;
}

/**
Expand Down Expand Up @@ -3731,7 +3645,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
INIT_LIST_HEAD(&memcg->kmem_caches);

return 0;
}
Expand All @@ -3744,22 +3657,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)

if (memcg->kmem_state != KMEM_ONLINE)
return;
/*
* Clear the online state before clearing memcg_caches array
* entries. The slab_mutex in memcg_deactivate_kmem_caches()
* guarantees that no cache will be created for this cgroup
* after we are done (see memcg_create_kmem_cache()).
*/

memcg->kmem_state = KMEM_ALLOCATED;

parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;

/*
* Deactivate and reparent kmem_caches and objcgs.
*/
memcg_deactivate_kmem_caches(memcg, parent);
memcg_reparent_objcgs(memcg, parent);

kmemcg_id = memcg->kmemcg_id;
Expand Down Expand Up @@ -5384,9 +5288,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

/* The following stuff does not apply to the root */
if (!parent) {
#ifdef CONFIG_MEMCG_KMEM
INIT_LIST_HEAD(&memcg->kmem_caches);
#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
Expand Down
16 changes: 3 additions & 13 deletions mm/slab.c
Original file line number Diff line number Diff line change
Expand Up @@ -1249,7 +1249,7 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
memcg_link_cache(kmem_cache, NULL);
memcg_link_cache(kmem_cache);
slab_state = PARTIAL;

/*
Expand Down Expand Up @@ -2253,17 +2253,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}

#ifdef CONFIG_MEMCG
void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
{
__kmem_cache_shrink(cachep);
}

void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
}
#endif

int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep);
Expand Down Expand Up @@ -3872,7 +3861,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return ret;

lockdep_assert_held(&slab_mutex);
for_each_memcg_cache(c, cachep) {
c = memcg_cache(cachep);
if (c) {
/* return value determined by the root cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
Expand Down
Loading

0 comments on commit 9855609

Please sign in to comment.