Skip to content

Commit

Permalink
devmap/cpumap: Use flush list instead of bitmap
Browse files Browse the repository at this point in the history
The socket map uses a linked list instead of a bitmap to keep track of
which entries to flush. Do the same for devmap and cpumap, as this means we
don't have to care about the map index when enqueueing things into the
map (and so we can cache the map lookup).

Signed-off-by: Toke Høiland-Jørgensen <[email protected]>
Acked-by: Jonathan Lemon <[email protected]>
Acked-by: Andrii Nakryiko <[email protected]>
Signed-off-by: Daniel Borkmann <[email protected]>
  • Loading branch information
tohojo authored and borkmann committed Jun 28, 2019
1 parent c8af5cd commit d5df283
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 119 deletions.
105 changes: 48 additions & 57 deletions kernel/bpf/cpumap.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,19 @@

/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
* guaranteed that setting flush bit and flush operation happen on
* guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets.
*/

#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
struct bpf_cpu_map_entry;
struct bpf_cpu_map;

struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE];
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count;
};

Expand All @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;

struct bpf_cpu_map *cmap;

/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
Expand All @@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
unsigned long __percpu *flush_needed;
struct list_head __percpu *flush_list;
};

static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
struct xdp_bulk_queue *bq, bool in_napi_ctx);

static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
}
static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);

static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
int ret, cpu;
u64 cost;
int ret;

if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
Expand All @@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)

/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
cost += sizeof(struct list_head) * num_possible_cpus();

/* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost);
Expand All @@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}

/* A per cpu bitfield with a bit per possible CPU in map */
cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
__alignof__(unsigned long));
if (!cmap->flush_needed)
cmap->flush_list = alloc_percpu(struct list_head);
if (!cmap->flush_list)
goto free_charge;

for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));

/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
Expand All @@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)

return &cmap->map;
free_percpu:
free_percpu(cmap->flush_needed);
free_percpu(cmap->flush_list);
free_charge:
bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
Expand Down Expand Up @@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
int numa, err;
struct xdp_bulk_queue *bq;
int numa, err, i;

/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
Expand All @@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq)
goto free_rcu;

for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
}

/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue)
Expand Down Expand Up @@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);

/* No concurrent bq_enqueue can run at this point */
bq_flush_to_queue(rcpu, bq, false);
bq_flush_to_queue(bq, false);
}
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
Expand Down Expand Up @@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
Expand All @@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu();

/* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
* from the program we can assume no new bits will be set.
* list be empty on _all_ cpus. Because the above synchronize_rcu()
* ensures the map is disconnected from the program we can assume no new
* items will be added to the list.
*/
for_each_online_cpu(cpu) {
unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);

while (!bitmap_empty(bitmap, cmap->map.max_entries))
while (!list_empty(flush_list))
cond_resched();
}

Expand All @@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
free_percpu(cmap->flush_needed);
free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
Expand Down Expand Up @@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};

static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
struct xdp_bulk_queue *bq, bool in_napi_ctx)
static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
const int to_cpu = rcpu->cpu;
struct ptr_ring *q;
Expand Down Expand Up @@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
bq->count = 0;
spin_unlock(&q->producer_lock);

__list_del_clearprev(&bq->flush_node);

/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
return 0;
Expand All @@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);

if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(rcpu, bq, true);
bq_flush_to_queue(bq, true);

/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
Expand All @@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
* operation, when completing napi->poll call.
*/
bq->q[bq->count++] = xdpf;

if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);

return 0;
}

Expand All @@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}

void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);

__set_bit(bit, bitmap);
}

void __cpu_map_flush(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
u32 bit;

/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
* and __cpu_map_flush() happen on same CPU. Thus, the percpu
* bitmap indicate which percpu bulkq have packets.
*/
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
struct xdp_bulk_queue *bq;

/* This is possible if entry is removed by user space
* between xdp redirect and flush op.
*/
if (unlikely(!rcpu))
continue;

__clear_bit(bit, bitmap);
struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
struct xdp_bulk_queue *bq, *tmp;

/* Flush all frames in bulkq to real queue */
bq = this_cpu_ptr(rcpu->bulkq);
bq_flush_to_queue(rcpu, bq, true);
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_flush_to_queue(bq, true);

/* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(rcpu->kthread);
wake_up_process(bq->obj->kthread);
}
}
Loading

0 comments on commit d5df283

Please sign in to comment.