Re: [RFC PATCH bpf-next 1/3] bpf: Introduce BPF_F_CPU flag for percpu_array map

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Tue, 1 Jul 2025 13:22:02 -0700

On Tue, Jun 24, 2025 at 9:54 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote:
>
> This patch introduces support for the BPF_F_CPU flag in percpu_array maps
> to allow updating or looking up values for specific CPUs or for all CPUs
> with a single value.
>
> This enhancement enables:
>
> * Efficient update of all CPUs using a single value when cpu == 0xFFFFFFFF.
> * Targeted update or lookup for a specific CPU otherwise.
>
> The flag is passed via:
>
> * map_flags in bpf_percpu_array_update() along with the cpu field.
> * elem_flags in generic_map_update_batch() along with the cpu field.
>
> Signed-off-by: Leon Hwang <leon.hwang@xxxxxxxxx>
> ---
>  include/linux/bpf.h            |  5 +--
>  include/uapi/linux/bpf.h       |  6 ++++
>  kernel/bpf/arraymap.c          | 46 ++++++++++++++++++++++++----
>  kernel/bpf/syscall.c           | 56 ++++++++++++++++++++++------------
>  tools/include/uapi/linux/bpf.h |  6 ++++
>  5 files changed, 92 insertions(+), 27 deletions(-)
>

[...]

> #define BPF_ALL_CPU    0xFFFFFFFF

at the very least we have to make it an enum, IMO. but I'm in general
unsure if we need it at all... and in any case, should it be named
"BPF_ALL_CPUS" (plural)?

> -int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
> +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value,
> +                         u64 flags, u32 cpu)
>  {
>         struct bpf_array *array = container_of(map, struct bpf_array, map);
>         u32 index = *(u32 *)key;
>         void __percpu *pptr;
> -       int cpu, off = 0;
> +       int off = 0;
>         u32 size;
>
>         if (unlikely(index >= array->map.max_entries))
>                 return -ENOENT;
>
> +       if (unlikely(flags > BPF_F_CPU))
> +               /* unknown flags */
> +               return -EINVAL;
> +
>         /* per_cpu areas are zero-filled and bpf programs can only
>          * access 'value_size' of them, so copying rounded areas
>          * will not leak any kernel data
>          */
>         size = array->elem_size;
> +
> +       if (flags & BPF_F_CPU) {
> +               if (cpu >= num_possible_cpus())
> +                       return -E2BIG;
> +
> +               rcu_read_lock();
> +               pptr = array->pptrs[index & array->index_mask];
> +               copy_map_value_long(map, value, per_cpu_ptr(pptr, cpu));
> +               check_and_init_map_value(map, value);
> +               rcu_read_unlock();
> +               return 0;
> +       }
> +

nit: it seems a bit cleaner to me to not duplicate
rcu_read_{lock,unlock} and pptr fetching

I'd probably add `if ((flags & BPF_F_CPU) && cpu >=
num_possible_cpus())` check, and then within rcu region

if (flags & BPF_F_CPU) {
    copy_map_value_long(...);
    check_and_init_map_value(...);
} else {
    for_each_possible_cpu(cpu) {
       copy_map_value_long(...);
       check_and_init_map_value(...);
    }
}

This to me is more explicitly showing that locking/data fetching isn't
different, and it's only about singular CPU vs all CPUs

(oh, and move int off inside the else branch then as well)

>         rcu_read_lock();
>         pptr = array->pptrs[index & array->index_mask];
>         for_each_possible_cpu(cpu) {
> @@ -382,15 +400,16 @@ static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
>  }
>
>  int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
> -                           u64 map_flags)
> +                           u64 map_flags, u32 cpu)
>  {
>         struct bpf_array *array = container_of(map, struct bpf_array, map);
>         u32 index = *(u32 *)key;
>         void __percpu *pptr;
> -       int cpu, off = 0;
> +       bool reuse_value;
> +       int off = 0;
>         u32 size;
>
> -       if (unlikely(map_flags > BPF_EXIST))
> +       if (unlikely(map_flags > BPF_F_CPU))
>                 /* unknown flags */
>                 return -EINVAL;
>
> @@ -409,10 +428,25 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
>          * so no kernel data leaks possible
>          */
>         size = array->elem_size;
> +
> +       if ((map_flags & BPF_F_CPU) && cpu != BPF_ALL_CPU) {
> +               if (cpu >= num_possible_cpus())
> +                       return -E2BIG;
> +
> +               rcu_read_lock();
> +               pptr = array->pptrs[index & array->index_mask];
> +               copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value);
> +               bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
> +               rcu_read_unlock();
> +               return 0;
> +       }
> +
> +       reuse_value = (map_flags & BPF_F_CPU) && cpu == BPF_ALL_CPU;
>         rcu_read_lock();
>         pptr = array->pptrs[index & array->index_mask];
>         for_each_possible_cpu(cpu) {
> -               copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
> +               copy_map_value_long(map, per_cpu_ptr(pptr, cpu),
> +                                   reuse_value ? value : value + off);
>                 bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
>                 off += size;

ditto here, I'd not touch rcu locking and bpf_obj_free_fields. The
difference would be singular vs all CPUs, and then for all CPUs with
BPF_F_CPU we just don't update off, getting desired behavior without
extra reuse_value variable?

[...]

> @@ -1941,19 +1941,27 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
>  {
>         void __user *values = u64_to_user_ptr(attr->batch.values);
>         void __user *keys = u64_to_user_ptr(attr->batch.keys);
> +       u64 elem_flags = attr->batch.elem_flags;
>         u32 value_size, cp, max_count;
>         void *key, *value;
>         int err = 0;
>
> -       if (attr->batch.elem_flags & ~BPF_F_LOCK)
> +       if (elem_flags & ~(BPF_F_LOCK | BPF_F_CPU))
>                 return -EINVAL;
>
> -       if ((attr->batch.elem_flags & BPF_F_LOCK) &&
> +       if ((elem_flags & BPF_F_LOCK) &&
>             !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
>                 return -EINVAL;
>         }
>
> -       value_size = bpf_map_value_size(map);
> +       if (elem_flags & BPF_F_CPU) {
> +               if (map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY)
> +                       return -EINVAL;
> +
> +               value_size = round_up(map->value_size, 8);
> +       } else {
> +               value_size = bpf_map_value_size(map);
> +       }

why not roll this into bpf_map_value_size() helper? it's internal,
should be fine

pw-bot: cr

>
>         max_count = attr->batch.count;
>         if (!max_count)
> @@ -1980,7 +1988,8 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
>                         break;
>
>                 err = bpf_map_update_value(map, map_file, key, value,
> -                                          attr->batch.elem_flags);
> +                                          attr->batch.elem_flags,
> +                                          attr->batch.cpu);

So I think we discussed cpu as a separate field vs embedded into flags
field, right? I don't remember what I argued for, but looking at this
patch, it seems like it would be more convenient to have cpu come as
part of flags, no? And I don't mean UAPI-side, there separate cpu
field I think makes most sense. But internally I'd roll it into flags
as ((cpu << 32) | flags), instead of dragging it around everywhere. It
feels unclean to have "cpu" argument to generic
bpf_map_copy_value()...

(and looking at how much code we add just to pass that extra cpu
argument through libbpf API, maybe combining cpu and flags is actually
a way to go?..)

WDYT?

[...]