Re: [PATCH bpf-next v3 3/6] bpf: Introduce BPF_F_CPU flag for percpu_hash and lru_percpu_hash maps

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, Aug 21, 2025 at 9:08 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote:
>
> Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash
> maps to allow updating values for all CPUs with a single value.
>
> Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash
> maps to allow updating value for specified CPU.
>
> This enhancement enables:
>
> * Efficient update values across all CPUs with a single value when
>   BPF_F_ALL_CPUS is set for update_elem and update_batch APIs.
> * Targeted update or lookup for a specified CPU when BPF_F_CPU is set.
>
> The BPF_F_CPU flag is passed via:
>
> * map_flags of lookup_elem and update_elem APIs along with embedded cpu
>   field.
> * elem_flags of lookup_batch and update_batch APIs along with embedded
>   cpu field.
>
> Signed-off-by: Leon Hwang <leon.hwang@xxxxxxxxx>
> ---
>  include/linux/bpf.h   |  54 +++++++++++++++++++-
>  kernel/bpf/arraymap.c |  29 ++++-------
>  kernel/bpf/hashtab.c  | 111 +++++++++++++++++++++++++++++-------------
>  kernel/bpf/syscall.c  |  30 +++---------
>  4 files changed, 147 insertions(+), 77 deletions(-)
>

[...]

> @@ -397,22 +395,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
>                             u64 map_flags)
>  {
>         struct bpf_array *array = container_of(map, struct bpf_array, map);
> -       const u64 cpu_flags = BPF_F_CPU | BPF_F_ALL_CPUS;
>         u32 index = *(u32 *)key;
>         void __percpu *pptr;
> +       int off = 0, err;
>         u32 size, cpu;
> -       int off = 0;
> -
> -       if (unlikely((u32)map_flags > BPF_F_ALL_CPUS))
> -               /* unknown flags */
> -               return -EINVAL;
> -       if (unlikely((map_flags & cpu_flags) == cpu_flags))
> -               return -EINVAL;
>
> -       cpu = map_flags >> 32;
> -       if (unlikely((map_flags & BPF_F_CPU) && cpu >= num_possible_cpus()))
> -               /* invalid cpu */
> -               return -ERANGE;
> +       err = bpf_map_check_cpu_flags(map_flags, true);
> +       if (unlikely(err))
> +               return err;

again, unnecessary churn, why not add this function in previous patch
when you add cpu flags ?


>
>         if (unlikely(index >= array->map.max_entries))
>                 /* all elements were pre-allocated, cannot insert a new one */
> @@ -432,6 +422,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
>         rcu_read_lock();
>         pptr = array->pptrs[index & array->index_mask];
>         if (map_flags & BPF_F_CPU) {
> +               cpu = map_flags >> 32;
>                 copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value);
>                 bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
>         } else {
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 71f9931ac64cd..34a35cdade425 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -937,24 +937,39 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  }
>
>  static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
> -                           void *value, bool onallcpus)
> +                           void *value, bool onallcpus, u64 map_flags)
>  {
> +       int cpu = map_flags & BPF_F_CPU ? map_flags >> 32 : 0;
> +       int current_cpu = raw_smp_processor_id();
> +
>         if (!onallcpus) {
>                 /* copy true value_size bytes */
> -               copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
> +               copy_map_value(&htab->map, (map_flags & BPF_F_CPU) && cpu != current_cpu ?
> +                              per_cpu_ptr(pptr, cpu) : this_cpu_ptr(pptr), value);

is there any benefit to this cpu == current_cpu special casing?
Wouldn't per_cpu_ptr() do the right thing even if cpu == current_cpu?

>         } else {
>                 u32 size = round_up(htab->map.value_size, 8);
> -               int off = 0, cpu;
> +               int off = 0;
> +
> +               if (map_flags & BPF_F_CPU) {
> +                       copy_map_value_long(&htab->map, cpu != current_cpu ?
> +                                           per_cpu_ptr(pptr, cpu) : this_cpu_ptr(pptr), value);
> +                       return;
> +               }

[...]

> @@ -1806,10 +1834,17 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>                         void __percpu *pptr;
>
>                         pptr = htab_elem_get_ptr(l, map->key_size);
> -                       for_each_possible_cpu(cpu) {
> -                               copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
> -                               check_and_init_map_value(&htab->map, dst_val + off);
> -                               off += size;
> +                       if (!do_delete && (elem_map_flags & BPF_F_CPU)) {

if do_delete is true we can't have BPF_F_CPU set, right? We checked
that above, so why all these complications?

> +                               cpu = elem_map_flags >> 32;
> +                               copy_map_value_long(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
> +                               check_and_init_map_value(&htab->map, dst_val);
> +                       } else {
> +                               for_each_possible_cpu(cpu) {
> +                                       copy_map_value_long(&htab->map, dst_val + off,
> +                                                           per_cpu_ptr(pptr, cpu));
> +                                       check_and_init_map_value(&htab->map, dst_val + off);
> +                                       off += size;
> +                               }
>                         }
>                 } else {
>                         value = htab_elem_value(l, key_size);
> @@ -2365,14 +2400,18 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k
>         return NULL;
>  }
>
> -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
> +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
>  {
> +       int ret, cpu, off = 0;
>         struct htab_elem *l;
>         void __percpu *pptr;
> -       int ret = -ENOENT;
> -       int cpu, off = 0;
>         u32 size;
>
> +       ret = bpf_map_check_cpu_flags(map_flags, false);
> +       if (unlikely(ret))
> +               return ret;
> +       ret = -ENOENT;
> +
>         /* per_cpu areas are zero-filled and bpf programs can only
>          * access 'value_size' of them, so copying rounded areas
>          * will not leak any kernel data
> @@ -2386,10 +2425,16 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
>          * eviction heuristics when user space does a map walk.
>          */
>         pptr = htab_elem_get_ptr(l, map->key_size);
> -       for_each_possible_cpu(cpu) {
> -               copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
> -               check_and_init_map_value(map, value + off);
> -               off += size;
> +       if (map_flags & BPF_F_CPU) {
> +               cpu = map_flags >> 32;
> +               copy_map_value_long(map, value, per_cpu_ptr(pptr, cpu));
> +               check_and_init_map_value(map, value);
> +       } else {
> +               for_each_possible_cpu(cpu) {
> +                       copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
> +                       check_and_init_map_value(map, value + off);
> +                       off += size;
> +               }
>         }

it feels like this whole logic of copying per-cpu value to/from user
should be generic between all per-cpu maps, once we get that `void
__percpu *` pointer, no? See if you can extract it as reusable helper
(but, you know, without all the per-map type special casing, though it
doesn't seem like you should need it, though I might be missing
details, of course). One for bpf_percpu_copy_to_user() and another for
bpf_percpu_copy_from_user(), which would take into account all these
cpu flags?


[...]





[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux