Re: [PATCH bpf-next v3 3/6] bpf: Introduce BPF_F_CPU flag for percpu_hash and lru_percpu_hash maps

"Leon Hwang" <leon.hwang@xxxxxxxxx> · Tue, 26 Aug 2025 23:14:26 +0800

On Sat Aug 23, 2025 at 6:14 AM +08, Andrii Nakryiko wrote:
> On Thu, Aug 21, 2025 at 9:08 AM Leon Hwang <leon.hwang@xxxxxxxxx> wrote:
>>
>> Introduce BPF_F_ALL_CPUS flag support for percpu_hash and lru_percpu_hash
>> maps to allow updating values for all CPUs with a single value.
>>
>> Introduce BPF_F_CPU flag support for percpu_hash and lru_percpu_hash
>> maps to allow updating value for specified CPU.
>>
>> This enhancement enables:
>>
>> * Efficient update values across all CPUs with a single value when
>>   BPF_F_ALL_CPUS is set for update_elem and update_batch APIs.
>> * Targeted update or lookup for a specified CPU when BPF_F_CPU is set.
>>
>> The BPF_F_CPU flag is passed via:
>>
>> * map_flags of lookup_elem and update_elem APIs along with embedded cpu
>>   field.
>> * elem_flags of lookup_batch and update_batch APIs along with embedded
>>   cpu field.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@xxxxxxxxx>
>> ---
>>  include/linux/bpf.h   |  54 +++++++++++++++++++-
>>  kernel/bpf/arraymap.c |  29 ++++-------
>>  kernel/bpf/hashtab.c  | 111 +++++++++++++++++++++++++++++-------------
>>  kernel/bpf/syscall.c  |  30 +++---------
>>  4 files changed, 147 insertions(+), 77 deletions(-)
>>
>
> [...]
>
>> @@ -397,22 +395,14 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
>>                             u64 map_flags)
>>  {
>>         struct bpf_array *array = container_of(map, struct bpf_array, map);
>> -       const u64 cpu_flags = BPF_F_CPU | BPF_F_ALL_CPUS;
>>         u32 index = *(u32 *)key;
>>         void __percpu *pptr;
>> +       int off = 0, err;
>>         u32 size, cpu;
>> -       int off = 0;
>> -
>> -       if (unlikely((u32)map_flags > BPF_F_ALL_CPUS))
>> -               /* unknown flags */
>> -               return -EINVAL;
>> -       if (unlikely((map_flags & cpu_flags) == cpu_flags))
>> -               return -EINVAL;
>>
>> -       cpu = map_flags >> 32;
>> -       if (unlikely((map_flags & BPF_F_CPU) && cpu >= num_possible_cpus()))
>> -               /* invalid cpu */
>> -               return -ERANGE;
>> +       err = bpf_map_check_cpu_flags(map_flags, true);
>> +       if (unlikely(err))
>> +               return err;
>
> again, unnecessary churn, why not add this function in previous patch
> when you add cpu flags ?
>

Ack.

>
>>
>>         if (unlikely(index >= array->map.max_entries))
>>                 /* all elements were pre-allocated, cannot insert a new one */
>> @@ -432,6 +422,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
>>         rcu_read_lock();
>>         pptr = array->pptrs[index & array->index_mask];
>>         if (map_flags & BPF_F_CPU) {
>> +               cpu = map_flags >> 32;
>>                 copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value);
>>                 bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
>>         } else {
>> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
>> index 71f9931ac64cd..34a35cdade425 100644
>> --- a/kernel/bpf/hashtab.c
>> +++ b/kernel/bpf/hashtab.c
>> @@ -937,24 +937,39 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>>  }
>>
>>  static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
>> -                           void *value, bool onallcpus)
>> +                           void *value, bool onallcpus, u64 map_flags)
>>  {
>> +       int cpu = map_flags & BPF_F_CPU ? map_flags >> 32 : 0;
>> +       int current_cpu = raw_smp_processor_id();
>> +
>>         if (!onallcpus) {
>>                 /* copy true value_size bytes */
>> -               copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
>> +               copy_map_value(&htab->map, (map_flags & BPF_F_CPU) && cpu != current_cpu ?
>> +                              per_cpu_ptr(pptr, cpu) : this_cpu_ptr(pptr), value);
>
> is there any benefit to this cpu == current_cpu special casing?
> Wouldn't per_cpu_ptr() do the right thing even if cpu == current_cpu?
>

per_cpu_ptr() seems doing the same thing of this_cpu_ptr().

However, after having a deep research of per_cpu_ptr() and
this_cpu_ptr(), I think this_cpu_ptr() is more efficient than
per_cpu_ptr().

e.g.

DEFINE_PER_CPU(int, my_percpu_var) = 0;

int percpu_read(int cpu);
int thiscpu_read(void);
int percpu_currcpu_read(int cpu);

int percpu_read(int cpu)
{
    return *(int *)(void *)per_cpu_ptr(&my_percpu_var, cpu);
}
EXPORT_SYMBOL(percpu_read);

int thiscpu_read(void)
{
    return *(int *)(void *)this_cpu_ptr(&my_percpu_var);
}
EXPORT_SYMBOL(thiscpu_read);

int percpu_currcpu_read(int cpu)
{
    if (cpu == raw_smp_processor_id())
        return *(int *)(void *)this_cpu_ptr(&my_percpu_var);
    else
        return *(int *)(void *)per_cpu_ptr(&my_percpu_var, cpu);
}
EXPORT_SYMBOL(percpu_currcpu_read);

Source code link[0].

With disassembling these functions, the core insns of this_cpu_ptr()
are:

0xffffffffc15c5076 <+6/0x6>:      48 c7 c0 04 60 03 00	movq	$0x36004, %rax
0xffffffffc15c507d <+13/0xd>:     65 48 03 05 7b 49 a5 3e	addq	%gs:0x3ea5497b(%rip), %rax
0xffffffffc15c5085 <+21/0x15>:    8b 00              	movl	(%rax), %eax

The core insns of per_cpu_ptr() are:

0xffffffffc15c501b <+11/0xb>:     49 c7 c4 04 60 03 00	movq	$0x36004, %r12
0xffffffffc15c5022 <+18/0x12>:    53                 	pushq	%rbx
0xffffffffc15c5023 <+19/0x13>:    48 63 df           	movslq	%edi, %rbx
0xffffffffc15c5026 <+22/0x16>:    48 81 fb 00 20 00 00	cmpq	$0x2000, %rbx
0xffffffffc15c502d <+29/0x1d>:    73 19              	jae	0xffffffffc15c5048	; percpu_read+0x38
0xffffffffc15c502f <+31/0x1f>:    48 8b 04 dd 80 fc 24 a8	movq	-0x57db0380(, %rbx, 8), %rax
0xffffffffc15c5037 <+39/0x27>:    5b                 	popq	%rbx
0xffffffffc15c5038 <+40/0x28>:    42 8b 04 20        	movl	(%rax, %r12), %eax
0xffffffffc15c503c <+44/0x2c>:    41 5c              	popq	%r12
0xffffffffc15c503e <+46/0x2e>:    5d                 	popq	%rbp
0xffffffffc15c503f <+47/0x2f>:    31 f6              	xorl	%esi, %esi
0xffffffffc15c5041 <+49/0x31>:    31 ff              	xorl	%edi, %edi
0xffffffffc15c5043 <+51/0x33>:    c3                 	retq
0xffffffffc15c5044 <+52/0x34>:    cc                 	int3
0xffffffffc15c5045 <+53/0x35>:    cc                 	int3
0xffffffffc15c5046 <+54/0x36>:    cc                 	int3
0xffffffffc15c5047 <+55/0x37>:    cc                 	int3
0xffffffffc15c5048 <+56/0x38>:    48 89 de           	movq	%rbx, %rsi
0xffffffffc15c504b <+59/0x3b>:    48 c7 c7 a0 70 5c c1	movq	$18446744072658645152, %rdi
0xffffffffc15c5052 <+66/0x42>:    e8 d9 87 ac e5     	callq	0xffffffffa708d830	; __ubsan_handle_out_of_bounds+0x0 lib/ubsan.c:336
0xffffffffc15c5057 <+71/0x47>:    eb d6              	jmp	0xffffffffc15c502f	; percpu_read+0x1f

Disasm log link[1].

As we can see, per_cpu_ptr() calls __ubsan_handle_out_of_bounds() to
check index range of percpu offset array. The callq insn is much slower
than the addq insn.

Therefore, cpu == current_cpu + this_cpu_ptr() is much better than
per_cpu_ptr().

Links:
[0] https://github.com/Asphaltt/kernel-module-fun/blob/master/percpu-ptr.c
[1] https://github.com/Asphaltt/kernel-module-fun/blob/master/percpu_ptr.md

>>         } else {
>>                 u32 size = round_up(htab->map.value_size, 8);
>> -               int off = 0, cpu;
>> +               int off = 0;
>> +
>> +               if (map_flags & BPF_F_CPU) {
>> +                       copy_map_value_long(&htab->map, cpu != current_cpu ?
>> +                                           per_cpu_ptr(pptr, cpu) : this_cpu_ptr(pptr), value);
>> +                       return;
>> +               }
>
> [...]
>
>> @@ -1806,10 +1834,17 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
>>                         void __percpu *pptr;
>>
>>                         pptr = htab_elem_get_ptr(l, map->key_size);
>> -                       for_each_possible_cpu(cpu) {
>> -                               copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
>> -                               check_and_init_map_value(&htab->map, dst_val + off);
>> -                               off += size;
>> +                       if (!do_delete && (elem_map_flags & BPF_F_CPU)) {
>
> if do_delete is true we can't have BPF_F_CPU set, right? We checked
> that above, so why all these complications?
>

Ack.

This !do_delete is unnecessary. I'll drop it in next revision.

>> +                               cpu = elem_map_flags >> 32;
>> +                               copy_map_value_long(&htab->map, dst_val, per_cpu_ptr(pptr, cpu));
>> +                               check_and_init_map_value(&htab->map, dst_val);
>> +                       } else {
>> +                               for_each_possible_cpu(cpu) {
>> +                                       copy_map_value_long(&htab->map, dst_val + off,
>> +                                                           per_cpu_ptr(pptr, cpu));
>> +                                       check_and_init_map_value(&htab->map, dst_val + off);
>> +                                       off += size;
>> +                               }
>>                         }
>>                 } else {
>>                         value = htab_elem_value(l, key_size);
>> @@ -2365,14 +2400,18 @@ static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *k
>>         return NULL;
>>  }
>>
>> -int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
>> +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value, u64 map_flags)
>>  {
>> +       int ret, cpu, off = 0;
>>         struct htab_elem *l;
>>         void __percpu *pptr;
>> -       int ret = -ENOENT;
>> -       int cpu, off = 0;
>>         u32 size;
>>
>> +       ret = bpf_map_check_cpu_flags(map_flags, false);
>> +       if (unlikely(ret))
>> +               return ret;
>> +       ret = -ENOENT;
>> +
>>         /* per_cpu areas are zero-filled and bpf programs can only
>>          * access 'value_size' of them, so copying rounded areas
>>          * will not leak any kernel data
>> @@ -2386,10 +2425,16 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
>>          * eviction heuristics when user space does a map walk.
>>          */
>>         pptr = htab_elem_get_ptr(l, map->key_size);
>> -       for_each_possible_cpu(cpu) {
>> -               copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
>> -               check_and_init_map_value(map, value + off);
>> -               off += size;
>> +       if (map_flags & BPF_F_CPU) {
>> +               cpu = map_flags >> 32;
>> +               copy_map_value_long(map, value, per_cpu_ptr(pptr, cpu));
>> +               check_and_init_map_value(map, value);
>> +       } else {
>> +               for_each_possible_cpu(cpu) {
>> +                       copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
>> +                       check_and_init_map_value(map, value + off);
>> +                       off += size;
>> +               }
>>         }
>
> it feels like this whole logic of copying per-cpu value to/from user
> should be generic between all per-cpu maps, once we get that `void
> __percpu *` pointer, no? See if you can extract it as reusable helper
> (but, you know, without all the per-map type special casing, though it
> doesn't seem like you should need it, though I might be missing
> details, of course). One for bpf_percpu_copy_to_user() and another for
> bpf_percpu_copy_from_user(), which would take into account all these
> cpu flags?
>

Good idea.

Let's introduce these two helper functions.

Thanks,
Leon

[...]