Re: [PATCH bpf-next 3/4] bpf: task work scheduling kfuncs

Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> · Thu, 7 Aug 2025 20:55:48 +0200

On Wed, 6 Aug 2025 at 16:46, Mykyta Yatsenko <mykyta.yatsenko5@xxxxxxxxx> wrote:
>
> From: Mykyta Yatsenko <yatsenko@xxxxxxxx>
>
> Implementation of the bpf_task_work_schedule kfuncs.
>
> Main components:
>  * struct bpf_task_work_context – Metadata and state management per task
> work.
>  * enum bpf_task_work_state – A state machine to serialize work
>  scheduling and execution.
>  * bpf_task_work_schedule() – The central helper that initiates
> scheduling.
>  * bpf_task_work_callback() – Invoked when the actual task_work runs.
>  * bpf_task_work_irq() – An intermediate step (runs in softirq context)
> to enqueue task work.
>  * bpf_task_work_cancel_and_free() – Cleanup for deleted BPF map entries.
>
> Flow of task work scheduling
>  1) bpf_task_work_schedule_* is called from BPF code.
>  2) Transition state from STANDBY to PENDING.
>  3) irq_work_queue() schedules bpf_task_work_irq().
>  4) Transition state from PENDING to SCHEDULING.
>  4) bpf_task_work_irq() attempts task_work_add(). If successful, state
>  transitions to SCHEDULED.
>  5) Task work calls bpf_task_work_callback(), which transition state to
>  RUNNING.
>  6) BPF callback is executed
>  7) Context is cleaned up, refcounts released, state set back to
>  STANDBY.
>
> Map value deletion
> If map value that contains bpf_task_work_context is deleted, BPF map
> implementation calls bpf_task_work_cancel_and_free().
> Deletion is handled by atomically setting state to FREED and
> releasing references or letting scheduler do that, depending on the
> last state before the deletion:
>  * SCHEDULING: release references in bpf_task_work_cancel_and_free(),
>  expect bpf_task_work_irq() to cancel task work.
>  * SCHEDULED: release references and try to cancel task work in
>  bpf_task_work_cancel_and_free().
>   * other states: one of bpf_task_work_irq(), bpf_task_work_schedule(),
>   bpf_task_work_callback() should cleanup upon detecting the state
>   switching to FREED.
>
> The state transitions are controlled with atomic_cmpxchg, ensuring:
>  * Only one thread can successfully enqueue work.
>  * Proper handling of concurrent deletes (BPF_TW_FREED).
>  * Safe rollback if task_work_add() fails.
>

In general I am not sure why we need so many acquire/release pairs.
Why not use test_and_set_bit etc.? Or simply cmpxchg?
What ordering of stores are we depending on that merits
acquire/release ordering?
We should probably document explicitly.

> Signed-off-by: Mykyta Yatsenko <yatsenko@xxxxxxxx>
> ---
>  kernel/bpf/helpers.c | 188 ++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 186 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 516286f67f0d..4c8b1c9be7aa 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -25,6 +25,8 @@
>  #include <linux/kasan.h>
>  #include <linux/bpf_verifier.h>
>  #include <linux/uaccess.h>
> +#include <linux/task_work.h>
> +#include <linux/irq_work.h>
>
>  #include "../../lib/kstrtox.h"
>
> @@ -3702,6 +3704,160 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
>
>  typedef void (*bpf_task_work_callback_t)(struct bpf_map *, void *, void *);
>
> +enum bpf_task_work_state {
> +       /* bpf_task_work is ready to be used */
> +       BPF_TW_STANDBY = 0,
> +       /* bpf_task_work is getting scheduled into irq_work */
> +       BPF_TW_PENDING,
> +       /* bpf_task_work is in irq_work and getting scheduled into task_work */
> +       BPF_TW_SCHEDULING,
> +       /* bpf_task_work is scheduled into task_work successfully */
> +       BPF_TW_SCHEDULED,
> +       /* callback is running */
> +       BPF_TW_RUNNING,
> +       /* BPF map value storing this bpf_task_work is deleted */
> +       BPF_TW_FREED,
> +};
> +
> +struct bpf_task_work_context {
> +       /* map that contains this structure in a value */
> +       struct bpf_map *map;
> +       /* bpf_task_work_state value, representing the state */
> +       atomic_t state;
> +       /* bpf_prog that schedules task work */
> +       struct bpf_prog *prog;
> +       /* task for which callback is scheduled */
> +       struct task_struct *task;
> +       /* notification mode for task work scheduling */
> +       enum task_work_notify_mode mode;
> +       /* callback to call from task work */
> +       bpf_task_work_callback_t callback_fn;
> +       struct callback_head work;
> +       struct irq_work irq_work;
> +} __aligned(8);

I will echo Alexei's comments about the layout. We cannot inline all
this in map value.
Allocation using an init function or in some control function is
probably the only way.

> +
> +static bool task_work_match(struct callback_head *head, void *data)
> +{
> +       struct bpf_task_work_context *ctx = container_of(head, struct bpf_task_work_context, work);
> +
> +       return ctx == data;
> +}
> +
> +static void bpf_reset_task_work_context(struct bpf_task_work_context *ctx)
> +{
> +       bpf_prog_put(ctx->prog);
> +       bpf_task_release(ctx->task);
> +       rcu_assign_pointer(ctx->map, NULL);
> +}
> +
> +static void bpf_task_work_callback(struct callback_head *cb)
> +{
> +       enum bpf_task_work_state state;
> +       struct bpf_task_work_context *ctx;
> +       struct bpf_map *map;
> +       u32 idx;
> +       void *key;
> +       void *value;
> +
> +       rcu_read_lock_trace();
> +       ctx = container_of(cb, struct bpf_task_work_context, work);
> +
> +       state = atomic_cmpxchg_acquire(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
> +       if (state == BPF_TW_SCHEDULED)
> +               state = atomic_cmpxchg_acquire(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
> +       if (state == BPF_TW_FREED)
> +               goto out;

I am leaving out commenting on this, since I expect it to change per
later comments.

> +
> +       map = rcu_dereference(ctx->map);
> +       if (!map)
> +               goto out;
> +
> +       value = (void *)ctx - map->record->task_work_off;
> +       key = (void *)map_key_from_value(map, value, &idx);
> +
> +       migrate_disable();
> +       ctx->callback_fn(map, key, value);
> +       migrate_enable();
> +
> +       /* State is running or freed, either way reset. */
> +       bpf_reset_task_work_context(ctx);
> +       atomic_cmpxchg_release(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
> +out:
> +       rcu_read_unlock_trace();
> +}
> +
> +static void bpf_task_work_irq(struct irq_work *irq_work)
> +{
> +       struct bpf_task_work_context *ctx;
> +       enum bpf_task_work_state state;
> +       int err;
> +
> +       ctx = container_of(irq_work, struct bpf_task_work_context, irq_work);
> +
> +       rcu_read_lock_trace();

What's the idea behind rcu_read_lock_trace? Let's add a comment.

> +       state = atomic_cmpxchg_release(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING);
> +       if (state == BPF_TW_FREED) {
> +               bpf_reset_task_work_context(ctx);
> +               goto out;
> +       }
> +
> +       err = task_work_add(ctx->task, &ctx->work, ctx->mode);

Racy, SCHEDULING->FREE state claim from cancel_and_free will release ctx->task.

> +       if (err) {
> +               state = atomic_cmpxchg_acquire(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_PENDING);

Races here look fine, since we don't act on FREED (for this block
atleast), cancel_and_free doesn't act on seeing PENDING,
so there is interlocking.

> +               if (state == BPF_TW_SCHEDULING) {
> +                       bpf_reset_task_work_context(ctx);
> +                       atomic_cmpxchg_release(&ctx->state, BPF_TW_PENDING, BPF_TW_STANDBY);
> +               }
> +               goto out;
> +       }
> +       state = atomic_cmpxchg_release(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
> +       if (state == BPF_TW_FREED)
> +               task_work_cancel_match(ctx->task, task_work_match, ctx);

It looks like there is a similar race condition here.
If BPF_TW_SCHEDULING is set, cancel_and_free may invoke and attempt
bpf_task_release() from bpf_reset_task_work_context().
Meanwhile, we will access ctx->task here directly after seeing BPF_TW_FREED.

> +out:
> +       rcu_read_unlock_trace();
> +}
> +
> +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work_context *ctx,
> +                                 struct bpf_map *map, bpf_task_work_callback_t callback_fn,
> +                                 struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
> +{
> +       struct bpf_prog *prog;
> +
> +       BTF_TYPE_EMIT(struct bpf_task_work);
> +
> +       prog = bpf_prog_inc_not_zero(aux->prog);
> +       if (IS_ERR(prog))
> +               return -EPERM;
> +
> +       if (!atomic64_read(&map->usercnt)) {
> +               bpf_prog_put(prog);
> +               return -EPERM;
> +       }
> +       task = bpf_task_acquire(task);
> +       if (!task) {
> +               bpf_prog_put(prog);
> +               return -EPERM;
> +       }
> +
> +       if (atomic_cmpxchg_acquire(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING) != BPF_TW_STANDBY) {

If we are reusing map values, wouldn't a freed state stay perpetually freed?
I.e. after the first delete of array elements etc. it becomes useless.
Every array map update would invoke a cancel_and_free.
Who resets it?

> +               bpf_task_release(task);
> +               bpf_prog_put(prog);
> +               return -EBUSY;
> +       }
> +
> +       ctx->task = task;
> +       ctx->callback_fn = callback_fn;
> +       ctx->prog = prog;
> +       ctx->mode = mode;
> +       init_irq_work(&ctx->irq_work, bpf_task_work_irq);
> +       init_task_work(&ctx->work, bpf_task_work_callback);
> +       rcu_assign_pointer(ctx->map, map);
> +
> +       irq_work_queue(&ctx->irq_work);
> +
> +       return 0;
> +}
> +
>  /**
>   * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
>   * @task: Task struct for which callback should be scheduled
> @@ -3718,7 +3874,8 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task,
>                                               bpf_task_work_callback_t callback,
>                                               void *aux__prog)
>  {
> -       return 0;
> +       return bpf_task_work_schedule(task, (struct bpf_task_work_context *)tw, map__map,
> +                                     callback, aux__prog, TWA_SIGNAL);
>  }
>
>  /**
> @@ -3738,13 +3895,38 @@ __bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task,
>                                               bpf_task_work_callback_t callback,
>                                               void *aux__prog)
>  {
> -       return 0;
> +       enum task_work_notify_mode mode;
> +
> +       mode = task == current && in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
> +       return bpf_task_work_schedule(task, (struct bpf_task_work_context *)tw, map__map,
> +                                     callback, aux__prog, mode);
>  }
>
>  __bpf_kfunc_end_defs();
>
>  void bpf_task_work_cancel_and_free(void *val)
>  {
> +       struct bpf_task_work_context *ctx = val;
> +       enum bpf_task_work_state state;
> +
> +       state = atomic_xchg(&ctx->state, BPF_TW_FREED);
> +       switch (state) {
> +       case BPF_TW_SCHEDULED:
> +               task_work_cancel_match(ctx->task, task_work_match, ctx);
> +               fallthrough;
> +       /* Scheduling codepath is trying to schedule task work, reset context here. */
> +       case BPF_TW_SCHEDULING:
> +               bpf_reset_task_work_context(ctx);
> +               break;
> +       /* work is not initialized, mark as freed and exit */
> +       case BPF_TW_STANDBY:
> +       /* The context is in interim state, scheduling logic should cleanup. */
> +       case BPF_TW_PENDING:
> +       /* Callback is already running, it should reset context upon finishing. */
> +       case BPF_TW_RUNNING:
> +       default:
> +               break;
> +       }
>  }
>
>  BTF_KFUNCS_START(generic_btf_ids)
> @@ -3770,6 +3952,8 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
>  BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
> +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
> +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
>
>  #ifdef CONFIG_CGROUPS
>  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
> --
> 2.50.1
>
>