Re: [PATCH bpf-next v2 3/4] bpf: task work scheduling kfuncs

Kumar Kartikeya Dwivedi <memxor@xxxxxxxxx> · Fri, 29 Aug 2025 00:29:10 +0200

On Wed, 27 Aug 2025 at 23:03, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote:
>
> On Fri, Aug 15, 2025 at 12:22 PM Mykyta Yatsenko
> <mykyta.yatsenko5@xxxxxxxxx> wrote:
> >
> > From: Mykyta Yatsenko <yatsenko@xxxxxxxx>
> >
> > Implementation of the bpf_task_work_schedule kfuncs.
> >
> > Main components:
> >  * struct bpf_task_work_context – Metadata and state management per task
> > work.
> >  * enum bpf_task_work_state – A state machine to serialize work
> >  scheduling and execution.
> >  * bpf_task_work_schedule() – The central helper that initiates
> > scheduling.
> >  * bpf_task_work_acquire() - Attempts to take ownership of the context,
> >  pointed by passed struct bpf_task_work, allocates new context if none
> >  exists yet.
> >  * bpf_task_work_callback() – Invoked when the actual task_work runs.
> >  * bpf_task_work_irq() – An intermediate step (runs in softirq context)
> > to enqueue task work.
> >  * bpf_task_work_cancel_and_free() – Cleanup for deleted BPF map entries.
> >
> > Flow of successful task work scheduling
> >  1) bpf_task_work_schedule_* is called from BPF code.
> >  2) Transition state from STANDBY to PENDING, marks context is owned by
> >  this task work scheduler
> >  3) irq_work_queue() schedules bpf_task_work_irq().
> >  4) Transition state from PENDING to SCHEDULING.
> >  4) bpf_task_work_irq() attempts task_work_add(). If successful, state
> >  transitions to SCHEDULED.
> >  5) Task work calls bpf_task_work_callback(), which transition state to
> >  RUNNING.
> >  6) BPF callback is executed
> >  7) Context is cleaned up, refcounts released, context state set back to
> >  STANDBY.
> >
> > bpf_task_work_context handling
> > The context pointer is stored in bpf_task_work ctx field (u64) but
> > treated as an __rcu pointer via casts.
> > bpf_task_work_acquire() publishes new bpf_task_work_context by cmpxchg
> > with RCU initializer.
> > Read under the RCU lock only in bpf_task_work_acquire() when ownership
> > is contended.
> > Upon deleting map value, bpf_task_work_cancel_and_free() is detaching
> > context pointer from struct bpf_task_work and releases resources
> > if scheduler does not own the context or can be canceled (state ==
> > STANDBY or state == SCHEDULED and callback canceled). If task work
> > scheduler owns the context, its state is set to FREED and scheduler is
> > expected to cleanup on the next state transition.
> >
> > Signed-off-by: Mykyta Yatsenko <yatsenko@xxxxxxxx>
> > ---
> >  kernel/bpf/helpers.c | 270 +++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 260 insertions(+), 10 deletions(-)
> >
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index d2f88a9bc47b..346ae8fd3ada 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -25,6 +25,8 @@
> >  #include <linux/kasan.h>
> >  #include <linux/bpf_verifier.h>
> >  #include <linux/uaccess.h>
> > +#include <linux/task_work.h>
> > +#include <linux/irq_work.h>
> >
> >  #include "../../lib/kstrtox.h"
> >
> > @@ -3701,6 +3703,226 @@ __bpf_kfunc int bpf_strstr(const char *s1__ign, const char *s2__ign)
> >
> >  typedef void (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value);
> >
> > +enum bpf_task_work_state {
> > +       /* bpf_task_work is ready to be used */
> > +       BPF_TW_STANDBY = 0,
> > +       /* irq work scheduling in progress */
> > +       BPF_TW_PENDING,
> > +       /* task work scheduling in progress */
> > +       BPF_TW_SCHEDULING,
> > +       /* task work is scheduled successfully */
> > +       BPF_TW_SCHEDULED,
> > +       /* callback is running */
> > +       BPF_TW_RUNNING,
> > +       /* associated BPF map value is deleted */
> > +       BPF_TW_FREED,
> > +};
> > +
> > +struct bpf_task_work_context {
> > +       /* the map and map value associated with this context */
> > +       struct bpf_map *map;
> > +       void *map_val;
> > +       /* bpf_prog that schedules task work */
> > +       struct bpf_prog *prog;
> > +       /* task for which callback is scheduled */
> > +       struct task_struct *task;
> > +       enum task_work_notify_mode mode;
> > +       enum bpf_task_work_state state;
> > +       bpf_task_work_callback_t callback_fn;
> > +       struct callback_head work;
> > +       struct irq_work irq_work;
> > +       struct rcu_head rcu;
> > +} __aligned(8);
> > +
> > +static struct bpf_task_work_context *bpf_task_work_context_alloc(void)
> > +{
> > +       struct bpf_task_work_context *ctx;
> > +
> > +       ctx = bpf_mem_alloc(&bpf_global_ma, sizeof(struct bpf_task_work_context));
> > +       if (ctx)
> > +               memset(ctx, 0, sizeof(*ctx));
> > +       return ctx;
> > +}
> > +
> > +static void bpf_task_work_context_free(struct rcu_head *rcu)
> > +{
> > +       struct bpf_task_work_context *ctx = container_of(rcu, struct bpf_task_work_context, rcu);
> > +       /* bpf_mem_free expects migration to be disabled */
> > +       migrate_disable();
> > +       bpf_mem_free(&bpf_global_ma, ctx);
> > +       migrate_enable();
> > +}
> > +
> > +static bool task_work_match(struct callback_head *head, void *data)
> > +{
> > +       struct bpf_task_work_context *ctx = container_of(head, struct bpf_task_work_context, work);
> > +
> > +       return ctx == data;
> > +}
> > +
> > +static void bpf_task_work_context_reset(struct bpf_task_work_context *ctx)
> > +{
> > +       bpf_prog_put(ctx->prog);
> > +       bpf_task_release(ctx->task);
> > +}
> > +
> > +static void bpf_task_work_callback(struct callback_head *cb)
> > +{
> > +       enum bpf_task_work_state state;
> > +       struct bpf_task_work_context *ctx;
> > +       u32 idx;
> > +       void *key;
> > +
> > +       ctx = container_of(cb, struct bpf_task_work_context, work);
> > +
> > +       /*
> > +        * Read lock is needed to protect map key and value access below, it has to be done before
> > +        * the state transition
> > +        */
> > +       rcu_read_lock_trace();
> > +       /*
> > +        * This callback may start running before bpf_task_work_irq() switched the state to
> > +        * SCHEDULED so handle both transition variants SCHEDULING|SCHEDULED -> RUNNING.
> > +        */
> > +       state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_RUNNING);
> > +       if (state == BPF_TW_SCHEDULED)
> > +               state = cmpxchg(&ctx->state, BPF_TW_SCHEDULED, BPF_TW_RUNNING);
> > +       if (state == BPF_TW_FREED) {
> > +               rcu_read_unlock_trace();
> > +               bpf_task_work_context_reset(ctx);
> > +               call_rcu_tasks_trace(&ctx->rcu, bpf_task_work_context_free);
> > +               return;
> > +       }
> > +
> > +       key = (void *)map_key_from_value(ctx->map, ctx->map_val, &idx);
> > +       migrate_disable();
> > +       ctx->callback_fn(ctx->map, key, ctx->map_val);
> > +       migrate_enable();
> > +       rcu_read_unlock_trace();
> > +       /* State is running or freed, either way reset. */
> > +       bpf_task_work_context_reset(ctx);
> > +       state = cmpxchg(&ctx->state, BPF_TW_RUNNING, BPF_TW_STANDBY);
> > +       if (state == BPF_TW_FREED)
> > +               call_rcu_tasks_trace(&ctx->rcu, bpf_task_work_context_free);
> > +}
> > +
> > +static void bpf_task_work_irq(struct irq_work *irq_work)
> > +{
> > +       struct bpf_task_work_context *ctx;
> > +       enum bpf_task_work_state state;
> > +       int err;
> > +
> > +       ctx = container_of(irq_work, struct bpf_task_work_context, irq_work);
> > +
> > +       state = cmpxchg(&ctx->state, BPF_TW_PENDING, BPF_TW_SCHEDULING);
> > +       if (state == BPF_TW_FREED)
> > +               goto free_context;
> > +
> > +       err = task_work_add(ctx->task, &ctx->work, ctx->mode);
> > +       if (err) {
> > +               bpf_task_work_context_reset(ctx);
> > +               state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_STANDBY);
> > +               if (state == BPF_TW_FREED)
> > +                       call_rcu_tasks_trace(&ctx->rcu, bpf_task_work_context_free);
> > +               return;
> > +       }
> > +       state = cmpxchg(&ctx->state, BPF_TW_SCHEDULING, BPF_TW_SCHEDULED);
> > +       if (state == BPF_TW_FREED && task_work_cancel_match(ctx->task, task_work_match, ctx))
> > +               goto free_context; /* successful cancellation, release and free ctx */
> > +       return;
> > +
> > +free_context:
> > +       bpf_task_work_context_reset(ctx);
> > +       call_rcu_tasks_trace(&ctx->rcu, bpf_task_work_context_free);
> > +}
> > +
> > +static struct bpf_task_work_context *bpf_task_work_context_acquire(struct bpf_task_work *tw,
> > +                                                                  struct bpf_map *map)
> > +{
> > +       struct bpf_task_work_context *ctx, *old_ctx;
> > +       enum bpf_task_work_state state;
> > +       struct bpf_task_work_context __force __rcu **ppc =
> > +               (struct bpf_task_work_context __force __rcu **)&tw->ctx;
>
> we discussed this with Mykyta earlier offline, but I'll not it here:
> this looks pretty ugly, we should avoid these casts. We probably need
> struct bpf_task_work with opaque contents for user-visible API, and
> then have bpf_task_work_kern or something like that, where ctx will be
> just a struct bpf_task_work_ctx pointer
>
> > +
> > +       /* ctx pointer is RCU protected */
> > +       rcu_read_lock_trace();
> > +       ctx = rcu_dereference(*ppc);
>
> and here it should be enough to do just READ_ONCE(tw->ctx), either way
> rcu_dereference is wrong here because it checks for just classic
> rcu_read_lock() to be "held", while we use RCU Tasks Trace flavor for
> protection of the context struct

Couldn't you use rcu_dereference_check(), with rcu_read_lock_trace_held().

>
> > +       if (!ctx) {
> > +               ctx = bpf_task_work_context_alloc();
> > +               if (!ctx) {
> > +                       rcu_read_unlock_trace();
> > +                       return ERR_PTR(-ENOMEM);
> > +               }
> > +               old_ctx = unrcu_pointer(cmpxchg(ppc, NULL, RCU_INITIALIZER(ctx)));
>
> similarly here, just `old_ctx = cmpxchg(tw->ctx, NULL, ctx);` seems
> much more preferable to me
>
> > +               /*
> > +                * If ctx is set by another CPU, release allocated memory.
> > +                * Do not fail, though, attempt stealing the work
> > +                */
> > +               if (old_ctx) {
> > +                       bpf_mem_free(&bpf_global_ma, ctx);
> > +                       ctx = old_ctx;
> > +               }
> > +       }
> > +       state = cmpxchg(&ctx->state, BPF_TW_STANDBY, BPF_TW_PENDING);
> > +       /*
> > +        * We can unlock RCU, because task work scheduler (this codepath)
> > +        * now owns the ctx or returning an error
> > +        */
> > +       rcu_read_unlock_trace();
> > +       if (state != BPF_TW_STANDBY)
> > +               return ERR_PTR(-EBUSY);
> > +       return ctx;
> > +}
> > +
> > +static int bpf_task_work_schedule(struct task_struct *task, struct bpf_task_work *tw,
> > +                                 struct bpf_map *map, bpf_task_work_callback_t callback_fn,
> > +                                 struct bpf_prog_aux *aux, enum task_work_notify_mode mode)
> > +{
> > +       struct bpf_prog *prog;
> > +       struct bpf_task_work_context *ctx = NULL;
> > +       int err;
> > +
> > +       BTF_TYPE_EMIT(struct bpf_task_work);
> > +
> > +       prog = bpf_prog_inc_not_zero(aux->prog);
> > +       if (IS_ERR(prog))
> > +               return -EBADF;
> > +
> > +       if (!atomic64_read(&map->usercnt)) {
> > +               err = -EBADF;
> > +               goto release_prog;
> > +       }
>
> Kumar pointed out that race between map_release_uref() and this check.
> It looks like a simple fix would be to perform
> bpf_task_work_context_acquire() first, and only then check for
> map->usercnt, and if it dropped to zero, just clean up and return
> -EPERM?

I believe that should work. As long as we don't create new objects to
be freed once usercnt has already dropped (and parallel racing free
already done).

>
> > +       task = bpf_task_acquire(task);
> > +       if (!task) {
> > +               err = -EPERM;
> > +               goto release_prog;
> > +       }
> > +       ctx = bpf_task_work_context_acquire(tw, map);
> > +       if (IS_ERR(ctx)) {
> > +               err = PTR_ERR(ctx);
> > +               goto release_all;
> > +       }
> > +
> > +       ctx->task = task;
> > +       ctx->callback_fn = callback_fn;
> > +       ctx->prog = prog;
> > +       ctx->mode = mode;
> > +       ctx->map = map;
> > +       ctx->map_val = (void *)tw - map->record->task_work_off;
> > +       init_irq_work(&ctx->irq_work, bpf_task_work_irq);
> > +       init_task_work(&ctx->work, bpf_task_work_callback);
> > +
> > +       irq_work_queue(&ctx->irq_work);
> > +
> > +       return 0;
> > +
> > +release_all:
> > +       bpf_task_release(task);
> > +release_prog:
> > +       bpf_prog_put(prog);
> > +       return err;
> > +}
> > +
> >  /**
> >   * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode
> >   * @task: Task struct for which callback should be scheduled
> > @@ -3711,13 +3933,11 @@ typedef void (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *v
> >   *
> >   * Return: 0 if task work has been scheduled successfully, negative error code otherwise
> >   */
> > -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task,
> > -                                             struct bpf_task_work *tw,
> > +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw,
> >                                               struct bpf_map *map__map,
> > -                                             bpf_task_work_callback_t callback,
> > -                                             void *aux__prog)
> > +                                             bpf_task_work_callback_t callback, void *aux__prog)
>
> this should have been folded into patch 1?
>
> >  {
> > -       return 0;
> > +       return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL);
> >  }
> >
> >  /**
> > @@ -3731,19 +3951,47 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task,
> >   *
> >   * Return: 0 if task work has been scheduled successfully, negative error code otherwise
> >   */
> > -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task,
> > -                                             struct bpf_task_work *tw,
> > +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw,
> >                                               struct bpf_map *map__map,
> > -                                             bpf_task_work_callback_t callback,
> > -                                             void *aux__prog)
> > +                                             bpf_task_work_callback_t callback, void *aux__prog)
> >  {
>
> same, rebasing/refactoring artifacts
>
> > -       return 0;
> > +       enum task_work_notify_mode mode;
> > +
> > +       mode = task == current && in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;
> > +       return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, mode);
> >  }
> >
> >  __bpf_kfunc_end_defs();
> >
> >  void bpf_task_work_cancel_and_free(void *val)
> >  {
> > +       struct bpf_task_work *tw = val;
> > +       struct bpf_task_work_context *ctx;
> > +       enum bpf_task_work_state state;
> > +
> > +       /* No need do rcu_read_lock as no other codepath can reset this pointer */
>
> typo: to do
>
> > +       ctx = unrcu_pointer(xchg((struct bpf_task_work_context __force __rcu **)&tw->ctx, NULL));
> > +       if (!ctx)
> > +               return;
> > +       state = xchg(&ctx->state, BPF_TW_FREED);
> > +
> > +       switch (state) {
> > +       case BPF_TW_SCHEDULED:
> > +               /* If we can't cancel task work, rely on task work callback to free the context */
> > +               if (!task_work_cancel_match(ctx->task, task_work_match, ctx))
> > +                       break;
> > +               bpf_task_work_context_reset(ctx);
>
> As Kumar pointed out earlier, this can be called from NMI, so we can't
> just cancel here, we need to schedule irq_work to do the cancellation.
>
> Good thing is that if we were in SCHEDULED state, then we can simply
> reuse irq_work struct from task_work_ctx (please contract "context"
> into "ctx", btw), and there is no change to the state machine, it's
> just a slightly delayed clean up (and we'll remain in terminal FREED
> state anyways).
>
> Bad news: that extra irq_work step mean we have to be careful about
> ctx lifetime, because if task_work callback couldn't be cancelled, we
> might have a situation where memory is freed by task_work callback
> itself (when it fails to transition to RUNNING) while we wait for
> irq_work callback to be run, and then we get freed memory dereference.
>
> So I'm thinking that besides RCU protection we should also have a
> simple refcounting protection for bpf_task_work_ctx itself. I don't
> think that's a problem performance wise, and the nice thing is that
> there will be less direct `call_rcu_tasks_trace(&ctx->rcu,
> bpf_task_work_context_free);` sprinkled around. Instead we'll have
> just bpf_task_work_put(ctx), which might call RCU-delayed freeing.
>
> We might want to roll bpf_task_work_context_reset logic into it,
> conditionally (but we need to make sure that we reset prog and task
> pointers to NULL during a non-freeing context reset).

This also sounds workable. I'll take a look in v5.

>
> > +               fallthrough;
> > +       case BPF_TW_STANDBY:
> > +               call_rcu_tasks_trace(&ctx->rcu, bpf_task_work_context_free);
> > +               break;
> > +       /* In all below cases scheduling logic should detect context state change and cleanup */
> > +       case BPF_TW_SCHEDULING:
> > +       case BPF_TW_PENDING:
> > +       case BPF_TW_RUNNING:
> > +       default:
> > +               break;
> > +       }
> >  }
> >
> >  BTF_KFUNCS_START(generic_btf_ids)
> > @@ -3769,6 +4017,8 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_rbtree_root, KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_rbtree_left, KF_RET_NULL)
> >  BTF_ID_FLAGS(func, bpf_rbtree_right, KF_RET_NULL)
> > +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS)
> > +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS)
> >
> >  #ifdef CONFIG_CGROUPS
> >  BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
> > --
> > 2.50.1
> >