Re: [PATCH bpf-next v5 2/5] bpf: Implement mprog API on top of existing cgroup progs

Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> · Mon, 9 Jun 2025 16:35:43 -0700

On Fri, Jun 6, 2025 at 9:31 AM Yonghong Song <yonghong.song@xxxxxxxxx> wrote:
>
> Current cgroup prog ordering is appending at attachment time. This is not
> ideal. In some cases, users want specific ordering at a particular cgroup
> level. To address this, the existing mprog API seems an ideal solution with
> supporting BPF_F_BEFORE and BPF_F_AFTER flags.
>
> But there are a few obstacles to directly use kernel mprog interface.
> Currently cgroup bpf progs already support prog attach/detach/replace
> and link-based attach/detach/replace. For example, in struct
> bpf_prog_array_item, the cgroup_storage field needs to be together
> with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog
> as the member, which makes it difficult to use kernel mprog interface.
>
> In another case, the current cgroup prog detach tries to use the
> same flag as in attach. This is different from mprog kernel interface
> which uses flags passed from user space.
>
> So to avoid modifying existing behavior, I made the following changes to
> support mprog API for cgroup progs:
>  - The support is for prog list at cgroup level. Cross-level prog list
>    (a.k.a. effective prog list) is not supported.
>  - Previously, BPF_F_PREORDER is supported only for prog attach, now
>    BPF_F_PREORDER is also supported by link-based attach.
>  - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported
>    similar to kernel mprog but with different implementation.
>  - For detach and replace, use the existing implementation.
>  - For attach, detach and replace, the revision for a particular prog
>    list, associated with a particular attach type, will be updated
>    by increasing count by 1.
>
> Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx>
> ---
>  include/uapi/linux/bpf.h       |   7 ++
>  kernel/bpf/cgroup.c            | 188 +++++++++++++++++++++++++++++----
>  kernel/bpf/syscall.c           |  44 +++++---
>  tools/include/uapi/linux/bpf.h |   7 ++
>  4 files changed, 209 insertions(+), 37 deletions(-)
>

[...]

> +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
> +{
> +       struct bpf_link *link = ERR_PTR(-EINVAL);
> +
> +       if (flags & BPF_F_ID)
> +               link = bpf_link_by_id(id_or_fd);
> +       else if (id_or_fd)
> +               link = bpf_link_get_from_fd(id_or_fd);
> +       if (IS_ERR(link))
> +               return ERR_PTR(PTR_ERR(link));

this can be just `return link;` (same below for prog)

simplified while applying

> +
> +       return link;
> +}
> +

[...]

> +       if (is_link) {
> +               anchor_link = bpf_get_anchor_link(flags, id_or_fd);
> +               if (IS_ERR(anchor_link))
> +                       return ERR_PTR(PTR_ERR(anchor_link));
> +       } else if (is_id || id_or_fd) {

this can be just `else {` with no conditions, no? Basically, if
BPF_F_LINK -- fetch link, otherwise assume program. Or am I missing
something? I didn't touch this part, but maybe we can simplify this a
bit in the follow up?

> +               anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
> +               if (IS_ERR(anchor_prog))
> +                       return ERR_PTR(PTR_ERR(anchor_prog));
> +       }
> +

[...]

> @@ -4244,20 +4266,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>         case BPF_PROG_TYPE_FLOW_DISSECTOR:
>                 ret = netns_bpf_prog_attach(attr, prog);
>                 break;
> -       case BPF_PROG_TYPE_CGROUP_DEVICE:
> -       case BPF_PROG_TYPE_CGROUP_SKB:
> -       case BPF_PROG_TYPE_CGROUP_SOCK:
> -       case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
> -       case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> -       case BPF_PROG_TYPE_CGROUP_SYSCTL:
> -       case BPF_PROG_TYPE_SOCK_OPS:
> -       case BPF_PROG_TYPE_LSM:
> -               if (ptype == BPF_PROG_TYPE_LSM &&
> -                   prog->expected_attach_type != BPF_LSM_CGROUP)
> -                       ret = -EINVAL;
> -               else
> -                       ret = cgroup_bpf_prog_attach(attr, ptype, prog);
> -               break;
>         case BPF_PROG_TYPE_SCHED_CLS:
>                 if (attr->attach_type == BPF_TCX_INGRESS ||
>                     attr->attach_type == BPF_TCX_EGRESS)
> @@ -4266,7 +4274,10 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>                         ret = netkit_prog_attach(attr, prog);
>                 break;
>         default:
> -               ret = -EINVAL;
> +               if (!is_cgroup_prog_type(ptype, prog->expected_attach_type, true))
> +                       ret = -EINVAL;
> +               else
> +                       ret = cgroup_bpf_prog_attach(attr, ptype, prog);

I tried to get used to this is_cgroup_prog_type() check inside
switch's default, but it feels too surprising and ugly. I moved it to
before the switch to be more explicit. I hope you don't mind.

I ended up with this diff on top of your patches:

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c3ac5661da27..ffbafbef5010 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -666,9 +666,6 @@ static struct bpf_link *bpf_get_anchor_link(u32
flags, u32 id_or_fd)
                link = bpf_link_by_id(id_or_fd);
        else if (id_or_fd)
                link = bpf_link_get_from_fd(id_or_fd);
-       if (IS_ERR(link))
-               return ERR_PTR(PTR_ERR(link));
-
        return link;
 }

@@ -680,9 +677,6 @@ static struct bpf_prog *bpf_get_anchor_prog(u32
flags, u32 id_or_fd)
                prog = bpf_prog_by_id(id_or_fd);
        else if (id_or_fd)
                prog = bpf_prog_get(id_or_fd);
-       if (IS_ERR(prog))
-               return prog;
-
        return prog;
 }

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2035093eeeb3..97ad57ffc404 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4255,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
                return -EINVAL;
        }

+       if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+               ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+               goto out;
+       }
+
        switch (ptype) {
        case BPF_PROG_TYPE_SK_SKB:
        case BPF_PROG_TYPE_SK_MSG:
@@ -4274,12 +4279,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
                        ret = netkit_prog_attach(attr, prog);
                break;
        default:
-               if (!is_cgroup_prog_type(ptype,
prog->expected_attach_type, true))
-                       ret = -EINVAL;
-               else
-                       ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+               ret = -EINVAL;
        }
-
+out:
        if (ret)
                bpf_prog_put(prog);
        return ret;