On Fri, Jun 6, 2025 at 9:31 AM Yonghong Song <yonghong.song@xxxxxxxxx> wrote: > > Current cgroup prog ordering is appending at attachment time. This is not > ideal. In some cases, users want specific ordering at a particular cgroup > level. To address this, the existing mprog API seems an ideal solution with > supporting BPF_F_BEFORE and BPF_F_AFTER flags. > > But there are a few obstacles to directly use kernel mprog interface. > Currently cgroup bpf progs already support prog attach/detach/replace > and link-based attach/detach/replace. For example, in struct > bpf_prog_array_item, the cgroup_storage field needs to be together > with bpf prog. But the mprog API struct bpf_mprog_fp only has bpf_prog > as the member, which makes it difficult to use kernel mprog interface. > > In another case, the current cgroup prog detach tries to use the > same flag as in attach. This is different from mprog kernel interface > which uses flags passed from user space. > > So to avoid modifying existing behavior, I made the following changes to > support mprog API for cgroup progs: > - The support is for prog list at cgroup level. Cross-level prog list > (a.k.a. effective prog list) is not supported. > - Previously, BPF_F_PREORDER is supported only for prog attach, now > BPF_F_PREORDER is also supported by link-based attach. > - For attach, BPF_F_BEFORE/BPF_F_AFTER/BPF_F_ID/BPF_F_LINK is supported > similar to kernel mprog but with different implementation. > - For detach and replace, use the existing implementation. > - For attach, detach and replace, the revision for a particular prog > list, associated with a particular attach type, will be updated > by increasing count by 1. > > Signed-off-by: Yonghong Song <yonghong.song@xxxxxxxxx> > --- > include/uapi/linux/bpf.h | 7 ++ > kernel/bpf/cgroup.c | 188 +++++++++++++++++++++++++++++---- > kernel/bpf/syscall.c | 44 +++++--- > tools/include/uapi/linux/bpf.h | 7 ++ > 4 files changed, 209 insertions(+), 37 deletions(-) > [...] > +static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) > +{ > + struct bpf_link *link = ERR_PTR(-EINVAL); > + > + if (flags & BPF_F_ID) > + link = bpf_link_by_id(id_or_fd); > + else if (id_or_fd) > + link = bpf_link_get_from_fd(id_or_fd); > + if (IS_ERR(link)) > + return ERR_PTR(PTR_ERR(link)); this can be just `return link;` (same below for prog) simplified while applying > + > + return link; > +} > + [...] > + if (is_link) { > + anchor_link = bpf_get_anchor_link(flags, id_or_fd); > + if (IS_ERR(anchor_link)) > + return ERR_PTR(PTR_ERR(anchor_link)); > + } else if (is_id || id_or_fd) { this can be just `else {` with no conditions, no? Basically, if BPF_F_LINK -- fetch link, otherwise assume program. Or am I missing something? I didn't touch this part, but maybe we can simplify this a bit in the follow up? > + anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); > + if (IS_ERR(anchor_prog)) > + return ERR_PTR(PTR_ERR(anchor_prog)); > + } > + [...] > @@ -4244,20 +4266,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) > case BPF_PROG_TYPE_FLOW_DISSECTOR: > ret = netns_bpf_prog_attach(attr, prog); > break; > - case BPF_PROG_TYPE_CGROUP_DEVICE: > - case BPF_PROG_TYPE_CGROUP_SKB: > - case BPF_PROG_TYPE_CGROUP_SOCK: > - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: > - case BPF_PROG_TYPE_CGROUP_SOCKOPT: > - case BPF_PROG_TYPE_CGROUP_SYSCTL: > - case BPF_PROG_TYPE_SOCK_OPS: > - case BPF_PROG_TYPE_LSM: > - if (ptype == BPF_PROG_TYPE_LSM && > - prog->expected_attach_type != BPF_LSM_CGROUP) > - ret = -EINVAL; > - else > - ret = cgroup_bpf_prog_attach(attr, ptype, prog); > - break; > case BPF_PROG_TYPE_SCHED_CLS: > if (attr->attach_type == BPF_TCX_INGRESS || > attr->attach_type == BPF_TCX_EGRESS) > @@ -4266,7 +4274,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) > ret = netkit_prog_attach(attr, prog); > break; > default: > - ret = -EINVAL; > + if (!is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) > + ret = -EINVAL; > + else > + ret = cgroup_bpf_prog_attach(attr, ptype, prog); I tried to get used to this is_cgroup_prog_type() check inside switch's default, but it feels too surprising and ugly. I moved it to before the switch to be more explicit. I hope you don't mind. I ended up with this diff on top of your patches: diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c3ac5661da27..ffbafbef5010 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -666,9 +666,6 @@ static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) link = bpf_link_by_id(id_or_fd); else if (id_or_fd) link = bpf_link_get_from_fd(id_or_fd); - if (IS_ERR(link)) - return ERR_PTR(PTR_ERR(link)); - return link; } @@ -680,9 +677,6 @@ static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) prog = bpf_prog_by_id(id_or_fd); else if (id_or_fd) prog = bpf_prog_get(id_or_fd); - if (IS_ERR(prog)) - return prog; - return prog; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2035093eeeb3..97ad57ffc404 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4255,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) return -EINVAL; } + if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) { + ret = cgroup_bpf_prog_attach(attr, ptype, prog); + goto out; + } + switch (ptype) { case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: @@ -4274,12 +4279,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = netkit_prog_attach(attr, prog); break; default: - if (!is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) - ret = -EINVAL; - else - ret = cgroup_bpf_prog_attach(attr, ptype, prog); + ret = -EINVAL; } - +out: if (ret) bpf_prog_put(prog); return ret;