Re: [PATCH bpf-next v1 3/4] bpf: add bpf_icmp_send_unreach cgroup_skb kfunc

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Thu, 10 Jul 2025 09:07:59 -0700

On Thu, Jul 10, 2025 at 3:26 AM Mahe Tardy <mahe.tardy@xxxxxxxxx> wrote:
>
> This is needed in the context of Tetragon to provide improved feedback
> (in contrast to just dropping packets) to east-west traffic when blocked
> by policies using cgroup_skb programs.
>
> This reuse concepts from netfilter reject target codepath with the
> differences that:
> * Packets are cloned since the BPF user can still return SK_PASS from
>   the cgroup_skb progs and the current skb need to stay untouched
>   (cgroup_skb hooks only allow read-only skb payload).
> * Since cgroup_skb programs are called late in the stack, checksums do
>   not need to be computed or verified, and IPv4 fragmentation does not
>   need to be checked (ip_local_deliver should take care of that
>   earlier).
>
> Signed-off-by: Mahe Tardy <mahe.tardy@xxxxxxxxx>
> ---
>  net/core/filter.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index ab456bf1056e..9215f79e7690 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -85,6 +85,8 @@
>  #include <linux/un.h>
>  #include <net/xdp_sock_drv.h>
>  #include <net/inet_dscp.h>
> +#include <linux/icmp.h>
> +#include <net/icmp.h>
>
>  #include "dev.h"
>
> @@ -12140,6 +12142,53 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
>         return 0;
>  }
>
> +__bpf_kfunc int bpf_icmp_send_unreach(struct __sk_buff *__skb, int code)
> +{
> +       struct sk_buff *skb = (struct sk_buff *)__skb;
> +       struct sk_buff *nskb;
> +
> +       switch (skb->protocol) {
> +       case htons(ETH_P_IP):
> +               if (code < 0 || code > NR_ICMP_UNREACH)
> +                       return -EINVAL;
> +
> +               nskb = skb_clone(skb, GFP_ATOMIC);
> +               if (!nskb)
> +                       return -ENOMEM;
> +
> +               if (ip_route_reply_fetch_dst(nskb) < 0) {
> +                       kfree_skb(nskb);
> +                       return -EHOSTUNREACH;
> +               }
> +
> +               icmp_send(nskb, ICMP_DEST_UNREACH, code, 0);
> +               kfree_skb(nskb);
> +               break;
> +#if IS_ENABLED(CONFIG_IPV6)
> +       case htons(ETH_P_IPV6):
> +               if (code < 0 || code > ICMPV6_REJECT_ROUTE)
> +                       return -EINVAL;
> +
> +               nskb = skb_clone(skb, GFP_ATOMIC);
> +               if (!nskb)
> +                       return -ENOMEM;
> +
> +               if (ip6_route_reply_fetch_dst(nskb) < 0) {
> +                       kfree_skb(nskb);
> +                       return -EHOSTUNREACH;
> +               }
> +
> +               icmpv6_send(nskb, ICMPV6_DEST_UNREACH, code, 0);
> +               kfree_skb(nskb);
> +               break;
> +#endif
> +       default:
> +               return -EPROTONOSUPPORT;
> +       }
> +
> +       return 0;
> +}
> +
>  __bpf_kfunc_end_defs();
>
>  int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
> @@ -12177,6 +12226,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
>  BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
>  BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
>
> +BTF_KFUNCS_START(bpf_kfunc_check_set_icmp_send_unreach)
> +BTF_ID_FLAGS(func, bpf_icmp_send_unreach, KF_TRUSTED_ARGS)
> +BTF_KFUNCS_END(bpf_kfunc_check_set_icmp_send_unreach)
> +
>  static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
>         .owner = THIS_MODULE,
>         .set = &bpf_kfunc_check_set_skb,
> @@ -12202,6 +12255,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
>         .set = &bpf_kfunc_check_set_sock_ops,
>  };
>
> +static const struct btf_kfunc_id_set bpf_kfunc_set_icmp_send_unreach = {
> +       .owner = THIS_MODULE,
> +       .set = &bpf_kfunc_check_set_icmp_send_unreach,
> +};
> +
>  static int __init bpf_kfunc_init(void)
>  {
>         int ret;
> @@ -12221,7 +12279,8 @@ static int __init bpf_kfunc_init(void)
>         ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
>                                                &bpf_kfunc_set_sock_addr);
>         ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
> -       return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
> +       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
> +       return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_icmp_send_unreach);

Does it have to be restricted to BPF_PROG_TYPE_CGROUP_SKB ?
Can it be a part of bpf_kfunc_set_skb[] and used more generally ?

If restriction is necessary then I guess we can live with extra
bpf_kfunc_set_icmp_send_unreach, though it's odd to create a set
just for one kfunc.
Either way don't change the last 'return ...' line in this file.
Add 'ret = ret ?: register...' instead to reduce churn.

Also cc netdev and netfilter maintainers in v2.

--
pw-bot: cr