When running iperf through a set of XDP programs we were able to crash machines with NICs using the mlx5_core driver. We were able to confirm that other NICs/drivers did not exhibit the same problem, and suspect this could be a memory management issue in the driver code. Specifically we found a WARNING at include/net/page_pool/helpers.h:277 mlx5e_page_release_fragmented.isra. We are able to demonstrate this issue in production using hardware, but cannot easily bisect because we don’t have a simple reproducer. I wanted to share stack traces in order to help us further debug and understand if anyone else has run into this issue. We are currently working on getting more crashdumps and doing further analysis. The test setup looks like the following: ┌─────┐ │mlx5 │ │NIC │ └──┬──┘ │xdp ebpf program (does encap and XDP_TX) │ ▼ ┌──────────────────────┐ │xdp.frags │ │ │ └──┬───────────────────┘ │tailcall │BPF_REDIRECT_MAP (using CPUMAP bpf type) ▼ ┌──────────────────────┐ │xdp.frags/cpumap │ │ │ └──┬───────────────────┘ │BPF_REDIRECT to veth (*potential trigger for issue) │ ▼ ┌──────┐ │veth │ │ │ └──┬───┘ │ │ ▼ Here an mlx5 NIC has an xdp.frags program attached which tailcalls via BPF_REDIRECT_MAP into an xdp.frags/cpumap. For our reproducer we can choose a random valid CPU to reproduce the issue. Once that packet reaches the xdp.frags/cpumap program we then do another BPF_REDIRECT to a veth device which has an XDP program which redirects to an XSKMAP. It wasn’t until we added the additional BPF_REDIRECT to the veth device that we noticed this issue. When running with 6.12.30 to 6.12.32 kernels we are able to see the following KASAN use-after-free WARNINGs followed by a page fault which crashes the machine. We have not been able to test earlier or later kernels. I’ve tried to map symbols to lines of code for clarity. ------------[ cut here ]------------ WARNING: CPU: 157 PID: 0 at include/net/page_pool/helpers.h:277 mlx5e_page_release_fragmented.isra.0+0xf7/0x150 [mlx5_core] mlx5e_page_release_fragmented.isra.0 (include/net/page_pool/helpers.h:277 (discriminator 1) include/net/page_pool/helpers.h:292 (discriminator 1) drivers/net/ethernet/mellanox/mlx5/core/en_rx.c:301 (discriminator 1)) mlx5_core ================================================================== Modules linked in: BUG: KASAN: use-after-free in veth_xdp_rcv.constprop.0+0x9a6/0xc40 [veth] mptcp_diag Read of size 2 at addr ffff88b8c9eee008 by task napi/iconduit-g/681556 CPU: 34 UID: 0 PID: 681556 Comm: napi/iconduit-g Kdump: loaded Tainted: G W O 6.12.30-cloudflare-kasan-2025.5.26 #1 Tainted: [W]=WARN, [O]=OOT_MODULE Hardware name: Lenovo HR355M-V3-G12/HR355M_V3_HPM, BIOS HR355M_V3.G.031 02/17/2025 Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) ? __pfx__raw_spin_lock_irqsave (kernel/locking/spinlock.c:161) ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:323 drivers/net/veth.c:924) veth kasan_report (mm/kasan/report.c:220 mm/kasan/report.c:603) ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:323 drivers/net/veth.c:924) veth veth_xdp_rcv.constprop.0 (include/net/xdp.h:323 drivers/net/veth.c:924) veth ? napi_threaded_poll_loop (net/core/dev.c:6377 net/core/dev.c:6363 net/core/dev.c:6967) ? __pfx_veth_xdp_rcv.constprop.0 (drivers/net/veth.c:899) veth ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) veth_poll (drivers/net/veth.c:981) veth ? update_load_avg (kernel/sched/fair.c:4531 kernel/sched/fair.c:4868) ? __pfx_veth_poll (drivers/net/veth.c:969) veth ? __pfx___perf_event_task_sched_out (kernel/events/core.c:3765) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? finish_task_switch.isra.0 (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 kernel/sched/sched.h:1527 kernel/sched/core.c:5086 kernel/sched/core.c:5204) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __switch_to (arch/x86/include/asm/bitops.h:55 include/asm-generic/bitops/instrumented-atomic.h:29 include/linux/thread_info.h:89 include/linux/sched.h:1978 arch/x86/include/asm/fpu/sched.h:68 arch/x86/kernel/process_64.c:674) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __schedule (kernel/sched/core.c:6592) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __pfx_migrate_enable (kernel/sched/core.c:2338) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? napi_pp_put_page (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2913 (discriminator 5) include/linux/atomic/atomic-long.h:331 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3446 (discriminator 5) include/net/page_pool/helpers.h:276 (discriminator 5) include/net/page_pool/helpers.h:308 (discriminator 5) include/net/page_pool/helpers.h:320 (discriminator 5) include/net/page_pool/helpers.h:353 (discriminator 5) net/core/skbuff.c:1040 (discriminator 5)) __napi_poll (net/core/dev.c:6837) bpf_trampoline_6442548359+0x79/0x123 ? __cond_resched (arch/x86/include/asm/preempt.h:84 (discriminator 13) kernel/sched/core.c:6891 (discriminator 13) kernel/sched/core.c:7234 (discriminator 13)) __napi_poll (net/core/dev.c:6824) napi_threaded_poll_loop (include/linux/netpoll.h:90 net/core/dev.c:6958) ? __pfx_napi_threaded_poll_loop (net/core/dev.c:6941) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? sysvec_call_function_single (arch/x86/include/asm/hardirq.h:78 (discriminator 2) arch/x86/kernel/smp.c:266 (discriminator 2)) ? napi_threaded_poll (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 net/core/dev.c:6926 net/core/dev.c:6983) napi_threaded_poll (net/core/dev.c:6984) ? __pfx_napi_threaded_poll (net/core/dev.c:6980) kthread (kernel/kthread.c:389) ? recalc_sigpending (arch/x86/include/asm/bitops.h:75 include/asm-generic/bitops/instrumented-atomic.h:42 include/linux/thread_info.h:94 kernel/signal.c:178) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork (arch/x86/kernel/process.c:152) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) </TASK> xsk_diag The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x38c9eee flags: 0x1effff800000000(node=7|zone=2|lastcpupid=0x1ffff) raw_diag raw: 01effff800000000 ffffea00e3075c48 ffffea00e3211648 0000000000000000 raw: 0000000000000000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected unix_diag Memory state around the buggy address: ffff88b8c9eedf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88b8c9eedf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88b8c9eee000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ af_packet_diag ffff88b8c9eee080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88b8c9eee100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== netlink_diag Disabling lock debugging due to kernel taint nfnetlink_queue xt_TPROXY ================================================================== BUG: KASAN: use-after-free in veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth nf_tproxy_ipv6 Read of size 4 at addr ffff88b8c9eee024 by task napi/iconduit-g/681556 CPU: 34 UID: 0 PID: 681556 Comm: napi/iconduit-g Kdump: loaded Tainted: G B W O 6.12.30-cloudflare-kasan-2025.5.26 #1 Tainted: [B]=BAD_PAGE, [W]=WARN, [O]=OOT_MODULE Hardware name: Lenovo HR355M-V3-G12/HR355M_V3_HPM, BIOS HR355M_V3.G.031 02/17/2025 Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) ? __pfx__raw_spin_lock_irqsave (kernel/locking/spinlock.c:161) ? add_taint (include/linux/debug_locks.h:16 (discriminator 4) kernel/panic.c:602 (discriminator 4)) ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth kasan_report (mm/kasan/report.c:220 mm/kasan/report.c:603) ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth ? napi_threaded_poll_loop (net/core/dev.c:6377 net/core/dev.c:6363 net/core/dev.c:6967) ? __pfx_veth_xdp_rcv.constprop.0 (drivers/net/veth.c:899) veth ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) veth_poll (drivers/net/veth.c:981) veth ? update_load_avg (kernel/sched/fair.c:4531 kernel/sched/fair.c:4868) ? __pfx_veth_poll (drivers/net/veth.c:969) veth ? __pfx___perf_event_task_sched_out (kernel/events/core.c:3765) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? finish_task_switch.isra.0 (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 kernel/sched/sched.h:1527 kernel/sched/core.c:5086 kernel/sched/core.c:5204) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __switch_to (arch/x86/include/asm/bitops.h:55 include/asm-generic/bitops/instrumented-atomic.h:29 include/linux/thread_info.h:89 include/linux/sched.h:1978 arch/x86/include/asm/fpu/sched.h:68 arch/x86/kernel/process_64.c:674) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __schedule (kernel/sched/core.c:6592) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __pfx_migrate_enable (kernel/sched/core.c:2338) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? napi_pp_put_page (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2913 (discriminator 5) include/linux/atomic/atomic-long.h:331 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3446 (discriminator 5) include/net/page_pool/helpers.h:276 (discriminator 5) include/net/page_pool/helpers.h:308 (discriminator 5) include/net/page_pool/helpers.h:320 (discriminator 5) include/net/page_pool/helpers.h:353 (discriminator 5) net/core/skbuff.c:1040 (discriminator 5)) __napi_poll (net/core/dev.c:6837) bpf_trampoline_6442548359+0x79/0x123 ? __cond_resched (arch/x86/include/asm/preempt.h:84 (discriminator 13) kernel/sched/core.c:6891 (discriminator 13) kernel/sched/core.c:7234 (discriminator 13)) __napi_poll (net/core/dev.c:6824) napi_threaded_poll_loop (include/linux/netpoll.h:90 net/core/dev.c:6958) ? __pfx_napi_threaded_poll_loop (net/core/dev.c:6941) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? sysvec_call_function_single (arch/x86/include/asm/hardirq.h:78 (discriminator 2) arch/x86/kernel/smp.c:266 (discriminator 2)) ? napi_threaded_poll (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 net/core/dev.c:6926 net/core/dev.c:6983) napi_threaded_poll (net/core/dev.c:6984) ? __pfx_napi_threaded_poll (net/core/dev.c:6980) kthread (kernel/kthread.c:389) ? recalc_sigpending (arch/x86/include/asm/bitops.h:75 include/asm-generic/bitops/instrumented-atomic.h:42 include/linux/thread_info.h:94 kernel/signal.c:178) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork (arch/x86/kernel/process.c:152) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) </TASK> nf_tproxy_ipv4 The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x38c9eee flags: 0x1effff800000000(node=7|zone=2|lastcpupid=0x1ffff) xt_socket raw: 01effff800000000 ffffea00e3075c48 ffffea00e3211648 0000000000000000 raw: 0000000000000000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected nf_socket_ipv4 Memory state around the buggy address: ffff88b8c9eedf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff nf_socket_ipv6 ffff88b8c9eedf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88b8c9eee000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ xt_NFQUEUE ffff88b8c9eee080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88b8c9eee100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== ================================================================== overlay BUG: KASAN: use-after-free in veth_xdp_rcv_one+0xb0c/0xce0 [veth] Read of size 8 at addr ffff88b8c9eee000 by task napi/iconduit-g/681556 esp4 CPU: 34 UID: 0 PID: 681556 Comm: napi/iconduit-g Kdump: loaded Tainted: G B W O 6.12.30-cloudflare-kasan-2025.5.26 #1 Tainted: [B]=BAD_PAGE, [W]=WARN, [O]=OOT_MODULE Hardware name: Lenovo HR355M-V3-G12/HR355M_V3_HPM, BIOS HR355M_V3.G.031 02/17/2025 Call Trace: <TASK> dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:378 mm/kasan/report.c:488) ? __pfx__raw_spin_lock_irqsave (kernel/locking/spinlock.c:161) ? __pfx__raw_spin_lock (kernel/locking/spinlock.c:153) ? veth_xdp_rcv_one (include/net/xdp.h:254 drivers/net/veth.c:650) veth kasan_report (mm/kasan/report.c:220 mm/kasan/report.c:603) ? veth_xdp_rcv_one (include/net/xdp.h:254 drivers/net/veth.c:650) veth veth_xdp_rcv_one (include/net/xdp.h:254 drivers/net/veth.c:650) veth ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth ? __pfx_veth_xdp_rcv_one (drivers/net/veth.c:639) veth ? _raw_spin_unlock_irqrestore (include/linux/spinlock_api_smp.h:152 (discriminator 2) kernel/locking/spinlock.c:194 (discriminator 2)) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? add_taint (arch/x86/include/asm/bitops.h:60 include/asm-generic/bitops/instrumented-atomic.h:29 kernel/panic.c:605) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? end_report.part.0 (mm/kasan/report.c:242) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? veth_xdp_rcv.constprop.0 (include/net/xdp.h:182 include/net/xdp.h:325 drivers/net/veth.c:924) veth veth_xdp_rcv.constprop.0 (drivers/net/veth.c:926) veth ? napi_threaded_poll_loop (net/core/dev.c:6377 net/core/dev.c:6363 net/core/dev.c:6967) ? __pfx_veth_xdp_rcv.constprop.0 (drivers/net/veth.c:899) veth ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) veth_poll (drivers/net/veth.c:981) veth ? update_load_avg (kernel/sched/fair.c:4531 kernel/sched/fair.c:4868) ? __pfx_veth_poll (drivers/net/veth.c:969) veth ? __pfx___perf_event_task_sched_out (kernel/events/core.c:3765) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? finish_task_switch.isra.0 (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:119 kernel/sched/sched.h:1527 kernel/sched/core.c:5086 kernel/sched/core.c:5204) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __switch_to (arch/x86/include/asm/bitops.h:55 include/asm-generic/bitops/instrumented-atomic.h:29 include/linux/thread_info.h:89 include/linux/sched.h:1978 arch/x86/include/asm/fpu/sched.h:68 arch/x86/kernel/process_64.c:674) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __schedule (kernel/sched/core.c:6592) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? __pfx_migrate_enable (kernel/sched/core.c:2338) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? napi_pp_put_page (arch/x86/include/asm/atomic64_64.h:79 (discriminator 5) include/linux/atomic/atomic-arch-fallback.h:2913 (discriminator 5) include/linux/atomic/atomic-long.h:331 (discriminator 5) include/linux/atomic/atomic-instrumented.h:3446 (discriminator 5) include/net/page_pool/helpers.h:276 (discriminator 5) include/net/page_pool/helpers.h:308 (discriminator 5) include/net/page_pool/helpers.h:320 (discriminator 5) include/net/page_pool/helpers.h:353 (discriminator 5) net/core/skbuff.c:1040 (discriminator 5)) __napi_poll (net/core/dev.c:6837) bpf_trampoline_6442548359+0x79/0x123 ? __cond_resched (arch/x86/include/asm/preempt.h:84 (discriminator 13) kernel/sched/core.c:6891 (discriminator 13) kernel/sched/core.c:7234 (discriminator 13)) __napi_poll (net/core/dev.c:6824) napi_threaded_poll_loop (include/linux/netpoll.h:90 net/core/dev.c:6958) ? __pfx_napi_threaded_poll_loop (net/core/dev.c:6941) ? srso_alias_return_thunk (arch/x86/lib/retpoline.S:182) ? sysvec_call_function_single (arch/x86/include/asm/hardirq.h:78 (discriminator 2) arch/x86/kernel/smp.c:266 (discriminator 2)) ? napi_threaded_poll (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 net/core/dev.c:6926 net/core/dev.c:6983) napi_threaded_poll (net/core/dev.c:6984) ? __pfx_napi_threaded_poll (net/core/dev.c:6980) kthread (kernel/kthread.c:389) ? recalc_sigpending (arch/x86/include/asm/bitops.h:75 include/asm-generic/bitops/instrumented-atomic.h:42 include/linux/thread_info.h:94 kernel/signal.c:178) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork (arch/x86/kernel/process.c:152) ? __pfx_kthread (kernel/kthread.c:342) ret_from_fork_asm (arch/x86/entry/entry_64.S:257) </TASK> xt_hashlimit The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x38c9eee flags: 0x1effff800000000(node=7|zone=2|lastcpupid=0x1ffff) ip_set_hash_netport raw: 01effff800000000 ffffea00e3075c48 ffffea00e3211648 0000000000000000 raw: 0000000000000000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected xt_length Memory state around the buggy address: ffff88b8c9eedf00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88b8c9eedf80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88b8c9eee000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff nft_compat ^ ffff88b8c9eee080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88b8c9eee100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== nf_conntrack_netlink ================================================================== xfrm_interface BUG: KASAN: use-after-free in veth_xdp_rcv_one+0x995/0xce0 [veth] Read of size 2 at addr ffff88b8c9eee00a by task napi/iconduit-g/681556 xfrm6_tunnel CPU: 34 UID: 0 PID: 681556 Comm: napi/iconduit-g Kdump: loaded Tainted: G B W O 6.12.30-cloudflare-kasan-2025.5.26 #1 Tainted: [B]=BAD_PAGE, [W]=WARN, [O]=OOT_MODULE Hardware name: Lenovo HR355M-V3-G12/HR355M_V3_HPM, BIOS HR355M_V3.G.031 02/17/2025 Call Trace: <TASK> dump_stack_lvl+0x4b/0x70 print_report+0x14d/0x4cf ? __pfx__raw_spin_lock_irqsave+0x10/0x10 ? veth_xdp_rcv_one+0x995/0xce0 [veth] kasan_report+0xb6/0x140