[PATCH 3/4] bpf: runtime part of fast-path termination approach

Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx> · Sun, 7 Sep 2025 23:04:14 +0000

Update softlock detection logic to detect any stalls due to
BPF programs. When softlockup is detected, bpf_die will be
added to a workqueue on a CPU. With this implementation termination
handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is
enabled.

Inside bpf_die, we perform the text_poke to stub helpers/kfuncs.
The current implementation handles termination of long running
bpf_loop iterators both inlining and non-inlining case.

The limitation of this implementation is that the termination handler
atleast need a single CPU to run.

Signed-off-by: Raj Sahu <rjsu26@xxxxxxxxx>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@xxxxxxxxx>
---
 arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |   2 +
 include/linux/filter.h      |   6 ++
 kernel/bpf/core.c           |  35 +++++++++-
 kernel/watchdog.c           |   8 +++
 5 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 107a44729675..4de9a8cdc465 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2606,6 +2606,10 @@ st:			if (is_imm8(insn->off))
 				if (arena_vm_start)
 					pop_r12(&prog);
 			}
+			/* emiting 5 byte nop for non-inline bpf_loop callback */
+			if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
+				emit_nops(&prog, X86_PATCH_SIZE);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
@@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
 	return true;
 }
 
+
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
@@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 #endif
 }
 
+void in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	struct call_aux_states *call_states;
+	unsigned long new_target;
+	unsigned char *addr;
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+	call_states = prog->term_states->patch_call_sites->call_states;
+	for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
+		
+		new_target = (unsigned long) bpf_termination_null_func;
+		if (call_states[i].is_bpf_loop_cb_inline) {
+			new_target = (unsigned long) bpf_loop_term_callback;	
+		}
+		char new_insn[5];
+
+		addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
+
+		unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
+		new_insn[0] = 0xE8;
+		new_insn[1] = (new_rel >> 0) & 0xFF;
+		new_insn[2] = (new_rel >> 8) & 0xFF;
+		new_insn[3] = (new_rel >> 16) & 0xFF;
+		new_insn[4] = (new_rel >> 24) & 0xFF;
+
+		smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
+	}
+
+	if (prog->aux->is_bpf_loop_cb_non_inline) {
+		
+		char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
+		char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
+		smp_text_poke_batch_add(prog->bpf_func + prog->jited_len - 
+				(1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
+	}
+
+
+	/* flush all text poke calls */
+	smp_text_poke_batch_finish();
+}
+
+void bpf_die(struct bpf_prog *prog)
+{
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+
+	/*
+	 * Replacing 5 byte nop in prologue with jmp instruction to ret
+	 */
+	unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */ 
+					+ 5 /* noop is 5 bytes */ 
+					+ ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
+
+	char new_insn[5];
+	new_insn[0] = 0xE9;
+	new_insn[1] = (jmp_offset >> 0) & 0xFF;
+	new_insn[2] = (jmp_offset >> 8) & 0xFF;
+	new_insn[3] = (jmp_offset >> 16) & 0xFF;
+	new_insn[4] = (jmp_offset >> 24) & 0xFF;
+
+	smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
+
+	if (prog->aux->func_cnt) {
+		for (int i = 0; i < prog->aux->func_cnt; i++) {
+			in_place_patch_bpf_prog(prog->aux->func[i]);
+		}
+	} else {
+		in_place_patch_bpf_prog(prog);
+	}
+
+}
+
+void bpf_prog_termination_deferred(struct work_struct *work)
+{
+	struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
+						 work);
+	struct bpf_prog *prog = term_states->prog;
+
+	bpf_die(prog);
+}
+
+static struct workqueue_struct *bpf_termination_wq;
+
+void bpf_softlockup(u32 dur_s)
+{
+	unsigned long addr;
+	struct unwind_state state;
+	struct bpf_prog *prog;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr)
+			break;
+
+		if (!is_bpf_text_address(addr))
+			continue;
+
+		rcu_read_lock();
+		prog = bpf_prog_ksym_find(addr);
+		rcu_read_unlock();
+		if (bpf_is_subprog(prog))
+			continue;
+
+		if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
+			break;
+	
+		bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);
+		if (!bpf_termination_wq)
+			pr_err("Failed to alloc workqueue for bpf termination.\n");
+
+		queue_work(bpf_termination_wq, &prog->term_states->work);
+
+		/* Currently nested programs are not terminated together.
+		 * Removing this break will result in BPF trampolines being
+		 * identified as is_bpf_text_address resulting in NULL ptr
+		 * deref in next step.
+		 */
+		break;
+	}
+}
+
 void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 			       struct bpf_prog *new, struct bpf_prog *old)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index caaee33744fc..03fce8f2c466 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 typedef unsigned int (*bpf_func_t)(const void *,
 				   const struct bpf_insn *);
+
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
 	bpf_iter_init_seq_priv_t init_seq_private;
@@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites {
 struct bpf_term_aux_states {
 	struct bpf_prog *prog;
 	struct work_struct work;
+	atomic_t bpf_die_in_progress;
 	struct bpf_term_patch_call_sites *patch_call_sites;
 };
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9092d8ea95c8..4f0f8fe478bf 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
@@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
 void bpf_prog_pack_free(void *ptr, u32 size);
+void bpf_softlockup(u32 dur_s);
+void bpf_prog_termination_deferred(struct work_struct *work);
+void bpf_die(struct bpf_prog *prog);
+void in_place_patch_bpf_prog(struct bpf_prog *prog);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 93442ab2acde..7b0552d15be3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -41,6 +41,7 @@
 #include <linux/execmem.h>
 
 #include <asm/barrier.h>
+#include <asm/unwind.h>
 #include <linux/unaligned.h>
 
 /* Registers */
@@ -95,6 +96,37 @@ enum page_size_enum {
 	__PAGE_SIZE = PAGE_SIZE
 };
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return NULL;
+}
+
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
+{
+	return 1;
+}
+
+
+void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_die(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_prog_termination_deferred(struct work_struct *work)
+{
+	return;
+}
+
+void __weak bpf_softlockup(u32 dur_s)
+{
+	return;
+}
+
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
@@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
 	fp->term_states = term_states;
+	atomic_set(&fp->term_states->bpf_die_in_progress, 0);
 	fp->term_states->patch_call_sites = patch_call_sites;
 	fp->term_states->patch_call_sites->call_sites_cnt = 0;
 	fp->term_states->patch_call_sites->call_states = NULL;
 	fp->term_states->prog = fp;
-
+	INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred);
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..59c91c18ca0e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/tick.h>
+#include <linux/filter.h>
 
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
@@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts,
 		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
 			scx_softlockup(now - touch_ts);
 
+		/*
+		 * Long running BPF programs can cause CPU's to stall.
+		 * So trigger fast path termination to terminate such BPF programs.
+		 */
+		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+			bpf_softlockup(now - touch_ts);
+
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
-- 
2.43.0