[RFC bpf-next 4/4] bpf: Runtime part of fast-path termination approach

Raj Sahu <rjsu26@xxxxxxxxx> · Sun, 20 Apr 2025 06:55:22 -0400

From: sidchintamaneni <sidchintamaneni@xxxxxx>

Introduces IPI based runtime mechanism to terminate
a BPF program. When a BPF program is interrupted by
an IPI, its registers are are passed onto the bpf_die.

Inside bpf_die we perform the RIP switch and stack walk
to replace the return addresses of BPF progs/subprogs to
the patched program. In bpf_die, we are supporting non-inlined
bpf_die scenario as well, later could be extended to other
unrestricted iterators.

Signed-off-by: Raj <rjsu26@xxxxxxxxx>
Signed-off-by: Siddharth <sidchintamaneni@xxxxxxxxx>
---
 arch/x86/kernel/smp.c  |   4 +-
 include/linux/filter.h |  16 +++++
 include/linux/smp.h    |   2 +-
 kernel/bpf/syscall.c   | 159 +++++++++++++++++++++++++++++++++++++++++
 kernel/smp.c           |  22 ++++--
 5 files changed, 193 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18266cc3d98c..aca5a97be19f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -259,7 +259,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
 	apic_eoi();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	inc_irq_stat(irq_call_count);
-	generic_smp_call_function_interrupt();
+	generic_smp_call_function_interrupt(regs);
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
 }
 
@@ -268,7 +268,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
 	apic_eoi();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	inc_irq_stat(irq_call_count);
-	generic_smp_call_function_single_interrupt();
+	generic_smp_call_function_single_interrupt(regs);
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
 }
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f5cf4d35d83e..cb75f62a1357 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -689,10 +689,21 @@ extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
 				     int off, int size);
 
+void bpf_die(void *data);
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
 					  unsigned int (*bpf_func)(const void *,
 								   const struct bpf_insn *));
+static void update_term_per_cpu_flag(const struct bpf_prog *prog, u8 cpu_flag)
+{
+	unsigned long flags;
+	u32 cpu_id = raw_smp_processor_id();
+	spin_lock_irqsave(&prog->termination_states->per_cpu_state[cpu_id].lock, 
+				flags);
+	prog->termination_states->per_cpu_state[cpu_id].cpu_flag = cpu_flag;
+	spin_unlock_irqrestore(&prog->termination_states->per_cpu_state[cpu_id].lock,
+				flags);
+}
 
 static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 					  const void *ctx,
@@ -701,12 +712,15 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 	u32 ret;
 
 	cant_migrate();
+
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		struct bpf_prog_stats *stats;
 		u64 duration, start = sched_clock();
 		unsigned long flags;
 
+		update_term_per_cpu_flag(prog, 1);
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+		update_term_per_cpu_flag(prog, 0);
 
 		duration = sched_clock() - start;
 		stats = this_cpu_ptr(prog->stats);
@@ -715,7 +729,9 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 		u64_stats_add(&stats->nsecs, duration);
 		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	} else {
+		update_term_per_cpu_flag(prog, 1);
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+		update_term_per_cpu_flag(prog, 0);
 	}
 	return ret;
 }
diff --git a/include/linux/smp.h b/include/linux/smp.h
index f1aa0952e8c3..a0d8b3263a15 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -173,7 +173,7 @@ void wake_up_all_idle_cpus(void);
  * Generic and arch helpers
  */
 void __init call_function_init(void);
-void generic_smp_call_function_single_interrupt(void);
+void generic_smp_call_function_single_interrupt(struct pt_regs *regs);
 #define generic_smp_call_function_interrupt \
 	generic_smp_call_function_single_interrupt
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fb54c5e948ff..c5911b67eb15 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -6008,6 +6008,162 @@ static int token_create(union bpf_attr *attr)
 	return bpf_token_create(attr);
 }
 
+static bool per_cpu_flag_is_true(struct termination_aux_states *term_states, int cpu_id)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&term_states->per_cpu_state[cpu_id].lock, 
+				flags);
+	if (term_states->per_cpu_state[cpu_id].cpu_flag == 1) {
+		spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
+					flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
+				flags);
+	return false;
+}
+
+static int is_bpf_address(struct bpf_prog *prog, unsigned long addr) {
+
+        unsigned long bpf_func_addr = (unsigned long)prog->bpf_func;
+        if ((addr > bpf_func_addr) &&
+                        (addr < bpf_func_addr + prog->jited_len)){
+                return 1;
+        }
+
+        for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) {
+                struct bpf_prog *bpf_subprog = prog->aux->func[subprog];
+                unsigned long bpf_subprog_func_addr =
+                                        (unsigned long)bpf_subprog->bpf_func;
+                if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr +
+                                                        bpf_subprog->jited_len)) {
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+static unsigned long find_offset_in_patch_prog(struct bpf_prog *patch_prog,
+                struct bpf_prog *prog, unsigned long addr)
+{
+
+        unsigned long bpf_func_addr = (unsigned long)prog->bpf_func;
+        if ((addr > bpf_func_addr) &&
+                        (addr < bpf_func_addr + prog->jited_len)){
+                unsigned long offset = addr - (unsigned long)prog->bpf_func;
+                return (unsigned long)patch_prog->bpf_func + offset;
+        }
+
+        for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) {
+                struct bpf_prog *bpf_subprog = prog->aux->func[subprog];
+                unsigned long bpf_subprog_func_addr =
+                                        (unsigned long)bpf_subprog->bpf_func;
+                if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr +
+                                                        bpf_subprog->jited_len)) {
+                        unsigned long offset = addr - (unsigned
+                                        long)prog->aux->func[subprog]->bpf_func;
+                        return (unsigned long)patch_prog->aux->func[subprog]->bpf_func + offset;
+                }
+        }
+
+	return -EINVAL;
+}
+
+
+void bpf_die(void *data)
+{
+	struct unwind_state state;
+	struct bpf_prog *prog, *patch_prog;
+	struct pt_regs *regs;
+	char str[KSYM_SYMBOL_LEN];
+	unsigned long addr, new_addr, bpf_loop_addr, bpf_loop_term_addr;
+	int cpu_id = raw_smp_processor_id();
+
+	prog = (struct bpf_prog *)data;
+	patch_prog = prog->termination_states->patch_prog;
+
+	if(!per_cpu_flag_is_true(prog->termination_states, cpu_id))
+		return;
+
+	regs = &prog->termination_states->pre_execution_state[cpu_id];
+	bpf_loop_addr = (unsigned long)bpf_loop_proto.func;
+	bpf_loop_term_addr = (unsigned long)bpf_loop_termination_proto.func;
+
+	unwind_start(&state, current, regs, NULL);
+	addr = unwind_get_return_address(&state);
+
+	/* BPF programs RIP is in bpf program context when termination
+	 * signal raises an IPI
+	 */
+	if (is_bpf_address(prog, addr)) {
+		new_addr = find_offset_in_patch_prog(patch_prog, prog, addr);
+		if (new_addr < 0)
+			return;
+		regs->ip = new_addr;
+	}
+
+	unsigned long stack_addr = regs->sp;
+	while (addr) {
+		if (is_bpf_address(prog, addr)) {
+			while (*(unsigned long *)stack_addr != addr) {
+				stack_addr += 1;
+			}
+			new_addr = find_offset_in_patch_prog(patch_prog, prog, addr);
+			if (new_addr < 0)
+				return;
+			*(unsigned long *)stack_addr = new_addr;
+		} else {
+			/* Handles termination non-inline bpf_loop scenario.
+			 * Could be modular and later extended to other iterators.
+			 */
+			const char *name = kallsyms_lookup(addr, NULL, NULL, NULL, str);
+			if (name) {
+				unsigned long lookup_addr = kallsyms_lookup_name(name);
+				if (lookup_addr && lookup_addr == bpf_loop_addr) {
+					while (*(unsigned long *)stack_addr != addr) {
+						stack_addr += 1;
+					}
+					*(unsigned long *)stack_addr = bpf_loop_term_addr;
+				}
+			}
+		}
+		unwind_next_frame(&state);
+		addr = unwind_get_return_address(&state);
+	}
+
+	atomic64_dec(&prog->aux->refcnt);
+
+	return;
+}
+
+static int bpf_prog_terminate(union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	struct termination_aux_states *term_states;
+	int cpu_id;
+
+	prog = bpf_prog_by_id(attr->prog_id);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	term_states = prog->termination_states;
+	if (!term_states)
+		return -ENOTSUPP;
+
+	cpu_id = attr->prog_terminate.term_cpu_id;
+	if (cpu_id < 0 && cpu_id >= NR_CPUS)
+		return -EINVAL;
+
+	if (!per_cpu_flag_is_true(term_states, cpu_id))
+		return -EFAULT;
+
+	smp_call_function_single(cpu_id, bpf_die, (void *)prog, 1);
+
+	return 0;
+}
+
 static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 {
 	union bpf_attr attr;
@@ -6144,6 +6300,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
 	case BPF_TOKEN_CREATE:
 		err = token_create(&attr);
 		break;
+	case BPF_PROG_TERMINATE:
+		err = bpf_prog_terminate(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/smp.c b/kernel/smp.c
index 974f3a3962e8..f4dcc493b63f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -26,6 +26,7 @@
 #include <linux/sched/debug.h>
 #include <linux/jump_label.h>
 #include <linux/string_choices.h>
+#include <linux/filter.h>
 
 #include <trace/events/ipi.h>
 #define CREATE_TRACE_POINTS
@@ -49,7 +50,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
 
 static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);
 
-static void __flush_smp_call_function_queue(bool warn_cpu_offline);
+static void __flush_smp_call_function_queue(struct pt_regs *regs, bool warn_cpu_offline);
 
 int smpcfd_prepare_cpu(unsigned int cpu)
 {
@@ -94,7 +95,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
 	 * ensure that the outgoing CPU doesn't go offline with work
 	 * still pending.
 	 */
-	__flush_smp_call_function_queue(false);
+	__flush_smp_call_function_queue(NULL, false);
 	irq_work_run();
 	return 0;
 }
@@ -452,14 +453,15 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
  * Invoked by arch to handle an IPI for call function single.
  * Must be called with interrupts disabled.
  */
-void generic_smp_call_function_single_interrupt(void)
+void generic_smp_call_function_single_interrupt(struct pt_regs *regs)
 {
-	__flush_smp_call_function_queue(true);
+	__flush_smp_call_function_queue(regs, true);
 }
 
 /**
  * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
  *
+ * @regs : register state when the interrupted the CPU
  * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
  *		      offline CPU. Skip this check if set to 'false'.
  *
@@ -471,7 +473,7 @@ void generic_smp_call_function_single_interrupt(void)
  * Loop through the call_single_queue and run all the queued callbacks.
  * Must be called with interrupts disabled.
  */
-static void __flush_smp_call_function_queue(bool warn_cpu_offline)
+static void __flush_smp_call_function_queue(struct pt_regs *regs, bool warn_cpu_offline)
 {
 	call_single_data_t *csd, *csd_next;
 	struct llist_node *entry, *prev;
@@ -536,6 +538,12 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 				entry = &csd_next->node.llist;
 			}
 
+			if (func == bpf_die) {
+				int cpu_id = raw_smp_processor_id();
+				struct bpf_prog *prog = (struct bpf_prog *)info;
+				prog->termination_states->
+					pre_execution_state[cpu_id] = *regs;
+			}
 			csd_lock_record(csd);
 			csd_do_func(func, info, csd);
 			csd_unlock(csd);
@@ -567,8 +575,8 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
 				void *info = csd->info;
 
 				csd_lock_record(csd);
-				csd_unlock(csd);
 				csd_do_func(func, info, csd);
+				csd_unlock(csd);
 				csd_lock_record(NULL);
 			} else if (type == CSD_TYPE_IRQ_WORK) {
 				irq_work_single(csd);
@@ -612,7 +620,7 @@ void flush_smp_call_function_queue(void)
 	local_irq_save(flags);
 	/* Get the already pending soft interrupts for RT enabled kernels */
 	was_pending = local_softirq_pending();
-	__flush_smp_call_function_queue(true);
+	__flush_smp_call_function_queue(NULL, true);
 	if (local_softirq_pending())
 		do_softirq_post_smp_call_flush(was_pending);
 
-- 
2.43.0