From: sidchintamaneni <sidchintamaneni@xxxxxx> Introduces IPI based runtime mechanism to terminate a BPF program. When a BPF program is interrupted by an IPI, its registers are are passed onto the bpf_die. Inside bpf_die we perform the RIP switch and stack walk to replace the return addresses of BPF progs/subprogs to the patched program. In bpf_die, we are supporting non-inlined bpf_die scenario as well, later could be extended to other unrestricted iterators. Signed-off-by: Raj <rjsu26@xxxxxxxxx> Signed-off-by: Siddharth <sidchintamaneni@xxxxxxxxx> --- arch/x86/kernel/smp.c | 4 +- include/linux/filter.h | 16 +++++ include/linux/smp.h | 2 +- kernel/bpf/syscall.c | 159 +++++++++++++++++++++++++++++++++++++++++ kernel/smp.c | 22 ++++-- 5 files changed, 193 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18266cc3d98c..aca5a97be19f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -259,7 +259,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function) apic_eoi(); trace_call_function_entry(CALL_FUNCTION_VECTOR); inc_irq_stat(irq_call_count); - generic_smp_call_function_interrupt(); + generic_smp_call_function_interrupt(regs); trace_call_function_exit(CALL_FUNCTION_VECTOR); } @@ -268,7 +268,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single) apic_eoi(); trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); inc_irq_stat(irq_call_count); - generic_smp_call_function_single_interrupt(); + generic_smp_call_function_single_interrupt(regs); trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); } diff --git a/include/linux/filter.h b/include/linux/filter.h index f5cf4d35d83e..cb75f62a1357 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -689,10 +689,21 @@ extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); +void bpf_die(void *data); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); +static void update_term_per_cpu_flag(const struct bpf_prog *prog, u8 cpu_flag) +{ + unsigned long flags; + u32 cpu_id = raw_smp_processor_id(); + spin_lock_irqsave(&prog->termination_states->per_cpu_state[cpu_id].lock, + flags); + prog->termination_states->per_cpu_state[cpu_id].cpu_flag = cpu_flag; + spin_unlock_irqrestore(&prog->termination_states->per_cpu_state[cpu_id].lock, + flags); +} static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, @@ -701,12 +712,15 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, u32 ret; cant_migrate(); + if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 duration, start = sched_clock(); unsigned long flags; + update_term_per_cpu_flag(prog, 1); ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + update_term_per_cpu_flag(prog, 0); duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); @@ -715,7 +729,9 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { + update_term_per_cpu_flag(prog, 1); ret = dfunc(ctx, prog->insnsi, prog->bpf_func); + update_term_per_cpu_flag(prog, 0); } return ret; } diff --git a/include/linux/smp.h b/include/linux/smp.h index f1aa0952e8c3..a0d8b3263a15 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -173,7 +173,7 @@ void wake_up_all_idle_cpus(void); * Generic and arch helpers */ void __init call_function_init(void); -void generic_smp_call_function_single_interrupt(void); +void generic_smp_call_function_single_interrupt(struct pt_regs *regs); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index fb54c5e948ff..c5911b67eb15 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -6008,6 +6008,162 @@ static int token_create(union bpf_attr *attr) return bpf_token_create(attr); } +static bool per_cpu_flag_is_true(struct termination_aux_states *term_states, int cpu_id) +{ + unsigned long flags; + + spin_lock_irqsave(&term_states->per_cpu_state[cpu_id].lock, + flags); + if (term_states->per_cpu_state[cpu_id].cpu_flag == 1) { + spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock, + flags); + return true; + } + spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock, + flags); + return false; +} + +static int is_bpf_address(struct bpf_prog *prog, unsigned long addr) { + + unsigned long bpf_func_addr = (unsigned long)prog->bpf_func; + if ((addr > bpf_func_addr) && + (addr < bpf_func_addr + prog->jited_len)){ + return 1; + } + + for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) { + struct bpf_prog *bpf_subprog = prog->aux->func[subprog]; + unsigned long bpf_subprog_func_addr = + (unsigned long)bpf_subprog->bpf_func; + if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr + + bpf_subprog->jited_len)) { + return 1; + } + } + + return 0; +} + +static unsigned long find_offset_in_patch_prog(struct bpf_prog *patch_prog, + struct bpf_prog *prog, unsigned long addr) +{ + + unsigned long bpf_func_addr = (unsigned long)prog->bpf_func; + if ((addr > bpf_func_addr) && + (addr < bpf_func_addr + prog->jited_len)){ + unsigned long offset = addr - (unsigned long)prog->bpf_func; + return (unsigned long)patch_prog->bpf_func + offset; + } + + for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) { + struct bpf_prog *bpf_subprog = prog->aux->func[subprog]; + unsigned long bpf_subprog_func_addr = + (unsigned long)bpf_subprog->bpf_func; + if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr + + bpf_subprog->jited_len)) { + unsigned long offset = addr - (unsigned + long)prog->aux->func[subprog]->bpf_func; + return (unsigned long)patch_prog->aux->func[subprog]->bpf_func + offset; + } + } + + return -EINVAL; +} + + +void bpf_die(void *data) +{ + struct unwind_state state; + struct bpf_prog *prog, *patch_prog; + struct pt_regs *regs; + char str[KSYM_SYMBOL_LEN]; + unsigned long addr, new_addr, bpf_loop_addr, bpf_loop_term_addr; + int cpu_id = raw_smp_processor_id(); + + prog = (struct bpf_prog *)data; + patch_prog = prog->termination_states->patch_prog; + + if(!per_cpu_flag_is_true(prog->termination_states, cpu_id)) + return; + + regs = &prog->termination_states->pre_execution_state[cpu_id]; + bpf_loop_addr = (unsigned long)bpf_loop_proto.func; + bpf_loop_term_addr = (unsigned long)bpf_loop_termination_proto.func; + + unwind_start(&state, current, regs, NULL); + addr = unwind_get_return_address(&state); + + /* BPF programs RIP is in bpf program context when termination + * signal raises an IPI + */ + if (is_bpf_address(prog, addr)) { + new_addr = find_offset_in_patch_prog(patch_prog, prog, addr); + if (new_addr < 0) + return; + regs->ip = new_addr; + } + + unsigned long stack_addr = regs->sp; + while (addr) { + if (is_bpf_address(prog, addr)) { + while (*(unsigned long *)stack_addr != addr) { + stack_addr += 1; + } + new_addr = find_offset_in_patch_prog(patch_prog, prog, addr); + if (new_addr < 0) + return; + *(unsigned long *)stack_addr = new_addr; + } else { + /* Handles termination non-inline bpf_loop scenario. + * Could be modular and later extended to other iterators. + */ + const char *name = kallsyms_lookup(addr, NULL, NULL, NULL, str); + if (name) { + unsigned long lookup_addr = kallsyms_lookup_name(name); + if (lookup_addr && lookup_addr == bpf_loop_addr) { + while (*(unsigned long *)stack_addr != addr) { + stack_addr += 1; + } + *(unsigned long *)stack_addr = bpf_loop_term_addr; + } + } + } + unwind_next_frame(&state); + addr = unwind_get_return_address(&state); + } + + atomic64_dec(&prog->aux->refcnt); + + return; +} + +static int bpf_prog_terminate(union bpf_attr *attr) +{ + struct bpf_prog *prog; + struct termination_aux_states *term_states; + int cpu_id; + + prog = bpf_prog_by_id(attr->prog_id); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + term_states = prog->termination_states; + if (!term_states) + return -ENOTSUPP; + + cpu_id = attr->prog_terminate.term_cpu_id; + if (cpu_id < 0 && cpu_id >= NR_CPUS) + return -EINVAL; + + if (!per_cpu_flag_is_true(term_states, cpu_id)) + return -EFAULT; + + smp_call_function_single(cpu_id, bpf_die, (void *)prog, 1); + + return 0; +} + static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; @@ -6144,6 +6300,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) case BPF_TOKEN_CREATE: err = token_create(&attr); break; + case BPF_PROG_TERMINATE: + err = bpf_prog_terminate(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/smp.c b/kernel/smp.c index 974f3a3962e8..f4dcc493b63f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -26,6 +26,7 @@ #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> +#include <linux/filter.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS @@ -49,7 +50,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); -static void __flush_smp_call_function_queue(bool warn_cpu_offline); +static void __flush_smp_call_function_queue(struct pt_regs *regs, bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { @@ -94,7 +95,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * ensure that the outgoing CPU doesn't go offline with work * still pending. */ - __flush_smp_call_function_queue(false); + __flush_smp_call_function_queue(NULL, false); irq_work_run(); return 0; } @@ -452,14 +453,15 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ -void generic_smp_call_function_single_interrupt(void) +void generic_smp_call_function_single_interrupt(struct pt_regs *regs) { - __flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(regs, true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * + * @regs : register state when the interrupted the CPU * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * @@ -471,7 +473,7 @@ void generic_smp_call_function_single_interrupt(void) * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ -static void __flush_smp_call_function_queue(bool warn_cpu_offline) +static void __flush_smp_call_function_queue(struct pt_regs *regs, bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; @@ -536,6 +538,12 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) entry = &csd_next->node.llist; } + if (func == bpf_die) { + int cpu_id = raw_smp_processor_id(); + struct bpf_prog *prog = (struct bpf_prog *)info; + prog->termination_states-> + pre_execution_state[cpu_id] = *regs; + } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); @@ -567,8 +575,8 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) void *info = csd->info; csd_lock_record(csd); - csd_unlock(csd); csd_do_func(func, info, csd); + csd_unlock(csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); @@ -612,7 +620,7 @@ void flush_smp_call_function_queue(void) local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); - __flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(NULL, true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); -- 2.43.0