Raj Sahu <rjsu26@xxxxxxxxx> writes: [...] > diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c > index 380e9a7cac75..e5dceebfb302 100644 > --- a/kernel/bpf/bpf_iter.c > +++ b/kernel/bpf/bpf_iter.c > @@ -6,6 +6,7 @@ > #include <linux/filter.h> > #include <linux/bpf.h> > #include <linux/rcupdate_trace.h> > +#include <asm/unwind.h> > > struct bpf_iter_target_info { > struct list_head list; > @@ -775,6 +776,70 @@ const struct bpf_func_proto bpf_loop_proto = { > .arg4_type = ARG_ANYTHING, > }; > > +BPF_CALL_4(bpf_loop_termination, u32, nr_loops, void *, callback_fn, > + void *, callback_ctx, u64, flags) > +{ > + /* Since a patched BPF program for termination will want > + * to finish as fast as possible, > + * we simply don't run any loop in here. > + */ > + > + /* Restoring the callee-saved registers and exit. > + * Hacky/ err prone way of restoring the registers. > + * We are figuring out a way to have arch independent > + * way to do this. > + */ > + > + asm volatile( > + "pop %rbx\n\t" > + "pop %rbp\n\t" > + "pop %r12\n\t" > + "pop %r13\n\t" > + ); Why do anything special here? I'd expect a bpf_loop() version simplified to a single return to work. > + > + return 0; > +} [...] > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c [...] > + > +static bool find_in_skiplist(int func_id) { Nit: "skip list" is a name of an (unrelated) data structure, maybe disambiguate the name here? > + return is_verifier_inlined_function(func_id) || > + is_debug_function(func_id) || > + is_resource_release_function(func_id); > +} > + > +static int get_replacement_helper(int func_id, enum bpf_return_type ret_type) { > + > + switch (func_id) { > + case BPF_FUNC_loop: > + return BPF_FUNC_loop_termination; > + case BPF_FUNC_for_each_map_elem: > + case BPF_FUNC_user_ringbuf_drain: > + return -ENOTSUPP; > + } > + > + switch (ret_type) { > + case RET_VOID: > + return BPF_FUNC_dummy_void; > + case RET_INTEGER: > + return BPF_FUNC_dummy_int; > + case RET_PTR_TO_MAP_VALUE_OR_NULL: > + return BPF_FUNC_dummy_ptr_to_map; > + case RET_PTR_TO_SOCKET_OR_NULL: > + case RET_PTR_TO_TCP_SOCK_OR_NULL: > + case RET_PTR_TO_SOCK_COMMON_OR_NULL: > + case RET_PTR_TO_RINGBUF_MEM_OR_NULL: > + case RET_PTR_TO_DYNPTR_MEM_OR_NULL: > + case RET_PTR_TO_BTF_ID_OR_NULL: > + case RET_PTR_TO_BTF_ID_TRUSTED: > + case RET_PTR_TO_MAP_VALUE: > + case RET_PTR_TO_SOCKET: > + case RET_PTR_TO_TCP_SOCK: > + case RET_PTR_TO_SOCK_COMMON: > + case RET_PTR_TO_MEM: > + case RET_PTR_TO_MEM_OR_BTF_ID: > + case RET_PTR_TO_BTF_ID: I'm curious what's the plan for RET_PTR_TO_BTF_ID? verifier.c:check_ptr_tp_btf_access() has the following comment: /* By default any pointer obtained from walking a trusted pointer is no * longer trusted, unless the field being accessed has explicitly been * marked as inheriting its parent's state of trust (either full or RCU). * For example: * 'cgroups' pointer is untrusted if task->cgroups dereference * happened in a sleepable program outside of bpf_rcu_read_lock() * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU). * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED. * * A regular RCU-protected pointer with __rcu tag can also be deemed * trusted if we are in an RCU CS. Such pointer can be NULL. */ > + default: > + return -ENOTSUPP; > + } > +} > + > +static void patch_generator(struct bpf_prog *prog) > +{ > + struct call_insn_aux{ > + int insn_idx; > + int replacement_helper; > + }; > + > + struct call_insn_aux *call_indices; > + int num_calls=0; > + call_indices = vmalloc(sizeof(call_indices) * prog->len); clang-tidy spotted a warning here, the expression should be `sizeof(*call_indices)`. > + > + /* Find all call insns */ > + for(int insn_idx =0 ;insn_idx < prog->len; insn_idx++) > + { > + struct bpf_insn *insn = &prog->insnsi[insn_idx] ; > + u8 class = BPF_CLASS(insn->code); > + if (class == BPF_JMP || class == BPF_JMP32) { > + if (BPF_OP(insn->code) == BPF_CALL){ > + if (insn->src_reg == BPF_PSEUDO_CALL) { > + continue; > + } > + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL){ /*kfunc */ > + // TODO Need to use btf for getting proto > + // If release function --> skip > + // If acquire function --> find return type and add to list > + } > + else { > + int func_id = insn->imm; > + const struct bpf_func_proto *fn = NULL; > + int new_helper_id = -1; > + > + if (find_in_skiplist(func_id)) { > + continue; > + } > + > + fn = bpf_verifier_ops[prog->type]->get_func_proto(func_id, prog); Nit: please reuse verifier.c:get_helper_proto() utility function here. > + if (!fn && !fn->func) { > + continue; > + } > + > + new_helper_id = get_replacement_helper(func_id, fn->ret_type); > + if (new_helper_id < 0) { > + continue; Nit: propagate error? > + } > + > + call_indices[num_calls].insn_idx = insn_idx; > + call_indices[num_calls].replacement_helper= new_helper_id; > + num_calls++; > + } > + } > + } > + } > + > + /* Patch all call insns */ > + for(int k =0; k < num_calls; k++){ > + prog->insnsi[call_indices[k].insn_idx].imm = call_indices[k].replacement_helper; Nit: each instruction is visited only once, so it appears that patching can be done in-place w/o need for call_indices array. > + } > +} > + > +static bool create_termination_prog(struct bpf_prog *prog, > + union bpf_attr *attr, > + bpfptr_t uattr, > + u32 uattr_size) > +{ > + if (prog->len < 10) > + return false; > + > + int err; > + struct bpf_prog *patch_prog; > + patch_prog = bpf_prog_alloc_no_stats(bpf_prog_size(prog->len), 0); > + if (!patch_prog) { > + return false; > + } > + > + patch_prog->termination_states->is_termination_prog = true; > + > + err = clone_bpf_prog(patch_prog, prog); > + if (err) > + goto free_termination_prog; > + > + patch_generator(patch_prog); > + > + err = bpf_check(&patch_prog, attr, uattr, uattr_size); There might be issues with program verification if bpf_check is called for a patched program. For example, for a full implementation you'd need to handle kfuncs, replacing loops like: while (bpf_iter_next()) {} with loops like: while (dummy_return_null()) {} won't work as verifier would assume that the loop is infinite. In general, check_helper_call, check_kfunc_call and is_state_visited have logic for specific helpers/kfuncs that modifies verifier state basing not only on the helper return value type. What do you plan to do with functions that unconditionally take resources, e.g. bpf_spin_lock()? - From verification point of view: this function is RET_VOID and is not in find_in_skiplist(), patch_generator() would replace its call with a dummy. However, a corresponding bpf_spin_unlock() would remain and thus bpf_check() will exit with error. So, you would need some special version of bpf_check, that collects all resources needed for program translation (e.g. maps), but does not perform semantic checks. Or patch_generator() has to be called for a program that is already verified. - From termination point of view: this function cannot be replaced with a dummy w/o removing corresponding unlock. Do you plan to leave these as-is? > + if (err) { > + goto free_termination_prog; > + } > + > + patch_prog = bpf_prog_select_runtime(patch_prog, &err); > + if (err) { > + goto free_termination_prog; > + } > + > + prog->termination_states->patch_prog = patch_prog; > + return true; > + > +free_termination_prog: > + free_percpu(patch_prog->stats); > + free_percpu(patch_prog->active); > + kfree(patch_prog->aux); > + return false; In case of a load failure of the main program bpf_prog_load_load() does much more cleanup. [...] > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index 54c6953a8b84..57b4fd1f6a72 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c [...] > @@ -22502,7 +22506,14 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env, > */ > insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt); > insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx); > - insn_buf[cnt++] = BPF_CALL_REL(0); > + > + if (termination_states && termination_states->is_termination_prog) { > + /* In a termination BPF prog, we want to exit - set R0 = 1 */ > + insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 1); > + } else { > + insn_buf[cnt++] = BPF_CALL_REL(0); > + } > + Q: Do you need 1:1 instruction pointer correspondence between original and patched programs? Since you patch inlined bpf_loop instead of disallowing inlining in the patched program I assume you do. In this case, there is no guarantee that instructions above have the same size after jit. > /* increment loop counter */ > insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1); > /* jump to loop header if callback returned 0 */ [...]