Re: [RFC bpf-next 3/4] bpf: Generating a stubbed version of BPF program for termination

Eduard Zingerman <eddyz87@xxxxxxxxx> · Mon, 12 May 2025 17:07:02 -0700

Raj Sahu <rjsu26@xxxxxxxxx> writes:

[...]

> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 380e9a7cac75..e5dceebfb302 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -6,6 +6,7 @@
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>  #include <linux/rcupdate_trace.h>
> +#include <asm/unwind.h>
>  
>  struct bpf_iter_target_info {
>  	struct list_head list;
> @@ -775,6 +776,70 @@ const struct bpf_func_proto bpf_loop_proto = {
>  	.arg4_type	= ARG_ANYTHING,
>  };
>  
> +BPF_CALL_4(bpf_loop_termination, u32, nr_loops, void *, callback_fn,
> +		void *, callback_ctx, u64, flags)
> +{
> +	/* Since a patched BPF program for termination will want
> +	 * to finish as fast as possible,
> +	 * we simply don't run any loop in here.
> +	 */
> +
> +	/* Restoring the callee-saved registers and exit.
> +	 * Hacky/ err prone way of restoring the registers.
> +	 * We are figuring out a way to have arch independent
> +	 * way to do this.
> +	 */
> +
> +	asm volatile(
> +	"pop %rbx\n\t"
> +	"pop %rbp\n\t"
> +	"pop %r12\n\t"
> +	"pop %r13\n\t"
> +	);

Why do anything special here?
I'd expect a bpf_loop() version simplified to a single return to work.

> +
> +	return 0;
> +}

[...]

> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c

[...]

> +
> +static bool find_in_skiplist(int func_id) {

Nit: "skip list" is a name of an (unrelated) data structure,
     maybe disambiguate the name here?

> +	return is_verifier_inlined_function(func_id) ||
> +	       is_debug_function(func_id) ||
> +	       is_resource_release_function(func_id);
> +}
> +
> +static int get_replacement_helper(int func_id, enum bpf_return_type ret_type) {
> +
> +	switch (func_id) {
> +		case BPF_FUNC_loop:
> +			return BPF_FUNC_loop_termination;
> +		case BPF_FUNC_for_each_map_elem:
> +		case BPF_FUNC_user_ringbuf_drain:
> +			return -ENOTSUPP;
> +	}
> +
> +	switch (ret_type) {
> +		case RET_VOID:
> +			return BPF_FUNC_dummy_void;
> +		case RET_INTEGER:
> +			return BPF_FUNC_dummy_int;
> +		case RET_PTR_TO_MAP_VALUE_OR_NULL:
> +			return BPF_FUNC_dummy_ptr_to_map;
> +		case RET_PTR_TO_SOCKET_OR_NULL:
> +		case RET_PTR_TO_TCP_SOCK_OR_NULL:
> +		case RET_PTR_TO_SOCK_COMMON_OR_NULL:
> +		case RET_PTR_TO_RINGBUF_MEM_OR_NULL:
> +		case RET_PTR_TO_DYNPTR_MEM_OR_NULL:
> +		case RET_PTR_TO_BTF_ID_OR_NULL:
> +		case RET_PTR_TO_BTF_ID_TRUSTED:
> +		case RET_PTR_TO_MAP_VALUE:
> +		case RET_PTR_TO_SOCKET:
> +		case RET_PTR_TO_TCP_SOCK:
> +		case RET_PTR_TO_SOCK_COMMON:
> +		case RET_PTR_TO_MEM:
> +		case RET_PTR_TO_MEM_OR_BTF_ID:
> +		case RET_PTR_TO_BTF_ID:

I'm curious what's the plan for RET_PTR_TO_BTF_ID?
verifier.c:check_ptr_tp_btf_access() has the following comment:

  /* By default any pointer obtained from walking a trusted pointer is no
   * longer trusted, unless the field being accessed has explicitly been
   * marked as inheriting its parent's state of trust (either full or RCU).
   * For example:
   * 'cgroups' pointer is untrusted if task->cgroups dereference
   * happened in a sleepable program outside of bpf_rcu_read_lock()
   * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
   * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
   *
   * A regular RCU-protected pointer with __rcu tag can also be deemed
   * trusted if we are in an RCU CS. Such pointer can be NULL.
   */

> +		default:
> +			return -ENOTSUPP;
> +	}
> +}
> +
> +static void patch_generator(struct bpf_prog *prog)
> +{
> +	struct call_insn_aux{
> +		int insn_idx;
> +		int replacement_helper;
> +	};
> +
> +	struct call_insn_aux *call_indices;
> +	int num_calls=0;
> +	call_indices = vmalloc(sizeof(call_indices) * prog->len);

clang-tidy spotted a warning here, the expression should be `sizeof(*call_indices)`.

> +
> +	/* Find all call insns */
> +	for(int insn_idx =0 ;insn_idx < prog->len; insn_idx++)
> +	{
> +		struct bpf_insn *insn = &prog->insnsi[insn_idx] ;
> +		u8 class = BPF_CLASS(insn->code);
> +		if (class == BPF_JMP || class == BPF_JMP32) {
> +			if (BPF_OP(insn->code) == BPF_CALL){
> +				if (insn->src_reg == BPF_PSEUDO_CALL) {
> +					continue;
> +				}
> +				if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL){ /*kfunc */
> +					// TODO Need to use btf for getting proto
> +					// If release function --> skip
> +					// If acquire function --> find return type and add to list
> +				}
> +				else {
> +					int func_id = insn->imm;
> +					const struct bpf_func_proto *fn = NULL;
> +					int new_helper_id = -1;
> +
> +					if (find_in_skiplist(func_id)) {
> +						continue;
> +					}
> +
> +					fn = bpf_verifier_ops[prog->type]->get_func_proto(func_id, prog);

Nit: please reuse verifier.c:get_helper_proto() utility function here.

> +					if (!fn && !fn->func) {
> +						continue;
> +					}
> +
> +					new_helper_id = get_replacement_helper(func_id, fn->ret_type);
> +					if (new_helper_id < 0) {
> +						continue;

Nit: propagate error?

> +					}
> +
> +					call_indices[num_calls].insn_idx = insn_idx;
> +					call_indices[num_calls].replacement_helper= new_helper_id;
> +					num_calls++;
> +				}
> +			}
> +		}
> +	}
> +
> +	/* Patch all call insns */
> +	for(int k =0; k < num_calls; k++){
> +		prog->insnsi[call_indices[k].insn_idx].imm = call_indices[k].replacement_helper;

Nit: each instruction is visited only once, so it appears that patching
     can be done in-place w/o need for call_indices array.

> +	}
> +}
> +
> +static bool create_termination_prog(struct bpf_prog *prog,
> +					union bpf_attr *attr,
> +					bpfptr_t uattr,
> +					u32 uattr_size)
> +{
> +	if (prog->len < 10)
> +		return false;
> +
> +	int err;
> +	struct bpf_prog *patch_prog;
> +	patch_prog = bpf_prog_alloc_no_stats(bpf_prog_size(prog->len), 0);
> +	if (!patch_prog) {
> +		return false;
> +	}
> +
> +	patch_prog->termination_states->is_termination_prog = true;
> +
> +	err = clone_bpf_prog(patch_prog, prog);
> +	if (err)
> +			goto free_termination_prog;
> +
> +	patch_generator(patch_prog);
> +
> +	err = bpf_check(&patch_prog, attr, uattr, uattr_size);

There might be issues with program verification if bpf_check is called
for a patched program. For example, for a full implementation you'd need
to handle kfuncs, replacing loops like:

  while (bpf_iter_next()) {}

with loops like:

  while (dummy_return_null()) {}

won't work as verifier would assume that the loop is infinite.
In general, check_helper_call, check_kfunc_call and is_state_visited
have logic for specific helpers/kfuncs that modifies verifier state
basing not only on the helper return value type.

What do you plan to do with functions that unconditionally take
resources, e.g. bpf_spin_lock()?

- From verification point of view:
  this function is RET_VOID and is not in
  find_in_skiplist(), patch_generator() would replace its call with a
  dummy. However, a corresponding bpf_spin_unlock() would remain and thus
  bpf_check() will exit with error.
  So, you would need some special version of bpf_check, that collects
  all resources needed for program translation (e.g. maps), but does
  not perform semantic checks.
  Or patch_generator() has to be called for a program that is already
  verified.

- From termination point of view:
  this function cannot be replaced with a dummy w/o removing
  corresponding unlock. Do you plan to leave these as-is?

> +	if (err) {
> +		goto free_termination_prog;
> +	}
> +
> +	patch_prog = bpf_prog_select_runtime(patch_prog, &err);
> +	if (err) {
> +		goto free_termination_prog;
> +	}
> +
> +	prog->termination_states->patch_prog = patch_prog;
> +	return true;
> +
> +free_termination_prog:
> +	free_percpu(patch_prog->stats);
> +	free_percpu(patch_prog->active);
> +	kfree(patch_prog->aux);
> +	return false;

In case of a load failure of the main program bpf_prog_load_load() does
much more cleanup.

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 54c6953a8b84..57b4fd1f6a72 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[...]

> @@ -22502,7 +22506,14 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
>  	 */
>  	insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
>  	insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> -	insn_buf[cnt++] = BPF_CALL_REL(0);
> +
> +	if (termination_states && termination_states->is_termination_prog) {
> +		/* In a termination BPF prog, we want to exit - set R0 = 1 */
> +		insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 1);
> +	} else {
> +		insn_buf[cnt++] = BPF_CALL_REL(0);
> +	}
> +

Q: Do you need 1:1 instruction pointer correspondence between original
   and patched programs?
   Since you patch inlined bpf_loop instead of disallowing inlining in
   the patched program I assume you do. In this case, there is no
   guarantee that instructions above have the same size after jit.

>  	/* increment loop counter */
>  	insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
>  	/* jump to loop header if callback returned 0 */

[...]