On Fri, May 2, 2025 at 12:06 PM Mykyta Yatsenko <mykyta.yatsenko5@xxxxxxxxx> wrote: > > From: Mykyta Yatsenko <yatsenko@xxxxxxxx> > > This patch introduces a new set of kfuncs for working with dynptrs in > BPF programs, enabling reading variable-length user or kernel data > into dynptr directly. To enable memory-safety, verifier allows only > constant-sized reads via existing bpf_probe_read_{user|kernel} etc. > kfuncs, dynptr-based kfuncs allow dynamically-sized reads without memory > safety shortcomings. > > The following kfuncs are introduced: > * `bpf_probe_read_kernel_dynptr()`: probes kernel-space data into a dynptr > * `bpf_probe_read_user_dynptr()`: probes user-space data into a dynptr > * `bpf_probe_read_kernel_str_dynptr()`: probes kernel-space string into > a dynptr > * `bpf_probe_read_user_str_dynptr()`: probes user-space string into a > dynptr > * `bpf_copy_from_user_dynptr()`: sleepable, copies user-space data into > a dynptr for the current task > * `bpf_copy_from_user_str_dynptr()`: sleepable, copies user-space string > into a dynptr for the current task > * `bpf_copy_from_user_task_dynptr()`: sleepable, copies user-space data > of the task into a dynptr > * `bpf_copy_from_user_task_str_dynptr()`: sleepable, copies user-space > string of the task into a dynptr > > The implementation is built on two generic functions: > * __bpf_dynptr_copy > * __bpf_dynptr_copy_str > These functions take function pointers as arguments, enabling the > copying of data from various sources, including both kernel and user > space. Notably, these indirect calls are typically inlined. > > Reviewed-by: Andrii Nakryiko <andrii@xxxxxxxxxx> > Signed-off-by: Mykyta Yatsenko <yatsenko@xxxxxxxx> > --- > kernel/bpf/helpers.c | 8 ++ > kernel/trace/bpf_trace.c | 199 +++++++++++++++++++++++++++++++++++++++ > 2 files changed, 207 insertions(+) > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c > index 2aad7c57425b..7d72d3e87324 100644 > --- a/kernel/bpf/helpers.c > +++ b/kernel/bpf/helpers.c > @@ -3294,6 +3294,14 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE > BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE) > BTF_ID_FLAGS(func, bpf_local_irq_save) > BTF_ID_FLAGS(func, bpf_local_irq_restore) > +BTF_ID_FLAGS(func, bpf_probe_read_user_dynptr) > +BTF_ID_FLAGS(func, bpf_probe_read_kernel_dynptr) > +BTF_ID_FLAGS(func, bpf_probe_read_user_str_dynptr) > +BTF_ID_FLAGS(func, bpf_probe_read_kernel_str_dynptr) > +BTF_ID_FLAGS(func, bpf_copy_from_user_dynptr, KF_SLEEPABLE) > +BTF_ID_FLAGS(func, bpf_copy_from_user_str_dynptr, KF_SLEEPABLE) > +BTF_ID_FLAGS(func, bpf_copy_from_user_task_dynptr, KF_SLEEPABLE) > +BTF_ID_FLAGS(func, bpf_copy_from_user_task_str_dynptr, KF_SLEEPABLE) They need to have KF_TRUSTED_ARGS, otherwise legacy ptr_to_btf_id can be passed in. > BTF_KFUNCS_END(common_btf_ids) > > static const struct btf_kfunc_id_set common_kfunc_set = { > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > index 52c432a44aeb..52926d572006 100644 > --- a/kernel/trace/bpf_trace.c > +++ b/kernel/trace/bpf_trace.c > @@ -3499,6 +3499,147 @@ static int __init bpf_kprobe_multi_kfuncs_init(void) > > late_initcall(bpf_kprobe_multi_kfuncs_init); > > +typedef int (*copy_fn_t)(void *dst, const void *src, u32 size, struct task_struct *tsk); > + > +static __always_inline int __bpf_dynptr_copy_str(struct bpf_dynptr *dptr, u32 doff, u32 size, why always_inline? patch 1 already adds overhead in non-LTO build, since small helper bpf_dynptr_check_off_len() will not be inlined. This __always_inline looks odd and probably wrong from optimizations pov. All other __always_inline look wrong too. > + const void __user *unsafe_src, > + copy_fn_t str_copy_fn, > + struct task_struct *tsk) > +{ > + struct bpf_dynptr_kern *dst; > + u32 chunk_sz, off; > + void *dst_slice; > + int cnt, err; > + char buf[256]; > + > + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); > + if (likely(dst_slice)) > + return str_copy_fn(dst_slice, unsafe_src, size, tsk); > + > + dst = (struct bpf_dynptr_kern *)dptr; > + if (bpf_dynptr_check_off_len(dst, doff, size)) > + return -E2BIG; > + > + for (off = 0; off < size; off += chunk_sz - 1) { > + chunk_sz = min_t(u32, sizeof(buf), size - off); > + /* Expect str_copy_fn to return count of copied bytes, including > + * zero terminator. Next iteration increment off by chunk_sz - 1 to > + * overwrite NUL. > + */ > + cnt = str_copy_fn(buf, unsafe_src + off, chunk_sz, tsk); > + if (cnt < 0) > + return cnt; > + err = __bpf_dynptr_write(dst, doff + off, buf, cnt, 0); > + if (err) > + return err; > + if (cnt < chunk_sz || chunk_sz == 1) /* we are done */ > + return off + cnt; > + } > + return off; > +} > + > +static __always_inline int __bpf_dynptr_copy(const struct bpf_dynptr *dptr, u32 doff, > + u32 size, const void __user *unsafe_src, > + copy_fn_t copy_fn, struct task_struct *tsk) > +{ > + struct bpf_dynptr_kern *dst; > + void *dst_slice; > + char buf[256]; > + u32 off, chunk_sz; > + int err; > + > + dst_slice = bpf_dynptr_slice_rdwr(dptr, doff, NULL, size); > + if (likely(dst_slice)) > + return copy_fn(dst_slice, unsafe_src, size, tsk); > + > + dst = (struct bpf_dynptr_kern *)dptr; > + if (bpf_dynptr_check_off_len(dst, doff, size)) > + return -E2BIG; > + > + for (off = 0; off < size; off += chunk_sz) { > + chunk_sz = min_t(u32, sizeof(buf), size - off); > + err = copy_fn(buf, unsafe_src + off, chunk_sz, tsk); > + if (err) > + return err; > + err = __bpf_dynptr_write(dst, doff + off, buf, chunk_sz, 0); > + if (err) > + return err; > + } > + return 0; > +} > + > +static __always_inline int copy_user_data_nofault(void *dst, const void __user *unsafe_src, > + u32 size, struct task_struct *tsk) > +{ > + if (WARN_ON_ONCE(tsk)) > + return -EFAULT; > + > + return copy_from_user_nofault(dst, unsafe_src, size); > +} > + > +static __always_inline int copy_user_data_sleepable(void *dst, const void __user *unsafe_src, > + u32 size, struct task_struct *tsk) > +{ > + int ret; > + > + if (!tsk) /* Read from the current task */ > + return copy_from_user(dst, unsafe_src, size); > + > + ret = access_process_vm(tsk, (unsigned long)unsafe_src, dst, size, 0); > + if (ret != size) > + return -EFAULT; > + return 0; > +} > + > +static __always_inline int copy_kernel_data_nofault(void *dst, const void *unsafe_src, > + u32 size, struct task_struct *tsk) > +{ > + if (WARN_ON_ONCE(tsk)) > + return -EFAULT; Why ? Let's not do defensive programming. The caller clearly passes NULL. Just drop this line. > + > + return copy_from_kernel_nofault(dst, unsafe_src, size); > +} > + > +static __always_inline int copy_user_str_nofault(void *dst, const void __user *unsafe_src, > + u32 size, struct task_struct *tsk) > +{ > + if (WARN_ON_ONCE(tsk)) > + return -EFAULT; same here and further down. pw-bot: cr