Implement the bpf global trampoline "bpf_global_caller" for x86_64. Thanks to Alexei's advice, we implement most of the global trampoline with C instead of asm. We implement the entry of the trampoline with a "__naked" function, who will save the regs to an array on the stack and call bpf_global_caller_run(). The entry will pass the address of the array and the address of the rip to bpf_global_caller_run(). In bpf_global_caller_run(), we will find the metadata by the function ip. For origin call case, we call kfunc_md_enter() to protect the metadata, which is similar to __bpf_tramp_enter(). Then we will call all the BPF progs, just like what BPF trampoline do. Without origin call, the bpf_global_caller_run() will return 0, and the entry will restore the regs and return; In origin call case, it will return 1, and the entry will make the RSP skip the rip before return. In the FENTRY case, the performance of global trampoline is ~10% slower than BPF trampoline. The global trampoline is optimized by inline some function call, such as __bpf_prog_enter_recur and __bpf_prog_exit_recur. However, more condition, branch and memory read is used in the bpf_global_caller. In the FEXIT and MODIFY_RETURN cases, the performance of the global trampoline is the same(or even better) than BPF trampoline. It make sense, as we also make the function call to __bpf_tramp_enter and __bpf_tramp_exit inlined in the bpf_global_caller. In fact, we can do more optimization to the bpf_global_caller. For example, we can define more bpf_global_caller_xx_run() function and make the "if (prog->sleepable)" and "if (do_origin_call)" fixed. It can be done in a next series. After such optimization, I believe the performance of FENTRY_MULTI can be closer or the same to FENTRY. And for the FEXIT/MODIFY_RETURN cases, the performance can be better. Signed-off-by: Menglong Dong <dongml2@xxxxxxxxxxxxxxx> --- v2: - rewrite the global trampoline with C instead of asm --- arch/x86/Kconfig | 4 + arch/x86/net/bpf_jit_comp.c | 268 ++++++++++++++++++++++++++++++++++++ include/linux/bpf_tramp.h | 72 ++++++++++ kernel/bpf/trampoline.c | 23 +--- 4 files changed, 346 insertions(+), 21 deletions(-) create mode 100644 include/linux/bpf_tramp.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 71019b3b54ea..96962c61419a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -155,6 +155,7 @@ config X86 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select ARCH_WANT_IRQS_OFF_ACTIVATE_MM + select ARCH_HAS_BPF_GLOBAL_CALLER if X86_64 select BUILDTIME_TABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_WATCHDOG @@ -432,6 +433,9 @@ config PGTABLE_LEVELS default 3 if X86_PAE default 2 +config ARCH_HAS_BPF_GLOBAL_CALLER + bool + menu "Processor type and features" config SMP diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 15672cb926fc..8d2fc436a748 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -11,6 +11,8 @@ #include <linux/bpf.h> #include <linux/memory.h> #include <linux/sort.h> +#include <linux/bpf_tramp.h> +#include <linux/kfunc_md.h> #include <asm/extable.h> #include <asm/ftrace.h> #include <asm/set_memory.h> @@ -3413,6 +3415,272 @@ int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, return ret; } +#define FUNC_ARGS_0 ((2 - 1) * 8) +#define FUNC_ARGS_1 ((2 + 0) * 8) +#define FUNC_ARGS_2 ((2 + 1) * 8) +#define FUNC_ARGS_3 ((2 + 2) * 8) +#define FUNC_ARGS_4 ((2 + 3) * 8) +#define FUNC_ARGS_5 ((2 + 4) * 8) +#define FUNC_ARGS_6 ((2 + 5) * 8) + +#define SAVE_ARGS_0 +#define SAVE_ARGS_1 \ + "movq %rdi, " __stringify(FUNC_ARGS_1) "(%rsp)\n" +#define SAVE_ARGS_2 SAVE_ARGS_1 \ + "movq %rsi, " __stringify(FUNC_ARGS_2) "(%rsp)\n" +#define SAVE_ARGS_3 SAVE_ARGS_2 \ + "movq %rdx, " __stringify(FUNC_ARGS_3) "(%rsp)\n" +#define SAVE_ARGS_4 SAVE_ARGS_3 \ + "movq %rcx, " __stringify(FUNC_ARGS_4) "(%rsp)\n" +#define SAVE_ARGS_5 SAVE_ARGS_4 \ + "movq %r8, " __stringify(FUNC_ARGS_5) "(%rsp)\n" +#define SAVE_ARGS_6 SAVE_ARGS_5 \ + "movq %r9, " __stringify(FUNC_ARGS_6) "(%rsp)\n" \ + +#define RESTORE_ARGS_0 +#define RESTORE_ARGS_1 \ + "movq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n" +#define RESTORE_ARGS_2 RESTORE_ARGS_1 \ + "movq " __stringify(FUNC_ARGS_2) "(%rsp), %rsi\n" +#define RESTORE_ARGS_3 RESTORE_ARGS_2 \ + "movq " __stringify(FUNC_ARGS_3) "(%rsp), %rdx\n" +#define RESTORE_ARGS_4 RESTORE_ARGS_3 \ + "movq " __stringify(FUNC_ARGS_4) "(%rsp), %rcx\n" +#define RESTORE_ARGS_5 RESTORE_ARGS_4 \ + "movq " __stringify(FUNC_ARGS_5) "(%rsp), %r8\n" +#define RESTORE_ARGS_6 RESTORE_ARGS_5 \ + "movq " __stringify(FUNC_ARGS_6) "(%rsp), %r9\n" + +#define RESTORE_ORIGIN_0 +#define RESTORE_ORIGIN_1 \ + "movq " __stringify(FUNC_ARGS_1 - FUNC_ARGS_1) "(%[args]), %%rdi\n" +#define RESTORE_ORIGIN_2 RESTORE_ORIGIN_1 \ + "movq " __stringify(FUNC_ARGS_2 - FUNC_ARGS_1) "(%[args]), %%rsi\n" +#define RESTORE_ORIGIN_3 RESTORE_ORIGIN_2 \ + "movq " __stringify(FUNC_ARGS_3 - FUNC_ARGS_1) "(%[args]), %%rdx\n" +#define RESTORE_ORIGIN_4 RESTORE_ORIGIN_3 \ + "movq " __stringify(FUNC_ARGS_4 - FUNC_ARGS_1) "(%[args]), %%rcx\n" +#define RESTORE_ORIGIN_5 RESTORE_ORIGIN_4 \ + "movq " __stringify(FUNC_ARGS_5 - FUNC_ARGS_1) "(%[args]), %%r8\n" +#define RESTORE_ORIGIN_6 RESTORE_ORIGIN_5 \ + "movq " __stringify(FUNC_ARGS_6 - FUNC_ARGS_1) "(%[args]), %%r9\n" + +static __always_inline void +do_origin_call(unsigned long *args, unsigned long *ip, int nr_args) +{ + /* Following code will be optimized by the compiler, as nr_args + * is a const, and there will be no condition here. + */ + if (nr_args == 0) { + asm volatile( + RESTORE_ORIGIN_0 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : + ); + } else if (nr_args == 1) { + asm volatile( + RESTORE_ORIGIN_1 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi" + ); + } else if (nr_args == 2) { + asm volatile( + RESTORE_ORIGIN_2 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi", "rsi" + ); + } else if (nr_args == 3) { + asm volatile( + RESTORE_ORIGIN_3 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi", "rsi", "rdx" + ); + } else if (nr_args == 4) { + asm volatile( + RESTORE_ORIGIN_4 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi", "rsi", "rdx", "rcx" + ); + } else if (nr_args == 5) { + asm volatile( + RESTORE_ORIGIN_5 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi", "rsi", "rdx", "rcx", "r8" + ); + } else if (nr_args == 6) { + asm volatile( + RESTORE_ORIGIN_6 CALL_NOSPEC "\n" + "movq %%rax, %0\n" + : "=m"(args[nr_args]), ASM_CALL_CONSTRAINT + : [args]"r"(args), [thunk_target]"r"(*ip) + : "rdi", "rsi", "rdx", "rcx", "r8", "r9" + ); + } +} + +static __always_inline notrace void +run_tramp_prog(struct kfunc_md_tramp_prog *tramp_prog, + struct bpf_tramp_run_ctx *run_ctx, unsigned long *args) +{ + struct bpf_prog *prog; + u64 start_time; + + while (tramp_prog) { + prog = tramp_prog->prog; + run_ctx->bpf_cookie = tramp_prog->cookie; + start_time = bpf_gtramp_enter(prog, run_ctx); + + if (likely(start_time)) { + asm volatile( + CALL_NOSPEC "\n" + : : [thunk_target]"r"(prog->bpf_func), [args]"D"(args) + ); + } + + bpf_gtramp_exit(prog, start_time, run_ctx); + tramp_prog = tramp_prog->next; + } +} + +static __always_inline notrace int +bpf_global_caller_run(unsigned long *args, unsigned long *ip, int nr_args) +{ + unsigned long origin_ip = (*ip) & 0xfffffffffffffff0; // Align to 16 bytes + struct kfunc_md_tramp_prog *tramp_prog; + struct bpf_tramp_run_ctx run_ctx; + struct kfunc_md *md; + bool do_orgin; + + rcu_read_lock(); + md = kfunc_md_get_rcu(origin_ip); + do_orgin = md->bpf_origin_call; + if (do_orgin) + kfunc_md_enter(md); + rcu_read_unlock(); + + /* save the origin function ip for bpf_get_func_ip() */ + *(args - 2) = origin_ip; + *(args - 1) = nr_args; + + run_tramp_prog(md->bpf_progs[BPF_TRAMP_FENTRY], &run_ctx, args); + + /* no fexit and modify_return, return directly */ + if (!do_orgin) + return 0; + + /* modify return case */ + tramp_prog = md->bpf_progs[BPF_TRAMP_MODIFY_RETURN]; + /* initialize return value */ + args[nr_args] = 0; + while (tramp_prog) { + struct bpf_prog *prog; + u64 start_time, ret; + + prog = tramp_prog->prog; + run_ctx.bpf_cookie = tramp_prog->cookie; + start_time = bpf_gtramp_enter(prog, &run_ctx); + + if (likely(start_time)) { + asm volatile( + CALL_NOSPEC "\n" + : "=a"(ret), ASM_CALL_CONSTRAINT + : [thunk_target]"r"(prog->bpf_func), + [args]"D"(args) + ); + args[nr_args] = ret; + } else { + ret = 0; + } + + bpf_gtramp_exit(prog, start_time, &run_ctx); + if (ret) + goto do_fexit; + tramp_prog = tramp_prog->next; + } + + /* restore the function arguments and call the origin function */ + do_origin_call(args, ip, nr_args); +do_fexit: + run_tramp_prog(md->bpf_progs[BPF_TRAMP_FEXIT], &run_ctx, args); + kfunc_md_exit(md); + return 1; +} + +/* Layout of the stack frame: + * rip ----> 8 bytes + * return value ----> 8 bytes + * args ----> 8 * 6 bytes + * arg count ----> 8 bytes + * origin ip ----> 8 bytes + */ +#define stack_size __stringify(8 + 8 + 6 * 8 + 8) + +#define CALLER_DEFINE(name, nr_args) \ +static __always_used __no_stack_protector notrace int \ +name##_run(unsigned long *args, unsigned long *ip) \ +{ \ + return bpf_global_caller_run(args, ip, nr_args); \ +} \ +static __naked void name(void) \ +{ \ + asm volatile( \ + "subq $" stack_size ", %rsp\n" \ + SAVE_ARGS_##nr_args \ + ); \ + \ + asm volatile( \ + "leaq " __stringify(FUNC_ARGS_1) "(%rsp), %rdi\n" \ + "leaq " stack_size "(%rsp), %rsi\n" \ + "call " #name "_run\n" \ + "test %rax, %rax\n" \ + "jne 1f\n" \ + ); \ + \ + asm volatile( \ + RESTORE_ARGS_##nr_args \ + "addq $" stack_size ", %rsp\n" \ + ASM_RET \ + ); \ + \ + asm volatile( \ + "1:\n" \ + "movq " __stringify(FUNC_ARGS_##nr_args + 8) \ + "(%rsp), %rax\n" \ + "addq $(" stack_size " + 8), %rsp\n" \ + ASM_RET); \ +} \ +STACK_FRAME_NON_STANDARD(name) + +CALLER_DEFINE(bpf_global_caller_0, 0); +CALLER_DEFINE(bpf_global_caller_1, 1); +CALLER_DEFINE(bpf_global_caller_2, 2); +CALLER_DEFINE(bpf_global_caller_3, 3); +CALLER_DEFINE(bpf_global_caller_4, 4); +CALLER_DEFINE(bpf_global_caller_5, 5); +CALLER_DEFINE(bpf_global_caller_6, 6); + +void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1] = { + bpf_global_caller_0, + bpf_global_caller_1, + bpf_global_caller_2, + bpf_global_caller_3, + bpf_global_caller_4, + bpf_global_caller_5, + bpf_global_caller_6, +}; + static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf) { u8 *jg_reloc, *prog = *pprog; diff --git a/include/linux/bpf_tramp.h b/include/linux/bpf_tramp.h new file mode 100644 index 000000000000..32447fcfc017 --- /dev/null +++ b/include/linux/bpf_tramp.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __LINUX_BPF_TRAMP_H__ +#define __LINUX_BPF_TRAMP_H__ +#ifdef CONFIG_BPF_JIT +#include <linux/filter.h> + +#ifdef CONFIG_ARCH_HAS_BPF_GLOBAL_CALLER +extern void *bpf_gloabl_caller_array[MAX_BPF_FUNC_ARGS + 1]; +#endif + +void notrace __update_prog_stats(struct bpf_prog *prog, u64 start); + +#define NO_START_TIME 1 +static __always_inline u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) { + start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } + return start; +} + +static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) +{ + if (static_branch_unlikely(&bpf_stats_enabled_key)) + __update_prog_stats(prog, start); +} + +static __always_inline u64 notrace +bpf_gtramp_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) + __acquires(RCU) +{ + if (unlikely(prog->sleepable)) { + rcu_read_lock_trace(); + might_fault(); + } else { + rcu_read_lock(); + } + migrate_disable(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { + bpf_prog_inc_misses_counter(prog); + if (prog->aux->recursion_detected) + prog->aux->recursion_detected(prog); + return 0; + } + return bpf_prog_start_time(); +} + +static __always_inline void notrace +bpf_gtramp_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) + __releases(RCU) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + this_cpu_dec(*(prog->active)); + migrate_enable(); + if (unlikely(prog->sleepable)) + rcu_read_unlock_trace(); + else + rcu_read_unlock(); +} + +#endif +#endif diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index b1e358c16eeb..fa90c225c93b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -13,6 +13,7 @@ #include <linux/bpf_verifier.h> #include <linux/bpf_lsm.h> #include <linux/delay.h> +#include <linux/bpf_tramp.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { @@ -868,19 +869,6 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) mutex_unlock(&trampoline_mutex); } -#define NO_START_TIME 1 -static __always_inline u64 notrace bpf_prog_start_time(void) -{ - u64 start = NO_START_TIME; - - if (static_branch_unlikely(&bpf_stats_enabled_key)) { - start = sched_clock(); - if (unlikely(!start)) - start = NO_START_TIME; - } - return start; -} - /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -911,7 +899,7 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram return bpf_prog_start_time(); } -static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) +void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; unsigned long flags; @@ -932,13 +920,6 @@ static void notrace __update_prog_stats(struct bpf_prog *prog, u64 start) u64_stats_update_end_irqrestore(&stats->syncp, flags); } -static __always_inline void notrace update_prog_stats(struct bpf_prog *prog, - u64 start) -{ - if (static_branch_unlikely(&bpf_stats_enabled_key)) - __update_prog_stats(prog, start); -} - static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) -- 2.39.5