To debug the issue I've created a patch that monitors preempt_count_{sub,add} (except those used by printk to avoid infinite recursion). diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5b25d278409b..37992b164282 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2219,6 +2219,8 @@ bpf_prog_run_array(const struct bpf_prog_array *array, struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; u32 ret = 1; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: ctx = %px preempt_count = %u\n", __func__, ctx, preempt_count()); RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); diff --git a/include/linux/filter.h b/include/linux/filter.h index f5cf4d35d83e..ac9ddd1b9bac 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -699,6 +699,8 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, bpf_dispatcher_fn dfunc) { u32 ret; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = %u\n", __func__, preempt_count()); cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b0af8d4ef6e6..e1d5d1e00860 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -194,12 +194,17 @@ static __always_inline unsigned char interrupt_context_level(void) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); +extern void preempt_count_add_printk(int val); +extern void preempt_count_sub_printk(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) +#define preempt_count_dec_and_test_printk() \ + ({ preempt_count_sub_printk(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() +#define preempt_count_dec_and_test_printk() __preempt_count_dec_and_test_printk() #endif #define __preempt_count_inc() __preempt_count_add(1) @@ -208,8 +213,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#define preempt_count_inc_printk() preempt_count_add_printk(1) +#define preempt_count_dec_printk() preempt_count_sub_printk(1) + #ifdef CONFIG_PREEMPT_COUNT +/* We can't monitor preempt_count changes in printk or we go into + * an infinte recursion loop! + * */ +#define preempt_disable_printk() \ +do { \ + preempt_count_inc_printk(); \ + barrier(); \ +} while (0) + #define preempt_disable() \ do { \ preempt_count_inc(); \ @@ -227,6 +244,13 @@ do { \ #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION +#define preempt_enable_printk() \ +do { \ + barrier(); \ + if (unlikely(preempt_count_dec_and_test_printk())) \ + __preempt_schedule(); \ +} while (0) + #define preempt_enable() \ do { \ barrier(); \ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c1113b74e1e2..555c01c7f983 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -25,6 +25,8 @@ #include <linux/kasan.h> #include <linux/bpf_verifier.h> +#include <asm/preempt.h> + #include "../../lib/kstrtox.h" /* If kernel subsystem is allowing eBPF programs to call this function, @@ -2596,6 +2598,9 @@ const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { __bpf_kfunc struct cgroup * bpf_task_get_cgroup1(struct task_struct *task, int hierarchy_id) { + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: calling task_get_cgroup1 preempt_count = 0x%x\n", __func__, preempt_count()); + struct cgroup *cgrp = task_get_cgroup1(task, hierarchy_id); if (IS_ERR(cgrp)) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd5304c6ac3c..137478a0b8f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3339,6 +3339,8 @@ int bpf_link_settle(struct bpf_link_primer *primer) primer->link->id = primer->id; spin_unlock_bh(&link_idr_lock); /* make bpf_link fetchable by FD */ + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s calling fd_install() preemt_count = %u\n", __func__, preempt_count()); fd_install(primer->fd, primer->file); /* pass through installed FD */ return primer->fd; @@ -5798,6 +5800,8 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) { union bpf_attr attr; int err; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = %u\n", __func__, preempt_count()); err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) @@ -5986,6 +5990,8 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) { struct bpf_prog * __maybe_unused prog; struct bpf_tramp_run_ctx __maybe_unused run_ctx; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = %u\n", __func__, preempt_count()); switch (cmd) { #ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index fa24c032ed6f..001632155bfe 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1295,6 +1295,8 @@ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup_root *root; unsigned long flags; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = 0x%x\n", __func__, preempt_count()); rcu_read_lock(); for_each_root(root) { diff --git a/kernel/printk/nbcon.c b/kernel/printk/nbcon.c index fd12efcc4aed..cce689bc6280 100644 --- a/kernel/printk/nbcon.c +++ b/kernel/printk/nbcon.c @@ -1634,7 +1634,7 @@ void nbcon_cpu_emergency_enter(void) { unsigned int *cpu_emergency_nesting; - preempt_disable(); + preempt_disable_printk(); cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting(); (*cpu_emergency_nesting)++; @@ -1654,7 +1654,7 @@ void nbcon_cpu_emergency_exit(void) if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0)) (*cpu_emergency_nesting)--; - preempt_enable(); + preempt_enable_printk(); } /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1eea80d0648e..d070220a817b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2439,7 +2439,7 @@ asmlinkage int vprintk_emit(int facility, int level, * this context can return as soon as possible. Hopefully * another printk() caller will take over the printing. */ - preempt_disable(); + preempt_disable_printk(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers. With the @@ -2448,7 +2448,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock_spinning()) console_unlock(); - preempt_enable(); + preempt_enable_printk(); } if (ft.legacy_offload) @@ -4536,7 +4536,7 @@ static void __wake_up_klogd(int val) if (!printk_percpu_data_ready()) return; - preempt_disable(); + preempt_disable_printk(); /* * Guarantee any new records can be seen by tasks preparing to wait * before this context checks if the wait queue is empty. @@ -4553,7 +4553,7 @@ static void __wake_up_klogd(int val) this_cpu_or(printk_pending, val); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); } - preempt_enable(); + preempt_enable_printk(); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..41426f8f806f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5857,7 +5857,9 @@ static inline void preempt_latency_start(int val) } } -void preempt_count_add(int val) +/* non printing version of preempt_count_add for use in + * printk to avoid infinite recursion */ +void preempt_count_add_printk(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -5876,6 +5878,34 @@ void preempt_count_add(int val) #endif preempt_latency_start(val); } +EXPORT_SYMBOL(preempt_count_add_printk); +NOKPROBE_SYMBOL(preempt_count_add_printk); + +void preempt_count_add(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + if (!strcmp(get_current()->comm, "test_progs")) { + pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count()); + __preempt_count_add(val); + pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count()); + } else { + __preempt_count_add(val); + } +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + preempt_latency_start(val); +} EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); @@ -5889,7 +5919,7 @@ static inline void preempt_latency_stop(int val) trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); } -void preempt_count_sub(int val) +void preempt_count_sub_printk(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -5908,6 +5938,34 @@ void preempt_count_sub(int val) preempt_latency_stop(val); __preempt_count_sub(val); } +EXPORT_SYMBOL(preempt_count_sub_printk); +NOKPROBE_SYMBOL(preempt_count_sub_printk); + +void preempt_count_sub(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + + preempt_latency_stop(val); + if (!strcmp(get_current()->comm, "test_progs")) { + pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count()); + __preempt_count_sub(val); + pr_err("%s %d: preempt_count = 0x%x", __func__, __LINE__, preempt_count()); + } else { + __preempt_count_sub(val); + } +} EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b0eb721fcfb5..762f8b429af3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -35,6 +35,9 @@ #include "trace_probe.h" #include "trace.h" +#include <asm/preempt.h> +#include <linux/sched.h> + #define CREATE_TRACE_POINTS #include "bpf_trace.h" @@ -110,6 +113,8 @@ static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx); unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: call = %px ctx = %px preempt_count = %u\n", __func__, call, ctx, preempt_count()); cant_sleep(); @@ -766,6 +771,8 @@ const struct bpf_func_proto bpf_get_current_task_proto = { BPF_CALL_0(bpf_get_current_task_btf) { + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = 0x%x", __func__, preempt_count()); return (unsigned long) current; } @@ -2244,6 +2251,8 @@ void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args) struct bpf_prog *prog = link->link.prog; struct bpf_run_ctx *old_run_ctx; struct bpf_trace_run_ctx run_ctx; + if (!strcmp(get_current()->comm, "test_progs")) + pr_err("%s: preempt_count = %u\n", __func__, preempt_count()); cant_sleep(); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index a0391f9da2d4..0d179a1192ca 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -7,6 +7,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_misc.h" +#include "bpf_kfuncs.h" #include "errno.h" char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c index e6c248e3ae54..e9e918cdf31f 100644 --- a/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c +++ b/tools/testing/selftests/bpf/test_kmods/bpf_testmod.c @@ -385,7 +385,7 @@ int bpf_testmod_fentry_ok; noinline ssize_t bpf_testmod_test_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_read_ctx ctx = { @@ -465,7 +465,7 @@ ALLOW_ERROR_INJECTION(bpf_testmod_test_read, ERRNO); noinline ssize_t bpf_testmod_test_write(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { struct bpf_testmod_test_write_ctx ctx = { @@ -567,7 +567,7 @@ static void testmod_unregister_uprobe(void) static ssize_t bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t len) { unsigned long offset = 0; This patch seems to create so much output that the orginal error message and backtrace often get lost, so I needed several runs to get a meaningful message when running $ ./test_progs -a cgrp_local_storage/cgrp1_tp_btf Here's the crucial part of the message which show an unexplained increase in preempt_count: [ 99.467001] [ T2857] bpf_link_settle calling fd_install() preemt_count = 0 The next two message are from fd install calling rcu_read_lock_sched() (I know that from an earlier version of the debug patch which also monitored fd_install()): [ 99.467003] [ T2857] preempt_count_add 5894: preempt_count = 0x0 [ 99.467004] [ T2857] preempt_count_add 5896: preempt_count = 0x1 The next two message are from fd install calling rcu_read_unlock_sched(): [ 99.467006] [ T2857] preempt_count_sub 5962: preempt_count = 0x1 [ 99.467007] [ T2857] preempt_count_sub 5964: preempt_count = 0x0 Here we are after fd_install which exited with preempt_count = 0, but when entering __bpf_trace_run we have preempt_count = 1 anyway without a call to preempt_count_add(). So who or what is increasing preempt_count here? [ 99.467014] [ T2857] __bpf_trace_run: preempt_count = 1 [ 99.467015] [ T2857] __bpf_prog_run: preempt_count = 1 [ 99.467017] [ T2857] ____bpf_get_current_task_btf: preempt_count = 0x1 [ 99.467018] [ T2857] bpf_task_get_cgroup1: calling task_get_cgroup1 preempt_count = 0x1 [ 99.467019] [ T2857] task_get_cgroup1: preempt_count = 0x1 [ 99.467022] [ T2857] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ 99.467024] [ T2857] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2857, name: test_progs [ 99.467025] [ T2857] preempt_count: 1, expected: 0 [ 99.467027] [ T2857] RCU nest depth: 2, expected: 2 [ 99.467028] [ T2857] 4 locks held by test_progs/2857: [ 99.467029] [ T2857] #0: ffffffffa6932de0 (rcu_read_lock_trace){....}-{0:0}, at: syscall_trace_enter+0x18e/0x260 [ 99.467040] [ T2857] #1: ffffffffa6933880 (rcu_read_lock){....}-{1:3}, at: bpf_trace_run2+0xa6/0x2a0 [ 99.467048] [ T2857] #2: ffffffffa6933880 (rcu_read_lock){....}-{1:3}, at: task_get_cgroup1+0x4c/0x360 [ 99.467054] [ T2857] #3: ffffffffa6956878 (css_set_lock){+.+.}-{3:3}, at: task_get_cgroup1+0x109/0x360 The "Preemption disabled at:" message is not useful here because it just comes from printk() (which enables preemption again). In the usual case (no monitoring) we get fd_install as the "Preemption disable at:" message, but as can be seen above fd_install exits with preempt_count = 0. [ 99.467060] [ T2857] Preemption disabled at: [ 99.467060] [ T2857] [<ffffffffa503a823>] __wake_up_klogd.part.0+0x13/0xa0 [ 99.467066] [ T2857] CPU: 13 UID: 0 PID: 2857 Comm: test_progs Tainted: G O 6.15.0-rc7-next-20250523-cgroupbpfbug-00024-gc1d60c8159da #66 PREEMPT_{RT,(full)} [ 99.467070] [ T2857] Tainted: [O]=OOT_MODULE [ 99.467071] [ T2857] Hardware name: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-158L, BIOS E158LAMS.10F 11/11/2024 [ 99.467072] [ T2857] Call Trace: [ 99.467075] [ T2857] <TASK> [ 99.467077] [ T2857] dump_stack_lvl+0x6d/0xb0 [ 99.467083] [ T2857] __might_resched.cold+0xfa/0x135 [ 99.467089] [ T2857] rt_spin_lock+0x5f/0x190 [ 99.467092] [ T2857] ? task_get_cgroup1+0x109/0x360 [ 99.467098] [ T2857] task_get_cgroup1+0x109/0x360 [ 99.467102] [ T2857] bpf_task_get_cgroup1+0x3d/0x50 [ 99.467109] [ T2857] bpf_prog_8d22669ef1ee8049_on_enter+0x62/0x1d4 [ 99.467115] [ T2857] bpf_trace_run2+0x108/0x2a0 [ 99.467119] [ T2857] ? srso_alias_return_thunk+0x5/0xfbef5 [ 99.467126] [ T2857] __bpf_trace_sys_enter+0x37/0x60 [ 99.467131] [ T2857] syscall_trace_enter+0x1c7/0x260 [ 99.467136] [ T2857] do_syscall_64+0x395/0xfa0 [ 99.467139] [ T2857] ? srso_alias_return_thunk+0x5/0xfbef5 [ 99.467142] [ T2857] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 99.467142] [ T2857] preempt_count_add 5894: preempt_count = 0x1 [ 99.467142] [ T2857] preempt_count_add 5896: preempt_count = 0x2 [ 99.467142] [ T2857] preempt_count_sub 5962: preempt_count = 0x2 [ 99.467142] [ T2857] preempt_count_sub 5964: preempt_count = 0x1 [ 99.467142] [ T2857] RIP: 0033:0x7f23eb72c779 [ 99.467142] [ T2857] preempt_count_add 5894: preempt_count = 0x1 [ 99.467142] [ T2857] preempt_count_add 5896: preempt_count = 0x2 [ 99.467142] [ T2857] preempt_count_sub 5962: preempt_count = 0x2 [ 99.467142] [ T2857] preempt_count_sub 5964: preempt_count = 0x1 [ 99.467142] [ T2857] Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 86 0d 00 f7 d8 64 89 01 48 [ 99.467142] [ T2857] RSP: 002b:00007ffd8b5d7f38 EFLAGS: 00000202 ORIG_RAX: 0000000000000141 [ 99.467142] [ T2857] RAX: ffffffffffffffda RBX: 00007ffd8b5d86c8 RCX: 00007f23eb72c779 [ 99.467142] [ T2857] RDX: 0000000000000040 RSI: 00007ffd8b5d7fb0 RDI: 000000000000001c [ 99.467142] [ T2857] RBP: 00007ffd8b5d7f50 R08: 0000000000000001 R09: 00007ffd8b5d7fb0 [ 99.467142] [ T2857] R10: 00007f23eb805ac0 R11: 0000000000000202 R12: 0000000000000000 [ 99.467142] [ T2857] R13: 00007ffd8b5d86e8 R14: 00007f23ebd61000 R15: 0000563e4fd2d890 [ 99.467142] [ T2857] </TASK> Bert Karwatzki