These three patches fixes all the dmesg warning (with CONFIG_LOCKDEP) issues when running the bpf test_progs and does not cause deadlocks without CONFIG_LOCKDEP. The warning when running the cgroup examples [ T2916] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ T2916] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2916, name: test_progs [ T2916] preempt_count: 1, expected: 0 [ T2916] RCU nest depth: 2, expected: 2 [...] [ T2916] __might_resched.cold+0xfa/0x135 [ T2916] rt_spin_lock+0x5f/0x190 [ T2916] ? task_get_cgroup1+0xe8/0x340 [ T2916] task_get_cgroup1+0xe8/0x340 [ T2916] bpf_task_get_cgroup1+0xe/0x20 [ T2916] bpf_prog_8d22669ef1ee8049_on_enter+0x62/0x1d4 [ T2916] bpf_trace_run2+0xd3/0x260 is fixed by this: diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 183fa2aa2935..49257cb90209 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -58,9 +58,9 @@ static notrace void \ __bpf_trace_##call(void *__data, proto) \ { \ might_fault(); \ - preempt_disable_notrace(); \ + migrate_disable(); \ CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args)); \ - preempt_enable_notrace(); \ + migrate_enable(); \ } #undef DECLARE_EVENT_SYSCALL_CLASS The warning when running the (wq, timer, free_timer, timer_mim, timer_lockup) examples [ T4696] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ T4696] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 4696, name: test_progs [ T4696] preempt_count: 1, expected: 0 [ T4696] RCU nest depth: 0, expected: 0 [...] [ T4696] __kmalloc_node_noprof+0xee/0x490 [ T4696] bpf_map_kmalloc_node+0x72/0x220 [ T4696] __bpf_async_init+0x107/0x310 [ T4696] bpf_timer_init+0x33/0x40 [ T4696] bpf_prog_7e15f1bc7d1d26d0_start_cb+0x5d/0x91 [ T4696] bpf_prog_d85f43676fabf521_start_timer+0x65/0x8a [ T4696] bpf_prog_test_run_syscall+0x103/0x250 is fixed by this (this might leak memory due to the allcoation before the lock(?)) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e3a2662f4e33..94fcd8c8661c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1263,19 +1263,16 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u return -EINVAL; } - __bpf_spin_lock_irqsave(&async->lock); t = async->timer; - if (t) { - ret = -EBUSY; - goto out; - } + if (t) + return -EBUSY; /* allocate hrtimer via map_kmalloc to use memcg accounting */ cb = bpf_map_kmalloc_node(map, size, GFP_ATOMIC, map->numa_node); - if (!cb) { - ret = -ENOMEM; - goto out; - } + if (!cb) + return -ENOMEM; + + __bpf_spin_lock_irqsave(&async->lock); switch (type) { case BPF_ASYNC_TYPE_TIMER: @@ -1315,7 +1312,6 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u kfree(cb); ret = -EPERM; } -out: __bpf_spin_unlock_irqrestore(&async->lock); return ret; } In addition to these there's a deadlock warning from lockdep when running the timer_lockup example [ 127.373597] [ C1] ============================================ [ 127.373598] [ C1] WARNING: possible recursive locking detected [ 127.373601] [ C1] 6.15.0-bpf-00006-g31cf22212ed9 #41 Tainted: G O [ 127.373602] [ C1] -------------------------------------------- [ 127.373603] [ C1] ktimers/1/85 is trying to acquire lock: [ 127.373605] [ C1] ffff98f62e61c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_cancel_wait_running+0x4d/0x80 [ 127.373614] [ C1] but task is already holding lock: [ 127.373615] [ C1] ffff98f62e65c1b8 (&base->softirq_expiry_lock){+...}-{3:3}, at: hrtimer_run_softirq+0x37/0x100 [ 127.373621] [ C1] other info that might help us debug this: [ 127.373621] [ C1] Possible unsafe locking scenario: [ 127.373622] [ C1] CPU0 [ 127.373623] [ C1] ---- [ 127.373624] [ C1] lock(&base->softirq_expiry_lock); [ 127.373626] [ C1] lock(&base->softirq_expiry_lock); [ 127.373627] [ C1] *** DEADLOCK *** [ 127.373628] [ C1] May be due to missing lock nesting notation which can by fixed (?) by reintroducing spin_lock_bh_nested() (which was introduced in v4.0 and removed in v4.11 ...) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d3561c4a080e..b6635466b35f 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -219,6 +219,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) #ifdef CONFIG_DEBUG_LOCK_ALLOC # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock_nested(lock, subclass) +# define raw_spin_lock_bh_nested(lock, subclass) \ + _raw_spin_lock_bh_nested(lock, subclass) # define raw_spin_lock_nest_lock(lock, nest_lock) \ do { \ @@ -233,6 +235,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) */ # define raw_spin_lock_nested(lock, subclass) \ _raw_spin_lock(((void)(subclass), (lock))) +# define raw_spin_lock_bh_nested(lock, subclass) \ + _raw_spin_lock_bh(((void)(subclass), (lock))) # define raw_spin_lock_nest_lock(lock, nest_lock) _raw_spin_lock(lock) #endif @@ -366,6 +370,11 @@ do { \ raw_spin_lock_nested(spinlock_check(lock), subclass); \ } while (0) +#define spin_lock_bh_nested(lock, subclass) \ +do { \ + raw_spin_lock_bh_nested(spinlock_check(lock), subclass); \ +} while (0) + #define spin_lock_nest_lock(lock, nest_lock) \ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ @@ -552,6 +561,10 @@ DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t, raw_spin_lock_bh(_T->lock), raw_spin_unlock_bh(_T->lock)) +DEFINE_LOCK_GUARD_1(raw_spinlock_bh_nested, raw_spinlock_t, + raw_spin_lock_bh_nested(_T->lock, SINGLE_DEPTH_NESTING), + raw_spin_unlock_bh(_T->lock)) + DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock)) DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t, diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 9ecb0ab504e3..791867efad04 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr); void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) __acquires(lock); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) + __acquires(lock); void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map) __acquires(lock); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index 819aeba1c87e..a42887a0c846 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -57,6 +57,7 @@ #define _raw_spin_lock(lock) __LOCK(lock) #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) +#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) #define _raw_write_lock_nested(lock, subclass) __LOCK(lock) diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h index f6499c37157d..b89dc9e272cc 100644 --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -88,6 +88,13 @@ static __always_inline void spin_lock_bh(spinlock_t *lock) rt_spin_lock(lock); } +static __always_inline void spin_lock_bh_nested(spinlock_t *lock, int subclass) +{ + /* Investigate: Drop bh when blocking ? */ + local_bh_disable(); + rt_spin_lock_nested(lock, subclass); +} + static __always_inline void spin_lock_irq(spinlock_t *lock) { rt_spin_lock(lock); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7685defd7c52..c4aa80441a1d 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -380,6 +380,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) } EXPORT_SYMBOL(_raw_spin_lock_nested); +void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh_nested); + unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, int subclass) { diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index db1e11b45de6..5b76072c4838 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -58,7 +58,6 @@ void __sched rt_spin_lock(spinlock_t *lock) __acquires(RCU) } EXPORT_SYMBOL(rt_spin_lock); -#ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) { spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); @@ -66,6 +65,7 @@ void __sched rt_spin_lock_nested(spinlock_t *lock, int subclass) } EXPORT_SYMBOL(rt_spin_lock_nested); +#ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock) { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 30899a8cc52c..d62455f62fc4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1457,7 +1457,7 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); - spin_lock_bh(&base->cpu_base->softirq_expiry_lock); + spin_lock_bh_nested(&base->cpu_base->softirq_expiry_lock, SINGLE_DEPTH_NESTING); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index e1fba28e4a86..7cfb9473a526 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -7,6 +7,7 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "bpf_misc.h" +#include "bpf_kfuncs.h" #include "errno.h" char _license[] SEC("license") = "GPL"; Bert Karwatzki