On 7/9/25 03:53, Alexei Starovoitov wrote: > From: Alexei Starovoitov <ast@xxxxxxxxxx> > > kmalloc_nolock() relies on ability of local_lock to detect the situation > when it's locked. > In !PREEMPT_RT local_lock_is_locked() is true only when NMI happened in > irq saved region that protects _that specific_ per-cpu kmem_cache_cpu. > In that case retry the operation in a different kmalloc bucket. > The second attempt will likely succeed, since this cpu locked > different kmem_cache_cpu. > > Similarly, in PREEMPT_RT local_lock_is_locked() returns true when > per-cpu rt_spin_lock is locked by current task. In this case re-entrance > into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries > a different bucket that is most likely is not locked by the current > task. Though it may be locked by a different task it's safe to > rt_spin_lock() on it. > > Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL > immediately if called from hard irq or NMI in PREEMPT_RT. > > kfree_nolock() defers freeing to irq_work when local_lock_is_locked() > and in_nmi() or in PREEMPT_RT. > > SLUB_TINY config doesn't use local_lock_is_locked() and relies on > spin_trylock_irqsave(&n->list_lock) to allocate while kfree_nolock() > always defers to irq_work. > > Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx> > static __fastpath_inline > @@ -2442,13 +2453,17 @@ static void *setup_object(struct kmem_cache *s, void *object) > * Slab allocation and freeing > */ > static inline struct slab *alloc_slab_page(gfp_t flags, int node, > - struct kmem_cache_order_objects oo) > + struct kmem_cache_order_objects oo, > + bool allow_spin) > { > struct folio *folio; > struct slab *slab; > unsigned int order = oo_order(oo); > > - if (node == NUMA_NO_NODE) > + if (unlikely(!allow_spin)) { > + folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, > + node, order); > + } else if (node == NUMA_NO_NODE) > folio = (struct folio *)alloc_frozen_pages(flags, order); > else > folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); Nit: should use { } either for everything or nothing (seems your new branch would work without them) > stat(s, ALLOC_NODE_MISMATCH); > @@ -3730,7 +3762,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > * PFMEMALLOC but right now, we are losing the pfmemalloc > * information when the page leaves the per-cpu allocator > */ > - if (unlikely(!pfmemalloc_match(slab, gfpflags))) > + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) > goto deactivate_slab; > > /* must check again c->slab in case we got preempted and it changed */ > @@ -3803,7 +3835,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > slub_set_percpu_partial(c, slab); > > if (likely(node_match(slab, node) && > - pfmemalloc_match(slab, gfpflags))) { > + pfmemalloc_match(slab, gfpflags)) || > + /* > + * Reentrant slub cannot take locks necessary > + * for __put_partials(), hence downgrade to any node > + */ > + !allow_spin) { Uh this seems rather ugly, I'd move the comment above everything. Also it's not "downgrade" as when you assign NUMA_NO_NODE earlier, I'd say "ignore the preference". Note that it would be bad to ignore with __GFP_THISNODE but then it's not allowed for kmalloc_nolock() so that's fine. > @@ -3911,6 +3953,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, > void *flush_freelist = c->freelist; > struct slab *flush_slab = c->slab; > > + if (unlikely(!allow_spin)) > + /* > + * Reentrant slub cannot take locks > + * necessary for deactivate_slab() > + */ > + return NULL; Hm but this is leaking the slab we allocated and have in the "slab" variable, we need to free it back in that case. > c->slab = NULL; > c->freelist = NULL; > c->tid = next_tid(c->tid); > @@ -4593,10 +4792,31 @@ static __always_inline void do_slab_free(struct kmem_cache *s, > barrier(); > > if (unlikely(slab != c->slab)) { > - __slab_free(s, slab, head, tail, cnt, addr); > + /* cnt == 0 signals that it's called from kfree_nolock() */ > + if (unlikely(!cnt)) { > + /* > + * __slab_free() can locklessly cmpxchg16 into a slab, > + * but then it might need to take spin_lock or local_lock > + * in put_cpu_partial() for further processing. > + * Avoid the complexity and simply add to a deferred list. > + */ > + defer_free(head); > + } else { > + __slab_free(s, slab, head, tail, cnt, addr); > + } > return; > } > > + if (unlikely(!cnt)) { > + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && > + local_lock_is_locked(&s->cpu_slab->lock)) { > + defer_free(head); > + return; > + } > + cnt = 1; Hmm we might end up doing a "goto redo" later and then do the wrong thing above? > + kasan_slab_free(s, head, false, false, /* skip quarantine */true); > + } > + > if (USE_LOCKLESS_FAST_PATH()) { > freelist = READ_ONCE(c->freelist); >