Re: [PATCH v2 6/6] slab: Introduce kmalloc_nolock() and kfree_nolock().

Vlastimil Babka <vbabka@xxxxxxx> · Thu, 10 Jul 2025 11:36:02 +0200

On 7/9/25 03:53, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@xxxxxxxxxx>
> 
> kmalloc_nolock() relies on ability of local_lock to detect the situation
> when it's locked.
> In !PREEMPT_RT local_lock_is_locked() is true only when NMI happened in
> irq saved region that protects _that specific_ per-cpu kmem_cache_cpu.
> In that case retry the operation in a different kmalloc bucket.
> The second attempt will likely succeed, since this cpu locked
> different kmem_cache_cpu.
> 
> Similarly, in PREEMPT_RT local_lock_is_locked() returns true when
> per-cpu rt_spin_lock is locked by current task. In this case re-entrance
> into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries
> a different bucket that is most likely is not locked by the current
> task. Though it may be locked by a different task it's safe to
> rt_spin_lock() on it.
> 
> Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL
> immediately if called from hard irq or NMI in PREEMPT_RT.
> 
> kfree_nolock() defers freeing to irq_work when local_lock_is_locked()
> and in_nmi() or in PREEMPT_RT.
> 
> SLUB_TINY config doesn't use local_lock_is_locked() and relies on
> spin_trylock_irqsave(&n->list_lock) to allocate while kfree_nolock()
> always defers to irq_work.
> 
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>

>  static __fastpath_inline
> @@ -2442,13 +2453,17 @@ static void *setup_object(struct kmem_cache *s, void *object)
>   * Slab allocation and freeing
>   */
>  static inline struct slab *alloc_slab_page(gfp_t flags, int node,
> -		struct kmem_cache_order_objects oo)
> +					   struct kmem_cache_order_objects oo,
> +					   bool allow_spin)
>  {
>  	struct folio *folio;
>  	struct slab *slab;
>  	unsigned int order = oo_order(oo);
>  
> -	if (node == NUMA_NO_NODE)
> +	if (unlikely(!allow_spin)) {
> +		folio = (struct folio *)alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */,
> +								  node, order);
> +	} else if (node == NUMA_NO_NODE)
>  		folio = (struct folio *)alloc_frozen_pages(flags, order);
>  	else
>  		folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL);

Nit: should use { } either for everything or nothing (seems your new branch
would work without them)

>  			stat(s, ALLOC_NODE_MISMATCH);
> @@ -3730,7 +3762,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  	 * PFMEMALLOC but right now, we are losing the pfmemalloc
>  	 * information when the page leaves the per-cpu allocator
>  	 */
> -	if (unlikely(!pfmemalloc_match(slab, gfpflags)))
> +	if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
>  		goto deactivate_slab;
>  
>  	/* must check again c->slab in case we got preempted and it changed */
> @@ -3803,7 +3835,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		slub_set_percpu_partial(c, slab);
>  
>  		if (likely(node_match(slab, node) &&
> -			   pfmemalloc_match(slab, gfpflags))) {
> +			   pfmemalloc_match(slab, gfpflags)) ||
> +		    /*
> +		     * Reentrant slub cannot take locks necessary
> +		     * for __put_partials(), hence downgrade to any node
> +		     */
> +		    !allow_spin) {

Uh this seems rather ugly, I'd move the comment above everything. Also it's
not "downgrade" as when you assign NUMA_NO_NODE earlier, I'd say "ignore the
preference".
Note that it would be bad to ignore with __GFP_THISNODE but then it's not
allowed for kmalloc_nolock() so that's fine.

> @@ -3911,6 +3953,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		void *flush_freelist = c->freelist;
>  		struct slab *flush_slab = c->slab;
>  
> +		if (unlikely(!allow_spin))
> +			/*
> +			 * Reentrant slub cannot take locks
> +			 * necessary for deactivate_slab()
> +			 */
> +			return NULL;

Hm but this is leaking the slab we allocated and have in the "slab"
variable, we need to free it back in that case.

>  		c->slab = NULL;
>  		c->freelist = NULL;
>  		c->tid = next_tid(c->tid);

> @@ -4593,10 +4792,31 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
>  	barrier();
>  
>  	if (unlikely(slab != c->slab)) {
> -		__slab_free(s, slab, head, tail, cnt, addr);
> +		/* cnt == 0 signals that it's called from kfree_nolock() */
> +		if (unlikely(!cnt)) {
> +			/*
> +			 * __slab_free() can locklessly cmpxchg16 into a slab,
> +			 * but then it might need to take spin_lock or local_lock
> +			 * in put_cpu_partial() for further processing.
> +			 * Avoid the complexity and simply add to a deferred list.
> +			 */
> +			defer_free(head);
> +		} else {
> +			__slab_free(s, slab, head, tail, cnt, addr);
> +		}
>  		return;
>  	}
>  
> +	if (unlikely(!cnt)) {
> +		if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) &&
> +		    local_lock_is_locked(&s->cpu_slab->lock)) {
> +			defer_free(head);
> +			return;
> +		}
> +		cnt = 1;

Hmm we might end up doing a "goto redo" later and then do the wrong thing above?

> +		kasan_slab_free(s, head, false, false, /* skip quarantine */true);
> +	}
> +
>  	if (USE_LOCKLESS_FAST_PATH()) {
>  		freelist = READ_ONCE(c->freelist);
>