Re: [PATCH v2 6/6] slab: Introduce kmalloc_nolock() and kfree_nolock().

Harry Yoo <harry.yoo@xxxxxxxxxx> · Fri, 11 Jul 2025 16:36:30 +0900

On Tue, Jul 08, 2025 at 06:53:03PM -0700, Alexei Starovoitov wrote:
> From: Alexei Starovoitov <ast@xxxxxxxxxx>
> 
> kmalloc_nolock() relies on ability of local_lock to detect the situation
> when it's locked.
> In !PREEMPT_RT local_lock_is_locked() is true only when NMI happened in
> irq saved region that protects _that specific_ per-cpu kmem_cache_cpu.
> In that case retry the operation in a different kmalloc bucket.
> The second attempt will likely succeed, since this cpu locked
> different kmem_cache_cpu.
>
> Similarly, in PREEMPT_RT local_lock_is_locked() returns true when
> per-cpu rt_spin_lock is locked by current task. In this case re-entrance
> into the same kmalloc bucket is unsafe, and kmalloc_nolock() tries
> a different bucket that is most likely is not locked by the current
> task. Though it may be locked by a different task it's safe to
> rt_spin_lock() on it.
> 
> Similar to alloc_pages_nolock() the kmalloc_nolock() returns NULL
> immediately if called from hard irq or NMI in PREEMPT_RT.
> 
> kfree_nolock() defers freeing to irq_work when local_lock_is_locked()
> and in_nmi() or in PREEMPT_RT.

Does that mean in mm/Kconfig SLUB now needs to select IRQ_WORK?

> SLUB_TINY config doesn't use local_lock_is_locked() and relies on
> spin_trylock_irqsave(&n->list_lock) to allocate while kfree_nolock()
> always defers to irq_work.
> 
> Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
>  include/linux/kasan.h |  13 +-
>  include/linux/slab.h  |   4 +
>  mm/kasan/common.c     |   5 +-
>  mm/slub.c             | 330 ++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 319 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/kasan.h b/include/linux/kasan.h
> index 890011071f2b..acdc8cb0152e 100644
> --- a/include/linux/kasan.h
> +++ b/include/linux/kasan.h
> @@ -200,7 +200,7 @@ static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
>  }
>  
>  bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
> -		       bool still_accessible);
> +		       bool still_accessible, bool no_quarantine);
>  /**
>   * kasan_slab_free - Poison, initialize, and quarantine a slab object.
>   * @object: Object to be freed.
> @@ -1982,6 +1983,7 @@ static inline void init_slab_obj_exts(struct slab *slab)
>  int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
>  		        gfp_t gfp, bool new_slab)
>  {
> +	bool allow_spin = gfpflags_allow_spinning(gfp);
>  	unsigned int objects = objs_per_slab(s, slab);
>  	unsigned long new_exts;
>  	unsigned long old_exts;
> @@ -1990,8 +1992,14 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
>  	gfp &= ~OBJCGS_CLEAR_MASK;
>  	/* Prevent recursive extension vector allocation */
>  	gfp |= __GFP_NO_OBJ_EXT;
> -	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
> -			   slab_nid(slab));
> +	if (unlikely(!allow_spin)) {
> +		size_t sz = objects * sizeof(struct slabobj_ext);
> +
> +		vec = kmalloc_nolock(sz, __GFP_ZERO, slab_nid(slab));

Missing memset()?
as there is no kcalloc_nolock()...

> +	} else {
> +		vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
> +				   slab_nid(slab));
> +	}
>  	if (!vec) {
>  		/* Mark vectors which failed to allocate */
>  		if (new_slab)
> @@ -3911,6 +3953,12 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  		void *flush_freelist = c->freelist;
>  		struct slab *flush_slab = c->slab;
>  
> +		if (unlikely(!allow_spin))
> +			/*
> +			 * Reentrant slub cannot take locks
> +			 * necessary for deactivate_slab()
> +			 */
> +			return NULL;
>  		c->slab = NULL;
>  		c->freelist = NULL;
>  		c->tid = next_tid(c->tid);
> @@ -4555,6 +4707,53 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
>  	discard_slab(s, slab);
>  }
>  
> +static DEFINE_PER_CPU(struct llist_head, defer_free_objects);
> +static DEFINE_PER_CPU(struct irq_work, defer_free_work);
> +
> +static void free_deferred_objects(struct irq_work *work)
> +{
> +	struct llist_head *llhead = this_cpu_ptr(&defer_free_objects);
> +	struct llist_node *llnode, *pos, *t;
> +
> +	if (llist_empty(llhead))
> +		return;
> +
> +	llnode = llist_del_all(llhead);
> +	llist_for_each_safe(pos, t, llnode) {
> +		struct kmem_cache *s;
> +		struct slab *slab;
> +		void *x = pos;
> +
> +		slab = virt_to_slab(x);
> +		s = slab->slab_cache;
> +
> +		/*
> +		 * memcg, kasan_slab_pre are already done for 'x'.
> +		 * The only thing left is kasan_poison.
> +		 */
> +		kasan_slab_free(s, x, false, false, true);
> +		__slab_free(s, slab, x, x, 1, _THIS_IP_);
> +	}
> +}
> +
> +static void defer_free(void *head)
> +{
> +	if (llist_add(head, this_cpu_ptr(&defer_free_objects)))
> +		irq_work_queue(this_cpu_ptr(&defer_free_work));

By adding it to the lockless list, it's overwriting freed objects,
and it's not always safe.

Looking at calculate_sizes():

        if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) ||    
            (flags & SLAB_POISON) || s->ctor ||                                 
            ((flags & SLAB_RED_ZONE) &&                                         
             (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {    
                /*                                                              
                 * Relocate free pointer after the object if it is not          
                 * permitted to overwrite the first word of the object on           
                 * kmem_cache_free.                                             
                 *                                                              
                 * This is the case if we do RCU, have a constructor or         
                 * destructor, are poisoning the objects, or are                
                 * redzoning an object smaller than sizeof(void *) or are           
                 * redzoning an object with slub_debug_orig_size() enabled,         
                 * in which case the right redzone may be extended.             
                 *                                                              
                 * The assumption that s->offset >= s->inuse means free         
                 * pointer is outside of the object is used in the              
                 * freeptr_outside_object() function. If that is no             
                 * longer true, the function needs to be modified.              
                 */                                                             
                s->offset = size;                                               
                size += sizeof(void *);  

Only sizeof(void *) bytes from object + s->offset is always safe to overwrite.

So either 1) teach defer_free() that it needs to use s->offset for each
object, instead of zero (and that the list can have objects from
different caches), or 2) introduce per-cache per-CPU lockless lists?

-- 
Cheers,
Harry / Hyeonggon