Re: [PATCH v5 bpf] xsk: fix immature cq descriptor production

Maciej Fijalkowski <maciej.fijalkowski@xxxxxxxxx> · Wed, 20 Aug 2025 14:35:40 +0200

On Tue, Aug 19, 2025 at 01:48:00PM -0700, Stanislav Fomichev wrote:
> On 08/19, Maciej Fijalkowski wrote:
> > Eryk reported an issue that I have put under Closes: tag, related to
> > umem addrs being prematurely produced onto pool's completion queue.
> > Let us make the skb's destructor responsible for producing all addrs
> > that given skb used.
> > 
> > Introduce struct xsk_addrs which will carry descriptor count with array
> > of addresses taken from processed descriptors that will be carried via
> > skb_shared_info::destructor_arg. This way we can refer to it within
> > xsk_destruct_skb(). In order to mitigate the overhead that will be
> > coming from memory allocations, let us introduce kmem_cache of
> > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the
> > system.
> > 
> > Commit from fixes tag introduced the buggy behavior, it was not broken
> > from day 1, but rather when xsk multi-buffer got introduced.
> > 
> > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path")
> > Reported-by: Eryk Kubanski <e.kubanski@xxxxxxxxxxxxxxxxxxx>
> > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@xxxxxxxxxxxxxxxxxxx/
> > Acked-by: Magnus Karlsson <magnus.karlsson@xxxxxxxxx>
> > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@xxxxxxxxx>
> > ---
> > 
> > v1:
> > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@xxxxxxxxx/
> > v2:
> > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@xxxxxxxxx/
> > v3:
> > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@xxxxxxxxx/
> > v4:
> > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@xxxxxxxxx/
> > 
> > v1->v2:
> > * store addrs in array carried via destructor_arg instead having them
> >   stored in skb headroom; cleaner and less hacky approach;
> > v2->v3:
> > * use kmem_cache for xsk_addrs allocation (Stan/Olek)
> > * set err when xsk_addrs allocation fails (Dan)
> > * change xsk_addrs layout to avoid holes
> > * free xsk_addrs on error path
> > * rebase
> > v3->v4:
> > * have kmem_cache as percpu vars
> > * don't drop unnecessary braces (unrelated) (Stan)
> > * use idx + i in xskq_prod_write_addr (Stan)
> > * alloc kmem_cache on bind (Stan)
> > * keep num_descs as first member in xsk_addrs (Magnus)
> > * add ack from Magnus
> > v4->v5:
> > * have a single kmem_cache per xsk subsystem (Stan)
> > 
> > ---
> >  net/xdp/xsk.c       | 91 +++++++++++++++++++++++++++++++++++++--------
> >  net/xdp/xsk_queue.h | 12 ++++++
> >  2 files changed, 87 insertions(+), 16 deletions(-)
> > 
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 9c3acecc14b1..012991de9df2 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -36,6 +36,13 @@
> >  #define TX_BATCH_SIZE 32
> >  #define MAX_PER_SOCKET_BUDGET 32
> >  
> > +struct xsk_addrs {
> > +	u32 num_descs;
> > +	u64 addrs[MAX_SKB_FRAGS + 1];
> > +};
> > +
> > +static struct kmem_cache *xsk_tx_generic_cache;
> > +
> >  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
> >  {
> >  	if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
> > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
> >  	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
> >  }
> >  
> > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
> > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
> >  {
> >  	unsigned long flags;
> >  	int ret;
> >  
> >  	spin_lock_irqsave(&pool->cq_lock, flags);
> > -	ret = xskq_prod_reserve_addr(pool->cq, addr);
> > +	ret = xskq_prod_reserve(pool->cq);
> >  	spin_unlock_irqrestore(&pool->cq_lock, flags);
> >  
> >  	return ret;
> >  }
> >  
> > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
> > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs,
> > +				      struct sk_buff *skb)
> >  {
> > +	struct xsk_buff_pool *pool = xs->pool;
> > +	struct xsk_addrs *xsk_addrs;
> >  	unsigned long flags;
> > +	u32 num_desc, i;
> > +	u32 idx;
> > +
> > +	xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > +	num_desc = xsk_addrs->num_descs;
> >  
> >  	spin_lock_irqsave(&pool->cq_lock, flags);
> > -	xskq_prod_submit_n(pool->cq, n);
> > +	idx = xskq_get_prod(pool->cq);
> > +
> > +	for (i = 0; i < num_desc; i++)
> > +		xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]);
> > +	xskq_prod_submit_n(pool->cq, num_desc);
> > +
> >  	spin_unlock_irqrestore(&pool->cq_lock, flags);
> > +	kmem_cache_free(xsk_tx_generic_cache, xsk_addrs);
> >  }
> >  
> >  static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
> >  	spin_unlock_irqrestore(&pool->cq_lock, flags);
> >  }
> >  
> > -static u32 xsk_get_num_desc(struct sk_buff *skb)
> > -{
> > -	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
> > -}
> > -
> >  static void xsk_destruct_skb(struct sk_buff *skb)
> >  {
> >  	struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
> > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> >  		*compl->tx_timestamp = ktime_get_tai_fast_ns();
> >  	}
> >  
> > -	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
> > +	xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb);
> >  	sock_wfree(skb);
> >  }
> >  
> > -static void xsk_set_destructor_arg(struct sk_buff *skb)
> > +static u32 xsk_get_num_desc(struct sk_buff *skb)
> >  {
> > -	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
> > +	struct xsk_addrs *addrs;
> >  
> > -	skb_shinfo(skb)->destructor_arg = (void *)num;
> > +	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > +	return addrs->num_descs;
> > +}
> > +
> > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs)
> > +{
> > +	skb_shinfo(skb)->destructor_arg = (void *)addrs;
> > +}
> > +
> > +static void xsk_inc_skb_descs(struct sk_buff *skb)
> > +{
> > +	struct xsk_addrs *addrs;
> > +
> > +	addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
> > +	addrs->num_descs++;
> >  }
> >  
> >  static void xsk_consume_skb(struct sk_buff *skb)
> >  {
> >  	struct xdp_sock *xs = xdp_sk(skb->sk);
> >  
> > +	kmem_cache_free(xsk_tx_generic_cache,
> > +			(struct xsk_addrs *)skb_shinfo(skb)->destructor_arg);
> >  	skb->destructor = sock_wfree;
> >  	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
> >  	/* Free skb without triggering the perf drop trace */
> > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> >  {
> >  	struct xsk_buff_pool *pool = xs->pool;
> >  	u32 hr, len, ts, offset, copy, copied;
> > +	struct xsk_addrs *addrs = NULL;
> >  	struct sk_buff *skb = xs->skb;
> >  	struct page *page;
> >  	void *buffer;
> > @@ -623,6 +656,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> >  			return ERR_PTR(err);
> >  
> >  		skb_reserve(skb, hr);
> > +
> > +		addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
> > +		if (!addrs)
> > +			return ERR_PTR(-ENOMEM);
> 
> Do we need to kfree the skb that we allocated on line 621 above? (maybe
> not because I always get confused by the mb/overflow handling here)

Awesome catches Stan. I'm fed up with these changes and I manage to
introduce these two bugs you're pointing out:)

My reasoning was that even if we write the errno for skb, I assumed that
branch below:

free_err:
	if (first_frag && skb)
		kfree_skb(skb);

will be taken, but the thing is we don't set @first_frag = true for
xsk_build_skb_zerocopy().

I will explicitly free skb where you're suggesting and of course
unregister the netdev notifier.

Thanks!

> 
> > +
> > +		xsk_set_destructor_arg(skb, addrs);
> >  	}
> >  

(...)