On Tue, Aug 19, 2025 at 01:48:00PM -0700, Stanislav Fomichev wrote: > On 08/19, Maciej Fijalkowski wrote: > > Eryk reported an issue that I have put under Closes: tag, related to > > umem addrs being prematurely produced onto pool's completion queue. > > Let us make the skb's destructor responsible for producing all addrs > > that given skb used. > > > > Introduce struct xsk_addrs which will carry descriptor count with array > > of addresses taken from processed descriptors that will be carried via > > skb_shared_info::destructor_arg. This way we can refer to it within > > xsk_destruct_skb(). In order to mitigate the overhead that will be > > coming from memory allocations, let us introduce kmem_cache of > > xsk_addrs. There will be a single kmem_cache for xsk generic xmit on the > > system. > > > > Commit from fixes tag introduced the buggy behavior, it was not broken > > from day 1, but rather when xsk multi-buffer got introduced. > > > > Fixes: b7f72a30e9ac ("xsk: introduce wrappers and helpers for supporting multi-buffer in Tx path") > > Reported-by: Eryk Kubanski <e.kubanski@xxxxxxxxxxxxxxxxxxx> > > Closes: https://lore.kernel.org/netdev/20250530103456.53564-1-e.kubanski@xxxxxxxxxxxxxxxxxxx/ > > Acked-by: Magnus Karlsson <magnus.karlsson@xxxxxxxxx> > > Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@xxxxxxxxx> > > --- > > > > v1: > > https://lore.kernel.org/bpf/20250702101648.1942562-1-maciej.fijalkowski@xxxxxxxxx/ > > v2: > > https://lore.kernel.org/bpf/20250705135512.1963216-1-maciej.fijalkowski@xxxxxxxxx/ > > v3: > > https://lore.kernel.org/bpf/20250806154127.2161434-1-maciej.fijalkowski@xxxxxxxxx/ > > v4: > > https://lore.kernel.org/bpf/20250813171210.2205259-1-maciej.fijalkowski@xxxxxxxxx/ > > > > v1->v2: > > * store addrs in array carried via destructor_arg instead having them > > stored in skb headroom; cleaner and less hacky approach; > > v2->v3: > > * use kmem_cache for xsk_addrs allocation (Stan/Olek) > > * set err when xsk_addrs allocation fails (Dan) > > * change xsk_addrs layout to avoid holes > > * free xsk_addrs on error path > > * rebase > > v3->v4: > > * have kmem_cache as percpu vars > > * don't drop unnecessary braces (unrelated) (Stan) > > * use idx + i in xskq_prod_write_addr (Stan) > > * alloc kmem_cache on bind (Stan) > > * keep num_descs as first member in xsk_addrs (Magnus) > > * add ack from Magnus > > v4->v5: > > * have a single kmem_cache per xsk subsystem (Stan) > > > > --- > > net/xdp/xsk.c | 91 +++++++++++++++++++++++++++++++++++++-------- > > net/xdp/xsk_queue.h | 12 ++++++ > > 2 files changed, 87 insertions(+), 16 deletions(-) > > > > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > > index 9c3acecc14b1..012991de9df2 100644 > > --- a/net/xdp/xsk.c > > +++ b/net/xdp/xsk.c > > @@ -36,6 +36,13 @@ > > #define TX_BATCH_SIZE 32 > > #define MAX_PER_SOCKET_BUDGET 32 > > > > +struct xsk_addrs { > > + u32 num_descs; > > + u64 addrs[MAX_SKB_FRAGS + 1]; > > +}; > > + > > +static struct kmem_cache *xsk_tx_generic_cache; > > + > > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) > > { > > if (pool->cached_need_wakeup & XDP_WAKEUP_RX) > > @@ -532,25 +539,39 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) > > return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); > > } > > > > -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) > > +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) > > { > > unsigned long flags; > > int ret; > > > > spin_lock_irqsave(&pool->cq_lock, flags); > > - ret = xskq_prod_reserve_addr(pool->cq, addr); > > + ret = xskq_prod_reserve(pool->cq); > > spin_unlock_irqrestore(&pool->cq_lock, flags); > > > > return ret; > > } > > > > -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) > > +static void xsk_cq_submit_addr_locked(struct xdp_sock *xs, > > + struct sk_buff *skb) > > { > > + struct xsk_buff_pool *pool = xs->pool; > > + struct xsk_addrs *xsk_addrs; > > unsigned long flags; > > + u32 num_desc, i; > > + u32 idx; > > + > > + xsk_addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; > > + num_desc = xsk_addrs->num_descs; > > > > spin_lock_irqsave(&pool->cq_lock, flags); > > - xskq_prod_submit_n(pool->cq, n); > > + idx = xskq_get_prod(pool->cq); > > + > > + for (i = 0; i < num_desc; i++) > > + xskq_prod_write_addr(pool->cq, idx + i, xsk_addrs->addrs[i]); > > + xskq_prod_submit_n(pool->cq, num_desc); > > + > > spin_unlock_irqrestore(&pool->cq_lock, flags); > > + kmem_cache_free(xsk_tx_generic_cache, xsk_addrs); > > } > > > > static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) > > @@ -562,11 +583,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) > > spin_unlock_irqrestore(&pool->cq_lock, flags); > > } > > > > -static u32 xsk_get_num_desc(struct sk_buff *skb) > > -{ > > - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; > > -} > > - > > static void xsk_destruct_skb(struct sk_buff *skb) > > { > > struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; > > @@ -576,21 +592,37 @@ static void xsk_destruct_skb(struct sk_buff *skb) > > *compl->tx_timestamp = ktime_get_tai_fast_ns(); > > } > > > > - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); > > + xsk_cq_submit_addr_locked(xdp_sk(skb->sk), skb); > > sock_wfree(skb); > > } > > > > -static void xsk_set_destructor_arg(struct sk_buff *skb) > > +static u32 xsk_get_num_desc(struct sk_buff *skb) > > { > > - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; > > + struct xsk_addrs *addrs; > > > > - skb_shinfo(skb)->destructor_arg = (void *)num; > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; > > + return addrs->num_descs; > > +} > > + > > +static void xsk_set_destructor_arg(struct sk_buff *skb, struct xsk_addrs *addrs) > > +{ > > + skb_shinfo(skb)->destructor_arg = (void *)addrs; > > +} > > + > > +static void xsk_inc_skb_descs(struct sk_buff *skb) > > +{ > > + struct xsk_addrs *addrs; > > + > > + addrs = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; > > + addrs->num_descs++; > > } > > > > static void xsk_consume_skb(struct sk_buff *skb) > > { > > struct xdp_sock *xs = xdp_sk(skb->sk); > > > > + kmem_cache_free(xsk_tx_generic_cache, > > + (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg); > > skb->destructor = sock_wfree; > > xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); > > /* Free skb without triggering the perf drop trace */ > > @@ -609,6 +641,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, > > { > > struct xsk_buff_pool *pool = xs->pool; > > u32 hr, len, ts, offset, copy, copied; > > + struct xsk_addrs *addrs = NULL; > > struct sk_buff *skb = xs->skb; > > struct page *page; > > void *buffer; > > @@ -623,6 +656,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, > > return ERR_PTR(err); > > > > skb_reserve(skb, hr); > > + > > + addrs = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); > > + if (!addrs) > > + return ERR_PTR(-ENOMEM); > > Do we need to kfree the skb that we allocated on line 621 above? (maybe > not because I always get confused by the mb/overflow handling here) Awesome catches Stan. I'm fed up with these changes and I manage to introduce these two bugs you're pointing out:) My reasoning was that even if we write the errno for skb, I assumed that branch below: free_err: if (first_frag && skb) kfree_skb(skb); will be taken, but the thing is we don't set @first_frag = true for xsk_build_skb_zerocopy(). I will explicitly free skb where you're suggesting and of course unregister the netdev notifier. Thanks! > > > + > > + xsk_set_destructor_arg(skb, addrs); > > } > > (...)