Jason Xing wrote: > From: Jason Xing <kernelxing@xxxxxxxxxxx> > > The patch does the following things: > - Add XDP_MAX_TX_BUDGET socket option. > - Unify TX_BATCH_SIZE and MAX_PER_SOCKET_BUDGET into single one > tx_budget_spent. > - tx_budget_spent is set to 32 by default in the initialization phase. > It's a per-socket granular control. > > The idea behind this comes out of real workloads in production. We use a > user-level stack with xsk support to accelerate sending packets and > minimize triggering syscall. When the packets are aggregated, it's not > hard to hit the upper bound (namely, 32). The moment user-space stack > fetches the -EAGAIN error number passed from sendto(), it will loop to try > again until all the expected descs from tx ring are sent out to the driver. > Enlarging the XDP_MAX_TX_BUDGET value contributes to less frequencies of > sendto(). Besides, applications leveraging this setsockopt can adjust > its proper value in time after noticing the upper bound issue happening. > > Signed-off-by: Jason Xing <kernelxing@xxxxxxxxxxx> > --- > V3 > Link: https://lore.kernel.org/all/20250618065553.96822-1-kerneljasonxing@xxxxxxxxx/ > 1. use a per-socket control (suggested by Stanislav) > 2. unify both definitions into one > 3. support setsockopt and getsockopt > 4. add more description in commit message +1 on an XSK setsockopt only > > V2 > Link: https://lore.kernel.org/all/20250617002236.30557-1-kerneljasonxing@xxxxxxxxx/ > 1. use a per-netns sysctl knob > 2. use sysctl_xsk_max_tx_budget to unify both definitions. > --- > include/net/xdp_sock.h | 3 ++- > include/uapi/linux/if_xdp.h | 1 + > net/xdp/xsk.c | 36 +++++++++++++++++++++++++------ > tools/include/uapi/linux/if_xdp.h | 1 + > 4 files changed, 34 insertions(+), 7 deletions(-) > > diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h > index e8bd6ddb7b12..8eecafad92c0 100644 > --- a/include/net/xdp_sock.h > +++ b/include/net/xdp_sock.h > @@ -65,11 +65,12 @@ struct xdp_sock { > struct xsk_queue *tx ____cacheline_aligned_in_smp; > struct list_head tx_list; > /* record the number of tx descriptors sent by this xsk and > - * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs > + * when it exceeds max_tx_budget, an opportunity needs > * to be given to other xsks for sending tx descriptors, thereby > * preventing other XSKs from being starved. > */ > u32 tx_budget_spent; > + u32 max_tx_budget; This probably does not need to be a u32? It does fit in an existing hole. Is it also a warm cacheline wherever this is touched in the hot path? > > /* Statistics */ > u64 rx_dropped; > diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h > index 44f2bb93e7e6..07c6d21c2f1c 100644 > --- a/include/uapi/linux/if_xdp.h > +++ b/include/uapi/linux/if_xdp.h > @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { > #define XDP_UMEM_COMPLETION_RING 6 > #define XDP_STATISTICS 7 > #define XDP_OPTIONS 8 > +#define XDP_MAX_TX_BUDGET 9 > > struct xdp_umem_reg { > __u64 addr; /* Start of packet data area */ > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c > index 72c000c0ae5f..7c47f665e9d1 100644 > --- a/net/xdp/xsk.c > +++ b/net/xdp/xsk.c > @@ -33,9 +33,6 @@ > #include "xdp_umem.h" > #include "xsk.h" > > -#define TX_BATCH_SIZE 32 > -#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) > - > void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) > { > if (pool->cached_need_wakeup & XDP_WAKEUP_RX) > @@ -424,7 +421,9 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) > rcu_read_lock(); > again: > list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { > - if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) { > + int max_budget = READ_ONCE(xs->max_tx_budget); > + > + if (xs->tx_budget_spent >= max_budget) { > budget_exhausted = true; > continue; > } > @@ -779,7 +778,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, > static int __xsk_generic_xmit(struct sock *sk) > { > struct xdp_sock *xs = xdp_sk(sk); > - u32 max_batch = TX_BATCH_SIZE; > + u32 max_budget = READ_ONCE(xs->max_tx_budget); > bool sent_frame = false; > struct xdp_desc desc; > struct sk_buff *skb; > @@ -797,7 +796,7 @@ static int __xsk_generic_xmit(struct sock *sk) > goto out; > > while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { > - if (max_batch-- == 0) { > + if (max_budget-- == 0) { > err = -EAGAIN; > goto out; > } > @@ -1437,6 +1436,18 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, > mutex_unlock(&xs->mutex); > return err; > } > + case XDP_MAX_TX_BUDGET: > + { > + unsigned int budget; > + > + if (optlen < sizeof(budget)) > + return -EINVAL; > + if (copy_from_sockptr(&budget, optval, sizeof(budget))) > + return -EFAULT; > + > + WRITE_ONCE(xs->max_tx_budget, budget); Sanitize input: bounds check > + return 0; > + } > default: > break; > } > @@ -1588,6 +1599,18 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname, > > return 0; > } > + case XDP_MAX_TX_BUDGET: > + { > + unsigned int budget = READ_ONCE(xs->max_tx_budget); > + > + if (copy_to_user(optval, &budget, sizeof(budget))) > + return -EFAULT; > + if (put_user(sizeof(budget), optlen)) > + return -EFAULT; > + > + return 0; > + } > + > default: > break; > } > @@ -1734,6 +1757,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, > > xs = xdp_sk(sk); > xs->state = XSK_READY; > + xs->max_tx_budget = 32; > mutex_init(&xs->mutex); > > INIT_LIST_HEAD(&xs->map_list); > diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h > index 44f2bb93e7e6..07c6d21c2f1c 100644 > --- a/tools/include/uapi/linux/if_xdp.h > +++ b/tools/include/uapi/linux/if_xdp.h > @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { > #define XDP_UMEM_COMPLETION_RING 6 > #define XDP_STATISTICS 7 > #define XDP_OPTIONS 8 > +#define XDP_MAX_TX_BUDGET 9 > > struct xdp_umem_reg { > __u64 addr; /* Start of packet data area */ > -- > 2.43.5 >