On Thu, May 22, 2025 at 2:46 PM Tariq Toukan <tariqt@xxxxxxxxxx> wrote: > > From: Saeed Mahameed <saeedm@xxxxxxxxxx> > > mlx5e_page_frag holds the physical page itself, to naturally support > zc page pools, remove physical page reference from mlx5 and replace it > with netmem_ref, to avoid internal handling in mlx5 for net_iov backed > pages. > > SHAMPO can issue packets that are not split into header and data. These > packets will be dropped if the data part resides in a net_iov as the > driver can't read into this area. > > No performance degradation observed. > > Signed-off-by: Saeed Mahameed <saeedm@xxxxxxxxxx> > Signed-off-by: Dragos Tatulea <dtatulea@xxxxxxxxxx> > Signed-off-by: Cosmin Ratiu <cratiu@xxxxxxxxxx> > Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx> > --- > drivers/net/ethernet/mellanox/mlx5/core/en.h | 2 +- > .../net/ethernet/mellanox/mlx5/core/en_rx.c | 103 ++++++++++-------- > 2 files changed, 61 insertions(+), 44 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h > index c329de1d4f0a..65a73913b9a2 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h > @@ -553,7 +553,7 @@ struct mlx5e_icosq { > } ____cacheline_aligned_in_smp; > > struct mlx5e_frag_page { omega nit: maybe this should be renamed now to mlx5e_frag_netmem. Up to you. > - struct page *page; > + netmem_ref netmem; > u16 frags; > }; > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > index e34ef53ebd0e..75e753adedef 100644 > --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c > @@ -273,33 +273,33 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq, > > #define MLX5E_PAGECNT_BIAS_MAX (PAGE_SIZE / 64) > > -static int mlx5e_page_alloc_fragmented(struct page_pool *pool, > +static int mlx5e_page_alloc_fragmented(struct page_pool *pp, > struct mlx5e_frag_page *frag_page) > { > - struct page *page; > + netmem_ref netmem = page_pool_alloc_netmems(pp, > + GFP_ATOMIC | __GFP_NOWARN); > > - page = page_pool_dev_alloc_pages(pool); > - if (unlikely(!page)) > + if (unlikely(!netmem)) > return -ENOMEM; > > - page_pool_fragment_page(page, MLX5E_PAGECNT_BIAS_MAX); > + page_pool_fragment_netmem(netmem, MLX5E_PAGECNT_BIAS_MAX); > > *frag_page = (struct mlx5e_frag_page) { > - .page = page, > + .netmem = netmem, > .frags = 0, > }; > > return 0; > } > > -static void mlx5e_page_release_fragmented(struct page_pool *pool, > +static void mlx5e_page_release_fragmented(struct page_pool *pp, > struct mlx5e_frag_page *frag_page) > { > u16 drain_count = MLX5E_PAGECNT_BIAS_MAX - frag_page->frags; > - struct page *page = frag_page->page; > + netmem_ref netmem = frag_page->netmem; > > - if (page_pool_unref_page(page, drain_count) == 0) > - page_pool_put_unrefed_page(pool, page, -1, true); > + if (page_pool_unref_netmem(netmem, drain_count) == 0) > + page_pool_put_unrefed_netmem(pp, netmem, -1, true); > } > > static inline int mlx5e_get_rx_frag(struct mlx5e_rq *rq, > @@ -359,7 +359,7 @@ static int mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq, struct mlx5e_rx_wqe_cyc *wqe, > frag->flags &= ~BIT(MLX5E_WQE_FRAG_SKIP_RELEASE); > > headroom = i == 0 ? rq->buff.headroom : 0; > - addr = page_pool_get_dma_addr(frag->frag_page->page); > + addr = page_pool_get_dma_addr_netmem(frag->frag_page->netmem); > wqe->data[i].addr = cpu_to_be64(addr + frag->offset + headroom); > } > > @@ -500,9 +500,10 @@ mlx5e_add_skb_shared_info_frag(struct mlx5e_rq *rq, struct skb_shared_info *sinf > struct xdp_buff *xdp, struct mlx5e_frag_page *frag_page, > u32 frag_offset, u32 len) > { > + netmem_ref netmem = frag_page->netmem; > skb_frag_t *frag; > > - dma_addr_t addr = page_pool_get_dma_addr(frag_page->page); > + dma_addr_t addr = page_pool_get_dma_addr_netmem(netmem); > > dma_sync_single_for_cpu(rq->pdev, addr + frag_offset, len, rq->buff.map_dir); > if (!xdp_buff_has_frags(xdp)) { > @@ -515,9 +516,9 @@ mlx5e_add_skb_shared_info_frag(struct mlx5e_rq *rq, struct skb_shared_info *sinf > } > > frag = &sinfo->frags[sinfo->nr_frags++]; > - skb_frag_fill_page_desc(frag, frag_page->page, frag_offset, len); > + skb_frag_fill_netmem_desc(frag, netmem, frag_offset, len); > > - if (page_is_pfmemalloc(frag_page->page)) > + if (!netmem_is_net_iov(netmem) && netmem_is_pfmemalloc(netmem)) nit: unnecessary net_iov check AFAICT? > xdp_buff_set_frag_pfmemalloc(xdp); > sinfo->xdp_frags_size += len; > } > @@ -528,27 +529,29 @@ mlx5e_add_skb_frag(struct mlx5e_rq *rq, struct sk_buff *skb, > u32 frag_offset, u32 len, > unsigned int truesize) > { > - dma_addr_t addr = page_pool_get_dma_addr(frag_page->page); > + dma_addr_t addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > u8 next_frag = skb_shinfo(skb)->nr_frags; > + netmem_ref netmem = frag_page->netmem; > > dma_sync_single_for_cpu(rq->pdev, addr + frag_offset, len, > rq->buff.map_dir); > > - if (skb_can_coalesce(skb, next_frag, frag_page->page, frag_offset)) { > + if (skb_can_coalesce_netmem(skb, next_frag, netmem, frag_offset)) { > skb_coalesce_rx_frag(skb, next_frag - 1, len, truesize); > - } else { > - frag_page->frags++; > - skb_add_rx_frag(skb, next_frag, frag_page->page, > - frag_offset, len, truesize); > + return; > } > + > + frag_page->frags++; > + skb_add_rx_frag_netmem(skb, next_frag, netmem, > + frag_offset, len, truesize); > } > > static inline void > mlx5e_copy_skb_header(struct mlx5e_rq *rq, struct sk_buff *skb, > - struct page *page, dma_addr_t addr, > + netmem_ref netmem, dma_addr_t addr, > int offset_from, int dma_offset, u32 headlen) > { > - const void *from = page_address(page) + offset_from; > + const void *from = netmem_address(netmem) + offset_from; I think this needs a check that netmem_address != NULL and safe error handling in case it is? If the netmem is unreadable, netmem_address will return NULL, and because you add offset_from to it, you can't NULL check from as well. > /* Aligning len to sizeof(long) optimizes memcpy performance */ > unsigned int len = ALIGN(headlen, sizeof(long)); > > @@ -685,7 +688,7 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, > if (unlikely(err)) > goto err_unmap; > > - addr = page_pool_get_dma_addr(frag_page->page); > + addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > > for (int j = 0; j < MLX5E_SHAMPO_WQ_HEADER_PER_PAGE; j++) { > header_offset = mlx5e_shampo_hd_offset(index++); > @@ -796,7 +799,8 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix) > err = mlx5e_page_alloc_fragmented(rq->page_pool, frag_page); > if (unlikely(err)) > goto err_unmap; > - addr = page_pool_get_dma_addr(frag_page->page); > + > + addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > umr_wqe->inline_mtts[i] = (struct mlx5_mtt) { > .ptag = cpu_to_be64(addr | MLX5_EN_WR), > }; > @@ -1216,7 +1220,7 @@ static void *mlx5e_shampo_get_packet_hd(struct mlx5e_rq *rq, u16 header_index) > struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); > u16 head_offset = mlx5e_shampo_hd_offset(header_index) + rq->buff.headroom; > > - return page_address(frag_page->page) + head_offset; > + return netmem_address(frag_page->netmem) + head_offset; Similar concern here. netmem_address may be NULL, especially when you enable unreadable netmem support in the later patches. There are a couple of call sites below as well. > } > > static void mlx5e_shampo_update_ipv4_udp_hdr(struct mlx5e_rq *rq, struct iphdr *ipv4) > @@ -1677,11 +1681,11 @@ mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi, > dma_addr_t addr; > u32 frag_size; > > - va = page_address(frag_page->page) + wi->offset; > + va = netmem_address(frag_page->netmem) + wi->offset; > data = va + rx_headroom; > frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); > > - addr = page_pool_get_dma_addr(frag_page->page); > + addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > dma_sync_single_range_for_cpu(rq->pdev, addr, wi->offset, > frag_size, rq->buff.map_dir); > net_prefetch(data); > @@ -1731,10 +1735,10 @@ mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi > > frag_page = wi->frag_page; > > - va = page_address(frag_page->page) + wi->offset; > + va = netmem_address(frag_page->netmem) + wi->offset; > frag_consumed_bytes = min_t(u32, frag_info->frag_size, cqe_bcnt); > > - addr = page_pool_get_dma_addr(frag_page->page); > + addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > dma_sync_single_range_for_cpu(rq->pdev, addr, wi->offset, > rq->buff.frame0_sz, rq->buff.map_dir); > net_prefetchw(va); /* xdp_frame data area */ > @@ -2007,13 +2011,14 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w > > if (prog) { > /* area for bpf_xdp_[store|load]_bytes */ > - net_prefetchw(page_address(frag_page->page) + frag_offset); > + net_prefetchw(netmem_address(frag_page->netmem) + frag_offset); > if (unlikely(mlx5e_page_alloc_fragmented(rq->page_pool, > &wi->linear_page))) { > rq->stats->buff_alloc_err++; > return NULL; > } > - va = page_address(wi->linear_page.page); > + > + va = netmem_address(wi->linear_page.netmem); > net_prefetchw(va); /* xdp_frame data area */ > linear_hr = XDP_PACKET_HEADROOM; > linear_data_len = 0; > @@ -2124,8 +2129,8 @@ mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *w > while (++pagep < frag_page); > } > /* copy header */ > - addr = page_pool_get_dma_addr(head_page->page); > - mlx5e_copy_skb_header(rq, skb, head_page->page, addr, > + addr = page_pool_get_dma_addr_netmem(head_page->netmem); > + mlx5e_copy_skb_header(rq, skb, head_page->netmem, addr, > head_offset, head_offset, headlen); > /* skb linear part was allocated with headlen and aligned to long */ > skb->tail += headlen; > @@ -2155,11 +2160,11 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, > return NULL; > } > > - va = page_address(frag_page->page) + head_offset; > + va = netmem_address(frag_page->netmem) + head_offset; > data = va + rx_headroom; > frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt); > > - addr = page_pool_get_dma_addr(frag_page->page); > + addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > dma_sync_single_range_for_cpu(rq->pdev, addr, head_offset, > frag_size, rq->buff.map_dir); > net_prefetch(data); > @@ -2198,16 +2203,19 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, > struct mlx5_cqe64 *cqe, u16 header_index) > { > struct mlx5e_frag_page *frag_page = mlx5e_shampo_hd_to_frag_page(rq, header_index); > - dma_addr_t page_dma_addr = page_pool_get_dma_addr(frag_page->page); > u16 head_offset = mlx5e_shampo_hd_offset(header_index); > - dma_addr_t dma_addr = page_dma_addr + head_offset; > u16 head_size = cqe->shampo.header_size; > u16 rx_headroom = rq->buff.headroom; > struct sk_buff *skb = NULL; > + dma_addr_t page_dma_addr; > + dma_addr_t dma_addr; > void *hdr, *data; > u32 frag_size; > > - hdr = page_address(frag_page->page) + head_offset; > + page_dma_addr = page_pool_get_dma_addr_netmem(frag_page->netmem); > + dma_addr = page_dma_addr + head_offset; > + > + hdr = netmem_address(frag_page->netmem) + head_offset; > data = hdr + rx_headroom; > frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + head_size); > > @@ -2232,7 +2240,7 @@ mlx5e_skb_from_cqe_shampo(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi, > } > > net_prefetchw(skb->data); > - mlx5e_copy_skb_header(rq, skb, frag_page->page, dma_addr, > + mlx5e_copy_skb_header(rq, skb, frag_page->netmem, dma_addr, > head_offset + rx_headroom, > rx_headroom, head_size); > /* skb linear part was allocated with headlen and aligned to long */ > @@ -2326,11 +2334,20 @@ static void mlx5e_handle_rx_cqe_mpwrq_shampo(struct mlx5e_rq *rq, struct mlx5_cq > } > > if (!*skb) { > - if (likely(head_size)) > + if (likely(head_size)) { > *skb = mlx5e_skb_from_cqe_shampo(rq, wi, cqe, header_index); > - else > - *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, cqe_bcnt, > - data_offset, page_idx); > + } else { > + struct mlx5e_frag_page *frag_page; > + > + frag_page = &wi->alloc_units.frag_pages[page_idx]; > + if (unlikely(netmem_is_net_iov(frag_page->netmem))) > + goto free_hd_entry; > + *skb = mlx5e_skb_from_cqe_mpwrq_nonlinear(rq, wi, cqe, > + cqe_bcnt, > + data_offset, > + page_idx); > + } > + > if (unlikely(!*skb)) > goto free_hd_entry; > > -- > 2.31.1 > > -- Thanks, Mina