On Mon, 19 May 2025 15:04:19 +0800 lizhe.67@xxxxxxxxxxxxx wrote: > From: Li Zhe <lizhe.67@xxxxxxxxxxxxx> > > When vfio_pin_pages_remote() is called with a range of addresses that > includes hugetlbfs folios, the function currently performs individual > statistics counting operations for each page. This can lead to significant > performance overheads, especially when dealing with large ranges of pages. > > This patch optimize this process by batching the statistics counting > operations. > > The performance test results for completing the 8G VFIO IOMMU DMA mapping, > obtained through trace-cmd, are as follows. In this case, the 8G virtual > address space has been mapped to physical memory using hugetlbfs with > pagesize=2M. > > Before this patch: > funcgraph_entry: # 33813.703 us | vfio_pin_map_dma(); > > After this patch: > funcgraph_entry: # 15635.055 us | vfio_pin_map_dma(); > > Signed-off-by: Li Zhe <lizhe.67@xxxxxxxxxxxxx> > --- > Changelogs: > > v1->v2: > - Fix some issues in comments and formatting. > - Consolidate vfio_find_vpfn_range() and vfio_find_vpfn(). > - Move the processing logic for hugetlbfs folio into the while(true) loop > and use a variable with a default value of 1 to indicate the number of > consecutive pages. > > v1 patch: https://lore.kernel.org/all/20250513035730.96387-1-lizhe.67@xxxxxxxxxxxxx/ > > drivers/vfio/vfio_iommu_type1.c | 70 +++++++++++++++++++++++++++------ > 1 file changed, 58 insertions(+), 12 deletions(-) > > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > index 0ac56072af9f..2218ca415366 100644 > --- a/drivers/vfio/vfio_iommu_type1.c > +++ b/drivers/vfio/vfio_iommu_type1.c > @@ -317,17 +317,20 @@ static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu) > } > > /* > - * Helper Functions for host iova-pfn list > + * Find the first vfio_pfn that overlapping the range > + * [iova, iova + PAGE_SIZE * npage) in rb tree > */ > -static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) > +static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma, > + dma_addr_t iova, unsigned long npage) > { > struct vfio_pfn *vpfn; > struct rb_node *node = dma->pfn_list.rb_node; > + dma_addr_t end_iova = iova + PAGE_SIZE * npage; > > while (node) { > vpfn = rb_entry(node, struct vfio_pfn, node); > > - if (iova < vpfn->iova) > + if (end_iova <= vpfn->iova) > node = node->rb_left; > else if (iova > vpfn->iova) > node = node->rb_right; > @@ -337,6 +340,14 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) > return NULL; > } > > +/* > + * Helper Functions for host iova-pfn list > + */ This comment should still precede the renamed function above, it's in reference to this section of code related to searching, inserting, and removing entries from the pfn list. > +static inline struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) > +{ > + return vfio_find_vpfn_range(dma, iova, 1); > +} > + > static void vfio_link_pfn(struct vfio_dma *dma, > struct vfio_pfn *new) > { > @@ -681,32 +692,67 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > * and rsvd here, and therefore continues to use the batch. > */ > while (true) { > + int page_step = 1; > + long lock_acct_step = 1; > + struct folio *folio = page_folio(batch->pages[batch->offset]); > + bool found_vpfn; > + > if (pfn != *pfn_base + pinned || > rsvd != is_invalid_reserved_pfn(pfn)) > goto out; > > + /* Handle hugetlbfs page */ > + if (folio_test_hugetlb(folio)) { Why do we care to specifically test for hugetlb vs folio_large_nr_pages(), at which point we can just use folio_nr_pages() directly here. > + unsigned long start_pfn = PHYS_PFN(vaddr); Using this macro on a vaddr looks wrong. > + > + /* > + * Note: The current page_step does not achieve the optimal > + * performance in scenarios where folio_nr_pages() exceeds > + * batch->capacity. It is anticipated that future enhancements > + * will address this limitation. > + */ > + page_step = min(batch->size, > + ALIGN(start_pfn + 1, folio_nr_pages(folio)) - start_pfn); Why do we assume start_pfn is the beginning of the folio? > + found_vpfn = !!vfio_find_vpfn_range(dma, iova, page_step); > + if (rsvd || !found_vpfn) { > + lock_acct_step = page_step; > + } else { > + dma_addr_t tmp_iova = iova; > + int i; > + > + lock_acct_step = 0; > + for (i = 0; i < page_step; ++i, tmp_iova += PAGE_SIZE) > + if (!vfio_find_vpfn(dma, tmp_iova)) > + lock_acct_step++; > + if (lock_acct_step) > + found_vpfn = false; Why are we making this so complicated versus falling back to iterating at page per page? > + } > + } else { > + found_vpfn = vfio_find_vpfn(dma, iova); > + } > + > /* > * Reserved pages aren't counted against the user, > * externally pinned pages are already counted against > * the user. > */ > - if (!rsvd && !vfio_find_vpfn(dma, iova)) { > + if (!rsvd && !found_vpfn) { > if (!dma->lock_cap && > - mm->locked_vm + lock_acct + 1 > limit) { > + mm->locked_vm + lock_acct + lock_acct_step > limit) { > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > ret = -ENOMEM; > goto unpin_out; > } > - lock_acct++; > + lock_acct += lock_acct_step; > } > > - pinned++; > - npage--; > - vaddr += PAGE_SIZE; > - iova += PAGE_SIZE; > - batch->offset++; > - batch->size--; > + pinned += page_step; > + npage -= page_step; > + vaddr += PAGE_SIZE * page_step; > + iova += PAGE_SIZE * page_step; > + batch->offset += page_step; > + batch->size -= page_step; > > if (!batch->size) > break; Why is something like below (untested) not sufficient? NB. (vaddr - folio_address()) still needs some scrutiny to determine if it's valid. @@ -681,32 +692,40 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, * and rsvd here, and therefore continues to use the batch. */ while (true) { + struct folio *folio = page_folio(batch->pages[batch->offset]); + long nr_pages; + if (pfn != *pfn_base + pinned || rsvd != is_invalid_reserved_pfn(pfn)) goto out; + nr_pages = min(batch->size, folio_nr_pages(folio) - + (vaddr - folio_address(folio)) >> PAGE_SHIFT); + if (nr_pages > 1 && vfio_find_vpfn_range(dma, iova, nr_pages)) + nr_pages = 1; + /* * Reserved pages aren't counted against the user, * externally pinned pages are already counted against * the user. */ - if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!rsvd && (nr_pages > 1 || !vfio_find_vpfn(dma, iova))) { if (!dma->lock_cap && - mm->locked_vm + lock_acct + 1 > limit) { + mm->locked_vm + lock_acct + nr_pages > limit) { pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); ret = -ENOMEM; goto unpin_out; } - lock_acct++; + lock_acct += nr_pages; } - pinned++; - npage--; - vaddr += PAGE_SIZE; - iova += PAGE_SIZE; - batch->offset++; - batch->size--; + pinned += nr_pages; + npage -= nr_pages; + vaddr += PAGE_SIZE * nr_pages; + iova += PAGE_SIZE * nr_pages; + batch->offset += nr_pages; + batch->size -= nr_pages; if (!batch->size) break;