Re: [RFC v2] vfio/type1: optimize vfio_unpin_pages_remote() for large folio

lizhe.67@xxxxxxxxxxxxx · Fri, 13 Jun 2025 14:29:20 +0800

On Thu, 12 Jun 2025 16:32:39 -0600, alex.williamson@xxxxxxxxxx wrote:

> >  drivers/vfio/vfio_iommu_type1.c | 53 +++++++++++++++++++++++++--------
> >  1 file changed, 41 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 28ee4b8d39ae..2f6c0074d7b3 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -469,17 +469,28 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
> >  	return true;
> >  }
> >  
> > -static int put_pfn(unsigned long pfn, int prot)
> > +static inline void _put_pfns(struct page *page, int npages, int prot)
> >  {
> > -	if (!is_invalid_reserved_pfn(pfn)) {
> > -		struct page *page = pfn_to_page(pfn);
> > +	unpin_user_page_range_dirty_lock(page, npages, prot & IOMMU_WRITE);
> > +}
> >  
> > -		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
> > -		return 1;
> > +/*
> > + * The caller must ensure that these npages PFNs belong to the same folio.
> > + */
> > +static inline int put_pfns(unsigned long pfn, int npages, int prot)
> > +{
> > +	if (!is_invalid_reserved_pfn(pfn)) {
> > +		_put_pfns(pfn_to_page(pfn), npages, prot);
> > +		return npages;
> >  	}
> >  	return 0;
> >  }
> >  
> > +static inline int put_pfn(unsigned long pfn, int prot)
> > +{
> > +	return put_pfns(pfn, 1, prot);
> > +}
> > +
> >  #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
> >  
> >  static void __vfio_batch_init(struct vfio_batch *batch, bool single)
> > @@ -805,15 +816,33 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> >  				    unsigned long pfn, unsigned long npage,
> >  				    bool do_accounting)
> >  {
> > -	long unlocked = 0, locked = 0;
> > -	long i;
> > +	long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
> >  
> > -	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > -		if (put_pfn(pfn++, dma->prot)) {
> > -			unlocked++;
> > -			if (vfio_find_vpfn(dma, iova))
> > -				locked++;
> > +	while (npage) {
> > +		struct folio *folio;
> > +		struct page *page;
> > +		long step = 1;
> > +
> > +		if (is_invalid_reserved_pfn(pfn))
> > +			goto next;
> > +
> > +		page = pfn_to_page(pfn);
> > +		folio = page_folio(page);
> > +
> > +		if (!folio_test_large(folio)) {
> > +			_put_pfns(page, 1, dma->prot);
> > +		} else {
> > +			step = min_t(long, npage,
> > +				folio_nr_pages(folio) -
> > +				folio_page_idx(folio, page));
> > +			_put_pfns(page, step, dma->prot);
> >  		}
> > +
> > +		unlocked += step;
> > +next:
> 
> Usage of @step is inconsistent, goto isn't really necessary either, how
> about:
> 
> 	while (npage) {
> 		unsigned long step = 1;
> 
> 		if (!is_invalid_reserved_pfn(pfn)) {
> 			struct page *page = pfn_to_page(pfn);
> 			struct folio *folio = page_folio(page);
> 			long nr_pages = folio_nr_pages(folio);
> 
> 			if (nr_pages > 1)
> 				step = min_t(long, npage,
> 					nr_pages -
> 					folio_page_idx(folio, page));
> 
> 			_put_pfns(page, step, dma->prot);
> 			unlocked += step;
> 		}
> 

That's great. This implementation is much better.

I'm a bit uncertain about the best type to use for the 'step'
variable here. I've been trying to keep things consistent with the
put_pfn() function, so I set the type of the second parameter in
_put_pfns() to 'int'(we pass 'step' as the second argument to
_put_pfns()).

Using unsigned long for 'step' should definitely work here, as the
number of pages in a large folio currently falls within the range
that can be represented by an int. However, there is still a
potential risk of truncation that we need to be mindful of.

> > +		pfn += step;
> > +		iova += PAGE_SIZE * step;
> > +		npage -= step;
> >  	}
> >  
> >  	if (do_accounting)
> 
> AIUI, the idea is that we know we have npage contiguous pfns and we
> currently test invalid/reserved, call pfn_to_page(), call
> unpin_user_pages_dirty_lock(), and test vpfn for each individually.
>
> This instead wants to batch the vpfn accounted pfns using the range
> helper added for the mapping patch,

Yes. We use vpfn_pages() just to track the locked pages.

> infer that continuous pfns have the
> same invalid/reserved state, the pages are sequential, and that we can
> use the end of the folio to mark any inflections in those assumptions
> otherwise.  Do I have that correct?

Yes. I think we're definitely on the same page here.

> I think this could be split into two patches, one simply batching the
> vpfn accounting and the next introducing the folio dependency.  The
> contributions of each to the overall performance improvement would be
> interesting.

I've made an initial attempt, and here are the two patches after
splitting them up.

1. batch-vpfn-accounting-patch:

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 28ee4b8d39ae..c8ddcee5aa68 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -805,16 +805,12 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 				    unsigned long pfn, unsigned long npage,
 				    bool do_accounting)
 {
-	long unlocked = 0, locked = 0;
+	long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
 	long i;
 
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
-		if (put_pfn(pfn++, dma->prot)) {
+	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
+		if (put_pfn(pfn++, dma->prot))
 			unlocked++;
-			if (vfio_find_vpfn(dma, iova))
-				locked++;
-		}
-	}
 
 	if (do_accounting)
 		vfio_lock_acct(dma, locked - unlocked, true);
-----------------

2. large-folio-optimization-patch:

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c8ddcee5aa68..48c2ba4ba4eb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -469,17 +469,28 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
 	return true;
 }
 
-static int put_pfn(unsigned long pfn, int prot)
+static inline void _put_pfns(struct page *page, int npages, int prot)
 {
-	if (!is_invalid_reserved_pfn(pfn)) {
-		struct page *page = pfn_to_page(pfn);
+	unpin_user_page_range_dirty_lock(page, npages, prot & IOMMU_WRITE);
+}
 
-		unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
-		return 1;
+/*
+ * The caller must ensure that these npages PFNs belong to the same folio.
+ */
+static inline int put_pfns(unsigned long pfn, int npages, int prot)
+{
+	if (!is_invalid_reserved_pfn(pfn)) {
+		_put_pfns(pfn_to_page(pfn), npages, prot);
+		return npages;
 	}
 	return 0;
 }
 
+static inline int put_pfn(unsigned long pfn, int prot)
+{
+	return put_pfns(pfn, 1, prot);
+}
+
 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
 
 static void __vfio_batch_init(struct vfio_batch *batch, bool single)
@@ -806,11 +817,28 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 				    bool do_accounting)
 {
 	long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
-	long i;
 
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
-		if (put_pfn(pfn++, dma->prot))
-			unlocked++;
+	while (npage) {
+		long step = 1;
+
+		if (!is_invalid_reserved_pfn(pfn)) {
+			struct page *page = pfn_to_page(pfn);
+			struct folio *folio = page_folio(page);
+			long nr_pages = folio_nr_pages(folio);
+
+			if (nr_pages > 1)
+				step = min_t(long, npage,
+					nr_pages -
+					folio_page_idx(folio, page));
+
+			_put_pfns(page, step, dma->prot);
+			unlocked += step;
+		}
+
+		pfn += step;
+		iova += PAGE_SIZE * step;
+		npage -= step;
+	}
 
 	if (do_accounting)
 		vfio_lock_acct(dma, locked - unlocked, true);
-----------------

Here are the results of the performance tests.

Base(v6.15):
./vfio-pci-mem-dma-map 0000:03:00.0 16
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.048 s (333.5 GB/s)
VFIO UNMAP DMA in 0.139 s (115.1 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.273 s (58.6 GB/s)
VFIO UNMAP DMA in 0.302 s (52.9 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.052 s (305.3 GB/s)
VFIO UNMAP DMA in 0.141 s (113.8 GB/s)

Base + Map + batch-vpfn-accounting-patch:
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.027 s (591.1 GB/s)
VFIO UNMAP DMA in 0.138 s (115.7 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.292 s (54.8 GB/s)
VFIO UNMAP DMA in 0.308 s (52.0 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.032 s (505.5 GB/s)
VFIO UNMAP DMA in 0.140 s (114.1 GB/s)

Base + Map + batch-vpfn-accounting-patch + large-folio-optimization-patch:
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.027 s (591.2 GB/s)
VFIO UNMAP DMA in 0.049 s (327.6 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.291 s (55.0 GB/s)
VFIO UNMAP DMA in 0.306 s (52.3 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.032 s (498.3 GB/s)
VFIO UNMAP DMA in 0.049 s (326.2 GB/s)

It seems that batching the vpfn accounting doesn't seem to have much
of an impact in my environment. Perhaps this is because the rbtree
for vfpn is empty, allowing vfio_find_vpfn to execute quickly?

Thanks,
Zhe