On Mon, 19 May 2025 10:07:24 -0600, alex.williamson@xxxxxxxxxx wrote: >> /* >> - * Helper Functions for host iova-pfn list >> + * Find the first vfio_pfn that overlapping the range >> + * [iova, iova + PAGE_SIZE * npage) in rb tree >> */ >> -static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) >> +static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma, >> + dma_addr_t iova, unsigned long npage) >> { >> struct vfio_pfn *vpfn; >> struct rb_node *node = dma->pfn_list.rb_node; >> + dma_addr_t end_iova = iova + PAGE_SIZE * npage; >> >> while (node) { >> vpfn = rb_entry(node, struct vfio_pfn, node); >> >> - if (iova < vpfn->iova) >> + if (end_iova <= vpfn->iova) >> node = node->rb_left; >> else if (iova > vpfn->iova) >> node = node->rb_right; >> @@ -337,6 +340,14 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) >> return NULL; >> } >> >> +/* >> + * Helper Functions for host iova-pfn list >> + */ > >This comment should still precede the renamed function above, it's in >reference to this section of code related to searching, inserting, and >removing entries from the pfn list. Thanks! I will place it in the comments before the function vfio_find_vpfn_range() in v3 patch. >> @@ -681,32 +692,67 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, >> * and rsvd here, and therefore continues to use the batch. >> */ >> while (true) { >> + int page_step = 1; >> + long lock_acct_step = 1; >> + struct folio *folio = page_folio(batch->pages[batch->offset]); >> + bool found_vpfn; >> + >> if (pfn != *pfn_base + pinned || >> rsvd != is_invalid_reserved_pfn(pfn)) >> goto out; >> >> + /* Handle hugetlbfs page */ >> + if (folio_test_hugetlb(folio)) { > >Why do we care to specifically test for hugetlb vs >folio_large_nr_pages(), at which point we can just use folio_nr_pages() >directly here. > >> + unsigned long start_pfn = PHYS_PFN(vaddr); > >Using this macro on a vaddr looks wrong. > >> + >> + /* >> + * Note: The current page_step does not achieve the optimal >> + * performance in scenarios where folio_nr_pages() exceeds >> + * batch->capacity. It is anticipated that future enhancements >> + * will address this limitation. >> + */ >> + page_step = min(batch->size, >> + ALIGN(start_pfn + 1, folio_nr_pages(folio)) - start_pfn); > >Why do we assume start_pfn is the beginning of the folio? > >> + found_vpfn = !!vfio_find_vpfn_range(dma, iova, page_step); >> + if (rsvd || !found_vpfn) { >> + lock_acct_step = page_step; >> + } else { >> + dma_addr_t tmp_iova = iova; >> + int i; >> + >> + lock_acct_step = 0; >> + for (i = 0; i < page_step; ++i, tmp_iova += PAGE_SIZE) >> + if (!vfio_find_vpfn(dma, tmp_iova)) >> + lock_acct_step++; >> + if (lock_acct_step) >> + found_vpfn = false; > >Why are we making this so complicated versus falling back to iterating >at page per page? > >> + } >> + } else { >> + found_vpfn = vfio_find_vpfn(dma, iova); >> + } >> + >> /* >> * Reserved pages aren't counted against the user, >> * externally pinned pages are already counted against >> * the user. >> */ >> - if (!rsvd && !vfio_find_vpfn(dma, iova)) { >> + if (!rsvd && !found_vpfn) { >> if (!dma->lock_cap && >> - mm->locked_vm + lock_acct + 1 > limit) { >> + mm->locked_vm + lock_acct + lock_acct_step > limit) { >> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", >> __func__, limit << PAGE_SHIFT); >> ret = -ENOMEM; >> goto unpin_out; >> } >> - lock_acct++; >> + lock_acct += lock_acct_step; >> } >> >> - pinned++; >> - npage--; >> - vaddr += PAGE_SIZE; >> - iova += PAGE_SIZE; >> - batch->offset++; >> - batch->size--; >> + pinned += page_step; >> + npage -= page_step; >> + vaddr += PAGE_SIZE * page_step; >> + iova += PAGE_SIZE * page_step; >> + batch->offset += page_step; >> + batch->size -= page_step; >> >> if (!batch->size) >> break; > >Why is something like below (untested) not sufficient? > >NB. (vaddr - folio_address()) still needs some scrutiny to determine if >it's valid. > >@@ -681,32 +692,40 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > * and rsvd here, and therefore continues to use the batch. > */ > while (true) { >+ struct folio *folio = page_folio(batch->pages[batch->offset]); >+ long nr_pages; >+ > if (pfn != *pfn_base + pinned || > rsvd != is_invalid_reserved_pfn(pfn)) > goto out; > >+ nr_pages = min(batch->size, folio_nr_pages(folio) - >+ folio_page_idx(folio, batch->pages[batch->offset]); >+ if (nr_pages > 1 && vfio_find_vpfn_range(dma, iova, nr_pages)) >+ nr_pages = 1; >+ > /* > * Reserved pages aren't counted against the user, > * externally pinned pages are already counted against > * the user. > */ >- if (!rsvd && !vfio_find_vpfn(dma, iova)) { >+ if (!rsvd && (nr_pages > 1 || !vfio_find_vpfn(dma, iova))) { > if (!dma->lock_cap && >- mm->locked_vm + lock_acct + 1 > limit) { >+ mm->locked_vm + lock_acct + nr_pages > limit) { > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > __func__, limit << PAGE_SHIFT); > ret = -ENOMEM; > goto unpin_out; > } >- lock_acct++; >+ lock_acct += nr_pages; > } > >- pinned++; >- npage--; >- vaddr += PAGE_SIZE; >- iova += PAGE_SIZE; >- batch->offset++; >- batch->size--; >+ pinned += nr_pages; >+ npage -= nr_pages; >+ vaddr += PAGE_SIZE * nr_pages; >+ iova += PAGE_SIZE * nr_pages; >+ batch->offset += nr_pages; >+ batch->size -= nr_pages; > > if (!batch->size) > break; The input parameter vaddr is a user-space address, while folio_address() returns a kernel space address. I agree that using folio_page_idx() is the best approach. This approach indeed simplifies things a lot. I have conducted basic functionality tests on it and have not found any issues. Please allow me to integrate it into the v3 patch. Thanks!