On Wed, 2025-05-14 at 16:42 -0700, Ackerley Tng wrote: > If a custom allocator is requested at guest_memfd creation time, pages > from the custom allocator will be used to back guest_memfd. > > Change-Id: I59df960b3273790f42fe5bea54a234f40962eb75 > Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> I know it's an RFC, but for future maturity, these logs are pretty thin across the series. Only one sentence for 143 lines is way to limited. > --- > mm/memory.c | 1 + > virt/kvm/guest_memfd.c | 142 +++++++++++++++++++++++++++++++++++++---- > 2 files changed, 132 insertions(+), 11 deletions(-) > > diff --git a/mm/memory.c b/mm/memory.c > index ba3ea0a82f7f..3af45e96913c 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -7249,6 +7249,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) > else > process_huge_page(addr_hint, nr_pages, clear_subpage, folio); > } > +EXPORT_SYMBOL_GPL(folio_zero_user); > > static int copy_user_gigantic_page(struct folio *dst, struct folio *src, > unsigned long addr_hint, > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index c65d93c5a443..24d270b9b725 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -478,15 +478,13 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > * leaking host data and the up-to-date flag is set. > */ > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > - gfn_t gfn, struct folio *folio) > + gfn_t gfn, struct folio *folio, > + unsigned long addr_hint) > { > - unsigned long nr_pages, i; > pgoff_t index; > int r; > > - nr_pages = folio_nr_pages(folio); > - for (i = 0; i < nr_pages; i++) > - clear_highpage(folio_page(folio, i)); > + folio_zero_user(folio, addr_hint); This is unrelated cleanup. > > /* > * Preparing huge folios should always be safe, since it should > @@ -554,7 +552,9 @@ static int kvm_gmem_filemap_add_folio(struct address_space *mapping, > */ > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > { > + size_t allocated_size; > struct folio *folio; > + pgoff_t index_floor; > int ret; > > repeat: > @@ -581,8 +581,10 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > return ERR_PTR(ret); > } > } > + allocated_size = folio_size(folio); > > - ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index); > + index_floor = round_down(index, folio_nr_pages(folio)); > + ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor); > if (ret) { > folio_put(folio); > > @@ -598,7 +600,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > return ERR_PTR(ret); > } > > - __folio_set_locked(folio); > + spin_lock(&inode->i_lock); > + inode->i_blocks += allocated_size / 512; > + spin_unlock(&inode->i_lock); > + > + /* > + * folio is the one that is allocated, this gets the folio at the > + * requested index. > + */ > + folio = page_folio(folio_file_page(folio, index)); > + folio_lock(folio); > + > return folio; > } > > @@ -736,6 +748,92 @@ static void kvm_gmem_truncate_inode_aligned_pages(struct inode *inode, > spin_unlock(&inode->i_lock); > } > > +/** > + * kvm_gmem_zero_range() - Zeroes all sub-pages in range [@start, @end). > + * > + * @mapping: the filemap to remove this range from. > + * @start: index in filemap for start of range (inclusive). > + * @end: index in filemap for end of range (exclusive). > + * > + * The pages in range may be split. truncate_inode_pages_range() isn't the right > + * function because it removes pages from the page cache; this function only > + * zeroes the pages. > + */ > +static void kvm_gmem_zero_range(struct address_space *mapping, > + pgoff_t start, pgoff_t end) > +{ > + struct folio_batch fbatch; > + > + folio_batch_init(&fbatch); > + while (filemap_get_folios(mapping, &start, end - 1, &fbatch)) { > + unsigned int i; > + > + for (i = 0; i < folio_batch_count(&fbatch); ++i) { > + struct folio *f; > + size_t nr_bytes; > + > + f = fbatch.folios[i]; > + nr_bytes = offset_in_folio(f, end << PAGE_SHIFT); > + if (nr_bytes == 0) > + nr_bytes = folio_size(f); > + > + folio_zero_segment(f, 0, nr_bytes); > + } > + > + folio_batch_release(&fbatch); > + cond_resched(); > + } > +} > + > +/** > + * kvm_gmem_truncate_inode_range() - Truncate pages in range [@lstart, @lend). > + * > + * @inode: inode to truncate from. > + * @lstart: offset in inode for start of range (inclusive). > + * @lend: offset in inode for end of range (exclusive). > + * > + * Removes full (huge)pages from the filemap and zeroing incomplete > + * (huge)pages. The pages in the range may be split. > + */ > +static void kvm_gmem_truncate_inode_range(struct inode *inode, loff_t lstart, > + loff_t lend) > +{ > + pgoff_t full_hpage_start; > + size_t nr_per_huge_page; > + pgoff_t full_hpage_end; > + size_t nr_pages; > + pgoff_t start; > + pgoff_t end; > + void *priv; > + > + priv = kvm_gmem_allocator_private(inode); > + nr_per_huge_page = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv); > + > + start = lstart >> PAGE_SHIFT; > + end = min(lend, i_size_read(inode)) >> PAGE_SHIFT; > + > + full_hpage_start = round_up(start, nr_per_huge_page); > + full_hpage_end = round_down(end, nr_per_huge_page); I think it's supposed to zero the start at a byte granularity. > + > + if (start < full_hpage_start) { > + pgoff_t zero_end = min(full_hpage_start, end); > + > + kvm_gmem_zero_range(inode->i_mapping, start, zero_end); > + } > + > + if (full_hpage_end > full_hpage_start) { > + nr_pages = full_hpage_end - full_hpage_start; > + kvm_gmem_truncate_inode_aligned_pages(inode, full_hpage_start, > + nr_pages); > + } > + > + if (end > full_hpage_end && end > full_hpage_start) { > + pgoff_t zero_start = max(full_hpage_end, start); This is weird. Could it just round up `end`, then check it and use it instead? > + > + kvm_gmem_zero_range(inode->i_mapping, zero_start, end); > + } > +} > + > static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > { > struct list_head *gmem_list = &inode->i_mapping->i_private_list; > @@ -752,7 +850,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_begin(gmem, start, end); > > - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); > + if (kvm_gmem_has_custom_allocator(inode)) { > + kvm_gmem_truncate_inode_range(inode, offset, offset + len); > + } else { > + /* Page size is PAGE_SIZE, so use optimized truncation function. */ > + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); > + } > > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_end(gmem, start, end); > @@ -776,6 +879,16 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) > > start = offset >> PAGE_SHIFT; > end = (offset + len) >> PAGE_SHIFT; > + if (kvm_gmem_has_custom_allocator(inode)) { > + size_t nr_pages; > + void *p; > + > + p = kvm_gmem_allocator_private(inode); > + nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p); > + > + start = round_down(start, nr_pages); > + end = round_down(end, nr_pages); > + } > > r = 0; > for (index = start; index < end; ) { > @@ -1570,7 +1683,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, > > *pfn = folio_file_pfn(folio, index); > if (max_order) > - *max_order = 0; > + *max_order = folio_order(folio); You might be able to have a separate patch that makes existing code work with larger folio sizes. Then add in the custom allocator/truncator bits in another one. > > *is_prepared = folio_test_uptodate(folio); > return folio; > @@ -1597,8 +1710,15 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, > goto out; > } > > - if (!is_prepared) > - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); > + if (!is_prepared) { > + /* > + * Use the same address as hugetlb for zeroing private pages > + * that won't be mapped to userspace anyway. > + */ > + unsigned long addr_hint = folio->index << PAGE_SHIFT; This could use some more explanation. > + > + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio, addr_hint); > + } > > folio_unlock(folio); >