If a custom allocator is requested at guest_memfd creation time, pages from the custom allocator will be used to back guest_memfd. Change-Id: I59df960b3273790f42fe5bea54a234f40962eb75 Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx> --- mm/memory.c | 1 + virt/kvm/guest_memfd.c | 142 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 132 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ba3ea0a82f7f..3af45e96913c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -7249,6 +7249,7 @@ void folio_zero_user(struct folio *folio, unsigned long addr_hint) else process_huge_page(addr_hint, nr_pages, clear_subpage, folio); } +EXPORT_SYMBOL_GPL(folio_zero_user); static int copy_user_gigantic_page(struct folio *dst, struct folio *src, unsigned long addr_hint, diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index c65d93c5a443..24d270b9b725 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -478,15 +478,13 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) * leaking host data and the up-to-date flag is set. */ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, struct folio *folio) + gfn_t gfn, struct folio *folio, + unsigned long addr_hint) { - unsigned long nr_pages, i; pgoff_t index; int r; - nr_pages = folio_nr_pages(folio); - for (i = 0; i < nr_pages; i++) - clear_highpage(folio_page(folio, i)); + folio_zero_user(folio, addr_hint); /* * Preparing huge folios should always be safe, since it should @@ -554,7 +552,9 @@ static int kvm_gmem_filemap_add_folio(struct address_space *mapping, */ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { + size_t allocated_size; struct folio *folio; + pgoff_t index_floor; int ret; repeat: @@ -581,8 +581,10 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) return ERR_PTR(ret); } } + allocated_size = folio_size(folio); - ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index); + index_floor = round_down(index, folio_nr_pages(folio)); + ret = kvm_gmem_filemap_add_folio(inode->i_mapping, folio, index_floor); if (ret) { folio_put(folio); @@ -598,7 +600,17 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) return ERR_PTR(ret); } - __folio_set_locked(folio); + spin_lock(&inode->i_lock); + inode->i_blocks += allocated_size / 512; + spin_unlock(&inode->i_lock); + + /* + * folio is the one that is allocated, this gets the folio at the + * requested index. + */ + folio = page_folio(folio_file_page(folio, index)); + folio_lock(folio); + return folio; } @@ -736,6 +748,92 @@ static void kvm_gmem_truncate_inode_aligned_pages(struct inode *inode, spin_unlock(&inode->i_lock); } +/** + * kvm_gmem_zero_range() - Zeroes all sub-pages in range [@start, @end). + * + * @mapping: the filemap to remove this range from. + * @start: index in filemap for start of range (inclusive). + * @end: index in filemap for end of range (exclusive). + * + * The pages in range may be split. truncate_inode_pages_range() isn't the right + * function because it removes pages from the page cache; this function only + * zeroes the pages. + */ +static void kvm_gmem_zero_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (filemap_get_folios(mapping, &start, end - 1, &fbatch)) { + unsigned int i; + + for (i = 0; i < folio_batch_count(&fbatch); ++i) { + struct folio *f; + size_t nr_bytes; + + f = fbatch.folios[i]; + nr_bytes = offset_in_folio(f, end << PAGE_SHIFT); + if (nr_bytes == 0) + nr_bytes = folio_size(f); + + folio_zero_segment(f, 0, nr_bytes); + } + + folio_batch_release(&fbatch); + cond_resched(); + } +} + +/** + * kvm_gmem_truncate_inode_range() - Truncate pages in range [@lstart, @lend). + * + * @inode: inode to truncate from. + * @lstart: offset in inode for start of range (inclusive). + * @lend: offset in inode for end of range (exclusive). + * + * Removes full (huge)pages from the filemap and zeroing incomplete + * (huge)pages. The pages in the range may be split. + */ +static void kvm_gmem_truncate_inode_range(struct inode *inode, loff_t lstart, + loff_t lend) +{ + pgoff_t full_hpage_start; + size_t nr_per_huge_page; + pgoff_t full_hpage_end; + size_t nr_pages; + pgoff_t start; + pgoff_t end; + void *priv; + + priv = kvm_gmem_allocator_private(inode); + nr_per_huge_page = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(priv); + + start = lstart >> PAGE_SHIFT; + end = min(lend, i_size_read(inode)) >> PAGE_SHIFT; + + full_hpage_start = round_up(start, nr_per_huge_page); + full_hpage_end = round_down(end, nr_per_huge_page); + + if (start < full_hpage_start) { + pgoff_t zero_end = min(full_hpage_start, end); + + kvm_gmem_zero_range(inode->i_mapping, start, zero_end); + } + + if (full_hpage_end > full_hpage_start) { + nr_pages = full_hpage_end - full_hpage_start; + kvm_gmem_truncate_inode_aligned_pages(inode, full_hpage_start, + nr_pages); + } + + if (end > full_hpage_end && end > full_hpage_start) { + pgoff_t zero_start = max(full_hpage_end, start); + + kvm_gmem_zero_range(inode->i_mapping, zero_start, end); + } +} + static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; @@ -752,7 +850,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_begin(gmem, start, end); - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + if (kvm_gmem_has_custom_allocator(inode)) { + kvm_gmem_truncate_inode_range(inode, offset, offset + len); + } else { + /* Page size is PAGE_SIZE, so use optimized truncation function. */ + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + } list_for_each_entry(gmem, gmem_list, entry) kvm_gmem_invalidate_end(gmem, start, end); @@ -776,6 +879,16 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) start = offset >> PAGE_SHIFT; end = (offset + len) >> PAGE_SHIFT; + if (kvm_gmem_has_custom_allocator(inode)) { + size_t nr_pages; + void *p; + + p = kvm_gmem_allocator_private(inode); + nr_pages = kvm_gmem_allocator_ops(inode)->nr_pages_in_folio(p); + + start = round_down(start, nr_pages); + end = round_down(end, nr_pages); + } r = 0; for (index = start; index < end; ) { @@ -1570,7 +1683,7 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, *pfn = folio_file_pfn(folio, index); if (max_order) - *max_order = 0; + *max_order = folio_order(folio); *is_prepared = folio_test_uptodate(folio); return folio; @@ -1597,8 +1710,15 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, goto out; } - if (!is_prepared) - r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); + if (!is_prepared) { + /* + * Use the same address as hugetlb for zeroing private pages + * that won't be mapped to userspace anyway. + */ + unsigned long addr_hint = folio->index << PAGE_SHIFT; + + r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio, addr_hint); + } folio_unlock(folio); -- 2.49.0.1045.g170613ef41-goog