Hi Ackerley, On Thu, 3 Apr 2025 at 00:48, Ackerley Tng <ackerleytng@xxxxxxxxxx> wrote: > > Fuad Tabba <tabba@xxxxxxxxxx> writes: > > > To allow in-place sharing of guest_memfd folios with the host, > > guest_memfd needs to track their sharing state, because mapping of > > shared folios will only be allowed where it safe to access these folios. > > It is safe to map and access these folios when explicitly shared with > > the host, or potentially if not yet exposed to the guest (e.g., at > > initialization). > > > > This patch introduces sharing states for guest_memfd folios as well as > > the functions that manage transitioning between those states. > > > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > > --- > > include/linux/kvm_host.h | 39 +++++++- > > virt/kvm/guest_memfd.c | 208 ++++++++++++++++++++++++++++++++++++--- > > virt/kvm/kvm_main.c | 62 ++++++++++++ > > 3 files changed, 295 insertions(+), 14 deletions(-) > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index bc73d7426363..bf82faf16c53 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -2600,7 +2600,44 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > > #endif > > > > #ifdef CONFIG_KVM_GMEM_SHARED_MEM > > +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end); > > +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end); > > +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, > > + gfn_t end); > > +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, > > + gfn_t end); > > +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn); > > void kvm_gmem_handle_folio_put(struct folio *folio); > > -#endif > > +#else > > +static inline int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end) > > +{ > > + WARN_ON_ONCE(1); > > + return -EINVAL; > > +} > > +static inline int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, > > + gfn_t end) > > +{ > > + WARN_ON_ONCE(1); > > + return -EINVAL; > > +} > > +static inline int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, > > + gfn_t start, gfn_t end) > > +{ > > + WARN_ON_ONCE(1); > > + return -EINVAL; > > +} > > +static inline int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, > > + gfn_t start, gfn_t end) > > +{ > > + WARN_ON_ONCE(1); > > + return -EINVAL; > > +} > > +static inline bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, > > + gfn_t gfn) > > +{ > > + WARN_ON_ONCE(1); > > + return false; > > +} > > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > > > > #endif > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > > index cde16ed3b230..3b4d724084a8 100644 > > --- a/virt/kvm/guest_memfd.c > > +++ b/virt/kvm/guest_memfd.c > > @@ -29,14 +29,6 @@ static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode) > > return inode->i_mapping->i_private_data; > > } > > > > -#ifdef CONFIG_KVM_GMEM_SHARED_MEM > > -void kvm_gmem_handle_folio_put(struct folio *folio) > > -{ > > - WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); > > -} > > -EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); > > -#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > > - > > /** > > * folio_file_pfn - like folio_file_page, but return a pfn. > > * @folio: The folio which contains this index. > > @@ -389,22 +381,211 @@ static void kvm_gmem_init_mount(void) > > } > > > > #ifdef CONFIG_KVM_GMEM_SHARED_MEM > > -static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index) > > +/* > > + * An enum of the valid folio sharing states: > > + * Bit 0: set if not shared with the guest (guest cannot fault it in) > > + * Bit 1: set if not shared with the host (host cannot fault it in) > > + */ > > +enum folio_shareability { > > + KVM_GMEM_ALL_SHARED = 0b00, /* Shared with the host and the guest. */ > > + KVM_GMEM_GUEST_SHARED = 0b10, /* Shared only with the guest. */ > > + KVM_GMEM_NONE_SHARED = 0b11, /* Not shared, transient state. */ > > +}; > > + > > +static int kvm_gmem_offset_set_shared(struct inode *inode, pgoff_t index) > > { > > - struct kvm_gmem *gmem = file->private_data; > > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + void *xval = xa_mk_value(KVM_GMEM_ALL_SHARED); > > + > > + lockdep_assert_held_write(offsets_lock); > > + > > + return xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL)); > > +} > > + > > +/* > > + * Marks the range [start, end) as shared with both the host and the guest. > > + * Called when guest shares memory with the host. > > + */ > > +static int kvm_gmem_offset_range_set_shared(struct inode *inode, > > + pgoff_t start, pgoff_t end) > > +{ > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + pgoff_t i; > > + int r = 0; > > + > > + write_lock(offsets_lock); > > + for (i = start; i < end; i++) { > > + r = kvm_gmem_offset_set_shared(inode, i); > > + if (WARN_ON_ONCE(r)) > > + break; > > + } > > + write_unlock(offsets_lock); > > + > > + return r; > > +} > > + > > +static int kvm_gmem_offset_clear_shared(struct inode *inode, pgoff_t index) > > +{ > > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED); > > + void *xval_none = xa_mk_value(KVM_GMEM_NONE_SHARED); > > + struct folio *folio; > > + int refcount; > > + int r; > > + > > + lockdep_assert_held_write(offsets_lock); > > + > > + folio = filemap_lock_folio(inode->i_mapping, index); > > + if (!IS_ERR(folio)) { > > + /* +1 references are expected because of filemap_lock_folio(). */ > > + refcount = folio_nr_pages(folio) + 1; > > + } else { > > + r = PTR_ERR(folio); > > + if (WARN_ON_ONCE(r != -ENOENT)) > > + return r; > > + > > + folio = NULL; > > + } > > + > > + if (!folio || folio_ref_freeze(folio, refcount)) { > > + /* > > + * No outstanding references: transition to guest shared. > > + */ > > + r = xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL)); > > + > > + if (folio) > > + folio_ref_unfreeze(folio, refcount); > > + } else { > > + /* > > + * Outstanding references: the folio cannot be faulted in by > > + * anyone until they're dropped. > > + */ > > + r = xa_err(xa_store(shared_offsets, index, xval_none, GFP_KERNEL)); > > Once we do this on elevated refcounts, truncate needs to be updated to > handle the case where some folio is still in a KVM_GMEM_NONE_SHARED > state. > > When a folio is found in a KVM_GMEM_NONE_SHARED state, the shareability > should be fast-forwarded to KVM_GMEM_GUEST_SHARED, and the filemap's > refcounts restored. The folio can then be truncated from the filemap as > usual (which will drop the filemap's refcounts) Ack. Thanks, /fuad > > + } > > + > > + if (folio) { > > + folio_unlock(folio); > > + folio_put(folio); > > + } > > + > > + return r; > > +} > > > > +/* > > + * Marks the range [start, end) as not shared with the host. If the host doesn't > > + * have any references to a particular folio, then that folio is marked as > > + * shared with the guest. > > + * > > + * However, if the host still has references to the folio, then the folio is > > + * marked and not shared with anyone. Marking it as not shared allows draining > > + * all references from the host, and ensures that the hypervisor does not > > + * transition the folio to private, since the host still might access it. > > + * > > + * Called when guest unshares memory with the host. > > + */ > > +static int kvm_gmem_offset_range_clear_shared(struct inode *inode, > > + pgoff_t start, pgoff_t end) > > +{ > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + pgoff_t i; > > + int r = 0; > > + > > + write_lock(offsets_lock); > > + for (i = start; i < end; i++) { > > + r = kvm_gmem_offset_clear_shared(inode, i); > > + if (WARN_ON_ONCE(r)) > > + break; > > + } > > + write_unlock(offsets_lock); > > + > > + return r; > > +} > > + > > +void kvm_gmem_handle_folio_put(struct folio *folio) > > +{ > > + WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); > > +} > > +EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); > > + > > +/* > > + * Returns true if the folio is shared with the host and the guest. > > + * > > + * Must be called with the offsets_lock lock held. > > + */ > > +static bool kvm_gmem_offset_is_shared(struct inode *inode, pgoff_t index) > > +{ > > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + unsigned long r; > > + > > + lockdep_assert_held(offsets_lock); > > > > - /* For now, VMs that support shared memory share all their memory. */ > > - return kvm_arch_gmem_supports_shared_mem(gmem->kvm); > > + r = xa_to_value(xa_load(shared_offsets, index)); > > + > > + return r == KVM_GMEM_ALL_SHARED; > > +} > > + > > +/* > > + * Returns true if the folio is shared with the guest (not transitioning). > > + * > > + * Must be called with the offsets_lock lock held. > > + */ > > +static bool kvm_gmem_offset_is_guest_shared(struct inode *inode, pgoff_t index) > > +{ > > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + unsigned long r; > > + > > + lockdep_assert_held(offsets_lock); > > + > > + r = xa_to_value(xa_load(shared_offsets, index)); > > + > > + return (r == KVM_GMEM_ALL_SHARED || r == KVM_GMEM_GUEST_SHARED); > > +} > > + > > +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) > > +{ > > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > > + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; > > + pgoff_t end_off = start_off + end - start; > > + > > + return kvm_gmem_offset_range_set_shared(inode, start_off, end_off); > > +} > > + > > +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) > > +{ > > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > > + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; > > + pgoff_t end_off = start_off + end - start; > > + > > + return kvm_gmem_offset_range_clear_shared(inode, start_off, end_off); > > +} > > + > > +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn) > > +{ > > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > + unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; > > + bool r; > > + > > + read_lock(offsets_lock); > > + r = kvm_gmem_offset_is_guest_shared(inode, pgoff); > > + read_unlock(offsets_lock); > > + > > + return r; > > } > > > > static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > > { > > struct inode *inode = file_inode(vmf->vma->vm_file); > > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > > struct folio *folio; > > vm_fault_t ret = VM_FAULT_LOCKED; > > > > filemap_invalidate_lock_shared(inode->i_mapping); > > + read_lock(offsets_lock); > > > > folio = kvm_gmem_get_folio(inode, vmf->pgoff); > > if (IS_ERR(folio)) { > > @@ -423,7 +604,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > > goto out_folio; > > } > > > > - if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) { > > + if (!kvm_gmem_offset_is_shared(inode, vmf->pgoff)) { > > ret = VM_FAULT_SIGBUS; > > goto out_folio; > > } > > @@ -457,6 +638,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > > } > > > > out_filemap: > > + read_unlock(offsets_lock); > > filemap_invalidate_unlock_shared(inode->i_mapping); > > > > return ret; > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index 3e40acb9f5c0..90762252381c 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -3091,6 +3091,68 @@ static int next_segment(unsigned long len, int offset) > > return len; > > } > > > > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM > > +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end) > > +{ > > + struct kvm_memslot_iter iter; > > + int r = 0; > > + > > + mutex_lock(&kvm->slots_lock); > > + > > + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { > > + struct kvm_memory_slot *memslot = iter.slot; > > + gfn_t gfn_start, gfn_end; > > + > > + if (!kvm_slot_can_be_private(memslot)) > > + continue; > > + > > + gfn_start = max(start, memslot->base_gfn); > > + gfn_end = min(end, memslot->base_gfn + memslot->npages); > > + if (WARN_ON_ONCE(start >= end)) > > + continue; > > + > > + r = kvm_gmem_slot_set_shared(memslot, gfn_start, gfn_end); > > + if (WARN_ON_ONCE(r)) > > + break; > > + } > > + > > + mutex_unlock(&kvm->slots_lock); > > + > > + return r; > > +} > > +EXPORT_SYMBOL_GPL(kvm_gmem_set_shared); > > + > > +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end) > > +{ > > + struct kvm_memslot_iter iter; > > + int r = 0; > > + > > + mutex_lock(&kvm->slots_lock); > > + > > + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { > > + struct kvm_memory_slot *memslot = iter.slot; > > + gfn_t gfn_start, gfn_end; > > + > > + if (!kvm_slot_can_be_private(memslot)) > > + continue; > > + > > + gfn_start = max(start, memslot->base_gfn); > > + gfn_end = min(end, memslot->base_gfn + memslot->npages); > > + if (WARN_ON_ONCE(start >= end)) > > + continue; > > + > > + r = kvm_gmem_slot_clear_shared(memslot, gfn_start, gfn_end); > > + if (WARN_ON_ONCE(r)) > > + break; > > + } > > + > > + mutex_unlock(&kvm->slots_lock); > > + > > + return r; > > +} > > +EXPORT_SYMBOL_GPL(kvm_gmem_clear_shared); > > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > > + > > /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ > > static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, > > void *data, int offset, int len)