Fuad Tabba <tabba@xxxxxxxxxx> writes: > To allow in-place sharing of guest_memfd folios with the host, > guest_memfd needs to track their sharing state, because mapping of > shared folios will only be allowed where it safe to access these folios. > It is safe to map and access these folios when explicitly shared with > the host, or potentially if not yet exposed to the guest (e.g., at > initialization). > > This patch introduces sharing states for guest_memfd folios as well as > the functions that manage transitioning between those states. > > Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx> > --- > include/linux/kvm_host.h | 39 +++++++- > virt/kvm/guest_memfd.c | 208 ++++++++++++++++++++++++++++++++++++--- > virt/kvm/kvm_main.c | 62 ++++++++++++ > 3 files changed, 295 insertions(+), 14 deletions(-) > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index bc73d7426363..bf82faf16c53 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -2600,7 +2600,44 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, > #endif > > #ifdef CONFIG_KVM_GMEM_SHARED_MEM > +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end); > +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end); > +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, > + gfn_t end); > +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, > + gfn_t end); > +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn); > void kvm_gmem_handle_folio_put(struct folio *folio); > -#endif > +#else > +static inline int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end) > +{ > + WARN_ON_ONCE(1); > + return -EINVAL; > +} > +static inline int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, > + gfn_t end) > +{ > + WARN_ON_ONCE(1); > + return -EINVAL; > +} > +static inline int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, > + gfn_t start, gfn_t end) > +{ > + WARN_ON_ONCE(1); > + return -EINVAL; > +} > +static inline int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, > + gfn_t start, gfn_t end) > +{ > + WARN_ON_ONCE(1); > + return -EINVAL; > +} > +static inline bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, > + gfn_t gfn) > +{ > + WARN_ON_ONCE(1); > + return false; > +} > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > > #endif > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index cde16ed3b230..3b4d724084a8 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -29,14 +29,6 @@ static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode) > return inode->i_mapping->i_private_data; > } > > -#ifdef CONFIG_KVM_GMEM_SHARED_MEM > -void kvm_gmem_handle_folio_put(struct folio *folio) > -{ > - WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); > -} > -EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); > -#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > - > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -389,22 +381,211 @@ static void kvm_gmem_init_mount(void) > } > > #ifdef CONFIG_KVM_GMEM_SHARED_MEM > -static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index) > +/* > + * An enum of the valid folio sharing states: > + * Bit 0: set if not shared with the guest (guest cannot fault it in) > + * Bit 1: set if not shared with the host (host cannot fault it in) > + */ > +enum folio_shareability { > + KVM_GMEM_ALL_SHARED = 0b00, /* Shared with the host and the guest. */ > + KVM_GMEM_GUEST_SHARED = 0b10, /* Shared only with the guest. */ > + KVM_GMEM_NONE_SHARED = 0b11, /* Not shared, transient state. */ > +}; > + > +static int kvm_gmem_offset_set_shared(struct inode *inode, pgoff_t index) > { > - struct kvm_gmem *gmem = file->private_data; > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + void *xval = xa_mk_value(KVM_GMEM_ALL_SHARED); > + > + lockdep_assert_held_write(offsets_lock); > + > + return xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL)); > +} > + > +/* > + * Marks the range [start, end) as shared with both the host and the guest. > + * Called when guest shares memory with the host. > + */ > +static int kvm_gmem_offset_range_set_shared(struct inode *inode, > + pgoff_t start, pgoff_t end) > +{ > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + pgoff_t i; > + int r = 0; > + > + write_lock(offsets_lock); > + for (i = start; i < end; i++) { > + r = kvm_gmem_offset_set_shared(inode, i); > + if (WARN_ON_ONCE(r)) > + break; > + } > + write_unlock(offsets_lock); > + > + return r; > +} > + > +static int kvm_gmem_offset_clear_shared(struct inode *inode, pgoff_t index) > +{ > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED); > + void *xval_none = xa_mk_value(KVM_GMEM_NONE_SHARED); > + struct folio *folio; > + int refcount; > + int r; > + > + lockdep_assert_held_write(offsets_lock); > + > + folio = filemap_lock_folio(inode->i_mapping, index); > + if (!IS_ERR(folio)) { > + /* +1 references are expected because of filemap_lock_folio(). */ > + refcount = folio_nr_pages(folio) + 1; > + } else { > + r = PTR_ERR(folio); > + if (WARN_ON_ONCE(r != -ENOENT)) > + return r; > + > + folio = NULL; > + } > + > + if (!folio || folio_ref_freeze(folio, refcount)) { > + /* > + * No outstanding references: transition to guest shared. > + */ > + r = xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL)); > + > + if (folio) > + folio_ref_unfreeze(folio, refcount); > + } else { > + /* > + * Outstanding references: the folio cannot be faulted in by > + * anyone until they're dropped. > + */ > + r = xa_err(xa_store(shared_offsets, index, xval_none, GFP_KERNEL)); Once we do this on elevated refcounts, truncate needs to be updated to handle the case where some folio is still in a KVM_GMEM_NONE_SHARED state. When a folio is found in a KVM_GMEM_NONE_SHARED state, the shareability should be fast-forwarded to KVM_GMEM_GUEST_SHARED, and the filemap's refcounts restored. The folio can then be truncated from the filemap as usual (which will drop the filemap's refcounts) > + } > + > + if (folio) { > + folio_unlock(folio); > + folio_put(folio); > + } > + > + return r; > +} > > +/* > + * Marks the range [start, end) as not shared with the host. If the host doesn't > + * have any references to a particular folio, then that folio is marked as > + * shared with the guest. > + * > + * However, if the host still has references to the folio, then the folio is > + * marked and not shared with anyone. Marking it as not shared allows draining > + * all references from the host, and ensures that the hypervisor does not > + * transition the folio to private, since the host still might access it. > + * > + * Called when guest unshares memory with the host. > + */ > +static int kvm_gmem_offset_range_clear_shared(struct inode *inode, > + pgoff_t start, pgoff_t end) > +{ > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + pgoff_t i; > + int r = 0; > + > + write_lock(offsets_lock); > + for (i = start; i < end; i++) { > + r = kvm_gmem_offset_clear_shared(inode, i); > + if (WARN_ON_ONCE(r)) > + break; > + } > + write_unlock(offsets_lock); > + > + return r; > +} > + > +void kvm_gmem_handle_folio_put(struct folio *folio) > +{ > + WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress."); > +} > +EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put); > + > +/* > + * Returns true if the folio is shared with the host and the guest. > + * > + * Must be called with the offsets_lock lock held. > + */ > +static bool kvm_gmem_offset_is_shared(struct inode *inode, pgoff_t index) > +{ > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + unsigned long r; > + > + lockdep_assert_held(offsets_lock); > > - /* For now, VMs that support shared memory share all their memory. */ > - return kvm_arch_gmem_supports_shared_mem(gmem->kvm); > + r = xa_to_value(xa_load(shared_offsets, index)); > + > + return r == KVM_GMEM_ALL_SHARED; > +} > + > +/* > + * Returns true if the folio is shared with the guest (not transitioning). > + * > + * Must be called with the offsets_lock lock held. > + */ > +static bool kvm_gmem_offset_is_guest_shared(struct inode *inode, pgoff_t index) > +{ > + struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets; > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + unsigned long r; > + > + lockdep_assert_held(offsets_lock); > + > + r = xa_to_value(xa_load(shared_offsets, index)); > + > + return (r == KVM_GMEM_ALL_SHARED || r == KVM_GMEM_GUEST_SHARED); > +} > + > +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) > +{ > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; > + pgoff_t end_off = start_off + end - start; > + > + return kvm_gmem_offset_range_set_shared(inode, start_off, end_off); > +} > + > +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end) > +{ > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > + pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn; > + pgoff_t end_off = start_off + end - start; > + > + return kvm_gmem_offset_range_clear_shared(inode, start_off, end_off); > +} > + > +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn) > +{ > + struct inode *inode = file_inode(READ_ONCE(slot->gmem.file)); > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > + unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn; > + bool r; > + > + read_lock(offsets_lock); > + r = kvm_gmem_offset_is_guest_shared(inode, pgoff); > + read_unlock(offsets_lock); > + > + return r; > } > > static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > { > struct inode *inode = file_inode(vmf->vma->vm_file); > + rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock; > struct folio *folio; > vm_fault_t ret = VM_FAULT_LOCKED; > > filemap_invalidate_lock_shared(inode->i_mapping); > + read_lock(offsets_lock); > > folio = kvm_gmem_get_folio(inode, vmf->pgoff); > if (IS_ERR(folio)) { > @@ -423,7 +604,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > goto out_folio; > } > > - if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) { > + if (!kvm_gmem_offset_is_shared(inode, vmf->pgoff)) { > ret = VM_FAULT_SIGBUS; > goto out_folio; > } > @@ -457,6 +638,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > } > > out_filemap: > + read_unlock(offsets_lock); > filemap_invalidate_unlock_shared(inode->i_mapping); > > return ret; > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 3e40acb9f5c0..90762252381c 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -3091,6 +3091,68 @@ static int next_segment(unsigned long len, int offset) > return len; > } > > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM > +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end) > +{ > + struct kvm_memslot_iter iter; > + int r = 0; > + > + mutex_lock(&kvm->slots_lock); > + > + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { > + struct kvm_memory_slot *memslot = iter.slot; > + gfn_t gfn_start, gfn_end; > + > + if (!kvm_slot_can_be_private(memslot)) > + continue; > + > + gfn_start = max(start, memslot->base_gfn); > + gfn_end = min(end, memslot->base_gfn + memslot->npages); > + if (WARN_ON_ONCE(start >= end)) > + continue; > + > + r = kvm_gmem_slot_set_shared(memslot, gfn_start, gfn_end); > + if (WARN_ON_ONCE(r)) > + break; > + } > + > + mutex_unlock(&kvm->slots_lock); > + > + return r; > +} > +EXPORT_SYMBOL_GPL(kvm_gmem_set_shared); > + > +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end) > +{ > + struct kvm_memslot_iter iter; > + int r = 0; > + > + mutex_lock(&kvm->slots_lock); > + > + kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) { > + struct kvm_memory_slot *memslot = iter.slot; > + gfn_t gfn_start, gfn_end; > + > + if (!kvm_slot_can_be_private(memslot)) > + continue; > + > + gfn_start = max(start, memslot->base_gfn); > + gfn_end = min(end, memslot->base_gfn + memslot->npages); > + if (WARN_ON_ONCE(start >= end)) > + continue; > + > + r = kvm_gmem_slot_clear_shared(memslot, gfn_start, gfn_end); > + if (WARN_ON_ONCE(r)) > + break; > + } > + > + mutex_unlock(&kvm->slots_lock); > + > + return r; > +} > +EXPORT_SYMBOL_GPL(kvm_gmem_clear_shared); > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */ > + > /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */ > static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, > void *data, int offset, int len)