Re: [PATCH v7 4/7] KVM: guest_memfd: Folio sharing states and functions that manage their transition

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Fuad Tabba <tabba@xxxxxxxxxx> writes:

> To allow in-place sharing of guest_memfd folios with the host,
> guest_memfd needs to track their sharing state, because mapping of
> shared folios will only be allowed where it safe to access these folios.
> It is safe to map and access these folios when explicitly shared with
> the host, or potentially if not yet exposed to the guest (e.g., at
> initialization).
>
> This patch introduces sharing states for guest_memfd folios as well as
> the functions that manage transitioning between those states.
>
> Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
> ---
>  include/linux/kvm_host.h |  39 +++++++-
>  virt/kvm/guest_memfd.c   | 208 ++++++++++++++++++++++++++++++++++++---
>  virt/kvm/kvm_main.c      |  62 ++++++++++++
>  3 files changed, 295 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index bc73d7426363..bf82faf16c53 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -2600,7 +2600,44 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
>  #endif
>  
>  #ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end);
> +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end);
> +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start,
> +			     gfn_t end);
> +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start,
> +			       gfn_t end);
> +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn);
>  void kvm_gmem_handle_folio_put(struct folio *folio);
> -#endif
> +#else
> +static inline int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start,
> +					gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot,
> +					   gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot,
> +					     gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +	return -EINVAL;
> +}
> +static inline bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot,
> +						 gfn_t gfn)
> +{
> +	WARN_ON_ONCE(1);
> +	return false;
> +}
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>  
>  #endif
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index cde16ed3b230..3b4d724084a8 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -29,14 +29,6 @@ static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>  	return inode->i_mapping->i_private_data;
>  }
>  
> -#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> -void kvm_gmem_handle_folio_put(struct folio *folio)
> -{
> -	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
> -}
> -EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put);
> -#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> -
>  /**
>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -389,22 +381,211 @@ static void kvm_gmem_init_mount(void)
>  }
>  
>  #ifdef CONFIG_KVM_GMEM_SHARED_MEM
> -static bool kvm_gmem_offset_is_shared(struct file *file, pgoff_t index)
> +/*
> + * An enum of the valid folio sharing states:
> + * Bit 0: set if not shared with the guest (guest cannot fault it in)
> + * Bit 1: set if not shared with the host (host cannot fault it in)
> + */
> +enum folio_shareability {
> +	KVM_GMEM_ALL_SHARED	= 0b00,	/* Shared with the host and the guest. */
> +	KVM_GMEM_GUEST_SHARED	= 0b10, /* Shared only with the guest. */
> +	KVM_GMEM_NONE_SHARED	= 0b11, /* Not shared, transient state. */
> +};
> +
> +static int kvm_gmem_offset_set_shared(struct inode *inode, pgoff_t index)
>  {
> -	struct kvm_gmem *gmem = file->private_data;
> +	struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	void *xval = xa_mk_value(KVM_GMEM_ALL_SHARED);
> +
> +	lockdep_assert_held_write(offsets_lock);
> +
> +	return xa_err(xa_store(shared_offsets, index, xval, GFP_KERNEL));
> +}
> +
> +/*
> + * Marks the range [start, end) as shared with both the host and the guest.
> + * Called when guest shares memory with the host.
> + */
> +static int kvm_gmem_offset_range_set_shared(struct inode *inode,
> +					    pgoff_t start, pgoff_t end)
> +{
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	pgoff_t i;
> +	int r = 0;
> +
> +	write_lock(offsets_lock);
> +	for (i = start; i < end; i++) {
> +		r = kvm_gmem_offset_set_shared(inode, i);
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +	write_unlock(offsets_lock);
> +
> +	return r;
> +}
> +
> +static int kvm_gmem_offset_clear_shared(struct inode *inode, pgoff_t index)
> +{
> +	struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	void *xval_guest = xa_mk_value(KVM_GMEM_GUEST_SHARED);
> +	void *xval_none = xa_mk_value(KVM_GMEM_NONE_SHARED);
> +	struct folio *folio;
> +	int refcount;
> +	int r;
> +
> +	lockdep_assert_held_write(offsets_lock);
> +
> +	folio = filemap_lock_folio(inode->i_mapping, index);
> +	if (!IS_ERR(folio)) {
> +		/* +1 references are expected because of filemap_lock_folio(). */
> +		refcount = folio_nr_pages(folio) + 1;
> +	} else {
> +		r = PTR_ERR(folio);
> +		if (WARN_ON_ONCE(r != -ENOENT))
> +			return r;
> +
> +		folio = NULL;
> +	}
> +
> +	if (!folio || folio_ref_freeze(folio, refcount)) {
> +		/*
> +		 * No outstanding references: transition to guest shared.
> +		 */
> +		r = xa_err(xa_store(shared_offsets, index, xval_guest, GFP_KERNEL));
> +
> +		if (folio)
> +			folio_ref_unfreeze(folio, refcount);
> +	} else {
> +		/*
> +		 * Outstanding references: the folio cannot be faulted in by
> +		 * anyone until they're dropped.
> +		 */
> +		r = xa_err(xa_store(shared_offsets, index, xval_none, GFP_KERNEL));

Once we do this on elevated refcounts, truncate needs to be updated to
handle the case where some folio is still in a KVM_GMEM_NONE_SHARED
state.

When a folio is found in a KVM_GMEM_NONE_SHARED state, the shareability
should be fast-forwarded to KVM_GMEM_GUEST_SHARED, and the filemap's
refcounts restored.  The folio can then be truncated from the filemap as
usual (which will drop the filemap's refcounts)

> +	}
> +
> +	if (folio) {
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}
> +
> +	return r;
> +}
>  
> +/*
> + * Marks the range [start, end) as not shared with the host. If the host doesn't
> + * have any references to a particular folio, then that folio is marked as
> + * shared with the guest.
> + *
> + * However, if the host still has references to the folio, then the folio is
> + * marked and not shared with anyone. Marking it as not shared allows draining
> + * all references from the host, and ensures that the hypervisor does not
> + * transition the folio to private, since the host still might access it.
> + *
> + * Called when guest unshares memory with the host.
> + */
> +static int kvm_gmem_offset_range_clear_shared(struct inode *inode,
> +					      pgoff_t start, pgoff_t end)
> +{
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	pgoff_t i;
> +	int r = 0;
> +
> +	write_lock(offsets_lock);
> +	for (i = start; i < end; i++) {
> +		r = kvm_gmem_offset_clear_shared(inode, i);
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +	write_unlock(offsets_lock);
> +
> +	return r;
> +}
> +
> +void kvm_gmem_handle_folio_put(struct folio *folio)
> +{
> +	WARN_ONCE(1, "A placeholder that shouldn't trigger. Work in progress.");
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_handle_folio_put);
> +
> +/*
> + * Returns true if the folio is shared with the host and the guest.
> + *
> + * Must be called with the offsets_lock lock held.
> + */
> +static bool kvm_gmem_offset_is_shared(struct inode *inode, pgoff_t index)
> +{
> +	struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	unsigned long r;
> +
> +	lockdep_assert_held(offsets_lock);
>  
> -	/* For now, VMs that support shared memory share all their memory. */
> -	return kvm_arch_gmem_supports_shared_mem(gmem->kvm);
> +	r = xa_to_value(xa_load(shared_offsets, index));
> +
> +	return r == KVM_GMEM_ALL_SHARED;
> +}
> +
> +/*
> + * Returns true if the folio is shared with the guest (not transitioning).
> + *
> + * Must be called with the offsets_lock lock held.
> + */
> +static bool kvm_gmem_offset_is_guest_shared(struct inode *inode, pgoff_t index)
> +{
> +	struct xarray *shared_offsets = &kvm_gmem_private(inode)->shared_offsets;
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	unsigned long r;
> +
> +	lockdep_assert_held(offsets_lock);
> +
> +	r = xa_to_value(xa_load(shared_offsets, index));
> +
> +	return (r == KVM_GMEM_ALL_SHARED || r == KVM_GMEM_GUEST_SHARED);
> +}
> +
> +int kvm_gmem_slot_set_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> +	struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
> +	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> +	pgoff_t end_off = start_off + end - start;
> +
> +	return kvm_gmem_offset_range_set_shared(inode, start_off, end_off);
> +}
> +
> +int kvm_gmem_slot_clear_shared(struct kvm_memory_slot *slot, gfn_t start, gfn_t end)
> +{
> +	struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
> +	pgoff_t start_off = slot->gmem.pgoff + start - slot->base_gfn;
> +	pgoff_t end_off = start_off + end - start;
> +
> +	return kvm_gmem_offset_range_clear_shared(inode, start_off, end_off);
> +}
> +
> +bool kvm_gmem_slot_is_guest_shared(struct kvm_memory_slot *slot, gfn_t gfn)
> +{
> +	struct inode *inode = file_inode(READ_ONCE(slot->gmem.file));
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
> +	unsigned long pgoff = slot->gmem.pgoff + gfn - slot->base_gfn;
> +	bool r;
> +
> +	read_lock(offsets_lock);
> +	r = kvm_gmem_offset_is_guest_shared(inode, pgoff);
> +	read_unlock(offsets_lock);
> +
> +	return r;
>  }
>  
>  static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  {
>  	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	rwlock_t *offsets_lock = &kvm_gmem_private(inode)->offsets_lock;
>  	struct folio *folio;
>  	vm_fault_t ret = VM_FAULT_LOCKED;
>  
>  	filemap_invalidate_lock_shared(inode->i_mapping);
> +	read_lock(offsets_lock);
>  
>  	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>  	if (IS_ERR(folio)) {
> @@ -423,7 +604,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  		goto out_folio;
>  	}
>  
> -	if (!kvm_gmem_offset_is_shared(vmf->vma->vm_file, vmf->pgoff)) {
> +	if (!kvm_gmem_offset_is_shared(inode, vmf->pgoff)) {
>  		ret = VM_FAULT_SIGBUS;
>  		goto out_folio;
>  	}
> @@ -457,6 +638,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  	}
>  
>  out_filemap:
> +	read_unlock(offsets_lock);
>  	filemap_invalidate_unlock_shared(inode->i_mapping);
>  
>  	return ret;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 3e40acb9f5c0..90762252381c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3091,6 +3091,68 @@ static int next_segment(unsigned long len, int offset)
>  		return len;
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +int kvm_gmem_set_shared(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_memslot_iter iter;
> +	int r = 0;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> +		struct kvm_memory_slot *memslot = iter.slot;
> +		gfn_t gfn_start, gfn_end;
> +
> +		if (!kvm_slot_can_be_private(memslot))
> +			continue;
> +
> +		gfn_start = max(start, memslot->base_gfn);
> +		gfn_end = min(end, memslot->base_gfn + memslot->npages);
> +		if (WARN_ON_ONCE(start >= end))
> +			continue;
> +
> +		r = kvm_gmem_slot_set_shared(memslot, gfn_start, gfn_end);
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return r;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_set_shared);
> +
> +int kvm_gmem_clear_shared(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	struct kvm_memslot_iter iter;
> +	int r = 0;
> +
> +	mutex_lock(&kvm->slots_lock);
> +
> +	kvm_for_each_memslot_in_gfn_range(&iter, kvm_memslots(kvm), start, end) {
> +		struct kvm_memory_slot *memslot = iter.slot;
> +		gfn_t gfn_start, gfn_end;
> +
> +		if (!kvm_slot_can_be_private(memslot))
> +			continue;
> +
> +		gfn_start = max(start, memslot->base_gfn);
> +		gfn_end = min(end, memslot->base_gfn + memslot->npages);
> +		if (WARN_ON_ONCE(start >= end))
> +			continue;
> +
> +		r = kvm_gmem_slot_clear_shared(memslot, gfn_start, gfn_end);
> +		if (WARN_ON_ONCE(r))
> +			break;
> +	}
> +
> +	mutex_unlock(&kvm->slots_lock);
> +
> +	return r;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_clear_shared);
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>  /* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
>  static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
>  				 void *data, int offset, int len)




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux