Hi Fuad,
On 5/28/25 4:02 AM, Fuad Tabba wrote:
This patch enables support for shared memory in guest_memfd, including
mapping that memory at the host userspace. This support is gated by the
configuration option KVM_GMEM_SHARED_MEM, and toggled by the guest_memfd
flag GUEST_MEMFD_FLAG_SUPPORT_SHARED, which can be set when creating a
guest_memfd instance.
Co-developed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Fuad Tabba <tabba@xxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 10 ++++
arch/x86/kvm/x86.c | 3 +-
include/linux/kvm_host.h | 13 ++++++
include/uapi/linux/kvm.h | 1 +
virt/kvm/Kconfig | 5 ++
virt/kvm/guest_memfd.c | 81 +++++++++++++++++++++++++++++++++
6 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 709cc2a7ba66..ce9ad4cd93c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2255,8 +2255,18 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
#ifdef CONFIG_KVM_GMEM
#define kvm_arch_supports_gmem(kvm) ((kvm)->arch.supports_gmem)
+
+/*
+ * CoCo VMs with hardware support that use guest_memfd only for backing private
+ * memory, e.g., TDX, cannot use guest_memfd with userspace mapping enabled.
+ */
+#define kvm_arch_supports_gmem_shared_mem(kvm) \
+ (IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM) && \
+ ((kvm)->arch.vm_type == KVM_X86_SW_PROTECTED_VM || \
+ (kvm)->arch.vm_type == KVM_X86_DEFAULT_VM))
#else
#define kvm_arch_supports_gmem(kvm) false
+#define kvm_arch_supports_gmem_shared_mem(kvm) false
#endif
#define kvm_arch_has_readonly_mem(kvm) (!(kvm)->arch.has_protected_state)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 035ced06b2dd..2a02f2457c42 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -12718,7 +12718,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
return -EINVAL;
kvm->arch.vm_type = type;
- kvm->arch.supports_gmem = (type == KVM_X86_SW_PROTECTED_VM);
+ kvm->arch.supports_gmem =
+ type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
/* Decided by the vendor code for other VM types. */
kvm->arch.pre_fault_allowed =
type == KVM_X86_DEFAULT_VM || type == KVM_X86_SW_PROTECTED_VM;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 80371475818f..ba83547e62b0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -729,6 +729,19 @@ static inline bool kvm_arch_supports_gmem(struct kvm *kvm)
}
#endif
+/*
+ * Returns true if this VM supports shared mem in guest_memfd.
+ *
+ * Arch code must define kvm_arch_supports_gmem_shared_mem if support for
+ * guest_memfd is enabled.
+ */
+#if !defined(kvm_arch_supports_gmem_shared_mem) && !IS_ENABLED(CONFIG_KVM_GMEM)
+static inline bool kvm_arch_supports_gmem_shared_mem(struct kvm *kvm)
+{
+ return false;
+}
+#endif
+
#ifndef kvm_arch_has_readonly_mem
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
{
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index b6ae8ad8934b..c2714c9d1a0e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1566,6 +1566,7 @@ struct kvm_memory_attributes {
#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
+#define GUEST_MEMFD_FLAG_SUPPORT_SHARED (1ULL << 0)
struct kvm_create_guest_memfd {
__u64 size;
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 559c93ad90be..df225298ab10 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -128,3 +128,8 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
config HAVE_KVM_ARCH_GMEM_INVALIDATE
bool
depends on KVM_GMEM
+
+config KVM_GMEM_SHARED_MEM
+ select KVM_GMEM
+ bool
+ prompt "Enable support for non-private (shared) memory in guest_memfd"
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 6db515833f61..5d34712f64fc 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -312,7 +312,81 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+static bool kvm_gmem_supports_shared(struct inode *inode)
+{
+ u64 flags;
+
+ if (!IS_ENABLED(CONFIG_KVM_GMEM_SHARED_MEM))
+ return false;
+
+ flags = (u64)inode->i_private;
+
+ return flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED;
+}
+
+
+#ifdef CONFIG_KVM_GMEM_SHARED_MEM
+static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ struct folio *folio;
+ vm_fault_t ret = VM_FAULT_LOCKED;
+
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ if (IS_ERR(folio)) {
+ int err = PTR_ERR(folio);
+
+ if (err == -EAGAIN)
+ return VM_FAULT_RETRY;
+
+ return vmf_error(err);
+ }
+
+ if (WARN_ON_ONCE(folio_test_large(folio))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_folio;
+ }
+
+ if (!folio_test_uptodate(folio)) {
+ clear_highpage(folio_page(folio, 0));
+ kvm_gmem_mark_prepared(folio);
+ }
+
+ vmf->page = folio_file_page(folio, vmf->pgoff);
+
+out_folio:
+ if (ret != VM_FAULT_LOCKED) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return ret;
+}
+
+static const struct vm_operations_struct kvm_gmem_vm_ops = {
+ .fault = kvm_gmem_fault_shared,
+};
+
+static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!kvm_gmem_supports_shared(file_inode(file)))
+ return -ENODEV;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
+ (VM_SHARED | VM_MAYSHARE)) {
+ return -EINVAL;
+ }
+
+ vma->vm_ops = &kvm_gmem_vm_ops;
+
+ return 0;
+}
+#else
+#define kvm_gmem_mmap NULL
+#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
+
static struct file_operations kvm_gmem_fops = {
+ .mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
@@ -463,6 +537,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
u64 flags = args->flags;
u64 valid_flags = 0;
It seems there is an uncovered corner case, which exists in current code (not directly
caused by this patch): After .mmap is hooked, the address space (inode->i_mapping) is
exposed to user space for futher requests like madvise().madvise(MADV_COLLAPSE) can
potentially collapse the pages to a huge page (folio) with the following assumptions.
It's not the expected behavior since huge page isn't supported yet.
- CONFIG_READ_ONLY_THP_FOR_FS = y
- the folios in the pagecache have been fully populated, it can be done by kvm_gmem_fallocate()
or kvm_gmem_get_pfn().
- mmap(0x00000f0100000000, ..., MAP_FIXED_NOREPLACE) on the guest-memfd, and then do
madvise(buf, size, MADV_COLLAPSE).
sys_madvise
do_madvise
madvise_do_behavior
madvise_vma_behavior
madvise_collapse
thp_vma_allowable_order
file_thp_enabled // need to return false to bail from the path earlier at least
hpage_collapse_scan_file
collapse_pte_mapped_thp
The fix would be to increase inode->i_writecount using allow_write_access() in
__kvm_gmem_create() to break the check done by file_thp_enabled().
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 0cd12f94958b..fe706c9f21cf 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -502,6 +502,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
}
file->f_flags |= O_LARGEFILE;
+ allow_write_access(file);
+ if (kvm_arch_supports_gmem_shared_mem(kvm))
+ valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
+
if (flags & ~valid_flags)
return -EINVAL;
@@ -501,6 +578,10 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
offset + size > i_size_read(inode))
goto err;
+ if (kvm_gmem_supports_shared(inode) &&
+ !kvm_arch_supports_gmem_shared_mem(kvm))
+ goto err;
+
filemap_invalidate_lock(inode->i_mapping);
start = offset >> PAGE_SHIFT;
Thanks,
Gavin