Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Vishal Annapurve <vannapurve@xxxxxxxxxx> · Thu, 31 Jul 2025 18:56:38 -0700

On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@xxxxxxx> wrote:
>
> +static int realm_create_protected_data_page(struct realm *realm,
> +                                           unsigned long ipa,
> +                                           kvm_pfn_t dst_pfn,
> +                                           kvm_pfn_t src_pfn,
> +                                           unsigned long flags)
> +{
> +       unsigned long rd = virt_to_phys(realm->rd);
> +       phys_addr_t dst_phys, src_phys;
> +       bool undelegate_failed = false;
> +       int ret, offset;
> +
> +       dst_phys = __pfn_to_phys(dst_pfn);
> +       src_phys = __pfn_to_phys(src_pfn);
> +
> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
> +               ret = realm_create_protected_data_granule(realm,
> +                                                         ipa,
> +                                                         dst_phys,
> +                                                         src_phys,
> +                                                         flags);
> +               if (ret)
> +                       goto err;
> +
> +               ipa += RMM_PAGE_SIZE;
> +               dst_phys += RMM_PAGE_SIZE;
> +               src_phys += RMM_PAGE_SIZE;
> +       }
> +
> +       return 0;
> +
> +err:
> +       if (ret == -EIO) {
> +               /* current offset needs undelegating */
> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> +                       undelegate_failed = true;
> +       }
> +       while (offset > 0) {
> +               ipa -= RMM_PAGE_SIZE;
> +               offset -= RMM_PAGE_SIZE;
> +               dst_phys -= RMM_PAGE_SIZE;
> +
> +               rmi_data_destroy(rd, ipa, NULL, NULL);
> +
> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> +                       undelegate_failed = true;
> +       }
> +
> +       if (undelegate_failed) {
> +               /*
> +                * A granule could not be undelegated,
> +                * so the page has to be leaked
> +                */
> +               get_page(pfn_to_page(dst_pfn));

I would like to point out that the support for in-place conversion
with guest_memfd using hugetlb pages [1] is under discussion.

As part of the in-place conversion, the policy we are routing for is
to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
in-place conversion works by splitting and merging folios during
memory conversion as per discussion at LPC [2].

The best way to avoid further use of this page with huge page support
around would be either:
1) Explicitly Inform guest_memfd of a particular pfn being in use by
KVM without relying on page refcounts or
2) Set the page as hwpoisoned. (Needs further discussion)

This page refcounting strategy will have to be revisited depending on
which series lands first. That being said, it would be great if ARM
could review/verify if the series [1] works for backing CCA VMs with
huge pages.

[1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@xxxxxxxxxx/
[2] https://lpc.events/event/18/contributions/1764/

> +       }
> +
> +       return -ENXIO;
> +}
> +
> +static int populate_region(struct kvm *kvm,
> +                          phys_addr_t ipa_base,
> +                          phys_addr_t ipa_end,
> +                          unsigned long data_flags)
> +{
> +       struct realm *realm = &kvm->arch.realm;
> +       struct kvm_memory_slot *memslot;
> +       gfn_t base_gfn, end_gfn;
> +       int idx;
> +       phys_addr_t ipa = ipa_base;
> +       int ret = 0;
> +
> +       base_gfn = gpa_to_gfn(ipa_base);
> +       end_gfn = gpa_to_gfn(ipa_end);
> +
> +       idx = srcu_read_lock(&kvm->srcu);
> +       memslot = gfn_to_memslot(kvm, base_gfn);
> +       if (!memslot) {
> +               ret = -EFAULT;
> +               goto out;
> +       }
> +
> +       /* We require the region to be contained within a single memslot */
> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (!kvm_slot_can_be_private(memslot)) {
> +               ret = -EPERM;
> +               goto out;
> +       }
> +
> +       while (ipa < ipa_end) {
> +               struct vm_area_struct *vma;
> +               unsigned long hva;
> +               struct page *page;
> +               bool writeable;
> +               kvm_pfn_t pfn;
> +               kvm_pfn_t priv_pfn;
> +               struct page *gmem_page;
> +
> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> +               vma = vma_lookup(current->mm, hva);
> +               if (!vma) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
> +                                       &writeable, &page);

Is this assuming double backing of guest memory ranges? Is this logic
trying to simulate a shared fault?

Does memory population work with CCA if priv_pfn and pfn are the same?
I am curious how the memory population will work with in-place
conversion support available for guest_memfd files.

> +
> +               if (is_error_pfn(pfn)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = kvm_gmem_get_pfn(kvm, memslot,
> +                                      ipa >> PAGE_SHIFT,
> +                                      &priv_pfn, &gmem_page, NULL);
> +               if (ret)
> +                       break;
> +
> +               ret = realm_create_protected_data_page(realm, ipa,
> +                                                      priv_pfn,
> +                                                      pfn,
> +                                                      data_flags);
> +
> +               kvm_release_page_clean(page);
> +
> +               if (ret)
> +                       break;
> +
> +               ipa += PAGE_SIZE;
> +       }
> +
> +out:
> +       srcu_read_unlock(&kvm->srcu, idx);
> +       return ret;
> +}
> +