Re: [PATCH v8 20/43] arm64: RME: Runtime faulting of memory

Steven Price <steven.price@xxxxxxx> · Wed, 21 May 2025 09:55:43 +0100

On 20/05/2025 15:48, Suzuki K Poulose wrote:
> On 16/05/2025 16:33, Steven Price wrote:
>> On 01/05/2025 01:16, Gavin Shan wrote:
>>> On 4/16/25 11:41 PM, Steven Price wrote:
>>>> At runtime if the realm guest accesses memory which hasn't yet been
>>>> mapped then KVM needs to either populate the region or fault the guest.
>>>>
>>>> For memory in the lower (protected) region of IPA a fresh page is
>>>> provided to the RMM which will zero the contents. For memory in the
>>>> upper (shared) region of IPA, the memory from the memslot is mapped
>>>> into the realm VM non secure.
>>>>
>>>> Signed-off-by: Steven Price <steven.price@xxxxxxx>
>>>> ---
>>>> Changes since v7:
>>>>    * Remove redundant WARN_ONs for realm_create_rtt_levels() - it will
>>>>      internally WARN when necessary.
>>>> Changes since v6:
>>>>    * Handle PAGE_SIZE being larger than RMM granule size.
>>>>    * Some minor renaming following review comments.
>>>> Changes since v5:
>>>>    * Reduce use of struct page in preparation for supporting the RMM
>>>>      having a different page size to the host.
>>>>    * Handle a race when delegating a page where another CPU has
>>>> faulted on
>>>>      a the same page (and already delegated the physical page) but
>>>> not yet
>>>>      mapped it. In this case simply return to the guest to either
>>>> use the
>>>>      mapping from the other CPU (or refault if the race is lost).
>>>>    * The changes to populate_par_region() are moved into the previous
>>>>      patch where they belong.
>>>> Changes since v4:
>>>>    * Code cleanup following review feedback.
>>>>    * Drop the PTE_SHARED bit when creating unprotected page table
>>>> entries.
>>>>      This is now set by the RMM and the host has no control of it
>>>> and the
>>>>      spec requires the bit to be set to zero.
>>>> Changes since v2:
>>>>    * Avoid leaking memory if failing to map it in the realm.
>>>>    * Correctly mask RTT based on LPA2 flag (see rtt_get_phys()).
>>>>    * Adapt to changes in previous patches.
>>>> ---
>>>>    arch/arm64/include/asm/kvm_emulate.h |  10 ++
>>>>    arch/arm64/include/asm/kvm_rme.h     |  10 ++
>>>>    arch/arm64/kvm/mmu.c                 | 127 ++++++++++++++++++-
>>>>    arch/arm64/kvm/rme.c                 | 180 ++++++++++++++++++++++
>>>> +++++
>>>>    4 files changed, 321 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/
>>>> include/asm/kvm_emulate.h
>>>> index c803c8188d9c..def439d6d732 100644
>>>> --- a/arch/arm64/include/asm/kvm_emulate.h
>>>> +++ b/arch/arm64/include/asm/kvm_emulate.h
>>>> @@ -704,6 +704,16 @@ static inline bool kvm_realm_is_created(struct
>>>> kvm *kvm)
>>>>        return kvm_is_realm(kvm) && kvm_realm_state(kvm) !=
>>>> REALM_STATE_NONE;
>>>>    }
>>>>    +static inline gpa_t kvm_gpa_from_fault(struct kvm *kvm, phys_addr_t
>>>> ipa)
>>>> +{
>>>> +    if (kvm_is_realm(kvm)) {
>>>> +        struct realm *realm = &kvm->arch.realm;
>>>> +
>>>> +        return ipa & ~BIT(realm->ia_bits - 1);
>>>> +    }
>>>> +    return ipa;
>>>> +}
>>>> +
>>>>    static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
>>>>    {
>>>>        if (static_branch_unlikely(&kvm_rme_is_available))
>>>> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/
>>>> asm/kvm_rme.h
>>>> index d86051ef0c5c..47aa6362c6c9 100644
>>>> --- a/arch/arm64/include/asm/kvm_rme.h
>>>> +++ b/arch/arm64/include/asm/kvm_rme.h
>>>> @@ -108,6 +108,16 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>>>>                   unsigned long ipa,
>>>>                   unsigned long size,
>>>>                   bool unmap_private);
>>>> +int realm_map_protected(struct realm *realm,
>>>> +            unsigned long base_ipa,
>>>> +            kvm_pfn_t pfn,
>>>> +            unsigned long size,
>>>> +            struct kvm_mmu_memory_cache *memcache);
>>>> +int realm_map_non_secure(struct realm *realm,
>>>> +             unsigned long ipa,
>>>> +             kvm_pfn_t pfn,
>>>> +             unsigned long size,
>>>> +             struct kvm_mmu_memory_cache *memcache);
>>>>      static inline bool kvm_realm_is_private_address(struct realm
>>>> *realm,
>>>>                            unsigned long addr)
>>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>>> index 71c04259e39f..02b66ee35426 100644
>>>> --- a/arch/arm64/kvm/mmu.c
>>>> +++ b/arch/arm64/kvm/mmu.c
>>>> @@ -338,8 +338,13 @@ static void __unmap_stage2_range(struct
>>>> kvm_s2_mmu *mmu, phys_addr_t start, u64
>>>>          lockdep_assert_held_write(&kvm->mmu_lock);
>>>>        WARN_ON(size & ~PAGE_MASK);
>>>> -    WARN_ON(stage2_apply_range(mmu, start, end,
>>>> KVM_PGT_FN(kvm_pgtable_stage2_unmap),
>>>> -                   may_block));
>>>> +
>>>> +    if (kvm_is_realm(kvm))
>>>> +        kvm_realm_unmap_range(kvm, start, size, !only_shared);
>>>> +    else
>>>> +        WARN_ON(stage2_apply_range(mmu, start, end,
>>>> +                       KVM_PGT_FN(kvm_pgtable_stage2_unmap),
>>>> +                       may_block));
>>>>    }
>>>>    
>>>
>>> As spotted previsouly, the parameter @may_block isn't handled by
>>> kvm_realm_unmap_range().
>>
>> Ack.
>>
>>>>    void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t
>>>> start,
>>>> @@ -359,7 +364,10 @@ static void stage2_flush_memslot(struct kvm *kvm,
>>>>        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
>>>>        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>>>>    -    kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>>>> +    if (kvm_is_realm(kvm))
>>>> +        kvm_realm_unmap_range(kvm, addr, end - addr, false);
>>>> +    else
>>>> +        kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
>>>>    }
>>>>      /**
>>>> @@ -1053,6 +1061,10 @@ void stage2_unmap_vm(struct kvm *kvm)
>>>>        struct kvm_memory_slot *memslot;
>>>>        int idx, bkt;
>>>>    +    /* For realms this is handled by the RMM so nothing to do
>>>> here */
>>>> +    if (kvm_is_realm(kvm))
>>>> +        return;
>>>> +
>>>>        idx = srcu_read_lock(&kvm->srcu);
>>>>        mmap_read_lock(current->mm);
>>>>        write_lock(&kvm->mmu_lock);
>>>> @@ -1078,6 +1090,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>>>>        if (kvm_is_realm(kvm) &&
>>>>            (kvm_realm_state(kvm) != REALM_STATE_DEAD &&
>>>>             kvm_realm_state(kvm) != REALM_STATE_NONE)) {
>>>> +        kvm_stage2_unmap_range(mmu, 0, (~0ULL) & PAGE_MASK, false);
>>>>            write_unlock(&kvm->mmu_lock);
>>>>            kvm_realm_destroy_rtts(kvm, pgt->ia_bits);
>>>
>>> (~0ULL & PAGE_MASK) wouldn't be a problem since the range will be
>>> limited to
>>> [0, BIT(realm->ia_bits) - 1] in kvm_realm_unmap_range(). I think it's
>>> reasonable
>>> to pass the maximal size here, something like:
>>>
>>>          kvm_stage2_unmap_range(mmu, 0, BIT(realm->ia_bits - 1), false);
> 
> I think this must be, given the end is excluding:
>        kvm_stage2_unmap_range(mmu, 0, BIT(realm->ia_bits), false);
> 
> BIT(realm->ia_bits - 1) only covers the protected half. The unprotected
> half spans  [ BIT(realm->ia_bits - 1), BIT(realm->ia_bits))

The kernel treats the two halves as aliasing. kvm_realm_unmap_range()
caps the end to min(BIT(realm->ia_bits - 1), end). So this wouldn't make
any difference.

This an unfortunate outcome of the memory slots describing both the
protected region (via guestmem_fd) and the shared region (via VMM maps).
So a single memslot describes two regions which leads to the kernel
treating those regions as aliasing for some purposes.

Thanks,
Steve