Re: [PATCH v8 20/43] arm64: RME: Runtime faulting of memory

Steven Price <steven.price@xxxxxxx> · Wed, 21 May 2025 10:10:46 +0100

On 19/05/2025 18:35, Suzuki K Poulose wrote:
> Hi Steven
> 
> On 16/04/2025 14:41, Steven Price wrote:
>> At runtime if the realm guest accesses memory which hasn't yet been
>> mapped then KVM needs to either populate the region or fault the guest.
>>
>> For memory in the lower (protected) region of IPA a fresh page is
>> provided to the RMM which will zero the contents. For memory in the
>> upper (shared) region of IPA, the memory from the memslot is mapped
>> into the realm VM non secure.
>>
>> Signed-off-by: Steven Price <steven.price@xxxxxxx>
> 
> Please find some comments below.
> 
> ...
> 
>> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
>> index f6af3ea6ea8a..b6959cd17a6c 100644
>> --- a/arch/arm64/kvm/rme.c
>> +++ b/arch/arm64/kvm/rme.c
>> @@ -714,6 +714,186 @@ static int
>> realm_create_protected_data_page(struct realm *realm,
>>       return -ENXIO;
>>   }
>>   +static int fold_rtt(struct realm *realm, unsigned long addr, int
>> level)
>> +{
>> +    phys_addr_t rtt_addr;
>> +    int ret;
>> +
>> +    ret = realm_rtt_fold(realm, addr, level, &rtt_addr);
>> +    if (ret)
>> +        return ret;
>> +
>> +    free_delegated_granule(rtt_addr);
>> +
>> +    return 0;
>> +}
>> +
>> +int realm_map_protected(struct realm *realm,
>> +            unsigned long ipa,
>> +            kvm_pfn_t pfn,
>> +            unsigned long map_size,
>> +            struct kvm_mmu_memory_cache *memcache)
>> +{
>> +    phys_addr_t phys = __pfn_to_phys(pfn);
>> +    phys_addr_t rd = virt_to_phys(realm->rd);
>> +    unsigned long base_ipa = ipa;
>> +    unsigned long size;
>> +    int map_level;
>> +    int ret = 0;
>> +
>> +    if (WARN_ON(!IS_ALIGNED(map_size, RMM_PAGE_SIZE)))
>> +        return -EINVAL;
>> +
>> +    if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
>> +        return -EINVAL;
>> +
>> +    if (IS_ALIGNED(map_size, RMM_L2_BLOCK_SIZE))
>> +        map_level = 2;
> 
> minor nit : RMM_RTT_BLOCK_LEVEL
> 
>> +    else
>> +        map_level = 3;
> 
> minor nit:  RMM_RTT_MAX_LEVEL ?

Ack

>> +
>> +    if (map_level < RMM_RTT_MAX_LEVEL) {
>> +        /*
>> +         * A temporary RTT is needed during the map, precreate it,
>> +         * however if there is an error (e.g. missing parent tables)
>> +         * this will be handled below.
>> +         */
>> +        realm_create_rtt_levels(realm, ipa, map_level,
>> +                    RMM_RTT_MAX_LEVEL, memcache);
>> +    }
>> +
>> +    for (size = 0; size < map_size; size += RMM_PAGE_SIZE) {
>> +        if (rmi_granule_delegate(phys)) {
>> +            /*
>> +             * It's likely we raced with another VCPU on the same
>> +             * fault. Assume the other VCPU has handled the fault
>> +             * and return to the guest.
>> +             */
>> +            return 0;
>> +        }
>> +
>> +        ret = rmi_data_create_unknown(rd, phys, ipa);
>> +
>> +        if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> +            /* Create missing RTTs and retry */
>> +            int level = RMI_RETURN_INDEX(ret);
>> +
>> +            WARN_ON(level == RMM_RTT_MAX_LEVEL);
>> +
>> +            ret = realm_create_rtt_levels(realm, ipa, level,
>> +                              RMM_RTT_MAX_LEVEL,
>> +                              memcache);
>> +            if (ret)
>> +                goto err_undelegate;
>> +
>> +            ret = rmi_data_create_unknown(rd, phys, ipa);
>> +        }
>> +
>> +        if (WARN_ON(ret))
>> +            goto err_undelegate;
>> +
>> +        phys += RMM_PAGE_SIZE;
>> +        ipa += RMM_PAGE_SIZE;
>> +    }
>> +
>> +    if (map_size == RMM_L2_BLOCK_SIZE) {
>> +        ret = fold_rtt(realm, base_ipa, map_level + 1);
>> +        if (WARN_ON(ret))
>> +            goto err;
>> +    }
>> +
>> +    return 0;
>> +
>> +err_undelegate:
>> +    if (WARN_ON(rmi_granule_undelegate(phys))) {
>> +        /* Page can't be returned to NS world so is lost */
>> +        get_page(phys_to_page(phys));
>> +    }
>> +err:
>> +    while (size > 0) {
>> +        unsigned long data, top;
>> +
>> +        phys -= RMM_PAGE_SIZE;
>> +        size -= RMM_PAGE_SIZE;
>> +        ipa -= RMM_PAGE_SIZE;
>> +
>> +        WARN_ON(rmi_data_destroy(rd, ipa, &data, &top));
>> +
>> +        if (WARN_ON(rmi_granule_undelegate(phys))) {
>> +            /* Page can't be returned to NS world so is lost */
>> +            get_page(phys_to_page(phys));
>> +        }
>> +    }
>> +    return -ENXIO;
>> +}
>> +
>> +int realm_map_non_secure(struct realm *realm,
>> +             unsigned long ipa,
>> +             kvm_pfn_t pfn,
>> +             unsigned long size,
>> +             struct kvm_mmu_memory_cache *memcache)
>> +{
>> +    phys_addr_t rd = virt_to_phys(realm->rd);
>> +    phys_addr_t phys = __pfn_to_phys(pfn);
>> +    unsigned long offset;
>> +    int map_size, map_level;
>> +    int ret = 0;
>> +
>> +    if (WARN_ON(!IS_ALIGNED(size, RMM_PAGE_SIZE)))
>> +        return -EINVAL;
>> +
>> +    if (WARN_ON(!IS_ALIGNED(ipa, size)))
>> +        return -EINVAL;
>> +
>> +    if (IS_ALIGNED(size, RMM_L2_BLOCK_SIZE)) {
>> +        map_level = 2;
>> +        map_size = RMM_L2_BLOCK_SIZE;
> 
> Same here, stick to the symbols than digits.

Ack

>> +    } else {
>> +        map_level = 3;
>> +        map_size = RMM_PAGE_SIZE;
>> +    }
>> +
>> +    for (offset = 0; offset < size; offset += map_size) {
>> +        /*
>> +         * realm_map_ipa() enforces that the memory is writable,
> 
> The function names seems to be obsolete, please fix.

realm_map_ipa() is in arch/arm64/kvm/mmu.c and contains...

	/*
	 * Write permission is required for now even though it's possible to
	 * map unprotected pages (granules) as read-only. It's impossible to
	 * map protected pages (granules) as read-only.
	 */
	if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
		return -EFAULT;

...which is what this is referring to.
[And now I've read your follow up email ;) ]

>> +         * so for now we permit both read and write.
>> +         */
>> +        unsigned long desc = phys |
>> +                     PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) |
>> +                     KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R |
>> +                     KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
>> +        ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
>> +
>> +        if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> 
> Could we hit the following case and end up in failure ?
> 
> * Initially a single page is shared and the S2 is mapped
> * Later the Realm shares the entire L2 block and encounters a fault
>   at a new IPA within the L2 block.
> 
> In this case, we may try to L2 mapping when there is a L3 mapping and
> we could encounter (RMI_ERROR_RTT, 2).

Ugh! You are of course correct, this is broken. It turns out this is all
irrelevant as things stand because we don't currently support anything
bigger than (host) PAGE_SIZE mappings (forced in user_mem_abort()).

Given this I'm very much tempted to just delete the code supporting
block mappings for now (just set map_level to RMM_RTT_MAX_LEVEL).

Longer term, I'm not sure what is the best approach, the options are:

1. Drop down and map the region at L3 (ignoring conflicting entries),
   followed by attempting to fold.

2. Destroy the table and directly map at L2.

Option 2 is clearly the most performant, but is potentially racy. In
particular I'm not 100% sure whether you could end up with another
thread attempting to map at L3 between the destroy and map, and
therefore recreating the table.

Thanks,
Steve

> 
>> +            /* Create missing RTTs and retry */
>> +            int level = RMI_RETURN_INDEX(ret);
> 
> So we should probably go down the rtt create step, with the following
> check.
> 
>             if (level < map_level) {
> 
>> +
>> +            ret = realm_create_rtt_levels(realm, ipa, level,
>> +                              map_level, memcache);
>> +            if (ret)
>> +                return -ENXIO;
>> +
>> +            ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
> 
> 
>         } else {
> 
> Otherwise, may be we need to do some more hard work to fix it up.
> 
> 1. If map_level == 3, something is terribly wrong or we raced with
> another thread ?
> 
> 2. If map_level < 3 and we didn't race :
> 
>   a. Going one level down and creating the mappings there and then
> folding. But we could endup dealing with ERROR_RTT,3 as in (1).
> 
>   b. Easiest is to destroy the table at "map_level + 1" and retry the map.
> 
> 
> Suzuki
> 
> 
> 
>> +        }
>> +        /*
>> +         * RMI_ERROR_RTT can be reported for two reasons: either the
>> +         * RTT tables are not there, or there is an RTTE already
>> +         * present for the address.  The call to
>> +         * realm_create_rtt_levels() above handles the first case, and
>> +         * in the second case this indicates that another thread has
>> +         * already populated the RTTE for us, so we can ignore the
>> +         * error and continue.
>> +         */
>> +        if (ret && RMI_RETURN_STATUS(ret) != RMI_ERROR_RTT)
>> +            return -ENXIO;
>> +
>> +        ipa += map_size;
>> +        phys += map_size;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int populate_region(struct kvm *kvm,
>>                  phys_addr_t ipa_base,
>>                  phys_addr_t ipa_end,
>