On 25/06/2025 10:00, Joey Gouly wrote: > Hi Steven, > > On Wed, Jun 11, 2025 at 11:48:08AM +0100, Steven Price wrote: >> The RMM maintains a data structure known as the Realm Execution Context >> (or REC). It is similar to struct kvm_vcpu and tracks the state of the >> virtual CPUs. KVM must delegate memory and request the structures are >> created when vCPUs are created, and suitably tear down on destruction. >> >> RECs must also be supplied with addition pages - auxiliary (or AUX) >> granules - for storing the larger registers state (e.g. for SVE). The >> number of AUX granules for a REC depends on the parameters with which >> the Realm was created - the RMM makes this information available via the >> RMI_REC_AUX_COUNT call performed after creating the Realm Descriptor (RD). >> >> Note that only some of register state for the REC can be set by KVM, the >> rest is defined by the RMM (zeroed). The register state then cannot be >> changed by KVM after the REC is created (except when the guest >> explicitly requests this e.g. by performing a PSCI call). The RMM also >> requires that the VMM creates RECs in ascending order of the MPIDR. >> >> See Realm Management Monitor specification (DEN0137) for more information: >> https://developer.arm.com/documentation/den0137/ >> >> Signed-off-by: Steven Price <steven.price@xxxxxxx> >> Reviewed-by: Gavin Shan <gshan@xxxxxxxxxx> >> --- >> Changes since v7: >> * Add comment explaining the aux_pages array. >> * Rename "undeleted_failed" variable to "should_free" to avoid a >> confusing double negative. >> Changes since v6: >> * Avoid reporting the KVM_ARM_VCPU_REC feature if the guest isn't a >> realm guest. >> * Support host page size being larger than RMM's granule size when >> allocating/freeing aux granules. >> Changes since v5: >> * Separate the concept of vcpu_is_rec() and >> kvm_arm_vcpu_rec_finalized() by using the KVM_ARM_VCPU_REC feature as >> the indication that the VCPU is a REC. >> Changes since v2: >> * Free rec->run earlier in kvm_destroy_realm() and adapt to previous patches. >> --- >> arch/arm64/include/asm/kvm_emulate.h | 7 ++ >> arch/arm64/include/asm/kvm_host.h | 3 + >> arch/arm64/include/asm/kvm_rme.h | 27 ++++ >> arch/arm64/kvm/arm.c | 13 +- >> arch/arm64/kvm/reset.c | 11 ++ >> arch/arm64/kvm/rme.c | 180 +++++++++++++++++++++++++++ >> 6 files changed, 239 insertions(+), 2 deletions(-) >> >> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h >> index a640bb7dffbc..302a691b3723 100644 >> --- a/arch/arm64/include/asm/kvm_emulate.h >> +++ b/arch/arm64/include/asm/kvm_emulate.h >> @@ -711,7 +711,14 @@ static inline bool kvm_realm_is_created(struct kvm *kvm) >> >> static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu) >> { >> + if (static_branch_unlikely(&kvm_rme_is_available)) >> + return vcpu_has_feature(vcpu, KVM_ARM_VCPU_REC); >> return false; >> } >> >> +static inline bool kvm_arm_rec_finalized(struct kvm_vcpu *vcpu) >> +{ >> + return vcpu->arch.rec.mpidr != INVALID_HWID; >> +} >> + >> #endif /* __ARM64_KVM_EMULATE_H__ */ >> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h >> index 18e6d72dabe9..58fe7b216126 100644 >> --- a/arch/arm64/include/asm/kvm_host.h >> +++ b/arch/arm64/include/asm/kvm_host.h >> @@ -883,6 +883,9 @@ struct kvm_vcpu_arch { >> >> /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */ >> struct vncr_tlb *vncr_tlb; >> + >> + /* Realm meta data */ >> + struct realm_rec rec; >> }; >> >> /* >> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h >> index 5f0de9a6d339..f716b890e484 100644 >> --- a/arch/arm64/include/asm/kvm_rme.h >> +++ b/arch/arm64/include/asm/kvm_rme.h >> @@ -6,6 +6,7 @@ >> #ifndef __ASM_KVM_RME_H >> #define __ASM_KVM_RME_H >> >> +#include <asm/rmi_smc.h> >> #include <uapi/linux/kvm.h> >> >> /** >> @@ -65,6 +66,30 @@ struct realm { >> unsigned int ia_bits; >> }; >> >> +/** >> + * struct realm_rec - Additional per VCPU data for a Realm >> + * >> + * @mpidr: MPIDR (Multiprocessor Affinity Register) value to identify this VCPU >> + * @rec_page: Kernel VA of the RMM's private page for this REC >> + * @aux_pages: Additional pages private to the RMM for this REC >> + * @run: Kernel VA of the RmiRecRun structure shared with the RMM >> + */ >> +struct realm_rec { >> + unsigned long mpidr; >> + void *rec_page; >> + /* >> + * REC_PARAMS_AUX_GRANULES is the maximum number of granules that the >> + * RMM can require. By using that to size the array we know that it >> + * will be big enough as the page size is always at least as large as >> + * the granule size. In the case of a larger page size than 4k (or an >> + * RMM which requires fewer auxiliary granules), the array will be >> + * bigger than needed however the extra memory required is small and >> + * this keeps the code cleaner. >> + */ >> + struct page *aux_pages[REC_PARAMS_AUX_GRANULES]; > > I think that something like this may work, and use the right amount of pages: > > struct page *aux_pages[(REC_PARAMS_AUX_GRANULES * RMM_PAGE_SIZE) >> PAGE_SHIFT]; Thanks, yes that should calculate the correct number of pages. I'm not sure why I didn't figure that out before. A minor issue is RMM_PAGE_SIZE is only defined in rme.c, but I think (with a comment) SZ_4K should suffice. Thanks, Steve > Thanks, > Joey > >> + struct rec_run *run; >> +}; >> + >> void kvm_init_rme(void); >> u32 kvm_realm_ipa_limit(void); >> >> @@ -72,6 +97,8 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); >> int kvm_init_realm_vm(struct kvm *kvm); >> void kvm_destroy_realm(struct kvm *kvm); >> void kvm_realm_destroy_rtts(struct kvm *kvm, u32 ia_bits); >> +int kvm_create_rec(struct kvm_vcpu *vcpu); >> +void kvm_destroy_rec(struct kvm_vcpu *vcpu); >> >> static inline bool kvm_realm_is_private_address(struct realm *realm, >> unsigned long addr) >> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c >> index b3e3323573c6..7be1bdfc5f0b 100644 >> --- a/arch/arm64/kvm/arm.c >> +++ b/arch/arm64/kvm/arm.c >> @@ -495,6 +495,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) >> /* Force users to call KVM_ARM_VCPU_INIT */ >> vcpu_clear_flag(vcpu, VCPU_INITIALIZED); >> >> + vcpu->arch.rec.mpidr = INVALID_HWID; >> + >> vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; >> >> /* Set up the timer */ >> @@ -1457,7 +1459,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, >> return -EINVAL; >> } >> >> -static unsigned long system_supported_vcpu_features(void) >> +static unsigned long system_supported_vcpu_features(struct kvm *kvm) >> { >> unsigned long features = KVM_VCPU_VALID_FEATURES; >> >> @@ -1478,6 +1480,9 @@ static unsigned long system_supported_vcpu_features(void) >> if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT)) >> clear_bit(KVM_ARM_VCPU_HAS_EL2, &features); >> >> + if (!kvm_is_realm(kvm)) >> + clear_bit(KVM_ARM_VCPU_REC, &features); >> + >> return features; >> } >> >> @@ -1495,7 +1500,7 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, >> return -ENOENT; >> } >> >> - if (features & ~system_supported_vcpu_features()) >> + if (features & ~system_supported_vcpu_features(vcpu->kvm)) >> return -EINVAL; >> >> /* >> @@ -1517,6 +1522,10 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, >> if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) >> return -EINVAL; >> >> + /* RME is incompatible with AArch32 */ >> + if (test_bit(KVM_ARM_VCPU_REC, &features)) >> + return -EINVAL; >> + >> return 0; >> } >> >> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c >> index 959532422d3a..2e9e855581d4 100644 >> --- a/arch/arm64/kvm/reset.c >> +++ b/arch/arm64/kvm/reset.c >> @@ -137,6 +137,11 @@ int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) >> return -EPERM; >> >> return kvm_vcpu_finalize_sve(vcpu); >> + case KVM_ARM_VCPU_REC: >> + if (!kvm_is_realm(vcpu->kvm) || !vcpu_is_rec(vcpu)) >> + return -EINVAL; >> + >> + return kvm_create_rec(vcpu); >> } >> >> return -EINVAL; >> @@ -147,6 +152,11 @@ bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) >> if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu)) >> return false; >> >> + if (kvm_is_realm(vcpu->kvm) && >> + !(vcpu_is_rec(vcpu) && kvm_arm_rec_finalized(vcpu) && >> + READ_ONCE(vcpu->kvm->arch.realm.state) == REALM_STATE_ACTIVE)) >> + return false; >> + >> return true; >> } >> >> @@ -161,6 +171,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) >> free_page((unsigned long)vcpu->arch.ctxt.vncr_array); >> kfree(vcpu->arch.vncr_tlb); >> kfree(vcpu->arch.ccsidr); >> + kvm_destroy_rec(vcpu); >> } >> >> static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) >> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c >> index 0f89295fa59c..f094544592b5 100644 >> --- a/arch/arm64/kvm/rme.c >> +++ b/arch/arm64/kvm/rme.c >> @@ -484,6 +484,186 @@ void kvm_destroy_realm(struct kvm *kvm) >> kvm_free_stage2_pgd(&kvm->arch.mmu); >> } >> >> +static void free_rec_aux(struct page **aux_pages, >> + unsigned int num_aux) >> +{ >> + unsigned int i, j; >> + unsigned int page_count = 0; >> + >> + for (i = 0; i < num_aux;) { >> + struct page *aux_page = aux_pages[page_count++]; >> + phys_addr_t aux_page_phys = page_to_phys(aux_page); >> + bool should_free = true; >> + >> + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { >> + if (WARN_ON(rmi_granule_undelegate(aux_page_phys))) >> + should_free = false; >> + aux_page_phys += RMM_PAGE_SIZE; >> + i++; >> + } >> + /* Only free if all the undelegate calls were successful */ >> + if (should_free) >> + __free_page(aux_page); >> + } >> +} >> + >> +static int alloc_rec_aux(struct page **aux_pages, >> + u64 *aux_phys_pages, >> + unsigned int num_aux) >> +{ >> + struct page *aux_page; >> + int page_count = 0; >> + unsigned int i, j; >> + int ret; >> + >> + for (i = 0; i < num_aux;) { >> + phys_addr_t aux_page_phys; >> + >> + aux_page = alloc_page(GFP_KERNEL); >> + if (!aux_page) { >> + ret = -ENOMEM; >> + goto out_err; >> + } >> + >> + aux_page_phys = page_to_phys(aux_page); >> + for (j = 0; j < PAGE_SIZE && i < num_aux; j += RMM_PAGE_SIZE) { >> + if (rmi_granule_delegate(aux_page_phys)) { >> + ret = -ENXIO; >> + goto err_undelegate; >> + } >> + aux_phys_pages[i++] = aux_page_phys; >> + aux_page_phys += RMM_PAGE_SIZE; >> + } >> + aux_pages[page_count++] = aux_page; >> + } >> + >> + return 0; >> +err_undelegate: >> + while (j > 0) { >> + j -= RMM_PAGE_SIZE; >> + i--; >> + if (WARN_ON(rmi_granule_undelegate(aux_phys_pages[i]))) { >> + /* Leak the page if the undelegate fails */ >> + goto out_err; >> + } >> + } >> + __free_page(aux_page); >> +out_err: >> + free_rec_aux(aux_pages, i); >> + return ret; >> +} >> + >> +int kvm_create_rec(struct kvm_vcpu *vcpu) >> +{ >> + struct user_pt_regs *vcpu_regs = vcpu_gp_regs(vcpu); >> + unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); >> + struct realm *realm = &vcpu->kvm->arch.realm; >> + struct realm_rec *rec = &vcpu->arch.rec; >> + unsigned long rec_page_phys; >> + struct rec_params *params; >> + int r, i; >> + >> + if (kvm_realm_state(vcpu->kvm) != REALM_STATE_NEW) >> + return -ENOENT; >> + >> + if (rec->run) >> + return -EBUSY; >> + >> + /* >> + * The RMM will report PSCI v1.0 to Realms and the KVM_ARM_VCPU_PSCI_0_2 >> + * flag covers v0.2 and onwards. >> + */ >> + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) >> + return -EINVAL; >> + >> + BUILD_BUG_ON(sizeof(*params) > PAGE_SIZE); >> + BUILD_BUG_ON(sizeof(*rec->run) > PAGE_SIZE); >> + >> + params = (struct rec_params *)get_zeroed_page(GFP_KERNEL); >> + rec->rec_page = (void *)__get_free_page(GFP_KERNEL); >> + rec->run = (void *)get_zeroed_page(GFP_KERNEL); >> + if (!params || !rec->rec_page || !rec->run) { >> + r = -ENOMEM; >> + goto out_free_pages; >> + } >> + >> + for (i = 0; i < ARRAY_SIZE(params->gprs); i++) >> + params->gprs[i] = vcpu_regs->regs[i]; >> + >> + params->pc = vcpu_regs->pc; >> + >> + if (vcpu->vcpu_id == 0) >> + params->flags |= REC_PARAMS_FLAG_RUNNABLE; >> + >> + rec_page_phys = virt_to_phys(rec->rec_page); >> + >> + if (rmi_granule_delegate(rec_page_phys)) { >> + r = -ENXIO; >> + goto out_free_pages; >> + } >> + >> + r = alloc_rec_aux(rec->aux_pages, params->aux, realm->num_aux); >> + if (r) >> + goto out_undelegate_rmm_rec; >> + >> + params->num_rec_aux = realm->num_aux; >> + params->mpidr = mpidr; >> + >> + if (rmi_rec_create(virt_to_phys(realm->rd), >> + rec_page_phys, >> + virt_to_phys(params))) { >> + r = -ENXIO; >> + goto out_free_rec_aux; >> + } >> + >> + rec->mpidr = mpidr; >> + >> + free_page((unsigned long)params); >> + return 0; >> + >> +out_free_rec_aux: >> + free_rec_aux(rec->aux_pages, realm->num_aux); >> +out_undelegate_rmm_rec: >> + if (WARN_ON(rmi_granule_undelegate(rec_page_phys))) >> + rec->rec_page = NULL; >> +out_free_pages: >> + free_page((unsigned long)rec->run); >> + free_page((unsigned long)rec->rec_page); >> + free_page((unsigned long)params); >> + return r; >> +} >> + >> +void kvm_destroy_rec(struct kvm_vcpu *vcpu) >> +{ >> + struct realm *realm = &vcpu->kvm->arch.realm; >> + struct realm_rec *rec = &vcpu->arch.rec; >> + unsigned long rec_page_phys; >> + >> + if (!vcpu_is_rec(vcpu)) >> + return; >> + >> + if (!rec->run) { >> + /* Nothing to do if the VCPU hasn't been finalized */ >> + return; >> + } >> + >> + free_page((unsigned long)rec->run); >> + >> + rec_page_phys = virt_to_phys(rec->rec_page); >> + >> + /* >> + * The REC and any AUX pages cannot be reclaimed until the REC is >> + * destroyed. So if the REC destroy fails then the REC page and any AUX >> + * pages will be leaked. >> + */ >> + if (WARN_ON(rmi_rec_destroy(rec_page_phys))) >> + return; >> + >> + free_rec_aux(rec->aux_pages, realm->num_aux); >> + >> + free_delegated_granule(rec_page_phys); >> +} >> + >> int kvm_init_realm_vm(struct kvm *kvm) >> { >> kvm->arch.realm.params = (void *)get_zeroed_page(GFP_KERNEL); >> -- >> 2.43.0 >>