Re: [PATCH v5 1/1] x86: kvm: svm: set up ERAPS support for guests

Tom Lendacky <thomas.lendacky@xxxxxxx> · Mon, 19 May 2025 16:22:51 -0500

On 5/15/25 10:26, Amit Shah wrote:
> From: Amit Shah <amit.shah@xxxxxxx>
> 
> AMD CPUs with the Enhanced Return Address Predictor (ERAPS) feature
> Zen5+) obviate the need for FILL_RETURN_BUFFER sequences right after
> VMEXITs.  The feature adds guest/host tags to entries in the RSB (a.k.a.
> RAP).  This helps with speculation protection across the VM boundary,
> and it also preserves host and guest entries in the RSB that can improve
> software performance (which would otherwise be flushed due to the
> FILL_RETURN_BUFFER sequences).  This feature also extends the size of
> the RSB from the older standard (of 32 entries) to a new default
> enumerated in CPUID leaf 0x80000021:EBX bits 23:16 -- which is 64
> entries in Zen5 CPUs.
> 
> In addition to flushing the RSB across VMEXIT boundaries, CPUs with
> this feature also flush the RSB when the CR3 is updated (i.e. whenever
> there's a context switch), to prevent one userspace process poisoning
> the RSB that may affect another process.  The relevance of this for KVM
> is explained below in caveat 2.
> 
> The hardware feature is always-on, and the host context uses the full
> default RSB size without any software changes necessary.  The presence
> of this feature allows software (both in host and guest contexts) to
> drop all RSB filling routines in favour of the hardware doing it.
> 
> For guests to observe and use this feature, the hypervisor needs to
> expose the CPUID bit, and also set a VMCB bit.  Without one or both of
> those, guests continue to use the older default RSB size and behaviour
> for backwards compatibility.  This means the hardware RSB size is
> limited to 32 entries for guests that do not have this feature exposed
> to them.
> 
> There are two guest/host configurations that need to be addressed before
> allowing a guest to use this feature: nested guests, and hosts using
> shadow paging (or when NPT is disabled):
> 
> 1. Nested guests: the ERAPS feature adds host/guest tagging to entries
>    in the RSB, but does not distinguish between the guest ASIDs.  To
>    prevent the case of an L2 guest poisoning the RSB to attack the L1
>    guest, the CPU exposes a new VMCB bit (FLUSH_RAP_ON_VMRUN).  The next
>    VMRUN with a VMCB that has this bit set causes the CPU to flush the
>    RSB before entering the guest context.  In this patch, we set the bit
>    in VMCB01 after a nested #VMEXIT to ensure the next time the L1 guest
>    runs, its RSB contents aren't polluted by the L2's contents.
>    Similarly, when an exit from L1 to the hypervisor happens, we set
>    that bit for VMCB02, so that the L1 guest's RSB contents are not
>    leaked/used in the L2 context.
> 
> 2. Hosts that disable NPT: the ERAPS feature also flushes the RSB
>    entries when the CR3 is updated.  When using shadow paging, CR3
>    updates within the guest do not update the CPU's CR3 register.  In
>    this case, do not expose the ERAPS feature to guests, and the guests
>    continue with existing mitigations to fill the RSB.
> 
> This patch to KVM ensures both those caveats are addressed, and sets the
> new ALLOW_LARGER_RAP VMCB bit that exposes this feature to the guest.
> That allows the new default RSB size to be used in guest contexts as
> well, and allows the guest to drop its RSB flushing routines.
> 
> Signed-off-by: Amit Shah <amit.shah@xxxxxxx>
> ---
>  arch/x86/include/asm/cpufeatures.h |  1 +
>  arch/x86/include/asm/svm.h         |  6 +++++-
>  arch/x86/kvm/cpuid.c               | 10 +++++++++-
>  arch/x86/kvm/svm/svm.c             | 14 ++++++++++++++
>  arch/x86/kvm/svm/svm.h             | 20 ++++++++++++++++++++
>  5 files changed, 49 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
> index 39e61212ac9a..57264d5ab162 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -457,6 +457,7 @@
>  #define X86_FEATURE_AUTOIBRS		(20*32+ 8) /* Automatic IBRS */
>  #define X86_FEATURE_NO_SMM_CTL_MSR	(20*32+ 9) /* SMM_CTL MSR is not present */
>  
> +#define X86_FEATURE_ERAPS		(20*32+24) /* Enhanced Return Address Predictor Security */
>  #define X86_FEATURE_SBPB		(20*32+27) /* Selective Branch Prediction Barrier */
>  #define X86_FEATURE_IBPB_BRTYPE		(20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */
>  #define X86_FEATURE_SRSO_NO		(20*32+29) /* CPU is not affected by SRSO */
> diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
> index 9b7fa99ae951..cf6a94e64e58 100644
> --- a/arch/x86/include/asm/svm.h
> +++ b/arch/x86/include/asm/svm.h
> @@ -130,7 +130,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
>  	u64 tsc_offset;
>  	u32 asid;
>  	u8 tlb_ctl;
> -	u8 reserved_2[3];
> +	u8 erap_ctl;
> +	u8 reserved_2[2];
>  	u32 int_ctl;
>  	u32 int_vector;
>  	u32 int_state;
> @@ -176,6 +177,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
>  #define TLB_CONTROL_FLUSH_ASID 3
>  #define TLB_CONTROL_FLUSH_ASID_LOCAL 7
>  
> +#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0)
> +#define ERAP_CONTROL_FLUSH_RAP BIT(1)
> +
>  #define V_TPR_MASK 0x0f
>  
>  #define V_IRQ_SHIFT 8
> diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
> index 571c906ffcbf..0cca1865826e 100644
> --- a/arch/x86/kvm/cpuid.c
> +++ b/arch/x86/kvm/cpuid.c
> @@ -1187,6 +1187,9 @@ void kvm_set_cpu_caps(void)
>  		F(SRSO_USER_KERNEL_NO),
>  	);
>  
> +	if (tdp_enabled)
> +		kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS);

Should this be moved to svm_set_cpu_caps() ? And there it can be

	if (npt_enabled)
		kvm_cpu_cap...

> +
>  	kvm_cpu_cap_init(CPUID_8000_0022_EAX,
>  		F(PERFMON_V2),
>  	);
> @@ -1756,8 +1759,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
>  		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
>  		break;
>  	case 0x80000021:
> -		entry->ebx = entry->ecx = entry->edx = 0;
> +		entry->ecx = entry->edx = 0;
>  		cpuid_entry_override(entry, CPUID_8000_0021_EAX);
> +		if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))
> +			entry->ebx &= GENMASK(23, 16);
> +		else
> +			entry->ebx = 0;
> +

Extra blank line.

>  		break;
>  	/* AMD Extended Performance Monitoring and Debug */
>  	case 0x80000022: {
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index a89c271a1951..a2b075ed4133 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1363,6 +1363,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
>  	if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL))
>  		set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
>  
> +	if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled)

Should this be:

	if (kvm_cpu_cap_has(X86_FEATURE_ERAPS))

?

> +		vmcb_enable_extended_rap(svm->vmcb);
> +
>  	if (kvm_vcpu_apicv_active(vcpu))
>  		avic_init_vmcb(svm, vmcb);
>  
> @@ -3482,6 +3485,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu)
>  	pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
>  	pr_err("%-20s%d\n", "asid:", control->asid);
>  	pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
> +	pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl);
>  	pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
>  	pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
>  	pr_err("%-20s%08x\n", "int_state:", control->int_state);
> @@ -3663,6 +3667,11 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>  
>  		trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM);
>  
> +		if (vmcb_is_extended_rap(svm->vmcb01.ptr)) {
> +			vmcb_set_flush_guest_rap(svm->vmcb01.ptr);
> +			vmcb_clr_flush_guest_rap(svm->nested.vmcb02.ptr);
> +		}
> +
>  		vmexit = nested_svm_exit_special(svm);
>  
>  		if (vmexit == NESTED_EXIT_CONTINUE)
> @@ -3670,6 +3679,11 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
>  
>  		if (vmexit == NESTED_EXIT_DONE)
>  			return 1;
> +	} else {
> +		if (vmcb_is_extended_rap(svm->vmcb01.ptr) && svm->nested.initialized) {
> +			vmcb_set_flush_guest_rap(svm->nested.vmcb02.ptr);
> +			vmcb_clr_flush_guest_rap(svm->vmcb01.ptr);
> +		}
>  	}
>  
>  	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index f16b068c4228..7f44f7c9b1d5 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -493,6 +493,26 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
>  	return vmcb_is_intercept(&svm->vmcb->control, bit);
>  }
>  
> +static inline void vmcb_set_flush_guest_rap(struct vmcb *vmcb)
> +{
> +	vmcb->control.erap_ctl |= ERAP_CONTROL_FLUSH_RAP;
> +}
> +
> +static inline void vmcb_clr_flush_guest_rap(struct vmcb *vmcb)
> +{
> +	vmcb->control.erap_ctl &= ~ERAP_CONTROL_FLUSH_RAP;
> +}
> +
> +static inline void vmcb_enable_extended_rap(struct vmcb *vmcb)

s/extended/larger/ to match the bit name ?

> +{
> +	vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP;
> +}
> +
> +static inline bool vmcb_is_extended_rap(struct vmcb *vmcb)

s/is_extended/has_larger/

Thanks,
Tom

> +{
> +	return !!(vmcb->control.erap_ctl & ERAP_CONTROL_ALLOW_LARGER_RAP);
> +}
> +
>  static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
>  {
>  	return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) &&