On 5/15/25 10:26, Amit Shah wrote: > From: Amit Shah <amit.shah@xxxxxxx> > > AMD CPUs with the Enhanced Return Address Predictor (ERAPS) feature > Zen5+) obviate the need for FILL_RETURN_BUFFER sequences right after > VMEXITs. The feature adds guest/host tags to entries in the RSB (a.k.a. > RAP). This helps with speculation protection across the VM boundary, > and it also preserves host and guest entries in the RSB that can improve > software performance (which would otherwise be flushed due to the > FILL_RETURN_BUFFER sequences). This feature also extends the size of > the RSB from the older standard (of 32 entries) to a new default > enumerated in CPUID leaf 0x80000021:EBX bits 23:16 -- which is 64 > entries in Zen5 CPUs. > > In addition to flushing the RSB across VMEXIT boundaries, CPUs with > this feature also flush the RSB when the CR3 is updated (i.e. whenever > there's a context switch), to prevent one userspace process poisoning > the RSB that may affect another process. The relevance of this for KVM > is explained below in caveat 2. > > The hardware feature is always-on, and the host context uses the full > default RSB size without any software changes necessary. The presence > of this feature allows software (both in host and guest contexts) to > drop all RSB filling routines in favour of the hardware doing it. > > For guests to observe and use this feature, the hypervisor needs to > expose the CPUID bit, and also set a VMCB bit. Without one or both of > those, guests continue to use the older default RSB size and behaviour > for backwards compatibility. This means the hardware RSB size is > limited to 32 entries for guests that do not have this feature exposed > to them. > > There are two guest/host configurations that need to be addressed before > allowing a guest to use this feature: nested guests, and hosts using > shadow paging (or when NPT is disabled): > > 1. Nested guests: the ERAPS feature adds host/guest tagging to entries > in the RSB, but does not distinguish between the guest ASIDs. To > prevent the case of an L2 guest poisoning the RSB to attack the L1 > guest, the CPU exposes a new VMCB bit (FLUSH_RAP_ON_VMRUN). The next > VMRUN with a VMCB that has this bit set causes the CPU to flush the > RSB before entering the guest context. In this patch, we set the bit > in VMCB01 after a nested #VMEXIT to ensure the next time the L1 guest > runs, its RSB contents aren't polluted by the L2's contents. > Similarly, when an exit from L1 to the hypervisor happens, we set > that bit for VMCB02, so that the L1 guest's RSB contents are not > leaked/used in the L2 context. > > 2. Hosts that disable NPT: the ERAPS feature also flushes the RSB > entries when the CR3 is updated. When using shadow paging, CR3 > updates within the guest do not update the CPU's CR3 register. In > this case, do not expose the ERAPS feature to guests, and the guests > continue with existing mitigations to fill the RSB. > > This patch to KVM ensures both those caveats are addressed, and sets the > new ALLOW_LARGER_RAP VMCB bit that exposes this feature to the guest. > That allows the new default RSB size to be used in guest contexts as > well, and allows the guest to drop its RSB flushing routines. > > Signed-off-by: Amit Shah <amit.shah@xxxxxxx> > --- > arch/x86/include/asm/cpufeatures.h | 1 + > arch/x86/include/asm/svm.h | 6 +++++- > arch/x86/kvm/cpuid.c | 10 +++++++++- > arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ > arch/x86/kvm/svm/svm.h | 20 ++++++++++++++++++++ > 5 files changed, 49 insertions(+), 2 deletions(-) > > diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h > index 39e61212ac9a..57264d5ab162 100644 > --- a/arch/x86/include/asm/cpufeatures.h > +++ b/arch/x86/include/asm/cpufeatures.h > @@ -457,6 +457,7 @@ > #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ > #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ > > +#define X86_FEATURE_ERAPS (20*32+24) /* Enhanced Return Address Predictor Security */ > #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ > #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ > #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ > diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h > index 9b7fa99ae951..cf6a94e64e58 100644 > --- a/arch/x86/include/asm/svm.h > +++ b/arch/x86/include/asm/svm.h > @@ -130,7 +130,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { > u64 tsc_offset; > u32 asid; > u8 tlb_ctl; > - u8 reserved_2[3]; > + u8 erap_ctl; > + u8 reserved_2[2]; > u32 int_ctl; > u32 int_vector; > u32 int_state; > @@ -176,6 +177,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { > #define TLB_CONTROL_FLUSH_ASID 3 > #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 > > +#define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) > +#define ERAP_CONTROL_FLUSH_RAP BIT(1) > + > #define V_TPR_MASK 0x0f > > #define V_IRQ_SHIFT 8 > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c > index 571c906ffcbf..0cca1865826e 100644 > --- a/arch/x86/kvm/cpuid.c > +++ b/arch/x86/kvm/cpuid.c > @@ -1187,6 +1187,9 @@ void kvm_set_cpu_caps(void) > F(SRSO_USER_KERNEL_NO), > ); > > + if (tdp_enabled) > + kvm_cpu_cap_check_and_set(X86_FEATURE_ERAPS); Should this be moved to svm_set_cpu_caps() ? And there it can be if (npt_enabled) kvm_cpu_cap... > + > kvm_cpu_cap_init(CPUID_8000_0022_EAX, > F(PERFMON_V2), > ); > @@ -1756,8 +1759,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) > entry->eax = entry->ebx = entry->ecx = entry->edx = 0; > break; > case 0x80000021: > - entry->ebx = entry->ecx = entry->edx = 0; > + entry->ecx = entry->edx = 0; > cpuid_entry_override(entry, CPUID_8000_0021_EAX); > + if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) > + entry->ebx &= GENMASK(23, 16); > + else > + entry->ebx = 0; > + Extra blank line. > break; > /* AMD Extended Performance Monitoring and Debug */ > case 0x80000022: { > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c > index a89c271a1951..a2b075ed4133 100644 > --- a/arch/x86/kvm/svm/svm.c > +++ b/arch/x86/kvm/svm/svm.c > @@ -1363,6 +1363,9 @@ static void init_vmcb(struct kvm_vcpu *vcpu) > if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) > set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1); > > + if (boot_cpu_has(X86_FEATURE_ERAPS) && npt_enabled) Should this be: if (kvm_cpu_cap_has(X86_FEATURE_ERAPS)) ? > + vmcb_enable_extended_rap(svm->vmcb); > + > if (kvm_vcpu_apicv_active(vcpu)) > avic_init_vmcb(svm, vmcb); > > @@ -3482,6 +3485,7 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) > pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); > pr_err("%-20s%d\n", "asid:", control->asid); > pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); > + pr_err("%-20s%d\n", "erap_ctl:", control->erap_ctl); > pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); > pr_err("%-20s%08x\n", "int_vector:", control->int_vector); > pr_err("%-20s%08x\n", "int_state:", control->int_state); > @@ -3663,6 +3667,11 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) > > trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); > > + if (vmcb_is_extended_rap(svm->vmcb01.ptr)) { > + vmcb_set_flush_guest_rap(svm->vmcb01.ptr); > + vmcb_clr_flush_guest_rap(svm->nested.vmcb02.ptr); > + } > + > vmexit = nested_svm_exit_special(svm); > > if (vmexit == NESTED_EXIT_CONTINUE) > @@ -3670,6 +3679,11 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) > > if (vmexit == NESTED_EXIT_DONE) > return 1; > + } else { > + if (vmcb_is_extended_rap(svm->vmcb01.ptr) && svm->nested.initialized) { > + vmcb_set_flush_guest_rap(svm->nested.vmcb02.ptr); > + vmcb_clr_flush_guest_rap(svm->vmcb01.ptr); > + } > } > > if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { > diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h > index f16b068c4228..7f44f7c9b1d5 100644 > --- a/arch/x86/kvm/svm/svm.h > +++ b/arch/x86/kvm/svm/svm.h > @@ -493,6 +493,26 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) > return vmcb_is_intercept(&svm->vmcb->control, bit); > } > > +static inline void vmcb_set_flush_guest_rap(struct vmcb *vmcb) > +{ > + vmcb->control.erap_ctl |= ERAP_CONTROL_FLUSH_RAP; > +} > + > +static inline void vmcb_clr_flush_guest_rap(struct vmcb *vmcb) > +{ > + vmcb->control.erap_ctl &= ~ERAP_CONTROL_FLUSH_RAP; > +} > + > +static inline void vmcb_enable_extended_rap(struct vmcb *vmcb) s/extended/larger/ to match the bit name ? > +{ > + vmcb->control.erap_ctl |= ERAP_CONTROL_ALLOW_LARGER_RAP; > +} > + > +static inline bool vmcb_is_extended_rap(struct vmcb *vmcb) s/is_extended/has_larger/ Thanks, Tom > +{ > + return !!(vmcb->control.erap_ctl & ERAP_CONTROL_ALLOW_LARGER_RAP); > +} > + > static inline bool nested_vgif_enabled(struct vcpu_svm *svm) > { > return guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_VGIF) &&