[RFC PATCH v1 2/5] x86/boot: Move VMXOFF from KVM teardown to CPU shutdown phase

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Relocate VMXOFF from the KVM module unload path to the CPU shutdown phase.
This simplifies proper virtualization cleanup during system shutdown, CPU
hotplug (online/offline cycles), and suspend-to-disk (S4) transitions.

Since INIT interrupts are blocked during VMX operation, VMXOFF must run
just before a CPU shuts down to allow it to be brought back online later.

As a result, VMX instructions are no longer expected to fault.

Signed-off-by: Xin Li (Intel) <xin@xxxxxxxxx>
---
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/cpu/common.c     | 37 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/crash.c          |  4 ++++
 arch/x86/kernel/process.c        |  3 +++
 arch/x86/kernel/reboot.c         | 11 ++++++----
 arch/x86/kernel/smp.c            |  5 +++++
 arch/x86/kernel/smpboot.c        |  6 ++++++
 arch/x86/kvm/vmx/vmx.c           | 30 --------------------------
 arch/x86/power/cpu.c             |  3 +++
 9 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 59660428f46d..0bfd4eb1e9e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,7 @@ void get_cpu_vendor(struct cpuinfo_x86 *c);
 extern void early_cpu_init(void);
 extern void identify_secondary_cpu(unsigned int cpu);
 extern void cpu_enable_virtualization(void);
+extern void cpu_disable_virtualization(void);
 extern void print_cpu_info(struct cpuinfo_x86 *);
 void print_cpu_msr(struct cpuinfo_x86 *);
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e36877b5a240..39b9be9a2fb1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -2002,6 +2002,43 @@ void cpu_enable_virtualization(void)
 	intel_pt_handle_vmx(0);
 }
 
+/*
+ * Because INIT interrupts are blocked during VMX operation, this function
+ * must be called just before a CPU shuts down to ensure it can be brought
+ * back online later.
+ *
+ * Consequently, VMX instructions are no longer expected to fault.
+ *
+ * Although VMXOFF should not fault, fault handling is retained as a
+ * precaution against any unexpected code paths that might trigger it and
+ * can be removed later if unnecessary.
+ */
+void cpu_disable_virtualization(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	if (!is_vmx_supported())
+		return;
+
+	if (!(cr4_read_shadow() & X86_CR4_VMXE)) {
+		pr_err("VMX not enabled or already disabled on CPU%d\n", cpu);
+		return;
+	}
+
+	asm goto("1: vmxoff\n\t"
+		 _ASM_EXTABLE(1b, %l[fault])
+		 ::: "cc", "memory" : fault);
+
+exit:
+	cr4_clear_bits(X86_CR4_VMXE);
+	intel_pt_handle_vmx(0);
+	return;
+
+fault:
+	pr_err("VMXOFF faulted on CPU%d\n", cpu);
+	goto exit;
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c6b12bed173d..772c6d350b50 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -111,6 +111,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 
 	crash_smp_send_stop();
 
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
 
 	/*
@@ -141,6 +142,9 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 	x86_platform.guest.enc_kexec_finish();
 
 	crash_save_cpu(regs, smp_processor_id());
+
+	/* Disable virtualization on the last running CPU, usually the BSP */
+	cpu_disable_virtualization();
 }
 
 #if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_HOTPLUG)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1b7960cf6eb0..a0f6397b81ab 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -827,6 +827,9 @@ void __noreturn stop_this_cpu(void *dummy)
 	disable_local_APIC();
 	mcheck_cpu_clear(c);
 
+	/* Disable virtualization, usually this is an AP */
+	cpu_disable_virtualization();
+
 	/*
 	 * Use wbinvd on processors that support SME. This provides support
 	 * for performing a successful kexec when going from SME inactive
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 964f6b0a3d68..7433e634018f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -764,6 +764,9 @@ void native_machine_shutdown(void)
 
 	if (kexec_in_progress)
 		x86_platform.guest.enc_kexec_finish();
+
+	/* Disable virtualization on the last running CPU, usually the BSP */
+	cpu_disable_virtualization();
 }
 
 static void __machine_emergency_restart(int emergency)
@@ -873,14 +876,14 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 	if (shootdown_callback)
 		shootdown_callback(cpu, regs);
 
-	/*
-	 * Prepare the CPU for reboot _after_ invoking the callback so that the
-	 * callback can safely use virtualization instructions, e.g. VMCLEAR.
-	 */
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
 
 	atomic_dec(&waiting_for_crash_ipi);
 
+	/* Disable virtualization, usually this is an AP */
+	cpu_disable_virtualization();
+
 	if (smp_ops.stop_this_cpu) {
 		smp_ops.stop_this_cpu();
 		BUG();
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b014e6d229f9..eb6a389ba1a9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -124,7 +124,9 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 	if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
 		return NMI_HANDLED;
 
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
+
 	stop_this_cpu(NULL);
 
 	return NMI_HANDLED;
@@ -136,7 +138,10 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
 {
 	apic_eoi();
+
+	/* Kept to VMCLEAR loaded VMCSs */
 	cpu_emergency_disable_virtualization();
+
 	stop_this_cpu(NULL);
 }
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33e166f6ab12..fe3b04f33b3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1229,6 +1229,12 @@ int native_cpu_disable(void)
          */
 	apic_soft_disable();
 
+	/*
+	 * IPIs have been disabled as mentioned above, so virtualization
+	 * can now be safely shut down.
+	 */
+	cpu_disable_virtualization();
+
 	return 0;
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f6742df0c4ff..26af0a8ae08f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -674,29 +674,6 @@ static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
 	return ret;
 }
 
-/*
- * Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
- *
- * Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
- * atomically track post-VMXON state, e.g. this may be called in NMI context.
- * Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
- * faults are guaranteed to be due to the !post-VMXON check unless the CPU is
- * magically in RM, VM86, compat mode, or at CPL>0.
- */
-static int kvm_cpu_vmxoff(void)
-{
-	asm goto("1: vmxoff\n\t"
-			  _ASM_EXTABLE(1b, %l[fault])
-			  ::: "cc", "memory" : fault);
-
-	cr4_clear_bits(X86_CR4_VMXE);
-	return 0;
-
-fault:
-	cr4_clear_bits(X86_CR4_VMXE);
-	return -EIO;
-}
-
 void vmx_emergency_disable_virtualization_cpu(void)
 {
 	int cpu = raw_smp_processor_id();
@@ -719,8 +696,6 @@ void vmx_emergency_disable_virtualization_cpu(void)
 		if (v->shadow_vmcs)
 			vmcs_clear(v->shadow_vmcs);
 	}
-
-	kvm_cpu_vmxoff();
 }
 
 static void __loaded_vmcs_clear(void *arg)
@@ -2788,12 +2763,7 @@ void vmx_disable_virtualization_cpu(void)
 {
 	vmclear_local_loaded_vmcss();
 
-	if (kvm_cpu_vmxoff())
-		kvm_spurious_fault();
-
 	hv_reset_evmcs();
-
-	intel_pt_handle_vmx(0);
 }
 
 struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0eec314b79c2..d2c865fdb069 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,6 +129,9 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->misc_enable_saved = !rdmsrq_safe(MSR_IA32_MISC_ENABLE,
 					       &ctxt->misc_enable);
 	msr_save_context(ctxt);
+
+	/* Now CR4 is saved, disable VMX and clear CR4.VMXE */
+	cpu_disable_virtualization();
 }
 
 /* Needed by apm.c */
-- 
2.51.0





[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux