Fix a NULL VMSA deref bug (which is probably the tip of the iceberg with respect to what all can go wrong) due to a race between KVM_CREATE_VCPU and KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, where a non-SEV-ES vCPU can be created in an SEV-ES VM. Found by running syzkaller on a bare metal SEV-ES host. C repro below. Sean Christopherson (2): KVM: SVM: Reject SEV{-ES} intra host migration if vCPU creation is in-flight KVM: SVM: Initialize vmsa_pa in VMCB to INVALID_PAGE if VMSA page is NULL arch/x86/kvm/svm/sev.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) base-commit: 0ff41df1cb268fc69e703a08a57ee14ae967d0ca -- 2.49.0.1204.g71687c7c1d-goog // autogenerated by syzkaller (https://github.com/google/syzkaller) #define _GNU_SOURCE #include <dirent.h> #include <errno.h> #include <fcntl.h> #include <pthread.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/ioctl.h> #include <sys/syscall.h> #include <sys/wait.h> #include <linux/futex.h> #include <linux/kvm.h> static unsigned long long procid; static void sleep_ms(uint64_t ms) { usleep(ms * 1000); } static uint64_t current_time_ms(void) { struct timespec ts; if (clock_gettime(CLOCK_MONOTONIC, &ts)) exit(1); return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000; } static void thread_start(void* (*fn)(void*), void* arg) { pthread_t th; pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, 128 << 10); int i = 0; for (; i < 100; i++) { if (pthread_create(&th, &attr, fn, arg) == 0) { pthread_attr_destroy(&attr); return; } if (errno == EAGAIN) { usleep(50); continue; } break; } exit(1); } typedef struct { int state; } event_t; static void event_init(event_t* ev) { ev->state = 0; } static void event_reset(event_t* ev) { ev->state = 0; } static void event_set(event_t* ev) { if (ev->state) exit(1); __atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE); syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000); } static void event_wait(event_t* ev) { while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE)) syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0); } static int event_isset(event_t* ev) { return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE); } static int event_timedwait(event_t* ev, uint64_t timeout) { uint64_t start = current_time_ms(); uint64_t now = start; for (;;) { uint64_t remain = timeout - (now - start); struct timespec ts; ts.tv_sec = remain / 1000; ts.tv_nsec = (remain % 1000) * 1000 * 1000; syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts); if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE)) return 1; now = current_time_ms(); if (now - start > timeout) return 0; } } struct thread_t { int created, call; event_t ready, done; }; static struct thread_t threads[16]; static void execute_call(int call); static int running; static void* thr(void* arg) { struct thread_t* th = (struct thread_t*)arg; for (;;) { event_wait(&th->ready); event_reset(&th->ready); execute_call(th->call); __atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED); event_set(&th->done); } return 0; } static void execute_one(void) { if (write(1, "executing program\n", sizeof("executing program\n") - 1)) { } int i, call, thread; for (call = 0; call < 9; call++) { for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0])); thread++) { struct thread_t* th = &threads[thread]; if (!th->created) { th->created = 1; event_init(&th->ready); event_init(&th->done); event_set(&th->done); thread_start(thr, th); } if (!event_isset(&th->done)) continue; event_reset(&th->done); th->call = call; __atomic_fetch_add(&running, 1, __ATOMIC_RELAXED); event_set(&th->ready); if (call == 2 || call == 5 || call == 7) break; event_timedwait(&th->done, 50); break; } } for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++) sleep_ms(1); } static void loop(void) { int iter = 0; for (; iter < 100; iter++) { int pid = fork(); if (pid < 0) exit(1); if (pid == 0) { execute_one(); exit(0); } int status = 0; uint64_t start = current_time_ms(); for (;;) { sleep_ms(10); if (waitpid(-1, &status, WNOHANG | __WALL) == pid) break; if (current_time_ms() - start < 5000) continue; break; } } } uint64_t r[4] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}; void execute_call(int call) { switch (call) { case 0: r[1] = syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/KVM_CREATE_VM, /*type=*/0ul); break; case 3: r[3] = syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/KVM_CREATE_VM, /*type=*/0ul); break; case 5: syscall(__NR_ioctl, /*fd=*/r[3], /*cmd=*/KVM_CREATE_VCPU, /*id=*/0ul); for (int i = 0; i < 32; i++) { syscall(__NR_ioctl, /*fd=*/r[3], /*cmd=*/KVM_CREATE_VCPU, /*id=*/0ul); } break; case 6: *(uint64_t*)0x200000000040 = 1; *(uint32_t*)0x200000000048 = 8; *(uint32_t*)0x20000000004c = 0; *(uint64_t*)0x200000000050 = 0x5625e9b0; *(uint64_t*)0x200000000058 = 0; memset((void*)0x200000000060, 0, 16); syscall(__NR_ioctl, /*fd=*/r[1], /*cmd=*/KVM_MEMORY_ENCRYPT_OP, /*arg=*/0x200000000040ul); for (int i = 0; i < 32; i++) { syscall(__NR_ioctl, /*fd=*/r[1], /*cmd=*/KVM_MEMORY_ENCRYPT_OP, /*arg=*/0x200000000040ul); } break; case 7: *(uint32_t*)0x200000000080 = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM; *(uint32_t*)0x200000000084 = 0; *(uint32_t*)0x200000000088 = r[1]; syscall(__NR_ioctl, /*fd=*/r[3], /*cmd=*/KVM_ENABLE_CAP, /*arg=*/0x200000000080ul); break; } } int main(void) { syscall(__NR_mmap, /*addr=*/0x1ffffffff000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x200000000000ul, /*len=*/0x1000000ul, /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x200001000000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/(intptr_t)-1, /*offset=*/0ul); for (procid = 0; procid < 10; procid++) { if (fork() == 0) { r[0] = open("/dev/kvm", O_RDWR); loop(); } } sleep(1000000); return 0; }