On Sat, 2 Aug 2025 19:47:02 -0700 Chaitanya Kulkarni <kch@xxxxxxxxxx> wrote: > Add foundational infrastructure for vfio-nvme, enabling support for live > migration of NVMe devices via the VFIO framework. The following > components are included: > > - Core driver skeleton for vfio-nvme support under drivers/vfio/pci/nvme/ > - Definitions of basic data structures used in live migration > (e.g., nvmevf_pci_core_device and nvmevf_migration_file) > - Implementation of helper routines for managing migration file state > - Integration of PCI driver callbacks and error handling logic > - Registration with vfio-pci-core through nvmevf_pci_ops > - Initial support for VFIO migration states and device open/close flows > > Subsequent patches will build upon this base to implement actual live > migration commands and complete the vfio device state handling logic. > > Signed-off-by: Lei Rao <lei.rao@xxxxxxxxx> > Signed-off-by: Max Gurtovoy <mgurtovoy@xxxxxxxxxx> > Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx> > --- > drivers/vfio/pci/Kconfig | 2 + > drivers/vfio/pci/Makefile | 2 + > drivers/vfio/pci/nvme/Kconfig | 10 ++ > drivers/vfio/pci/nvme/Makefile | 3 + > drivers/vfio/pci/nvme/nvme.c | 196 +++++++++++++++++++++++++++++++++ > drivers/vfio/pci/nvme/nvme.h | 36 ++++++ > 6 files changed, 249 insertions(+) > create mode 100644 drivers/vfio/pci/nvme/Kconfig > create mode 100644 drivers/vfio/pci/nvme/Makefile > create mode 100644 drivers/vfio/pci/nvme/nvme.c > create mode 100644 drivers/vfio/pci/nvme/nvme.h > > diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig > index 2b0172f54665..8f94429e7adc 100644 > --- a/drivers/vfio/pci/Kconfig > +++ b/drivers/vfio/pci/Kconfig > @@ -67,4 +67,6 @@ source "drivers/vfio/pci/nvgrace-gpu/Kconfig" > > source "drivers/vfio/pci/qat/Kconfig" > > +source "drivers/vfio/pci/nvme/Kconfig" > + > endmenu > diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile > index cf00c0a7e55c..be8c4b5ee0ba 100644 > --- a/drivers/vfio/pci/Makefile > +++ b/drivers/vfio/pci/Makefile > @@ -10,6 +10,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o > > obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ > > +obj-$(CONFIG_NVME_VFIO_PCI) += nvme/ > + > obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ > > obj-$(CONFIG_PDS_VFIO_PCI) += pds/ > diff --git a/drivers/vfio/pci/nvme/Kconfig b/drivers/vfio/pci/nvme/Kconfig > new file mode 100644 > index 000000000000..12e0eaba0de1 > --- /dev/null > +++ b/drivers/vfio/pci/nvme/Kconfig > @@ -0,0 +1,10 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +config NVME_VFIO_PCI > + tristate "VFIO support for NVMe PCI devices" > + depends on NVME_CORE > + depends on VFIO_PCI_CORE > + help > + This provides migration support for NVMe devices using the > + VFIO framework. > + > + If you don't know what to do here, say N. > diff --git a/drivers/vfio/pci/nvme/Makefile b/drivers/vfio/pci/nvme/Makefile > new file mode 100644 > index 000000000000..2f4a0ad3d9cf > --- /dev/null > +++ b/drivers/vfio/pci/nvme/Makefile > @@ -0,0 +1,3 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +obj-$(CONFIG_NVME_VFIO_PCI) += nvme-vfio-pci.o > +nvme-vfio-pci-y := nvme.o > diff --git a/drivers/vfio/pci/nvme/nvme.c b/drivers/vfio/pci/nvme/nvme.c > new file mode 100644 > index 000000000000..08bee3274207 > --- /dev/null > +++ b/drivers/vfio/pci/nvme/nvme.c > @@ -0,0 +1,196 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved > + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved > + */ > + > +#include <linux/device.h> > +#include <linux/eventfd.h> > +#include <linux/file.h> > +#include <linux/interrupt.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/pci.h> > +#include <linux/types.h> > +#include <linux/vfio.h> > +#include <linux/anon_inodes.h> > +#include <linux/kernel.h> > +#include <linux/vfio_pci_core.h> > + > +#include "nvme.h" > + > +static void nvmevf_disable_fd(struct nvmevf_migration_file *migf) > +{ > + mutex_lock(&migf->lock); > + > + /* release the device states buffer */ > + kvfree(migf->vf_data); > + migf->vf_data = NULL; > + migf->disabled = true; > + migf->total_length = 0; > + migf->filp->f_pos = 0; > + mutex_unlock(&migf->lock); > +} > + > +static void nvmevf_disable_fds(struct nvmevf_pci_core_device *nvmevf_dev) > +{ > + if (nvmevf_dev->resuming_migf) { > + nvmevf_disable_fd(nvmevf_dev->resuming_migf); > + fput(nvmevf_dev->resuming_migf->filp); > + nvmevf_dev->resuming_migf = NULL; > + } > + > + if (nvmevf_dev->saving_migf) { > + nvmevf_disable_fd(nvmevf_dev->saving_migf); > + fput(nvmevf_dev->saving_migf->filp); > + nvmevf_dev->saving_migf = NULL; > + } > +} > + > +static void nvmevf_state_mutex_unlock(struct nvmevf_pci_core_device *nvmevf_dev) > +{ > + lockdep_assert_held(&nvmevf_dev->state_mutex); > +again: > + spin_lock(&nvmevf_dev->reset_lock); > + if (nvmevf_dev->deferred_reset) { > + nvmevf_dev->deferred_reset = false; > + spin_unlock(&nvmevf_dev->reset_lock); > + nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING; > + nvmevf_disable_fds(nvmevf_dev); > + goto again; > + } > + mutex_unlock(&nvmevf_dev->state_mutex); > + spin_unlock(&nvmevf_dev->reset_lock); > +} > + > +static struct nvmevf_pci_core_device *nvmevf_drvdata(struct pci_dev *pdev) > +{ > + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); > + > + return container_of(core_device, struct nvmevf_pci_core_device, > + core_device); > +} > + > +static int nvmevf_pci_open_device(struct vfio_device *core_vdev) > +{ > + struct nvmevf_pci_core_device *nvmevf_dev; > + struct vfio_pci_core_device *vdev; > + int ret; > + > + nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device, > + core_device.vdev); > + vdev = &nvmevf_dev->core_device; > + > + ret = vfio_pci_core_enable(vdev); > + if (ret) > + return ret; > + > + if (nvmevf_dev->migrate_cap) > + nvmevf_dev->mig_state = VFIO_DEVICE_STATE_RUNNING; > + vfio_pci_core_finish_enable(vdev); > + return 0; > +} > + > +static void nvmevf_pci_close_device(struct vfio_device *core_vdev) > +{ > + struct nvmevf_pci_core_device *nvmevf_dev; > + > + nvmevf_dev = container_of(core_vdev, struct nvmevf_pci_core_device, > + core_device.vdev); > + > + if (nvmevf_dev->migrate_cap) { > + mutex_lock(&nvmevf_dev->state_mutex); > + nvmevf_disable_fds(nvmevf_dev); > + nvmevf_state_mutex_unlock(nvmevf_dev); > + } > + > + vfio_pci_core_close_device(core_vdev); > +} > + > +static const struct vfio_device_ops nvmevf_pci_ops = { > + .name = "nvme-vfio-pci", > + .release = vfio_pci_core_release_dev, > + .open_device = nvmevf_pci_open_device, > + .close_device = nvmevf_pci_close_device, > + .ioctl = vfio_pci_core_ioctl, > + .device_feature = vfio_pci_core_ioctl_feature, > + .read = vfio_pci_core_read, > + .write = vfio_pci_core_write, > + .mmap = vfio_pci_core_mmap, > + .request = vfio_pci_core_request, > + .match = vfio_pci_core_match, > +}; > + > +static int nvmevf_pci_probe(struct pci_dev *pdev, > + const struct pci_device_id *id) > +{ > + struct nvmevf_pci_core_device *nvmevf_dev; > + int ret; > + > + nvmevf_dev = vfio_alloc_device(nvmevf_pci_core_device, core_device.vdev, > + &pdev->dev, &nvmevf_pci_ops); > + if (IS_ERR(nvmevf_dev)) > + return PTR_ERR(nvmevf_dev); > + > + dev_set_drvdata(&pdev->dev, &nvmevf_dev->core_device); > + ret = vfio_pci_core_register_device(&nvmevf_dev->core_device); > + if (ret) > + goto out_put_dev; > + > + return 0; > + > +out_put_dev: > + vfio_put_device(&nvmevf_dev->core_device.vdev); > + return ret; > +} > + > +static void nvmevf_pci_remove(struct pci_dev *pdev) > +{ > + struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev); > + > + vfio_pci_core_unregister_device(&nvmevf_dev->core_device); > + vfio_put_device(&nvmevf_dev->core_device.vdev); > +} > + > +static void nvmevf_pci_aer_reset_done(struct pci_dev *pdev) > +{ > + struct nvmevf_pci_core_device *nvmevf_dev = nvmevf_drvdata(pdev); > + > + if (!nvmevf_dev->migrate_cap) > + return; > + > + /* > + * As the higher VFIO layers are holding locks across reset and using > + * those same locks with the mm_lock we need to prevent ABBA deadlock > + * with the state_mutex and mm_lock. > + * In case the state_mutex was taken already we defer the cleanup work > + * to the unlock flow of the other running context. > + */ > + spin_lock(&nvmevf_dev->reset_lock); > + nvmevf_dev->deferred_reset = true; > + if (!mutex_trylock(&nvmevf_dev->state_mutex)) { > + spin_unlock(&nvmevf_dev->reset_lock); > + return; > + } > + spin_unlock(&nvmevf_dev->reset_lock); > + nvmevf_state_mutex_unlock(nvmevf_dev); > +} > + > +static const struct pci_error_handlers nvmevf_err_handlers = { > + .reset_done = nvmevf_pci_aer_reset_done, > + .error_detected = vfio_pci_core_aer_err_detected, > +}; > + > +static struct pci_driver nvmevf_pci_driver = { > + .name = KBUILD_MODNAME, > + .probe = nvmevf_pci_probe, > + .remove = nvmevf_pci_remove, > + .err_handler = &nvmevf_err_handlers, > + .driver_managed_dma = true, > +}; > + > +module_pci_driver(nvmevf_pci_driver); > + > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Chaitanya Kulkarni <kch@xxxxxxxxxx>"); > +MODULE_DESCRIPTION("NVMe VFIO PCI - VFIO PCI driver with live migration support for NVMe"); Without a MODULE_DEVICE_TABLE, what devices are ever going to use this driver? Userspace needs to be given a clue when to use this driver vs vfio-pci. We also don't have a fallback mechanism to try a driver until it fails, so this driver likely needs to take over defacto support for all NVMe devices from vfio-pci, rather that later rejecting those that don't support migration as patch 4/ implements in the .init callback. Thanks, Alex > diff --git a/drivers/vfio/pci/nvme/nvme.h b/drivers/vfio/pci/nvme/nvme.h > new file mode 100644 > index 000000000000..ee602254679e > --- /dev/null > +++ b/drivers/vfio/pci/nvme/nvme.h > @@ -0,0 +1,36 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2022, INTEL CORPORATION. All rights reserved > + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved > + */ > + > +#ifndef NVME_VFIO_PCI_H > +#define NVME_VFIO_PCI_H > + > +#include <linux/kernel.h> > +#include <linux/vfio_pci_core.h> > +#include <linux/nvme.h> > + > +struct nvmevf_migration_file { > + struct file *filp; > + struct mutex lock; > + bool disabled; > + u8 *vf_data; > + size_t total_length; > +}; > + > +struct nvmevf_pci_core_device { > + struct vfio_pci_core_device core_device; > + int vf_id; > + u8 migrate_cap:1; > + u8 deferred_reset:1; > + /* protect migration state */ > + struct mutex state_mutex; > + enum vfio_device_mig_state mig_state; > + /* protect the reset_done flow */ > + spinlock_t reset_lock; > + struct nvmevf_migration_file *resuming_migf; > + struct nvmevf_migration_file *saving_migf; > +}; > + > +#endif /* NVME_VFIO_PCI_H */