[RFC v2 11/14] vfio/nvidia-vgpu: introduce NVIDIA vGPU VFIO variant driver

Zhi Wang <zhiw@xxxxxxxxxx> · Wed, 3 Sep 2025 15:11:08 -0700

A VFIO variant driver module is designed to extend the capabilities of
the existing VFIO (Virtual Function I/O), offering device management
interfaces to the userspace and advanced feature support.

For the userspace to use the NVIDIA vGPU, a new vGPU VFIO variant driver
is introduced to provide vGPU management, like selecting/creating vGPU
instance, support advance features like live migration.

Introduce the NVIDIA vGPU VFIO variant driver to support vGPU lifecycle
management UABI and the future advancd features.

Cc: Aniket Agashe <aniketa@xxxxxxxxxx>
Cc: Ankit Agrawal <ankita@xxxxxxxxxx>
Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx>
---
 .../ABI/stable/sysfs-driver-nvidia-vgpu       |  11 +
 drivers/vfio/pci/nvidia-vgpu/Makefile         |   3 +-
 drivers/vfio/pci/nvidia-vgpu/debug.h          |  10 +
 drivers/vfio/pci/nvidia-vgpu/vfio.h           |  49 ++
 drivers/vfio/pci/nvidia-vgpu/vfio_access.c    | 313 ++++++++
 drivers/vfio/pci/nvidia-vgpu/vfio_main.c      | 688 ++++++++++++++++++
 drivers/vfio/pci/nvidia-vgpu/vfio_sysfs.c     | 209 ++++++
 drivers/vfio/pci/nvidia-vgpu/vgpu.c           |  53 +-
 drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c       |  68 +-
 drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h       |  29 +
 10 files changed, 1427 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/ABI/stable/sysfs-driver-nvidia-vgpu
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio.h
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_access.c
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_main.c
 create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_sysfs.c

diff --git a/Documentation/ABI/stable/sysfs-driver-nvidia-vgpu b/Documentation/ABI/stable/sysfs-driver-nvidia-vgpu
new file mode 100644
index 000000000000..1fc3ac8e234d
--- /dev/null
+++ b/Documentation/ABI/stable/sysfs-driver-nvidia-vgpu
@@ -0,0 +1,11 @@
+What:           /sys/devices/pciXXXX:XX/0000:XX:XX.X/nvidia/creatable_vgpu_types
+Date:		June 2, 2025
+KernelVersion:	6.17
+Contact:	kvm@xxxxxxxxxxxxxxx
+Description:	Query the creatble vGPU types on a virtual function.
+
+What:           /sys/devices/pciXXXX:XX/0000:XX:XX.X/nvidia/current_vgpu_type
+Date:		June 2, 2025
+KernelVersion:	6.17
+Contact:	kvm@xxxxxxxxxxxxxxx
+Description:	Set the vGPU type for the virtual function.
diff --git a/drivers/vfio/pci/nvidia-vgpu/Makefile b/drivers/vfio/pci/nvidia-vgpu/Makefile
index 91e57c65ca27..2aba9b4868aa 100644
--- a/drivers/vfio/pci/nvidia-vgpu/Makefile
+++ b/drivers/vfio/pci/nvidia-vgpu/Makefile
@@ -2,4 +2,5 @@
 subdir-ccflags-y += -I$(src)/include
 
 obj-$(CONFIG_NVIDIA_VGPU_VFIO_PCI) += nvidia_vgpu_vfio_pci.o
-nvidia_vgpu_vfio_pci-y := vgpu_mgr.o vgpu.o metadata.o metadata_vgpu_type.o rpc.o
+nvidia_vgpu_vfio_pci-y := vgpu_mgr.o vgpu.o metadata.o metadata_vgpu_type.o rpc.o \
+			  vfio_main.o vfio_access.o vfio_sysfs.o
diff --git a/drivers/vfio/pci/nvidia-vgpu/debug.h b/drivers/vfio/pci/nvidia-vgpu/debug.h
index db9288752384..05cb2ea13543 100644
--- a/drivers/vfio/pci/nvidia-vgpu/debug.h
+++ b/drivers/vfio/pci/nvidia-vgpu/debug.h
@@ -22,4 +22,14 @@
 	pci_err(__v->pdev, "nvidia-vgpu %d: "f, __v->info.id, ##a); \
 })
 
+#define nvdev_debug(n, f, a...) ({ \
+	typeof(n) __n = (n); \
+	pci_dbg(__n->core_dev.pdev, "nvidia-vgpu-vfio: "f, ##a); \
+})
+
+#define nvdev_error(n, f, a...) ({ \
+	typeof(n) __n = (n); \
+	pci_err(__n->core_dev.pdev, "nvidia-vgpu-vfio: "f, ##a); \
+})
+
 #endif
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio.h b/drivers/vfio/pci/nvidia-vgpu/vfio.h
new file mode 100644
index 000000000000..4c9bf9c80f5c
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright © 2025 NVIDIA Corporation
+ */
+
+#ifndef _NVIDIA_VGPU_VFIO_H__
+#define _NVIDIA_VGPU_VFIO_H__
+
+#include <linux/vfio_pci_core.h>
+
+#include "vgpu_mgr.h"
+
+#define PCI_CONFIG_SPACE_LENGTH 4096
+
+#define CAP_LIST_NEXT_PTR_MSIX 0x7c
+#define MSIX_CAP_SIZE   0xc
+
+struct nvidia_vgpu_vfio {
+	struct vfio_pci_core_device core_dev;
+	u8 vconfig[PCI_CONFIG_SPACE_LENGTH];
+	void __iomem *bar0_map;
+
+	struct nvidia_vgpu_mgr *vgpu_mgr;
+	struct nvidia_vgpu_type *vgpu_type;
+
+	/* lock to protect vgpu pointer and following members */
+	struct mutex vfio_vgpu_lock;
+	struct nvidia_vgpu *vgpu;
+	bool vdev_is_opened;
+	bool driver_is_unbound;
+	struct pid *task_pid;
+	struct completion vdev_closing_completion;
+
+	struct nvidia_vgpu_event_listener pf_driver_event_listener;
+};
+
+static inline struct nvidia_vgpu_vfio *core_dev_to_nvdev(struct vfio_pci_core_device *core_dev)
+{
+	return container_of(core_dev, struct nvidia_vgpu_vfio, core_dev);
+}
+
+void nvidia_vgpu_vfio_setup_config(struct nvidia_vgpu_vfio *nvdev);
+ssize_t nvidia_vgpu_vfio_access(struct nvidia_vgpu_vfio *nvdev, char __user *buf, size_t count,
+				loff_t ppos, bool iswrite);
+
+int nvidia_vgpu_vfio_setup_sysfs(struct nvidia_vgpu_vfio *nvdev);
+void nvidia_vgpu_vfio_clean_sysfs(struct nvidia_vgpu_vfio *nvdev);
+
+#endif /* _NVIDIA_VGPU_VFIO_H__ */
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_access.c b/drivers/vfio/pci/nvidia-vgpu/vfio_access.c
new file mode 100644
index 000000000000..4a72575264ba
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio_access.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 NVIDIA Corporation
+ */
+
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/pci_regs.h>
+
+#include "vfio.h"
+
+#define vconfig_set8(offset, v) \
+	(*(u8 *)(nvdev->vconfig + (offset)) = v)
+
+#define vconfig_set16(offset, v) \
+	(*(u16 *)(nvdev->vconfig + (offset)) = v)
+
+#define vconfig_set32(offset, v) \
+	(*(u32 *)(nvdev->vconfig + (offset)) = v)
+
+void nvidia_vgpu_vfio_setup_config(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_type *vgpu_type;
+	u8 val8;
+
+	lockdep_assert_held(&nvdev->vfio_vgpu_lock);
+
+	if (WARN_ON(!nvdev->vgpu_type))
+		return;
+
+	vgpu_type = nvdev->vgpu_type;
+
+	memset(nvdev->vconfig, 0, sizeof(nvdev->vconfig));
+
+	/* Header type 0 (normal devices) */
+	vconfig_set16(PCI_VENDOR_ID, PCI_VENDOR_ID_NVIDIA);
+	vconfig_set16(PCI_DEVICE_ID, FIELD_GET(GENMASK(31, 16), vgpu_type->vdev_id));
+	vconfig_set16(PCI_COMMAND, 0x0000);
+	vconfig_set16(PCI_STATUS, 0x0010);
+
+	pci_read_config_byte(nvdev->core_dev.pdev, PCI_CLASS_REVISION, &val8);
+	vconfig_set8(PCI_CLASS_REVISION, val8);
+
+	vconfig_set8(PCI_CLASS_PROG, 0); /* VGA-compatible */
+	vconfig_set8(PCI_CLASS_DEVICE, 0); /* VGA controller */
+	vconfig_set8(PCI_CLASS_DEVICE + 1, 3); /* Display controller */
+
+	/* BAR0: 32-bit */
+	vconfig_set32(PCI_BASE_ADDRESS_0, 0x00000000);
+	/* BAR1: 64-bit, prefetchable */
+	vconfig_set32(PCI_BASE_ADDRESS_1, 0x0000000c);
+	/* BAR2: 64-bit, prefetchable */
+	vconfig_set32(PCI_BASE_ADDRESS_3, 0x0000000c);
+	/* Disable BAR3: I/O */
+	vconfig_set32(PCI_BASE_ADDRESS_5, 0x00000000);
+
+	vconfig_set16(PCI_SUBSYSTEM_VENDOR_ID, PCI_VENDOR_ID_NVIDIA);
+	vconfig_set16(PCI_SUBSYSTEM_ID, FIELD_GET(GENMASK(15, 0),
+		      nvdev->vgpu->info.vgpu_type->vdev_id));
+
+	vconfig_set8(PCI_CAPABILITY_LIST, CAP_LIST_NEXT_PTR_MSIX);
+	vconfig_set8(CAP_LIST_NEXT_PTR_MSIX + 1, 0);
+
+	/* INTx disabled */
+	vconfig_set8(0x3d, 0);
+}
+
+#define PCI_CONFIG_READ(pdev, off, buf, size) \
+	do { \
+		switch (size) { \
+		case 4: pci_read_config_dword((pdev), (off), (u32 *)(buf)); break; \
+		case 2: pci_read_config_word((pdev), (off), (u16 *)(buf)); break; \
+		case 1: pci_read_config_byte((pdev), (off), (u8 *)(buf));  break; \
+		} \
+	} while (0)
+
+#define PCI_CONFIG_WRITE(pdev, off, buf, size) \
+	do { \
+		switch (size) { \
+		case 4: pci_write_config_dword((pdev), (off), *(u32 *)(buf)); break; \
+		case 2: pci_write_config_word((pdev), (off), *(u16 *)(buf)); break; \
+		case 1: pci_write_config_byte((pdev), (off), *(u8 *)(buf));  break; \
+		} \
+	} while (0)
+
+#define MMIO_READ(map, off, buf, size) \
+	do { \
+		switch (size) { \
+		case 4: { u32 val = ioread32((map) + (off)); memcpy((buf), &val, 4); break; } \
+		case 2: { u16 val = ioread16((map) + (off)); memcpy((buf), &val, 2); break; } \
+		case 1: { u8  val = ioread8((map) + (off)); memcpy((buf), &val, 1); break; } \
+		} \
+	} while (0)
+
+#define MMIO_WRITE(map, off, buf, size) \
+	do { \
+		switch (size) { \
+		case 4: iowrite32(*(u32 *)(buf), (map) + (off)); break; \
+		case 2: iowrite16(*(u16 *)(buf), (map) + (off)); break; \
+		case 1: iowrite8 (*(u8  *)(buf), (map) + (off)); break; \
+		} \
+	} while (0)
+
+static ssize_t bar0_rw(struct nvidia_vgpu_vfio *nvdev, char *buf, size_t count, loff_t ppos,
+		       bool iswrite)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+	loff_t offset = ppos;
+	void __iomem *map;
+	int ret;
+
+	if (WARN_ON(index != VFIO_PCI_BAR0_REGION_INDEX))
+		return -EINVAL;
+
+	offset &= VFIO_PCI_OFFSET_MASK;
+
+	if (!nvdev->bar0_map) {
+		ret = pci_request_selected_regions(pdev, 1 << index, "nvidia-vgpu-vfio");
+		if (ret)
+			return ret;
+
+		if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) {
+			pci_release_selected_regions(pdev, 1 << index);
+			return -EIO;
+		}
+
+		map = ioremap(pci_resource_start(pdev, index), pci_resource_len(pdev, index));
+		if (!map) {
+			pci_err(pdev, "Can't map BAR0 MMIO space\n");
+			pci_release_selected_regions(pdev, 1 << index);
+			return -ENOMEM;
+		}
+		nvdev->bar0_map = map;
+	} else {
+		map = nvdev->bar0_map;
+	}
+
+	if (iswrite)
+		MMIO_WRITE(map, offset, buf, count);
+	else
+		MMIO_READ(map, offset, buf, count);
+
+	return count;
+}
+
+/* Generate mask for 32-bit or 64-bit PCI BAR address range */
+#define GEN_BARMASK(size)        ((u32)((~(size) + 1) & ~0xFUL))
+#define GEN_BARMASK_HI(size)     ((u32)(((~(size) + 1) & ~0xFULL) >> 32))
+
+static u32 emulate_pci_base_reg_write(struct nvidia_vgpu_vfio *nvdev, loff_t offset, u32 cfg_addr)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	struct nvidia_vgpu_type *vgpu_type = nvdev->vgpu->info.vgpu_type;
+	u32 bar_mask;
+
+	switch (offset) {
+	case PCI_BASE_ADDRESS_0:
+		bar_mask = GEN_BARMASK(pci_resource_len(pdev, VFIO_PCI_BAR0_REGION_INDEX));
+		cfg_addr = (cfg_addr & bar_mask) | (nvdev->vconfig[offset] & 0xFUL);
+		break;
+
+	case PCI_BASE_ADDRESS_1:
+		bar_mask = GEN_BARMASK(vgpu_type->bar1_length * SZ_1M);
+		cfg_addr = (cfg_addr & bar_mask) | (nvdev->vconfig[offset] & 0xFUL);
+		break;
+
+	case PCI_BASE_ADDRESS_2:
+		bar_mask = GEN_BARMASK_HI(vgpu_type->bar1_length * SZ_1M);
+		cfg_addr &= bar_mask;
+		break;
+
+	case PCI_BASE_ADDRESS_3:
+		bar_mask = GEN_BARMASK(pci_resource_len(pdev, VFIO_PCI_BAR3_REGION_INDEX));
+		cfg_addr = (cfg_addr & bar_mask) | (nvdev->vconfig[offset] & 0xFUL);
+		break;
+
+	case PCI_BASE_ADDRESS_4:
+		bar_mask = GEN_BARMASK_HI(pci_resource_len(pdev, VFIO_PCI_BAR3_REGION_INDEX));
+		cfg_addr &= bar_mask;
+		break;
+
+	default:
+		WARN_ONCE(1, "Unsupported PCI BAR offset: %llx\n", offset);
+		return 0;
+	}
+
+	return cfg_addr;
+}
+
+static void handle_pci_config_read(struct nvidia_vgpu_vfio *nvdev, char *buf,
+				   size_t count, loff_t offset)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	u32 val = 0;
+
+	memcpy(buf, (u8 *)&nvdev->vconfig[offset], count);
+
+	switch (offset) {
+	case PCI_COMMAND:
+		PCI_CONFIG_READ(pdev, offset, (char *)&val, count);
+
+		switch (count) {
+		case 4:
+			val = (u32)(val & 0xFFFF0000) | (val &
+					(PCI_COMMAND_PARITY | PCI_COMMAND_SERR));
+			break;
+		case 2:
+			val = (val & (PCI_COMMAND_PARITY | PCI_COMMAND_SERR));
+			break;
+		default:
+			WARN_ONCE(1, "Not supported access len\n");
+			break;
+		}
+		break;
+	case PCI_STATUS:
+		PCI_CONFIG_READ(pdev, offset, (char *)&val, count);
+		break;
+	default:
+		break;
+	}
+	*(u32 *)buf = *(u32 *)buf | val;
+}
+
+static void handle_pci_config_write(struct nvidia_vgpu_vfio *nvdev, char *buf,
+				    size_t count, loff_t offset)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	u32 val = 0;
+	u32 cfg_addr;
+
+	switch (offset) {
+	case PCI_VENDOR_ID:
+	case PCI_DEVICE_ID:
+	case PCI_CAPABILITY_LIST:
+		break;
+
+	case PCI_STATUS:
+		PCI_CONFIG_WRITE(pdev, offset, buf, count);
+		break;
+	case PCI_COMMAND:
+		if (count == 4) {
+			val = (u32)((*(u32 *)buf & 0xFFFF0000) >> 16);
+			PCI_CONFIG_WRITE(pdev, PCI_STATUS, (char *)&val, 2);
+
+			val = (u32)(*(u32 *)buf & 0x0000FFFF);
+			*(u32 *)buf = val;
+		}
+
+		memcpy((u8 *)&nvdev->vconfig[offset], buf, count);
+		break;
+	case PCI_BASE_ADDRESS_0:
+	case PCI_BASE_ADDRESS_1:
+	case PCI_BASE_ADDRESS_2:
+	case PCI_BASE_ADDRESS_3:
+	case PCI_BASE_ADDRESS_4:
+		cfg_addr = *(u32 *)buf;
+		cfg_addr = emulate_pci_base_reg_write(nvdev, offset, cfg_addr);
+		*(u32 *)&nvdev->vconfig[offset] = cfg_addr;
+		break;
+	default:
+		break;
+	}
+}
+
+static ssize_t pci_config_rw(struct nvidia_vgpu_vfio *nvdev, char *buf, size_t count,
+			     loff_t ppos, bool iswrite)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+	loff_t offset = ppos;
+
+	if (WARN_ON(index != VFIO_PCI_CONFIG_REGION_INDEX))
+		return -EINVAL;
+
+	offset &= VFIO_PCI_OFFSET_MASK;
+
+	if (offset >= CAP_LIST_NEXT_PTR_MSIX &&
+	    offset < CAP_LIST_NEXT_PTR_MSIX + MSIX_CAP_SIZE) {
+		if (!iswrite)
+			PCI_CONFIG_READ(pdev, offset, buf, count);
+		else
+			PCI_CONFIG_WRITE(pdev, offset, buf, count);
+		return count;
+	}
+
+	if (!iswrite)
+		handle_pci_config_read(nvdev, buf, count, offset);
+	else
+		handle_pci_config_write(nvdev, buf, count, offset);
+
+	return count;
+}
+
+ssize_t nvidia_vgpu_vfio_access(struct nvidia_vgpu_vfio *nvdev, char *buf,
+				size_t count, loff_t ppos, bool iswrite)
+{
+	int index = VFIO_PCI_OFFSET_TO_INDEX(ppos);
+
+	if (index >= VFIO_PCI_NUM_REGIONS)
+		return -EINVAL;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		return pci_config_rw(nvdev, buf, count, ppos,
+				     iswrite);
+	case VFIO_PCI_BAR0_REGION_INDEX:
+		return bar0_rw(nvdev, buf, count, ppos, iswrite);
+	default:
+		return -EINVAL;
+	}
+	return count;
+}
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_main.c b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c
new file mode 100644
index 000000000000..b557062a4ac2
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c
@@ -0,0 +1,688 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 NVIDIA Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/types.h>
+
+#include "debug.h"
+#include "vfio.h"
+
+static inline struct vfio_pci_core_device *vdev_to_core_dev(struct vfio_device *vdev)
+{
+	return container_of(vdev, struct vfio_pci_core_device, vdev);
+}
+
+static int pdev_to_gfid(struct pci_dev *pdev)
+{
+	return pci_iov_vf_id(pdev) + 1;
+}
+
+static int destroy_vgpu(struct nvidia_vgpu_vfio *nvdev)
+{
+	int ret;
+
+	ret = nvidia_vgpu_mgr_destroy_vgpu(nvdev->vgpu);
+	if (ret)
+		return ret;
+
+	kfree(nvdev->vgpu);
+	nvdev->vgpu = NULL;
+	return 0;
+}
+
+static int create_vgpu(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+	struct nvidia_vgpu_type *type = nvdev->vgpu_type;
+	struct nvidia_vgpu *vgpu;
+	int ret;
+
+	if (WARN_ON(!type || !nvdev->task_pid))
+		return -ENODEV;
+
+	vgpu = kzalloc(sizeof(*vgpu), GFP_KERNEL);
+	if (!vgpu)
+		return -ENOMEM;
+
+	vgpu->info.id = pci_iov_vf_id(pdev);
+	vgpu->info.dbdf = (0 << 16) | pci_dev_id(pdev);
+	vgpu->info.gfid = pdev_to_gfid(pdev);
+	vgpu->info.vgpu_type = type;
+	vgpu->info.vm_pid = pid_nr(nvdev->task_pid);
+
+	vgpu->vgpu_mgr = vgpu_mgr;
+	vgpu->pdev = pdev;
+
+	ret = nvidia_vgpu_mgr_create_vgpu(vgpu);
+	if (ret) {
+		kfree(vgpu);
+		return ret;
+	}
+
+	nvdev->vgpu = vgpu;
+	return 0;
+}
+
+static inline bool pdev_is_present(struct pci_dev *pdev)
+{
+	struct pci_dev *physfn = (pdev->is_virtfn) ? pdev->physfn : pdev;
+
+	if (pdev->is_virtfn)
+		return (pci_device_is_present(physfn) &&
+				pdev->error_state != pci_channel_io_perm_failure);
+	else
+		return pci_device_is_present(physfn);
+}
+
+/* Wait till 1000 ms for HW that returns CRS completion status */
+#define MIN_FLR_WAIT_TIME 100
+#define MAX_FLR_WAIT_TIME 1000
+
+static int do_vf_flr(struct vfio_device *vdev)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	u32 data, elapsed_time = 0;
+
+	if (!pdev->is_virtfn)
+		return 0;
+
+	if (!pdev_is_present(pdev))
+		return -ENOTTY;
+
+	pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP, &data);
+	if (!(data & PCI_EXP_DEVCAP_FLR)) {
+		nvdev_error(nvdev, "FLR capability not present on the VF.\n");
+		return -EINVAL;
+	}
+
+	device_lock(&pdev->dev);
+	pci_set_power_state(pdev, PCI_D0);
+	pci_save_state(pdev);
+
+	if (!pci_wait_for_pending_transaction(pdev))
+		nvdev_error(nvdev, "Timed out waiting for transaction pending to go to 0.\n");
+
+	pcie_capability_set_word(pdev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR);
+
+	/*
+	 * If CRS-SV is supported and enabled, then the root-port returns '0001h'
+	 * for a PCI config read of the 16-byte vendor_id field. This indicates CRS
+	 * completion status.
+	 *
+	 * If CRS-SV is not supported/enabled, then the root-port will generally
+	 * synthesise ~0 data for any PCI config read.
+	 */
+	do {
+		msleep(MIN_FLR_WAIT_TIME);
+		elapsed_time += MIN_FLR_WAIT_TIME;
+
+		pci_read_config_dword(pdev, PCI_VENDOR_ID, &data);
+	} while (((data & 0xffff) == 0x0001) && (elapsed_time < MAX_FLR_WAIT_TIME));
+
+	if (elapsed_time < MAX_FLR_WAIT_TIME) {
+		/*
+		 * Device is back from the CRS-SV, continue checking
+		 * if device is ready by reading PCI_COMMAND.
+		 */
+		do {
+			pci_read_config_dword(pdev, PCI_COMMAND, &data);
+			if (data != ~0)
+				goto flr_done;
+
+			msleep(MIN_FLR_WAIT_TIME);
+			elapsed_time += MIN_FLR_WAIT_TIME;
+		} while (elapsed_time < MAX_FLR_WAIT_TIME);
+
+		nvdev_error(nvdev, "FLR failed non-CRS case, waited for %d ms\n", elapsed_time);
+	} else {
+		nvdev_error(nvdev, "FLR failed CRS case, waited for %d ms\n", elapsed_time);
+	}
+
+	/* Device is not usable. */
+	xchg(&pdev->error_state, pci_channel_io_perm_failure);
+	device_unlock(&pdev->dev);
+	return -ENOTTY;
+
+flr_done:
+	pci_restore_state(pdev);
+	device_unlock(&pdev->dev);
+
+	return 0;
+}
+
+static int nvidia_vgpu_vfio_open_device(struct vfio_device *vdev)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	u64 pf_dma_mask;
+	int ret;
+
+	nvdev_debug(nvdev, "open device\n");
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+	if (!nvdev->vgpu_type) {
+		nvdev_error(nvdev, "a vGPU type must be chosen before opening VFIO device\n");
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	if (nvdev->driver_is_unbound) {
+		nvdev_error(nvdev, "the driver has been torn down because PF driver is unbound "
+				   "or the admin is disabling the VF\n");
+		ret = -ENODEV;
+		goto err_unlock;
+	}
+
+	if (nvdev->vdev_is_opened) {
+		ret = -EBUSY;
+		goto err_unlock;
+	}
+
+	ret = pci_enable_device(pdev);
+	if (ret)
+		goto err_unlock;
+
+	pci_set_master(pdev);
+
+	pf_dma_mask = dma_get_mask(&pdev->physfn->dev);
+	dma_set_mask(&pdev->dev, pf_dma_mask);
+	dma_set_coherent_mask(&pdev->dev, pf_dma_mask);
+
+	ret = do_vf_flr(vdev);
+	if (ret)
+		goto err_reset_function;
+
+	nvdev->task_pid = get_task_pid(current, PIDTYPE_PID);
+
+	ret = create_vgpu(nvdev);
+	if (ret)
+		goto err_create_vgpu;
+
+	ret = nvidia_vgpu_mgr_set_bme(nvdev->vgpu, true);
+	if (ret)
+		goto err_enable_bme;
+
+	nvidia_vgpu_vfio_setup_config(nvdev);
+
+	nvdev->vdev_is_opened = true;
+	reinit_completion(&nvdev->vdev_closing_completion);
+
+	nvdev_debug(nvdev, "VFIO device is opened, client pid: %u\n", pid_nr(nvdev->task_pid));
+
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+	return 0;
+
+err_enable_bme:
+	destroy_vgpu(nvdev);
+err_create_vgpu:
+	put_pid(nvdev->task_pid);
+err_reset_function:
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+err_unlock:
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+	return ret;
+}
+
+static void nvidia_vgpu_vfio_close_device(struct vfio_device *vdev)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+
+	nvdev_debug(nvdev, "VFIO device is closing, client pid: %u\n", pid_nr(nvdev->task_pid));
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+
+	if (nvdev->bar0_map) {
+		iounmap(nvdev->bar0_map);
+		pci_release_selected_regions(pdev, 1 << 0);
+		nvdev->bar0_map = NULL;
+	}
+
+	destroy_vgpu(nvdev);
+
+	put_pid(nvdev->task_pid);
+	nvdev->task_pid = NULL;
+
+	pci_clear_master(pdev);
+	pci_disable_device(pdev);
+
+	nvdev->vdev_is_opened = false;
+	complete(&nvdev->vdev_closing_completion);
+
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+
+	nvdev_debug(nvdev, "VFIO device is closed\n");
+}
+
+static int get_region_info(struct vfio_pci_core_device *core_dev, unsigned long arg)
+{
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	struct vfio_region_info info;
+	unsigned long minsz;
+	int ret = 0;
+
+	minsz = offsetofend(struct vfio_region_info, offset);
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EINVAL;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	switch (info.index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = PCI_CONFIG_SPACE_LENGTH;
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE;
+		break;
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR4_REGION_INDEX:
+		info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+		info.size = pci_resource_len(pdev, info.index);
+
+		if (info.index == VFIO_PCI_BAR1_REGION_INDEX)
+			info.size = nvdev->vgpu->info.vgpu_type->bar1_length * SZ_1M;
+
+		if (!info.size) {
+			info.flags = 0;
+			break;
+		}
+		info.flags = VFIO_REGION_INFO_FLAG_READ |
+			VFIO_REGION_INFO_FLAG_WRITE |
+			VFIO_REGION_INFO_FLAG_MMAP;
+		break;
+	case VFIO_PCI_BAR5_REGION_INDEX:
+	case VFIO_PCI_ROM_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+		info.size = 0;
+		break;
+	default:
+		if (info.index >= VFIO_PCI_NUM_REGIONS)
+			ret = -EINVAL;
+		break;
+	}
+
+	if (!ret)
+		ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+
+	return ret;
+}
+
+static long nvidia_vgpu_vfio_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	int ret = 0;
+
+	if (WARN_ON(!nvdev->vgpu || !nvdev->vdev_is_opened))
+		return -ENODEV;
+
+	switch (cmd) {
+	case VFIO_DEVICE_GET_REGION_INFO:
+		ret = get_region_info(core_dev, arg);
+		break;
+	case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
+	case VFIO_DEVICE_PCI_HOT_RESET:
+		break;
+	case VFIO_DEVICE_RESET:
+		ret = nvidia_vgpu_mgr_reset_vgpu(nvdev->vgpu);
+		break;
+	default:
+		ret = vfio_pci_core_ioctl(vdev, cmd, arg);
+		break;
+	}
+	return ret;
+}
+
+static ssize_t nvidia_vgpu_vfio_read(struct vfio_device *vdev, char __user *buf, size_t count,
+				     loff_t *ppos)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	u64 val;
+	size_t done = 0;
+	int ret = 0, size;
+
+	if (WARN_ON(!nvdev->vgpu || !nvdev->vdev_is_opened))
+		return -ENODEV;
+
+	while (count) {
+		if (count >= 4 && !(*ppos % 4))
+			size = 4;
+		else if (count >= 2 && !(*ppos % 2))
+			size = 2;
+		else
+			size = 1;
+
+		ret = nvidia_vgpu_vfio_access(nvdev, (char *)&val, size, *ppos, false);
+
+		if (ret <= 0)
+			return ret;
+
+		if (copy_to_user(buf, &val, size) != 0)
+			return -EFAULT;
+
+		*ppos += size;
+		buf += size;
+		count -= size;
+		done += size;
+	}
+	return done;
+}
+
+static ssize_t nvidia_vgpu_vfio_write(struct vfio_device *vdev,
+				      const char __user *buf, size_t count,
+				      loff_t *ppos)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	u64 val;
+	size_t done = 0;
+	int ret = 0, size;
+
+	if (WARN_ON(!nvdev->vgpu || !nvdev->vdev_is_opened))
+		return -ENODEV;
+
+	while (count) {
+		if (count >= 4 && !(*ppos % 4))
+			size = 4;
+		else if (count >= 2 && !(*ppos % 2))
+			size = 2;
+		else
+			size = 1;
+
+		if (copy_from_user(&val, buf, size) != 0)
+			return -EFAULT;
+
+		ret = nvidia_vgpu_vfio_access(nvdev, (char *)&val, size, *ppos, true);
+
+		if (ret <= 0)
+			return ret;
+
+		*ppos += size;
+		buf += size;
+		count -= size;
+		done += size;
+	}
+	return done;
+}
+
+static int nvidia_vgpu_vfio_mmap(struct vfio_device *vdev,
+				 struct vm_area_struct *vma)
+{
+	struct vfio_pci_core_device *core_dev = vdev_to_core_dev(vdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct pci_dev *pdev = core_dev->pdev;
+	u64 phys_len, req_len, pgoff, req_start;
+	unsigned int index;
+
+	if (WARN_ON(!nvdev->vgpu || !nvdev->vdev_is_opened))
+		return -ENODEV;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	if (index >= VFIO_PCI_BAR5_REGION_INDEX)
+		return -EINVAL;
+	if (vma->vm_end < vma->vm_start)
+		return -EINVAL;
+	if ((vma->vm_flags & VM_SHARED) == 0)
+		return -EINVAL;
+
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
+	req_len = vma->vm_end - vma->vm_start;
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+	req_start = pgoff << PAGE_SHIFT;
+
+	if (req_len == 0)
+		return -EINVAL;
+
+	if ((req_start + req_len > phys_len) || phys_len == 0)
+		return -EINVAL;
+
+	vma->vm_private_data = vdev;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, req_len, vma->vm_page_prot);
+}
+
+static const struct vfio_device_ops nvidia_vgpu_vfio_ops = {
+	.name           = "nvidia-vgpu-vfio-pci",
+	.init		= vfio_pci_core_init_dev,
+	.release	= vfio_pci_core_release_dev,
+	.open_device    = nvidia_vgpu_vfio_open_device,
+	.close_device   = nvidia_vgpu_vfio_close_device,
+	.ioctl          = nvidia_vgpu_vfio_ioctl,
+	.device_feature = vfio_pci_core_ioctl_feature,
+	.read           = nvidia_vgpu_vfio_read,
+	.write          = nvidia_vgpu_vfio_write,
+	.mmap           = nvidia_vgpu_vfio_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static void clean_nvdev_unbound(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+
+	/* driver unbound path is called from the event chain. */
+	lockdep_assert_held(&vgpu_mgr->pf_driver_event_chain.lock);
+	list_del_init(&nvdev->pf_driver_event_listener.list);
+
+	nvidia_vgpu_vfio_clean_sysfs(nvdev);
+
+	nvidia_vgpu_mgr_release(nvdev->vgpu_mgr);
+	nvdev->vgpu_mgr = NULL;
+	nvdev->vgpu_type = NULL;
+}
+
+static void handle_driver_unbound(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct task_struct *task;
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+
+	if (nvdev->driver_is_unbound) {
+		mutex_unlock(&nvdev->vfio_vgpu_lock);
+		return;
+	}
+
+	nvdev->driver_is_unbound = true;
+
+	if (nvdev->vdev_is_opened) {
+		task = get_pid_task(nvdev->task_pid, PIDTYPE_PID);
+		if (!task) {
+			mutex_unlock(&nvdev->vfio_vgpu_lock);
+			return;
+		}
+
+		nvdev_debug(nvdev, "Killing client pid: %u\n", pid_nr(nvdev->task_pid));
+
+		send_sig(SIGTERM, task, 1);
+		put_task_struct(task);
+
+		mutex_unlock(&nvdev->vfio_vgpu_lock);
+
+		wait_for_completion(&nvdev->vdev_closing_completion);
+	} else {
+		mutex_unlock(&nvdev->vfio_vgpu_lock);
+	}
+
+	clean_nvdev_unbound(nvdev);
+}
+
+static int handle_pf_driver_event(struct nvidia_vgpu_event_listener *self, unsigned int event,
+				  void *p)
+{
+	struct nvidia_vgpu_vfio *nvdev = container_of(self, struct nvidia_vgpu_vfio,
+			pf_driver_event_listener);
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+
+	switch (event) {
+	case NVIDIA_VGPU_PF_DRIVER_EVENT_DRIVER_UNBIND:
+		nvdev_debug(nvdev, "handle PF event driver unbind\n");
+
+		handle_driver_unbound(nvdev);
+		break;
+	case NVIDIA_VGPU_PF_DRIVER_EVENT_SRIOV_CONFIGURE:
+		int num_vfs = *(int *)p;
+
+		nvdev_debug(nvdev, "handle PF event SRIOV configure\n");
+
+		if (!num_vfs) {
+			handle_driver_unbound(nvdev);
+		} else {
+			/* convert num_vfs to max VF ID */
+			num_vfs--;
+			if (pci_iov_vf_id(pdev) > num_vfs)
+				handle_driver_unbound(nvdev);
+		}
+		break;
+	}
+	return 0;
+}
+
+static void register_pf_driver_event_listener(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+
+	nvdev->pf_driver_event_listener.func = handle_pf_driver_event;
+	INIT_LIST_HEAD(&nvdev->pf_driver_event_listener.list);
+
+	nvidia_vgpu_event_register_listener(&vgpu_mgr->pf_driver_event_chain,
+					    &nvdev->pf_driver_event_listener);
+}
+
+static void unregister_pf_driver_event_listener(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+
+	nvidia_vgpu_event_unregister_listener(&vgpu_mgr->pf_driver_event_chain,
+					      &nvdev->pf_driver_event_listener);
+}
+
+static void clean_nvdev(struct nvidia_vgpu_vfio *nvdev)
+{
+	if (nvdev->driver_is_unbound)
+		return;
+
+	unregister_pf_driver_event_listener(nvdev);
+	nvidia_vgpu_vfio_clean_sysfs(nvdev);
+
+	nvidia_vgpu_mgr_release(nvdev->vgpu_mgr);
+	nvdev->vgpu_mgr = NULL;
+	nvdev->vgpu_type = NULL;
+}
+
+static int setup_nvdev(void *priv, void *data)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = priv;
+	struct nvidia_vgpu_vfio *nvdev = data;
+	int ret;
+
+	mutex_init(&nvdev->vfio_vgpu_lock);
+	init_completion(&nvdev->vdev_closing_completion);
+
+	nvdev->vgpu_mgr = vgpu_mgr;
+
+	ret = nvidia_vgpu_vfio_setup_sysfs(nvdev);
+	if (ret)
+		return ret;
+
+	register_pf_driver_event_listener(nvdev);
+	return 0;
+}
+
+static int nvidia_vgpu_vfio_probe(struct pci_dev *pdev,
+				  const struct pci_device_id *id_table)
+{
+	struct nvidia_vgpu_vfio *nvdev;
+	int ret;
+
+	if (!pdev->is_virtfn)
+		return -EINVAL;
+
+	nvdev = vfio_alloc_device(nvidia_vgpu_vfio, core_dev.vdev,
+				  &pdev->dev, &nvidia_vgpu_vfio_ops);
+	if (IS_ERR(nvdev))
+		return PTR_ERR(nvdev);
+
+	ret = nvidia_vgpu_mgr_setup(pdev, setup_nvdev, nvdev);
+	if (ret)
+		goto err_setup_vgpu_mgr;
+
+	dev_set_drvdata(&pdev->dev, &nvdev->core_dev);
+
+	ret = vfio_pci_core_register_device(&nvdev->core_dev);
+	if (ret)
+		goto err_register_core_device;
+
+	return 0;
+
+err_register_core_device:
+	clean_nvdev(nvdev);
+err_setup_vgpu_mgr:
+	vfio_put_device(&nvdev->core_dev.vdev);
+	pci_err(pdev, "VF probe failed with ret: %d\n", ret);
+	return ret;
+}
+
+static void nvidia_vgpu_vfio_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_dev = dev_get_drvdata(&pdev->dev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+
+	WARN_ON(nvdev->vgpu || nvdev->vdev_is_opened);
+
+	vfio_pci_core_unregister_device(core_dev);
+	clean_nvdev(nvdev);
+	vfio_put_device(&core_dev->vdev);
+}
+
+struct pci_device_id nvidia_vgpu_vfio_table[] = {
+	{
+		.vendor      = PCI_VENDOR_ID_NVIDIA,
+		.device      = PCI_ANY_ID,
+		.subvendor   = PCI_ANY_ID,
+		.subdevice   = PCI_ANY_ID,
+		.class       = (PCI_CLASS_DISPLAY_3D << 8),
+		.class_mask  = ~0,
+	},
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, nvidia_vgpu_vfio_table);
+
+struct pci_driver nvidia_vgpu_vfio_driver = {
+	.name               = "nvidia-vgpu-vfio",
+	.id_table           = nvidia_vgpu_vfio_table,
+	.probe              = nvidia_vgpu_vfio_probe,
+	.remove             = nvidia_vgpu_vfio_remove,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(nvidia_vgpu_vfio_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vinay Kabra <vkabra@xxxxxxxxxx>");
+MODULE_AUTHOR("Kirti Wankhede <kwankhede@xxxxxxxxxx>");
+MODULE_AUTHOR("Zhi Wang <zhiw@xxxxxxxxxx>");
+MODULE_DESCRIPTION("NVIDIA vGPU VFIO Variant Driver - User Level driver for NVIDIA vGPU");
diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_sysfs.c b/drivers/vfio/pci/nvidia-vgpu/vfio_sysfs.c
new file mode 100644
index 000000000000..271b330f15b1
--- /dev/null
+++ b/drivers/vfio/pci/nvidia-vgpu/vfio_sysfs.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2025 NVIDIA Corporation
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/vfio_pci_core.h>
+#include <linux/types.h>
+
+#include "vfio.h"
+
+static struct nvidia_vgpu_type *find_vgpu_type(struct nvidia_vgpu_vfio *nvdev, u64 type_id)
+{
+	struct nvidia_vgpu_type *vgpu_type;
+	unsigned int i;
+
+	for (i = 0; i < nvdev->vgpu_mgr->num_vgpu_types; i++) {
+		vgpu_type = nvdev->vgpu_mgr->vgpu_types + i;
+		if (vgpu_type->vgpu_type == type_id)
+			return vgpu_type;
+	}
+	return NULL;
+}
+
+static ssize_t creatable_homogeneous_vgpu_types_show(struct nvidia_vgpu_vfio *nvdev, char *buf)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+	ssize_t ret = 0;
+	u64 i;
+
+	mutex_lock(&vgpu_mgr->curr_vgpu_type_lock);
+	/* No vGPU has been created. */
+	if (!vgpu_mgr->curr_vgpu_type) {
+		ret += sprintf(buf, "ID    : vGPU Name\n");
+
+		for (i = 0; i < vgpu_mgr->num_vgpu_types; i++) {
+			struct nvidia_vgpu_type *type = vgpu_mgr->vgpu_types + i;
+
+			ret += sprintf(buf + ret, "%-5d : %s\n", type->vgpu_type,
+				       type->vgpu_type_name);
+		}
+	} else {
+		struct nvidia_vgpu_type *type = vgpu_mgr->curr_vgpu_type;
+
+		/* There has been created vGPU(s). */
+		if (vgpu_mgr->num_instances < type->max_instance)
+			ret = sprintf(buf + ret, "%-5d : %s\n", type->vgpu_type,
+				      type->vgpu_type_name);
+	}
+	mutex_unlock(&vgpu_mgr->curr_vgpu_type_lock);
+	return ret;
+}
+
+static int create_homogeneous_instance(struct nvidia_vgpu_vfio *nvdev,
+				       struct nvidia_vgpu_type *type)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+	int ret = 0;
+
+	mutex_lock(&vgpu_mgr->curr_vgpu_type_lock);
+	if (!vgpu_mgr->curr_vgpu_type) {
+		vgpu_mgr->curr_vgpu_type = type;
+		vgpu_mgr->num_instances++;
+		nvdev->vgpu_type = type;
+	} else {
+		if (type != vgpu_mgr->curr_vgpu_type) {
+			ret = -EINVAL;
+		} else if (vgpu_mgr->num_instances >= vgpu_mgr->curr_vgpu_type->max_instance) {
+			ret = -ENOSPC;
+		} else {
+			vgpu_mgr->num_instances++;
+			nvdev->vgpu_type = type;
+		}
+	}
+	mutex_unlock(&vgpu_mgr->curr_vgpu_type_lock);
+	return ret;
+}
+
+static void destroy_homogeneous_instance(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr;
+
+	if (!nvdev->vgpu_type)
+		return;
+
+	mutex_lock(&vgpu_mgr->curr_vgpu_type_lock);
+	if (vgpu_mgr->curr_vgpu_type) {
+		if (!--vgpu_mgr->num_instances)
+			vgpu_mgr->curr_vgpu_type = NULL;
+	}
+	nvdev->vgpu_type = NULL;
+	mutex_unlock(&vgpu_mgr->curr_vgpu_type_lock);
+}
+
+static ssize_t creatable_vgpu_types_show(struct device *dev, struct device_attribute *attr,
+					 char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_pci_core_device *core_dev = pci_get_drvdata(pdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	ssize_t ret;
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+	if (nvdev->vgpu_type) {
+		mutex_unlock(&nvdev->vfio_vgpu_lock);
+		return 0;
+	}
+
+	ret = creatable_homogeneous_vgpu_types_show(nvdev, buf);
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+	return ret;
+}
+
+static DEVICE_ATTR_RO(creatable_vgpu_types);
+
+static ssize_t current_vgpu_type_store(struct device *dev, struct device_attribute *attr,
+				       const char *buf, size_t count)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_pci_core_device *core_dev = pci_get_drvdata(pdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	struct nvidia_vgpu_type *type;
+	unsigned long vgpu_type_id = ~0;
+	int ret = 0;
+
+	ret = kstrtoul(buf, 10, &vgpu_type_id);
+	if (ret)
+		return ret;
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+
+	if (nvdev->vdev_is_opened) {
+		mutex_unlock(&nvdev->vfio_vgpu_lock);
+		return -EBUSY;
+	}
+
+	if (vgpu_type_id) {
+		type = find_vgpu_type(nvdev, vgpu_type_id);
+		if (!type) {
+			ret = -ENODEV;
+			goto out_unlock;
+		}
+		ret = create_homogeneous_instance(nvdev, type);
+	} else {
+		destroy_homogeneous_instance(nvdev);
+	}
+
+out_unlock:
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+	return ret ? ret : count;
+}
+
+static ssize_t current_vgpu_type_show(struct device *dev, struct device_attribute *attr,
+				      char *buf)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_pci_core_device *core_dev = pci_get_drvdata(pdev);
+	struct nvidia_vgpu_vfio *nvdev = core_dev_to_nvdev(core_dev);
+	unsigned long type_id;
+
+	mutex_lock(&nvdev->vfio_vgpu_lock);
+
+	type_id = nvdev->vgpu_type ? nvdev->vgpu_type->vgpu_type : 0;
+
+	mutex_unlock(&nvdev->vfio_vgpu_lock);
+
+	return sprintf(buf, "%lu\n", type_id);
+}
+
+static DEVICE_ATTR_RW(current_vgpu_type);
+
+static struct attribute *vf_dev_attrs[] = {
+	&dev_attr_creatable_vgpu_types.attr,
+	&dev_attr_current_vgpu_type.attr,
+	NULL,
+};
+
+static const struct attribute_group vf_dev_group = {
+	.name  = "nvidia",
+	.attrs = vf_dev_attrs,
+};
+
+const struct attribute_group *vf_dev_groups[] = {
+	&vf_dev_group,
+	NULL,
+};
+
+int nvidia_vgpu_vfio_setup_sysfs(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+
+	if (WARN_ON(!pdev))
+		return -EINVAL;
+
+	return sysfs_create_groups(&pdev->dev.kobj, vf_dev_groups);
+}
+
+void nvidia_vgpu_vfio_clean_sysfs(struct nvidia_vgpu_vfio *nvdev)
+{
+	struct pci_dev *pdev = nvdev->core_dev.pdev;
+
+	if (WARN_ON(!pdev))
+		return;
+
+	sysfs_remove_groups(&pdev->dev.kobj, vf_dev_groups);
+}
diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu.c b/drivers/vfio/pci/nvidia-vgpu/vgpu.c
index 9e8ea77bbcc5..72083d300b8a 100644
--- a/drivers/vfio/pci/nvidia-vgpu/vgpu.c
+++ b/drivers/vfio/pci/nvidia-vgpu/vgpu.c
@@ -9,6 +9,7 @@
 #include "vgpu_mgr.h"
 
 #include <nvrm/bootload.h>
+#include <nvrm/vgpu.h>
 
 static void unregister_vgpu(struct nvidia_vgpu *vgpu)
 {
@@ -361,7 +362,7 @@ int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu)
 	struct nvidia_vgpu_info *info = &vgpu->info;
 	int ret;
 
-	if (WARN_ON(!info->gfid || !info->dbdf || !info->vgpu_type))
+	if (WARN_ON(!info->gfid || !info->dbdf || !info->vgpu_type || !info->vm_pid))
 		return -EINVAL;
 
 	if (WARN_ON(!vgpu->vgpu_mgr || !vgpu->pdev))
@@ -372,8 +373,8 @@ int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu)
 
 	vgpu->info = *info;
 
-	vgpu_debug(vgpu, "create vgpu %s on vgpu_mgr %px\n",
-		   info->vgpu_type->vgpu_type_name, vgpu->vgpu_mgr);
+	vgpu_debug(vgpu, "create vgpu %s on vgpu_mgr %px vm pid %u\n",
+		   info->vgpu_type->vgpu_type_name, vgpu->vgpu_mgr, info->vm_pid);
 
 	ret = register_vgpu(vgpu);
 	if (ret)
@@ -427,3 +428,49 @@ int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvidia_vgpu_mgr_create_vgpu);
+
+/**
+ * nvidia_vgpu_mgr_reset_vgpu - reset a vGPU instance
+ * @vgpu: the vGPU instance going to be reset.
+ *
+ * Returns: 0 on success, others on failure.
+ */
+int nvidia_vgpu_mgr_reset_vgpu(struct nvidia_vgpu *vgpu)
+{
+	int ret;
+
+	ret = nvidia_vgpu_rpc_call(vgpu, NV_VGPU_CPU_RPC_MSG_RESET, NULL, 0);
+	if (ret) {
+		vgpu_error(vgpu, "fail to reset vgpu ret %d\n", ret);
+		return ret;
+	}
+
+	vgpu_debug(vgpu, "reset done\n");
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nvidia_vgpu_mgr_reset_vgpu);
+
+static int update_bme_state(struct nvidia_vgpu *vgpu, bool enable)
+{
+	NV_VGPU_CPU_RPC_DATA_UPDATE_BME_STATE params = {0};
+
+	params.enable = enable;
+
+	return nvidia_vgpu_rpc_call(vgpu, NV_VGPU_CPU_RPC_MSG_UPDATE_BME_STATE,
+				    &params, sizeof(params));
+}
+
+/**
+ * nvidia_vgpu_set_bme - handle BME sequence
+ * @vgpu: the vGPU instance
+ * @enable: BME enable/disable
+ *
+ * Returns: 0 on success, others on failure.
+ */
+int nvidia_vgpu_mgr_set_bme(struct nvidia_vgpu *vgpu, bool enable)
+{
+	vgpu_debug(vgpu, "set bme, enable %d\n", enable);
+
+	return update_bme_state(vgpu, enable);
+}
+EXPORT_SYMBOL_GPL(nvidia_vgpu_mgr_set_bme);
diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c
index 6f53bd7ca940..e502a37468e3 100644
--- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c
+++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c
@@ -103,10 +103,32 @@ static struct nvidia_vgpu_mgr *alloc_vgpu_mgr(struct nvidia_vgpu_mgr_handle *han
 	mutex_init(&vgpu_mgr->vgpu_list_lock);
 	INIT_LIST_HEAD(&vgpu_mgr->vgpu_list_head);
 	atomic_set(&vgpu_mgr->num_vgpus, 0);
+	mutex_init(&vgpu_mgr->curr_vgpu_type_lock);
+	nvidia_vgpu_event_init_chain(&vgpu_mgr->pf_driver_event_chain);
 
 	return vgpu_mgr;
 }
 
+static int call_chain(struct nvidia_vgpu_event_chain *chain, unsigned int event, void *data)
+{
+	struct nvidia_vgpu_event_listener *l;
+	struct list_head *pos, *temp;
+	int ret = 0;
+
+	mutex_lock(&chain->lock);
+
+	list_for_each_safe(pos, temp, &chain->head) {
+		l = container_of(pos, struct nvidia_vgpu_event_listener, list);
+		ret = l->func(l, event, data);
+		if (ret)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&chain->lock);
+	return ret;
+}
+
 static const char *pf_events_string[NVIDIA_VGPU_PF_EVENT_MAX] = {
 	[NVIDIA_VGPU_PF_DRIVER_EVENT_SRIOV_CONFIGURE] = "SRIOV configure",
 	[NVIDIA_VGPU_PF_DRIVER_EVENT_DRIVER_UNBIND] = "driver unbind",
@@ -115,14 +137,20 @@ static const char *pf_events_string[NVIDIA_VGPU_PF_EVENT_MAX] = {
 static int pf_event_notify_fn(void *priv, unsigned int event, void *data)
 {
 	struct nvidia_vgpu_mgr *vgpu_mgr = priv;
+	int ret = 0;
 
 	if (WARN_ON(event >= NVIDIA_VGPU_PF_EVENT_MAX))
 		return -EINVAL;
 
 	vgpu_mgr_debug(vgpu_mgr, "handle PF event %s\n", pf_events_string[event]);
 
-	/* more to come. */
-	return 0;
+	switch (event) {
+	case NVIDIA_VGPU_PF_DRIVER_EVENT_START...NVIDIA_VGPU_PF_DRIVER_EVENT_END:
+		ret = call_chain(&vgpu_mgr->pf_driver_event_chain, event, data);
+		break;
+	}
+
+	return ret;
 }
 
 static void attach_vgpu_mgr(struct nvidia_vgpu_mgr *vgpu_mgr,
@@ -378,3 +406,39 @@ int nvidia_vgpu_mgr_setup(struct pci_dev *dev, int (*init_vfio_fn)(void *priv, v
 	return nvidia_vgpu_mgr_attach_handle(&handle, &attach_handle_data);
 }
 EXPORT_SYMBOL(nvidia_vgpu_mgr_setup);
+
+/**
+ * nvidia_vgpu_event_init_chain - initialize an event chain
+ * @chain: the even chain.
+ */
+void nvidia_vgpu_event_init_chain(struct nvidia_vgpu_event_chain *chain)
+{
+	mutex_init(&chain->lock);
+	INIT_LIST_HEAD(&chain->head);
+}
+
+/**
+ * nvidia_vgpu_event_register_listener - register an event listener
+ * @chain: the event chain.
+ * @l: the listener.
+ */
+void nvidia_vgpu_event_register_listener(struct nvidia_vgpu_event_chain *chain,
+					 struct nvidia_vgpu_event_listener *l)
+{
+	mutex_lock(&chain->lock);
+	list_add_tail(&l->list, &chain->head);
+	mutex_unlock(&chain->lock);
+}
+
+/**
+ * nvidia_vgpu_event_unregister_listener - unregister an event listener
+ * @chain: the event chain.
+ * @l: the listener.
+ */
+void nvidia_vgpu_event_unregister_listener(struct nvidia_vgpu_event_chain *chain,
+					   struct nvidia_vgpu_event_listener *l)
+{
+	mutex_lock(&chain->lock);
+	list_del_init(&l->list);
+	mutex_unlock(&chain->lock);
+}
diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
index fe475f8b2882..dc782f825f2b 100644
--- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
+++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h
@@ -101,6 +101,17 @@ struct nvidia_vgpu {
 	struct nvidia_vgpu_rpc rpc;
 };
 
+struct nvidia_vgpu_event_listener {
+	int (*func)(struct nvidia_vgpu_event_listener *self, unsigned int event, void *data);
+	struct list_head list;
+};
+
+struct nvidia_vgpu_event_chain {
+	/* lock for PF event listener list */
+	struct mutex lock;
+	struct list_head head;
+};
+
 /**
  * struct nvidia_vgpu_mgr - the vGPU manager
  *
@@ -125,6 +136,10 @@ struct nvidia_vgpu {
  * @num_vgpu_types: number of installed vGPU types
  * @use_alloc_bitmap: use chid allocator for the PF driver doesn't support chid allocation
  * @chid_alloc_bitmap: chid allocator bitmap
+ * @curr_vgpu_lock: lock to protect curr_vgpu_type
+ * @curr_vgpu_type: type of current created vgpu in homogeneous mode
+ * @num_instances: number of created vGPU with curr_vgpu_type in homogeneous mode
+ * @pf_driver_event_chain: PF driver event chain
  * @pdev: the PCI device pointer
  * @bar0_vaddr: the virtual address of BAR0
  */
@@ -163,6 +178,13 @@ struct nvidia_vgpu_mgr {
 	bool use_chid_alloc_bitmap;
 	void *chid_alloc_bitmap;
 
+	/* lock for current vGPU type */
+	struct mutex curr_vgpu_type_lock;
+	struct nvidia_vgpu_type *curr_vgpu_type;
+	unsigned int num_instances;
+
+	struct nvidia_vgpu_event_chain pf_driver_event_chain;
+
 	struct pci_dev *pdev;
 	void __iomem *bar0_vaddr;
 };
@@ -173,14 +195,21 @@ struct nvidia_vgpu_mgr {
 int nvidia_vgpu_mgr_setup(struct pci_dev *dev, int (*init_vfio_fn)(void *priv, void *data),
 			  void *init_vfio_fn_data);
 void nvidia_vgpu_mgr_release(struct nvidia_vgpu_mgr *vgpu_mgr);
+void nvidia_vgpu_event_init_chain(struct nvidia_vgpu_event_chain *chain);
+void nvidia_vgpu_event_register_listener(struct nvidia_vgpu_event_chain *chain,
+					 struct nvidia_vgpu_event_listener *l);
+void nvidia_vgpu_event_unregister_listener(struct nvidia_vgpu_event_chain *chain,
+					   struct nvidia_vgpu_event_listener *l);
 
 int nvidia_vgpu_mgr_destroy_vgpu(struct nvidia_vgpu *vgpu);
 int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu);
+int nvidia_vgpu_mgr_reset_vgpu(struct nvidia_vgpu *vgpu);
 int nvidia_vgpu_mgr_setup_metadata(struct nvidia_vgpu_mgr *vgpu_mgr);
 void nvidia_vgpu_mgr_clean_metadata(struct nvidia_vgpu_mgr *vgpu_mgr);
 int nvidia_vgpu_rpc_call(struct nvidia_vgpu *vgpu, u32 msg_type,
 			 void *data, u64 size);
 void nvidia_vgpu_clean_rpc(struct nvidia_vgpu *vgpu);
 int nvidia_vgpu_setup_rpc(struct nvidia_vgpu *vgpu);
+int nvidia_vgpu_mgr_set_bme(struct nvidia_vgpu *vgpu, bool enable);
 
 #endif
-- 
2.34.1