The GSP firmware provides several per-vGPU logging buffers to help on debugging bugs. Export those buffers to userspace. Thus, the user can attach them when reporting bugs. Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx> --- drivers/vfio/pci/nvidia-vgpu/Makefile | 4 +- drivers/vfio/pci/nvidia-vgpu/debugfs.c | 65 +++++++++++ drivers/vfio/pci/nvidia-vgpu/vfio.h | 16 +++ drivers/vfio/pci/nvidia-vgpu/vfio_debugfs.c | 117 ++++++++++++++++++++ drivers/vfio/pci/nvidia-vgpu/vfio_main.c | 44 +++++++- drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h | 2 + 6 files changed, 245 insertions(+), 3 deletions(-) create mode 100644 drivers/vfio/pci/nvidia-vgpu/debugfs.c create mode 100644 drivers/vfio/pci/nvidia-vgpu/vfio_debugfs.c diff --git a/drivers/vfio/pci/nvidia-vgpu/Makefile b/drivers/vfio/pci/nvidia-vgpu/Makefile index 2aba9b4868aa..615712b40128 100644 --- a/drivers/vfio/pci/nvidia-vgpu/Makefile +++ b/drivers/vfio/pci/nvidia-vgpu/Makefile @@ -2,5 +2,5 @@ subdir-ccflags-y += -I$(src)/include obj-$(CONFIG_NVIDIA_VGPU_VFIO_PCI) += nvidia_vgpu_vfio_pci.o -nvidia_vgpu_vfio_pci-y := vgpu_mgr.o vgpu.o metadata.o metadata_vgpu_type.o rpc.o \ - vfio_main.o vfio_access.o vfio_sysfs.o +nvidia_vgpu_vfio_pci-y := vgpu_mgr.o vgpu.o metadata.o metadata_vgpu_type.o rpc.o debugfs.o\ + vfio_main.o vfio_access.o vfio_sysfs.o vfio_debugfs.o diff --git a/drivers/vfio/pci/nvidia-vgpu/debugfs.c b/drivers/vfio/pci/nvidia-vgpu/debugfs.c new file mode 100644 index 000000000000..e6cdf44cd846 --- /dev/null +++ b/drivers/vfio/pci/nvidia-vgpu/debugfs.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2025 NVIDIA Corporation + */ + +#include <linux/debugfs.h> + +#include "vgpu_mgr.h" + +struct debugfs_root { + /* mutex to protect the debugfs_root */ + struct mutex mutex; + struct kref refcount; + struct dentry *root; +}; + +struct debugfs_root debugfs_root = { + .mutex = __MUTEX_INITIALIZER(debugfs_root.mutex), +}; + +struct dentry *nvidia_vgpu_get_debugfs_root(void) +{ + struct debugfs_root *root = &debugfs_root; + struct dentry *dentry; + + mutex_lock(&root->mutex); + if (root->root) { + kref_get(&root->refcount); + dentry = root->root; + goto out_unlock; + } + + dentry = debugfs_create_dir("nvidia-vgpu", NULL); + if (IS_ERR(dentry)) + goto out_unlock; + + kref_init(&root->refcount); + root->root = dentry; + +out_unlock: + mutex_unlock(&root->mutex); + return dentry; +} + +static void debugfs_root_release(struct kref *kref) +{ + struct debugfs_root *root = container_of(kref, struct debugfs_root, refcount); + + debugfs_remove(root->root); + root->root = NULL; +} + +void nvidia_vgpu_put_debugfs_root(void) +{ + struct debugfs_root *root = &debugfs_root; + + mutex_lock(&root->mutex); + if (WARN_ON(!root->root)) + goto out_unlock; + + kref_put(&root->refcount, debugfs_root_release); + +out_unlock: + mutex_unlock(&root->mutex); +} diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio.h b/drivers/vfio/pci/nvidia-vgpu/vfio.h index 4c9bf9c80f5c..8edc8cd6c6dc 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vfio.h +++ b/drivers/vfio/pci/nvidia-vgpu/vfio.h @@ -6,6 +6,7 @@ #ifndef _NVIDIA_VGPU_VFIO_H__ #define _NVIDIA_VGPU_VFIO_H__ +#include <linux/debugfs.h> #include <linux/vfio_pci_core.h> #include "vgpu_mgr.h" @@ -15,6 +16,12 @@ #define CAP_LIST_NEXT_PTR_MSIX 0x7c #define MSIX_CAP_SIZE 0xc +struct nvidia_vgpu_vfio_log { + struct debugfs_blob_wrapper blob; + void *mem; + struct dentry *dentry; +}; + struct nvidia_vgpu_vfio { struct vfio_pci_core_device core_dev; u8 vconfig[PCI_CONFIG_SPACE_LENGTH]; @@ -32,6 +39,12 @@ struct nvidia_vgpu_vfio { struct completion vdev_closing_completion; struct nvidia_vgpu_event_listener pf_driver_event_listener; + struct nvidia_vgpu_event_listener pf_event_listener; + + /* Logs */ + struct nvidia_vgpu_vfio_log log_init_task; + struct nvidia_vgpu_vfio_log log_vgpu_task; + struct nvidia_vgpu_vfio_log log_kernel; }; static inline struct nvidia_vgpu_vfio *core_dev_to_nvdev(struct vfio_pci_core_device *core_dev) @@ -45,5 +58,8 @@ ssize_t nvidia_vgpu_vfio_access(struct nvidia_vgpu_vfio *nvdev, char __user *buf int nvidia_vgpu_vfio_setup_sysfs(struct nvidia_vgpu_vfio *nvdev); void nvidia_vgpu_vfio_clean_sysfs(struct nvidia_vgpu_vfio *nvdev); +int nvidia_vgpu_vfio_setup_debugfs(struct nvidia_vgpu_vfio *nvdev); +void nvidia_vgpu_vfio_clean_debugfs(struct nvidia_vgpu_vfio *nvdev); +void nvidia_vgpu_vfio_update_logs(struct nvidia_vgpu_vfio *nvdev); #endif /* _NVIDIA_VGPU_VFIO_H__ */ diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_debugfs.c b/drivers/vfio/pci/nvidia-vgpu/vfio_debugfs.c new file mode 100644 index 000000000000..52a80928f74f --- /dev/null +++ b/drivers/vfio/pci/nvidia-vgpu/vfio_debugfs.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright © 2025 NVIDIA Corporation + */ + +#include <linux/debugfs.h> + +#include "vfio.h" + +static void free_vgpu_log(struct nvidia_vgpu_vfio_log *log) +{ + debugfs_remove(log->dentry); + kvfree(log->mem); + log->mem = NULL; +} + +static void clean_vgpu_logs(struct nvidia_vgpu_vfio *nvdev) +{ + free_vgpu_log(&nvdev->log_init_task); + free_vgpu_log(&nvdev->log_vgpu_task); + free_vgpu_log(&nvdev->log_kernel); +} + +static int alloc_vgpu_log(struct nvidia_vgpu_vfio_log *log, struct device *dev, + struct dentry *root, const char *name, u64 size) +{ + void *path = NULL; + + path = kzalloc(PATH_MAX, GFP_KERNEL); + if (!path) + return -ENOMEM; + + log->mem = kvzalloc(size, GFP_KERNEL); + if (!log->mem) { + kfree(log->mem); + return -ENOMEM; + } + + log->blob.size = size; + log->blob.data = log->mem; + + snprintf(path, PATH_MAX, "%s-%s", dev_name(dev), name); + log->dentry = debugfs_create_blob(path, 0400, root, &log->blob); + + kfree(path); + path = NULL; + + if (IS_ERR(log->dentry)) { + kfree(log->mem); + return PTR_ERR(log->dentry); + } + return 0; +} + +static int setup_vgpu_logs(struct nvidia_vgpu_vfio *nvdev, struct dentry *root) +{ + struct nvidia_vgpu_mgr *vgpu_mgr = nvdev->vgpu_mgr; + struct device *dev = &nvdev->core_dev.pdev->dev; + int ret; + + ret = alloc_vgpu_log(&nvdev->log_init_task, dev, root, "init_task_log", + vgpu_mgr->init_task_log_size); + if (ret) + return ret; + + ret = alloc_vgpu_log(&nvdev->log_vgpu_task, dev, root, "vgpu_task_log", + vgpu_mgr->vgpu_task_log_size); + if (ret) { + free_vgpu_log(&nvdev->log_init_task); + return ret; + } + + ret = alloc_vgpu_log(&nvdev->log_kernel, dev, root, "kernel_log", + vgpu_mgr->kernel_log_size); + if (ret) { + free_vgpu_log(&nvdev->log_init_task); + free_vgpu_log(&nvdev->log_vgpu_task); + return ret; + } + return 0; +} + +int nvidia_vgpu_vfio_setup_debugfs(struct nvidia_vgpu_vfio *nvdev) +{ + struct dentry *root = nvidia_vgpu_get_debugfs_root(); + int ret; + + if (IS_ERR(root)) + return PTR_ERR(root); + + ret = setup_vgpu_logs(nvdev, root); + if (ret) { + nvidia_vgpu_put_debugfs_root(); + return ret; + } + + return 0; +} + +void nvidia_vgpu_vfio_clean_debugfs(struct nvidia_vgpu_vfio *nvdev) +{ + clean_vgpu_logs(nvdev); + nvidia_vgpu_put_debugfs_root(); +} + +void nvidia_vgpu_vfio_update_logs(struct nvidia_vgpu_vfio *nvdev) +{ + struct nvidia_vgpu_vfio_log *logs[] = { + &nvdev->log_init_task, + &nvdev->log_vgpu_task, + &nvdev->log_kernel, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(logs); i++) + memcpy(logs[i]->mem, logs[i]->blob.data, logs[i]->blob.size); +} diff --git a/drivers/vfio/pci/nvidia-vgpu/vfio_main.c b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c index b557062a4ac2..4a6d939046e0 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vfio_main.c +++ b/drivers/vfio/pci/nvidia-vgpu/vfio_main.c @@ -24,10 +24,41 @@ static int pdev_to_gfid(struct pci_dev *pdev) return pci_iov_vf_id(pdev) + 1; } +static void disable_vgpu_logs(struct nvidia_vgpu_vfio *nvdev) +{ + if (WARN_ON(!nvdev->vgpu)) + return; + + /* save the latest vGPU logs before disabling */ + nvidia_vgpu_vfio_update_logs(nvdev); + + nvdev->log_init_task.blob.data = nvdev->log_init_task.mem; + nvdev->log_vgpu_task.blob.data = nvdev->log_vgpu_task.mem; + nvdev->log_kernel.blob.data = nvdev->log_kernel.mem; +} + +static void enable_vgpu_logs(struct nvidia_vgpu_vfio *nvdev) +{ + struct nvidia_vgpu *vgpu = nvdev->vgpu; + struct nvidia_vgpu_mgmt *mgmt = &vgpu->mgmt; + + if (WARN_ON(!vgpu)) + return; + + nvdev->log_init_task.blob.data = mgmt->init_task_log_vaddr; + nvdev->log_vgpu_task.blob.data = mgmt->vgpu_task_log_vaddr; + nvdev->log_kernel.blob.data = mgmt->kernel_log_vaddr; + + /* get the latest vGPU logs after enabling */ + nvidia_vgpu_vfio_update_logs(nvdev); +} + static int destroy_vgpu(struct nvidia_vgpu_vfio *nvdev) { int ret; + disable_vgpu_logs(nvdev); + ret = nvidia_vgpu_mgr_destroy_vgpu(nvdev->vgpu); if (ret) return ret; @@ -68,6 +99,8 @@ static int create_vgpu(struct nvidia_vgpu_vfio *nvdev) } nvdev->vgpu = vgpu; + + enable_vgpu_logs(nvdev); return 0; } @@ -582,11 +615,14 @@ static void unregister_pf_driver_event_listener(struct nvidia_vgpu_vfio *nvdev) static void clean_nvdev(struct nvidia_vgpu_vfio *nvdev) { - if (nvdev->driver_is_unbound) + if (nvdev->driver_is_unbound) { + nvidia_vgpu_vfio_clean_debugfs(nvdev); return; + } unregister_pf_driver_event_listener(nvdev); nvidia_vgpu_vfio_clean_sysfs(nvdev); + nvidia_vgpu_vfio_clean_debugfs(nvdev); nvidia_vgpu_mgr_release(nvdev->vgpu_mgr); nvdev->vgpu_mgr = NULL; @@ -608,6 +644,12 @@ static int setup_nvdev(void *priv, void *data) if (ret) return ret; + ret = nvidia_vgpu_vfio_setup_debugfs(nvdev); + if (ret) { + nvidia_vgpu_vfio_clean_sysfs(nvdev); + return ret; + } + register_pf_driver_event_listener(nvdev); return 0; } diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h index b5bcde555a5d..04fef4f69793 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h +++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h @@ -225,5 +225,7 @@ int nvidia_vgpu_rpc_call(struct nvidia_vgpu *vgpu, u32 msg_type, void nvidia_vgpu_clean_rpc(struct nvidia_vgpu *vgpu); int nvidia_vgpu_setup_rpc(struct nvidia_vgpu *vgpu); int nvidia_vgpu_mgr_set_bme(struct nvidia_vgpu *vgpu, bool enable); +struct dentry *nvidia_vgpu_get_debugfs_root(void); +void nvidia_vgpu_put_debugfs_root(void); #endif -- 2.34.1