Creating a vGPU requires allocating a portion of the FB memory from the NVKM. The size of the FB memory that a vGPU requires is from the vGPU type. Acquire the size of the required FB memory from the vGPU type. Allocate the FB memory from NVKM when creating a vGPU. Signed-off-by: Zhi Wang <zhiw@xxxxxxxxxx> --- drivers/vfio/pci/nvidia-vgpu/debug.h | 5 ++ .../vfio/pci/nvidia-vgpu/include/nvrm/ecc.h | 45 ++++++++++++ .../vfio/pci/nvidia-vgpu/include/nvrm/vmmu.h | 39 +++++++++++ drivers/vfio/pci/nvidia-vgpu/pf.h | 8 +++ drivers/vfio/pci/nvidia-vgpu/vgpu.c | 70 +++++++++++++++++++ drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c | 56 ++++++++++++++- drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h | 8 +++ 7 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/pci/nvidia-vgpu/include/nvrm/ecc.h create mode 100644 drivers/vfio/pci/nvidia-vgpu/include/nvrm/vmmu.h diff --git a/drivers/vfio/pci/nvidia-vgpu/debug.h b/drivers/vfio/pci/nvidia-vgpu/debug.h index 7cf92c9060ae..db9288752384 100644 --- a/drivers/vfio/pci/nvidia-vgpu/debug.h +++ b/drivers/vfio/pci/nvidia-vgpu/debug.h @@ -17,4 +17,9 @@ pci_dbg(__v->pdev, "nvidia-vgpu %d: "f, __v->info.id, ##a); \ }) +#define vgpu_error(v, f, a...) ({ \ + typeof(v) __v = (v); \ + pci_err(__v->pdev, "nvidia-vgpu %d: "f, __v->info.id, ##a); \ +}) + #endif diff --git a/drivers/vfio/pci/nvidia-vgpu/include/nvrm/ecc.h b/drivers/vfio/pci/nvidia-vgpu/include/nvrm/ecc.h new file mode 100644 index 000000000000..d2a8316a0f12 --- /dev/null +++ b/drivers/vfio/pci/nvidia-vgpu/include/nvrm/ecc.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 NVIDIA Corporation + */ +#ifndef __NVRM_ECC_H__ +#define __NVRM_ECC_H__ + +#include <nvrm/nvtypes.h> + +/* Excerpt of RM headers from https://github.com/NVIDIA/open-gpu-kernel-modules/tree/570.124.04 */ + +typedef struct NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS { + NV_DECLARE_ALIGNED(NvU64 count, 8); +} NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS; + +typedef struct NV2080_CTRL_GPU_QUERY_ECC_UNIT_STATUS { + NvBool enabled; + NvBool scrubComplete; + NvBool supported; + NV_DECLARE_ALIGNED(NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS dbe, 8); + NV_DECLARE_ALIGNED(NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS dbeNonResettable, 8); + NV_DECLARE_ALIGNED(NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS sbe, 8); + NV_DECLARE_ALIGNED(NV2080_CTRL_GPU_QUERY_ECC_EXCEPTION_STATUS sbeNonResettable, 8); +} NV2080_CTRL_GPU_QUERY_ECC_UNIT_STATUS; + +typedef struct NV0080_CTRL_GR_ROUTE_INFO { + NvU32 flags; + NV_DECLARE_ALIGNED(NvU64 route, 8); +} NV0080_CTRL_GR_ROUTE_INFO; + +typedef NV0080_CTRL_GR_ROUTE_INFO NV2080_CTRL_GR_ROUTE_INFO; + +#define NV2080_CTRL_GPU_ECC_UNIT_COUNT (0x00000024U) + +#define NV2080_CTRL_CMD_GPU_QUERY_ECC_STATUS (0x2080012fU) + +typedef struct NV2080_CTRL_GPU_QUERY_ECC_STATUS_PARAMS { + NV_DECLARE_ALIGNED(NV2080_CTRL_GPU_QUERY_ECC_UNIT_STATUS units[NV2080_CTRL_GPU_ECC_UNIT_COUNT], 8); + NvBool bFatalPoisonError; + NvU8 uncorrectableError; + NvU32 flags; + NV_DECLARE_ALIGNED(NV2080_CTRL_GR_ROUTE_INFO grRouteInfo, 8); +} NV2080_CTRL_GPU_QUERY_ECC_STATUS_PARAMS; + +#endif diff --git a/drivers/vfio/pci/nvidia-vgpu/include/nvrm/vmmu.h b/drivers/vfio/pci/nvidia-vgpu/include/nvrm/vmmu.h new file mode 100644 index 000000000000..fb1f100deac4 --- /dev/null +++ b/drivers/vfio/pci/nvidia-vgpu/include/nvrm/vmmu.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: MIT */ + +/* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. */ + +#ifndef __NVRM_VMMU_H__ +#define __NVRM_VMMU_H__ + +#include <nvrm/nvtypes.h> + +/* Excerpt of RM headers from https://github.com/NVIDIA/open-gpu-kernel-modules/tree/570.124.04 */ + +/* + * NV2080_CTRL_CMD_GPU_GET_VMMU_SEGMENT_SIZE + * + * This command returns the VMMU page size + * + * vmmuSegmentSize + * Output parameter. + * Returns the VMMU segment size (in bytes) + * + * Possible status values returned are: + * NV_OK + * NV_ERR_NOT_SUPPORTED + */ +#define NV2080_CTRL_CMD_GPU_GET_VMMU_SEGMENT_SIZE (0x2080017eU) /* finn: Evaluated from "(FINN_NV20_SUBDEVICE_0_GPU_INTERFACE_ID << 8) | NV2080_CTRL_GPU_GET_VMMU_SEGMENT_SIZE_PARAMS_MESSAGE_ID" */ + +#define NV2080_CTRL_GPU_GET_VMMU_SEGMENT_SIZE_PARAMS_MESSAGE_ID (0x7EU) + +typedef struct NV2080_CTRL_GPU_GET_VMMU_SEGMENT_SIZE_PARAMS { + NV_DECLARE_ALIGNED(NvU64 vmmuSegmentSize, 8); +} NV2080_CTRL_GPU_GET_VMMU_SEGMENT_SIZE_PARAMS; + +#define NV2080_CTRL_GPU_VMMU_SEGMENT_SIZE_32MB 0x02000000U +#define NV2080_CTRL_GPU_VMMU_SEGMENT_SIZE_64MB 0x04000000U +#define NV2080_CTRL_GPU_VMMU_SEGMENT_SIZE_128MB 0x08000000U +#define NV2080_CTRL_GPU_VMMU_SEGMENT_SIZE_256MB 0x10000000U +#define NV2080_CTRL_GPU_VMMU_SEGMENT_SIZE_512MB 0x20000000U + +#endif diff --git a/drivers/vfio/pci/nvidia-vgpu/pf.h b/drivers/vfio/pci/nvidia-vgpu/pf.h index b8008d8ee434..ce2728ce969b 100644 --- a/drivers/vfio/pci/nvidia-vgpu/pf.h +++ b/drivers/vfio/pci/nvidia-vgpu/pf.h @@ -95,4 +95,12 @@ static inline int nvidia_vgpu_mgr_init_handle(struct pci_dev *pdev, __m->handle.ops->free_chids(__m->handle.pf_drvdata, o, s); \ }) +#define nvidia_vgpu_mgr_alloc_fbmem(m, info) ({\ + typeof(m) __m = (m); \ + __m->handle.ops->alloc_fbmem(__m->handle.pf_drvdata, info); \ +}) + +#define nvidia_vgpu_mgr_free_fbmem(m, h) \ + ((m)->handle.ops->free_fbmem(h)) + #endif diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu.c b/drivers/vfio/pci/nvidia-vgpu/vgpu.c index 52b946469043..7025c7e2b9ac 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vgpu.c +++ b/drivers/vfio/pci/nvidia-vgpu/vgpu.c @@ -105,7 +105,70 @@ static int setup_chids(struct nvidia_vgpu *vgpu) vgpu_debug(vgpu, "alloc guest channel offset %u size %u\n", chid->chid_offset, chid->num_chid); + return 0; +} + +static void clean_fbmem_heap(struct nvidia_vgpu *vgpu) +{ + struct nvidia_vgpu_mgr *vgpu_mgr = vgpu->vgpu_mgr; + + vgpu_debug(vgpu, "free guest FB memory, offset 0x%llx size 0x%llx\n", + vgpu->fbmem_heap->addr, vgpu->fbmem_heap->size); + + nvidia_vgpu_mgr_free_fbmem(vgpu_mgr, vgpu->fbmem_heap); + vgpu->fbmem_heap = NULL; +} + +static int get_alloc_fbmem_size(struct nvidia_vgpu *vgpu, u64 *size) +{ + struct nvidia_vgpu_mgr *vgpu_mgr = vgpu->vgpu_mgr; + struct nvidia_vgpu_info *info = &vgpu->info; + struct nvidia_vgpu_type *type = info->vgpu_type; + u64 fb_length; + + if (!vgpu_mgr->ecc_enabled) { + *size = type->fb_length; + return 0; + } + + if (!info->vgpu_type->ecc_supported) { + vgpu_error(vgpu, "ECC is enabled. vGPU type %s doesn't support ECC!\n", + type->vgpu_type_name); + return -ENODEV; + } + /* Re-calculate the FB memory length when ECC is enabled. */ + fb_length = ALIGN(vgpu_mgr->total_fbmem_size, vgpu_mgr->vmmu_segment_size); + fb_length = fb_length / type->max_instance - type->fb_reservation - type->gsp_heap_size; + fb_length = min(type->fb_length, fb_length); + fb_length = ALIGN_DOWN(fb_length, vgpu_mgr->vmmu_segment_size); + + *size = fb_length; + return 0; +} + +static int setup_fbmem_heap(struct nvidia_vgpu *vgpu) +{ + struct nvidia_vgpu_mgr *vgpu_mgr = vgpu->vgpu_mgr; + struct nvidia_vgpu_alloc_fbmem_info info = {0}; + struct nvidia_vgpu_mem *mem; + int ret; + + ret = get_alloc_fbmem_size(vgpu, &info.size); + if (ret) + return ret; + + info.align = vgpu_mgr->vmmu_segment_size; + + vgpu_debug(vgpu, "alloc guest FB memory, size 0x%llx\n", info.size); + + mem = nvidia_vgpu_mgr_alloc_fbmem(vgpu_mgr, &info); + if (IS_ERR(mem)) + return PTR_ERR(mem); + + vgpu_debug(vgpu, "guest FB memory offset 0x%llx size 0x%llx\n", mem->addr, mem->size); + + vgpu->fbmem_heap = mem; return 0; } @@ -120,6 +183,7 @@ int nvidia_vgpu_mgr_destroy_vgpu(struct nvidia_vgpu *vgpu) if (!atomic_cmpxchg(&vgpu->status, 1, 0)) return -ENODEV; + clean_fbmem_heap(vgpu); clean_chids(vgpu); unregister_vgpu(vgpu); @@ -164,12 +228,18 @@ int nvidia_vgpu_mgr_create_vgpu(struct nvidia_vgpu *vgpu) if (ret) goto err_setup_chids; + ret = setup_fbmem_heap(vgpu); + if (ret) + goto err_setup_fbmem_heap; + atomic_set(&vgpu->status, 1); vgpu_debug(vgpu, "created\n"); return 0; +err_setup_fbmem_heap: + clean_chids(vgpu); err_setup_chids: unregister_vgpu(vgpu); diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c index 8565bb881fda..e8b670308b21 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c +++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.c @@ -6,6 +6,9 @@ #include "debug.h" #include "vgpu_mgr.h" +#include <nvrm/vmmu.h> +#include <nvrm/ecc.h> + static void clean_vgpu_mgr(struct nvidia_vgpu_mgr *vgpu_mgr) { if (vgpu_mgr->use_chid_alloc_bitmap) { @@ -104,6 +107,39 @@ static void attach_vgpu_mgr(struct nvidia_vgpu_mgr *vgpu_mgr, handle_data->vfio.pf_detach_handle_fn = pf_detach_handle_fn; } +static int get_vmmu_segment_size(struct nvidia_vgpu_mgr *vgpu_mgr) +{ + NV2080_CTRL_GPU_GET_VMMU_SEGMENT_SIZE_PARAMS *ctrl; + + ctrl = nvidia_vgpu_mgr_rm_ctrl_rd(vgpu_mgr, &vgpu_mgr->gsp_client, + NV2080_CTRL_CMD_GPU_GET_VMMU_SEGMENT_SIZE, + sizeof(*ctrl)); + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + + vgpu_mgr->vmmu_segment_size = ctrl->vmmuSegmentSize; + + nvidia_vgpu_mgr_rm_ctrl_done(vgpu_mgr, &vgpu_mgr->gsp_client, ctrl); + + return 0; +} + +static int get_ecc_status(struct nvidia_vgpu_mgr *vgpu_mgr) +{ + NV2080_CTRL_GPU_QUERY_ECC_STATUS_PARAMS *ctrl; + + ctrl = nvidia_vgpu_mgr_rm_ctrl_rd(vgpu_mgr, &vgpu_mgr->gsp_client, + NV2080_CTRL_CMD_GPU_QUERY_ECC_STATUS, + sizeof(*ctrl)); + if (IS_ERR(ctrl)) + return PTR_ERR(ctrl); + + vgpu_mgr->ecc_enabled = ctrl->units[0].enabled; + + nvidia_vgpu_mgr_rm_ctrl_done(vgpu_mgr, &vgpu_mgr->gsp_client, ctrl); + return 0; +} + static int setup_chid_alloc_bitmap(struct nvidia_vgpu_mgr *vgpu_mgr) { if (WARN_ON(!vgpu_mgr->use_chid_alloc_bitmap)) @@ -120,11 +156,27 @@ static int setup_chid_alloc_bitmap(struct nvidia_vgpu_mgr *vgpu_mgr) static int init_vgpu_mgr(struct nvidia_vgpu_mgr *vgpu_mgr) { + int ret; + + ret = get_vmmu_segment_size(vgpu_mgr); + if (ret) + return ret; + + ret = get_ecc_status(vgpu_mgr); + if (ret) + return ret; + + vgpu_mgr_debug(vgpu_mgr, "[GSP RM] VMMU segment size: 0x%llx\n", + vgpu_mgr->vmmu_segment_size); + vgpu_mgr_debug(vgpu_mgr, "[GSP RM] ECC enabled: %d\n", vgpu_mgr->ecc_enabled); + vgpu_mgr->total_avail_chids = nvidia_vgpu_mgr_get_avail_chids(vgpu_mgr); vgpu_mgr->total_fbmem_size = nvidia_vgpu_mgr_get_total_fbmem_size(vgpu_mgr); - vgpu_mgr_debug(vgpu_mgr, "total avail chids %u\n", vgpu_mgr->total_avail_chids); - vgpu_mgr_debug(vgpu_mgr, "total fbmem size 0x%llx\n", vgpu_mgr->total_fbmem_size); + vgpu_mgr_debug(vgpu_mgr, "[core driver] total avail chids %u\n", + vgpu_mgr->total_avail_chids); + vgpu_mgr_debug(vgpu_mgr, "[core driver] total fbmem size 0x%llx\n", + vgpu_mgr->total_fbmem_size); return vgpu_mgr->use_chid_alloc_bitmap ? setup_chid_alloc_bitmap(vgpu_mgr) : 0; } diff --git a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h index 5a7a6103a677..356779404cc2 100644 --- a/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h +++ b/drivers/vfio/pci/nvidia-vgpu/vgpu_mgr.h @@ -59,6 +59,7 @@ struct nvidia_vgpu_chid { * @info: vGPU info * @vgpu_mgr: pointer to vGPU manager * @chid: vGPU channel IDs + * @fbmem_heap: allocated FB memory for the vGPU */ struct nvidia_vgpu { /* Per-vGPU lock */ @@ -71,6 +72,7 @@ struct nvidia_vgpu { struct nvidia_vgpu_mgr *vgpu_mgr; struct nvidia_vgpu_chid chid; + struct nvidia_vgpu_mem *fbmem_heap; }; /** @@ -80,6 +82,8 @@ struct nvidia_vgpu { * @handle: the driver handle * @total_avail_chids: total available channel IDs * @total_fbmem_size: total FB memory size + * @vmmu_segment_size: VMMU segment size + * @ecc_enabled: ECC is enabled in the GPU * @vgpu_major: vGPU major version * @vgpu_minor: vGPU minor version * @vgpu_list_lock: lock to protect vGPU list @@ -99,6 +103,10 @@ struct nvidia_vgpu_mgr { u32 total_avail_chids; u64 total_fbmem_size; + /* GSP RM configurations */ + u64 vmmu_segment_size; + bool ecc_enabled; + u64 vgpu_major; u64 vgpu_minor; -- 2.34.1