Zhi Wang
2024-Sep-22 12:49 UTC
[RFC 01/29] nvkm/vgpu: introduce NVIDIA vGPU support prelude
NVIDIA GPU virtualization is a technology that allows multiple virtual machines (VMs) to share the power of a single GPU, enabling greater flexibility, efficiency, and cost-effectiveness in data centers and cloud environments. The first step of supporting NVIDIA vGPU in nvkm is to introduce the necessary vGPU data structures and functions to hook into the (de)initialization path of nvkm. Introduce NVIDIA vGPU data structures and functions hooking into the the (de)initialization path of nvkm and support the following patches. Cc: Neo Jia <cjia at nvidia.com> Cc: Surath Mitra <smitra at nvidia.com> Signed-off-by: Zhi Wang <zhiw at nvidia.com> --- .../drm/nouveau/include/nvkm/core/device.h | 3 + .../nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h | 17 +++++ drivers/gpu/drm/nouveau/nvkm/Kbuild | 1 + drivers/gpu/drm/nouveau/nvkm/device/pci.c | 19 +++-- drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/Kbuild | 2 + .../gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c | 76 +++++++++++++++++++ 6 files changed, 112 insertions(+), 6 deletions(-) create mode 100644 drivers/gpu/drm/nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h create mode 100644 drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/Kbuild create mode 100644 drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h index fef8ca74968d..497c52f51593 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h @@ -3,6 +3,7 @@ #define __NVKM_DEVICE_H__ #include <core/oclass.h> #include <core/intr.h> +#include <vgpu_mgr/vgpu_mgr.h> enum nvkm_subdev_type; #include <linux/auxiliary_bus.h> @@ -80,6 +81,8 @@ struct nvkm_device { bool legacy_done; } intr; + struct nvkm_vgpu_mgr vgpu_mgr; + struct auxiliary_device auxdev; const struct nvif_driver_func *driver; }; diff --git a/drivers/gpu/drm/nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h b/drivers/gpu/drm/nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h new file mode 100644 index 000000000000..3163fff1085b --- /dev/null +++ b/drivers/gpu/drm/nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef __NVKM_VGPU_MGR_H__ +#define __NVKM_VGPU_MGR_H__ + +#define NVIDIA_MAX_VGPUS 2 + +struct nvkm_vgpu_mgr { + bool enabled; + struct nvkm_device *nvkm_dev; +}; + +bool nvkm_vgpu_mgr_is_supported(struct nvkm_device *device); +bool nvkm_vgpu_mgr_is_enabled(struct nvkm_device *device); +int nvkm_vgpu_mgr_init(struct nvkm_device *device); +void nvkm_vgpu_mgr_fini(struct nvkm_device *device); + +#endif diff --git a/drivers/gpu/drm/nouveau/nvkm/Kbuild b/drivers/gpu/drm/nouveau/nvkm/Kbuild index 9e1a6ab937e1..d310467487c1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/Kbuild @@ -8,3 +8,4 @@ include $(src)/nvkm/device/Kbuild include $(src)/nvkm/falcon/Kbuild include $(src)/nvkm/subdev/Kbuild include $(src)/nvkm/engine/Kbuild +include $(src)/nvkm/vgpu_mgr/Kbuild diff --git a/drivers/gpu/drm/nouveau/nvkm/device/pci.c b/drivers/gpu/drm/nouveau/nvkm/device/pci.c index b8d2125a9f59..1543902b20e9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/device/pci.c +++ b/drivers/gpu/drm/nouveau/nvkm/device/pci.c @@ -1688,6 +1688,9 @@ nvkm_device_pci_remove(struct pci_dev *dev) { struct nvkm_device *device = pci_get_drvdata(dev); + if (nvkm_vgpu_mgr_is_enabled(device)) + nvkm_vgpu_mgr_fini(device); + if (device->runpm) { pm_runtime_get_sync(device->dev); pm_runtime_forbid(device->dev); @@ -1835,12 +1838,6 @@ nvkm_device_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) } quirk_broken_nv_runpm(pdev); -done: - if (ret) { - nvkm_device_del(&device); - return ret; - } - pci_set_drvdata(pci_dev, &pdev->device); if (nvkm_runpm) { @@ -1852,12 +1849,22 @@ nvkm_device_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) } } + if (nvkm_vgpu_mgr_is_supported(device)) { + ret = nvkm_vgpu_mgr_init(&pdev->device); + if (ret) + goto done; + } + if (device->runpm) { pm_runtime_allow(device->dev); pm_runtime_put(device->dev); } return 0; + +done: + nvkm_device_del(&device); + return ret; } static struct pci_device_id diff --git a/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/Kbuild b/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/Kbuild new file mode 100644 index 000000000000..244e967d4edc --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/Kbuild @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: MIT +nvkm-y += nvkm/vgpu_mgr/vgpu_mgr.o diff --git a/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c b/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c new file mode 100644 index 000000000000..a506414e5ba2 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: MIT */ +#include <core/device.h> +#include <core/pci.h> +#include <vgpu_mgr/vgpu_mgr.h> + +static bool support_vgpu_mgr = false; +module_param_named(support_vgpu_mgr, support_vgpu_mgr, bool, 0400); + +static inline struct pci_dev *nvkm_to_pdev(struct nvkm_device *device) +{ + struct nvkm_device_pci *pci = container_of(device, typeof(*pci), + device); + + return pci->pdev; +} + +/** + * nvkm_vgpu_mgr_is_supported - check if a platform support vGPU + * @device: the nvkm_device pointer + * + * Returns: true on supported platform which is newer than ADA Lovelace + * with SRIOV support. + */ +bool nvkm_vgpu_mgr_is_supported(struct nvkm_device *device) +{ + struct pci_dev *pdev = nvkm_to_pdev(device); + + if (!support_vgpu_mgr) + return false; + + return device->card_type == AD100 && pci_sriov_get_totalvfs(pdev); +} + +/** + * nvkm_vgpu_mgr_is_enabled - check if vGPU support is enabled on a PF + * @device: the nvkm_device pointer + * + * Returns: true if vGPU enabled. + */ +bool nvkm_vgpu_mgr_is_enabled(struct nvkm_device *device) +{ + return device->vgpu_mgr.enabled; +} + +/** + * nvkm_vgpu_mgr_init - Initialize the vGPU manager support + * @device: the nvkm_device pointer + * + * Returns: 0 on success, -ENODEV on platforms that are not supported. + */ +int nvkm_vgpu_mgr_init(struct nvkm_device *device) +{ + struct nvkm_vgpu_mgr *vgpu_mgr = &device->vgpu_mgr; + + if (!nvkm_vgpu_mgr_is_supported(device)) + return -ENODEV; + + vgpu_mgr->nvkm_dev = device; + vgpu_mgr->enabled = true; + + pci_info(nvkm_to_pdev(device), + "NVIDIA vGPU mananger support is enabled.\n"); + + return 0; +} + +/** + * nvkm_vgpu_mgr_fini - De-initialize the vGPU manager support + * @device: the nvkm_device pointer + */ +void nvkm_vgpu_mgr_fini(struct nvkm_device *device) +{ + struct nvkm_vgpu_mgr *vgpu_mgr = &device->vgpu_mgr; + + vgpu_mgr->enabled = false; +} -- 2.34.1
Greg KH
2024-Sep-26 09:20 UTC
[RFC 01/29] nvkm/vgpu: introduce NVIDIA vGPU support prelude
On Sun, Sep 22, 2024 at 05:49:23AM -0700, Zhi Wang wrote:> NVIDIA GPU virtualization is a technology that allows multiple virtual > machines (VMs) to share the power of a single GPU, enabling greater > flexibility, efficiency, and cost-effectiveness in data centers and cloud > environments. > > The first step of supporting NVIDIA vGPU in nvkm is to introduce the > necessary vGPU data structures and functions to hook into the > (de)initialization path of nvkm. > > Introduce NVIDIA vGPU data structures and functions hooking into the > the (de)initialization path of nvkm and support the following patches. > > Cc: Neo Jia <cjia at nvidia.com> > Cc: Surath Mitra <smitra at nvidia.com> > Signed-off-by: Zhi Wang <zhiw at nvidia.com>Some minor comments that are a hint you all aren't running checkpatch on your code...> --- /dev/null > +++ b/drivers/gpu/drm/nouveau/include/nvkm/vgpu_mgr/vgpu_mgr.h > @@ -0,0 +1,17 @@ > +/* SPDX-License-Identifier: MIT */Wait, what? Why? Ick. You all also forgot the copyright line :(> --- /dev/null > +++ b/drivers/gpu/drm/nouveau/nvkm/vgpu_mgr/vgpu_mgr.c > @@ -0,0 +1,76 @@ > +/* SPDX-License-Identifier: MIT */ > +#include <core/device.h> > +#include <core/pci.h> > +#include <vgpu_mgr/vgpu_mgr.h> > + > +static bool support_vgpu_mgr = false;A global variable for the whole system? Are you sure that will work well over time? Why isn't this a per-device thing?> +module_param_named(support_vgpu_mgr, support_vgpu_mgr, bool, 0400);This is not the 1990's, please never add new module parameters, use per-device variables. And no documentation? That's not ok either even if you did want to have this.> +static inline struct pci_dev *nvkm_to_pdev(struct nvkm_device *device) > +{ > + struct nvkm_device_pci *pci = container_of(device, typeof(*pci), > + device); > + > + return pci->pdev; > +} > + > +/** > + * nvkm_vgpu_mgr_is_supported - check if a platform support vGPU > + * @device: the nvkm_device pointer > + * > + * Returns: true on supported platform which is newer than ADA Lovelace > + * with SRIOV support. > + */ > +bool nvkm_vgpu_mgr_is_supported(struct nvkm_device *device) > +{ > + struct pci_dev *pdev = nvkm_to_pdev(device); > + > + if (!support_vgpu_mgr) > + return false; > + > + return device->card_type == AD100 && pci_sriov_get_totalvfs(pdev);checkpatch please. And "AD100" is an odd #define, as you know.> +} > + > +/** > + * nvkm_vgpu_mgr_is_enabled - check if vGPU support is enabled on a PF > + * @device: the nvkm_device pointer > + * > + * Returns: true if vGPU enabled. > + */ > +bool nvkm_vgpu_mgr_is_enabled(struct nvkm_device *device) > +{ > + return device->vgpu_mgr.enabled;What happens if this changes right after you look at it?> +} > + > +/** > + * nvkm_vgpu_mgr_init - Initialize the vGPU manager support > + * @device: the nvkm_device pointer > + * > + * Returns: 0 on success, -ENODEV on platforms that are not supported. > + */ > +int nvkm_vgpu_mgr_init(struct nvkm_device *device) > +{ > + struct nvkm_vgpu_mgr *vgpu_mgr = &device->vgpu_mgr; > + > + if (!nvkm_vgpu_mgr_is_supported(device)) > + return -ENODEV; > + > + vgpu_mgr->nvkm_dev = device; > + vgpu_mgr->enabled = true; > + > + pci_info(nvkm_to_pdev(device), > + "NVIDIA vGPU mananger support is enabled.\n");When drivers work properly, they are quiet. Why can't you see this all in the sysfs tree instead to know if support is there or not? You all are properly tieing in your "sub driver" logic to the driver model, right? (hint, I don't think so as it looks like that isn't happening, but I could be missing it...) thanks, greg k-h