Update guide to latest 510 driver

This commit is contained in:
PolloLoco 2022-03-25 12:50:50 +01:00
parent c255d59ca2
commit 845180dfcd
5 changed files with 501 additions and 855 deletions

View File

@ -1,452 +0,0 @@
--- ./kernel/Kbuild
+++ ./kernel/Kbuild
@@ -72,6 +72,7 @@ EXTRA_CFLAGS += -Wall -MD $(DEFINES) $(INCLUDES) -Wno-cast-qual -Wno-error -Wno-
EXTRA_CFLAGS += -D__KERNEL__ -DMODULE -DNVRM -DNV_VERSION_STRING=\"450.156\" -Wno-unused-function -Wuninitialized -fno-strict-aliasing -mno-red-zone -mcmodel=kernel -DNV_UVM_ENABLE
EXTRA_CFLAGS += $(call cc-option,-Werror=undef,)
EXTRA_CFLAGS += -DNV_SPECTRE_V2=$(NV_SPECTRE_V2)
+EXTRA_CFLAGS += -DNV_KERNEL_INTERFACE_LAYER -Wfatal-errors
#
# Detect SGI UV systems and apply system-specific optimizations.
--- ./kernel/conftest.sh
+++ ./kernel/conftest.sh
@@ -4576,7 +4576,7 @@ case "$5" in
#
VERBOSE=$6
iommu=CONFIG_VFIO_IOMMU_TYPE1
- mdev=CONFIG_VFIO_MDEV_DEVICE
+ mdev=CONFIG_VFIO_MDEV
kvm=CONFIG_KVM_VFIO
VFIO_IOMMU_PRESENT=0
VFIO_MDEV_DEVICE_PRESENT=0
--- ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.c
+++ ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.c
@@ -24,6 +24,10 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/err.h>
+#include <linux/eventfd.h>
+#include <uapi/linux/uuid.h>
+#include <linux/device.h>
+#include <linux/mdev.h>
#include "nvstatus.h"
#include "nv-misc.h"
#include "nv-linux.h"
@@ -37,6 +41,25 @@
struct vgpu_devs vgpu_devices;
struct phys_devs phys_devices;
+struct mdev_parent {
+ struct device *dev;
+ const struct mdev_parent_ops *ops;
+ struct kref ref;
+ struct list_head next;
+ struct kset *mdev_types_kset;
+ struct list_head type_list;
+ /* Synchronize device creation/removal with parent unregistration */
+ struct rw_semaphore unreg_sem;
+};
+
+struct mdev_type {
+ struct kobject kobj;
+ struct kobject *devices_kobj;
+ struct mdev_parent *parent;
+ struct list_head next;
+ unsigned int type_group_id;
+};
+
#define SLEEP_TIME_MILLISECONDS 20
#define VGPU_EXIT_TIMEOUT_MILLISECONDS 5000
#define WAITQUEUE_TIMEOUT_SECONDS 25000
@@ -162,8 +185,8 @@ struct parent_ops vgpu_fops = {
.remove = nv_vgpu_vfio_destroy,
.read = nv_vgpu_vfio_read,
.write = nv_vgpu_vfio_write,
- .open = nv_vgpu_vfio_open,
- .release = nv_vgpu_vfio_close,
+ .open_device = nv_vgpu_vfio_open,
+ .close_device = nv_vgpu_vfio_close,
.ioctl = nv_vgpu_vfio_ioctl,
.mmap = nv_vgpu_vfio_mmap,
};
@@ -368,9 +391,9 @@ static NV_STATUS nv_get_vgpu_type_id(const char *kobj_name, struct device *dev,
return NV_OK;
}
-static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_dev *pdev = to_pci_dev(mtype->parent->dev);
struct pci_dev *parent_device;
NvU32 vgpu_type_id;
NV_STATUS status;
@@ -381,7 +404,7 @@ static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
parent_device = pdev;
- if (nv_get_vgpu_type_id(kobj->name, dev, &vgpu_type_id)
+ if (nv_get_vgpu_type_id(mtype->kobj.name, mtype->parent->dev, &vgpu_type_id)
== NV_OK)
status = rm_vgpu_vfio_ops.get_name(parent_device, vgpu_type_id, buf);
else
@@ -394,9 +417,9 @@ static ssize_t name_show(struct kobject *kobj, struct device *dev, char *buf)
}
MDEV_TYPE_ATTR_RO(name);
-static ssize_t description_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_dev *pdev = to_pci_dev(mtype->parent->dev);
struct pci_dev *parent_device;
NvU32 vgpu_type_id;
NV_STATUS status;
@@ -407,7 +430,7 @@ static ssize_t description_show(struct kobject *kobj, struct device *dev, char *
parent_device = pdev;
- if (nv_get_vgpu_type_id(kobj->name, dev, &vgpu_type_id)
+ if (nv_get_vgpu_type_id(mtype->kobj.name, mtype->parent->dev, &vgpu_type_id)
== NV_OK)
status = rm_vgpu_vfio_ops.get_description(parent_device, vgpu_type_id, buf);
else
@@ -420,13 +443,13 @@ static ssize_t description_show(struct kobject *kobj, struct device *dev, char *
}
MDEV_TYPE_ATTR_RO(description);
-static ssize_t available_instances_show(struct kobject *kobj, struct device *dev, char *buf)
+static ssize_t available_instances_show(struct mdev_type *t, struct mdev_type_attribute *ta, char *buf)
{
- struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_dev *pdev = to_pci_dev(t->parent->dev);
NvU32 vgpu_type_id;
NV_STATUS status;
- if ((nv_get_vgpu_type_id(kobj->name, dev, &vgpu_type_id)) == NV_OK)
+ if ((nv_get_vgpu_type_id(t->kobj.name, t->parent->dev, &vgpu_type_id)) == NV_OK)
status = rm_vgpu_vfio_ops.get_instances(pdev, vgpu_type_id, buf);
else
return -EINVAL;
@@ -438,8 +461,7 @@ static ssize_t available_instances_show(struct kobject *kobj, struct device *dev
}
MDEV_TYPE_ATTR_RO(available_instances);
-static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
- char *buf)
+static ssize_t device_api_show(struct mdev_type *t, struct mdev_type_attribute *ta, char *buf)
{
return sprintf(buf, "%s\n",
VFIO_DEVICE_API_PCI_STRING);
@@ -534,7 +556,7 @@ destroy_exit:
return ret;
}
-static int nv_vgpu_vfio_create(struct kobject *kobj, struct mdev_device *mdev)
+static int nv_vgpu_vfio_create(struct mdev_device *mdev)
{
NV_STATUS status = NV_OK;
vgpu_dev_t *vgpu_dev = NULL;
@@ -556,7 +578,7 @@ static int nv_vgpu_vfio_create(struct kobject *kobj, struct mdev_device *mdev)
if (!pdev)
return -EINVAL;
- if (nv_get_vgpu_type_id(kobj->name, NV_GET_MDEV_PARENT(mdev), &vgpu_type_id)
+ if (nv_get_vgpu_type_id(mdev->type->kobj.name, NV_GET_MDEV_PARENT(mdev), &vgpu_type_id)
!= NV_OK)
{
ret = -EINVAL;
@@ -631,12 +653,7 @@ static int nv_vgpu_vfio_create(struct kobject *kobj, struct mdev_device *mdev)
if (pdev->is_virtfn)
{
#if defined(NV_MDEV_SET_IOMMU_DEVICE_PRESENT)
- ret = mdev_set_iommu_device(NV_GET_MDEV_DEV(mdev), NV_GET_MDEV_PARENT(mdev));
- if (ret != 0)
- {
- NV_VGPU_DEV_LOG(VGPU_ERR, mdev, "Failed to set IOMMU device. ret: %d \n", ret);
- goto remove_vgpu;
- }
+ mdev_set_iommu_device(mdev, NV_GET_MDEV_PARENT(mdev));
#endif
}
@@ -2479,19 +2496,18 @@ invalidate_exit:
static int vgpu_save_fd(vgpu_dev_t *vgpu_dev, int fd, NvU32 index)
{
- struct fd irqfd;
+ struct eventfd_ctx *evt;
- irqfd = fdget(fd);
- if (!irqfd.file)
- return -EBADF;
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt))
+ return PTR_ERR(evt);
if (index == VFIO_PCI_INTX_IRQ_INDEX)
- vgpu_dev->intr_info.intx_file = irqfd.file;
- else if (index == VFIO_PCI_MSI_IRQ_INDEX)
- vgpu_dev->intr_info.msi_file = irqfd.file;
+ vgpu_dev->intr_info.intx_evtfd = evt;
+ else if (index == VFIO_PCI_MSI_IRQ_INDEX)
+ vgpu_dev->intr_info.msi_evtfd = evt;
vgpu_dev->intr_info.index = index;
- fdput(irqfd);
return 0;
}
@@ -2500,11 +2516,8 @@ static int vgpu_save_fd(vgpu_dev_t *vgpu_dev, int fd, NvU32 index)
static irqreturn_t vgpu_msix_handler(int irq, void *arg)
{
vgpu_dev_t *vgpu_dev = (vgpu_dev_t *)arg;
- struct file *pfile = NULL;
- mm_segment_t old_fs;
- NvU64 val = 1;
+ struct eventfd_ctx *evt = NULL;
int ret = 0;
- loff_t offset = 0;
int i;
unsigned long eflags;
@@ -2512,21 +2525,16 @@ static irqreturn_t vgpu_msix_handler(int irq, void *arg)
{
if (vgpu_dev->intr_info.allocated_irq[i] == irq)
{
- pfile = vgpu_dev->intr_info.msix_fd[i].file;
+ evt = vgpu_dev->intr_info.msix_evtfd[i];
break;
}
}
- if (pfile && pfile->f_op && pfile->f_op->write)
+ if (evt)
{
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
NV_SAVE_FLAGS(eflags);
- ret = pfile->f_op->write(pfile, (char *)&val, sizeof(val), &offset);
+ ret = eventfd_signal(evt, 1);
NV_RESTORE_FLAGS(eflags);
-
- set_fs(old_fs);
}
return IRQ_HANDLED;
@@ -2537,23 +2545,24 @@ static int vgpu_msix_set_vector_signal(vgpu_dev_t *vgpu_dev,
{
struct pci_dev *pdev;
int irq = INVALID_IRQ, ret;
- struct fd irqfd;
+ struct eventfd_ctx *evt;
pdev = to_pci_dev(NV_GET_MDEV_PARENT(vgpu_dev->mdev));
- if (vgpu_dev->intr_info.msix_fd[vector].file)
+ if (vgpu_dev->intr_info.msix_evtfd[vector])
{
free_irq(vgpu_dev->intr_info.allocated_irq[vector], vgpu_dev);
- vgpu_dev->intr_info.msix_fd[vector].file = NULL;
+ eventfd_ctx_put(vgpu_dev->intr_info.msix_evtfd[vector]);
+ vgpu_dev->intr_info.msix_evtfd[vector] = NULL;
vgpu_dev->intr_info.allocated_irq[vector] = INVALID_IRQ;
}
if (fd < 0)
return 0;
- irqfd = fdget(fd);
- if (!irqfd.file)
- return -EBADF;
+ evt = eventfd_ctx_fdget(fd);
+ if (IS_ERR(evt))
+ return PTR_ERR(evt);
if (vector < 0 || vector >= vgpu_dev->intr_info.num_ctx)
return -EINVAL;
@@ -2569,7 +2578,7 @@ static int vgpu_msix_set_vector_signal(vgpu_dev_t *vgpu_dev,
vgpu_dev->intr_info.allocated_irq[vector] = irq;
- vgpu_dev->intr_info.msix_fd[vector]= irqfd;
+ vgpu_dev->intr_info.msix_evtfd[vector]= evt;
return 0;
}
@@ -2586,7 +2595,12 @@ static void vgpu_msix_disable(vgpu_dev_t *vgpu_dev)
if (vgpu_dev->intr_info.allocated_irq[i] != INVALID_IRQ)
{
free_irq(vgpu_dev->intr_info.allocated_irq[i], vgpu_dev);
- vgpu_dev->intr_info.msix_fd[i].file = NULL;
+
+ if (vgpu_dev->intr_info.msix_evtfd[i]) {
+ eventfd_ctx_put(vgpu_dev->intr_info.msix_evtfd[i]);
+ vgpu_dev->intr_info.msix_evtfd[i] = NULL;
+ }
+
vgpu_dev->intr_info.allocated_irq[i] = INVALID_IRQ;
}
}
@@ -2675,7 +2689,10 @@ static int nv_vgpu_vfio_set_irqs(vgpu_dev_t *vgpu_dev, uint32_t flags,
{
if (flags & VFIO_IRQ_SET_DATA_NONE)
{
- vgpu_dev->intr_info.intx_file = NULL;
+ if (vgpu_dev->intr_info.intx_evtfd) {
+ eventfd_ctx_put(vgpu_dev->intr_info.intx_evtfd);
+ vgpu_dev->intr_info.intx_evtfd = NULL;
+ }
break;
}
@@ -2700,7 +2717,10 @@ static int nv_vgpu_vfio_set_irqs(vgpu_dev_t *vgpu_dev, uint32_t flags,
{
if (flags & VFIO_IRQ_SET_DATA_NONE)
{
- vgpu_dev->intr_info.msi_file = NULL;
+ if (vgpu_dev->intr_info.msi_evtfd) {
+ eventfd_ctx_put(vgpu_dev->intr_info.msi_evtfd);
+ vgpu_dev->intr_info.msi_evtfd = NULL;
+ }
vgpu_dev->intr_info.index = VFIO_PCI_INTX_IRQ_INDEX;
break;
}
@@ -2708,10 +2728,9 @@ static int nv_vgpu_vfio_set_irqs(vgpu_dev_t *vgpu_dev, uint32_t flags,
if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
{
int fd = *(int *)data;
- if (fd > 0)
+ if (fd > 0 && !vgpu_dev->intr_info.msi_evtfd)
{
- if (vgpu_dev->intr_info.msi_file == NULL)
- ret = vgpu_save_fd(vgpu_dev, fd, index);
+ ret = vgpu_save_fd(vgpu_dev, fd, index);
}
}
break;
@@ -2766,12 +2785,9 @@ exit:
NV_STATUS nv_vgpu_inject_interrupt(void *vgpuRef)
{
- mm_segment_t old_fs;
- NvU64 val = 1;
int ret = 0;
- loff_t offset = 0;
NV_STATUS status = NV_OK;
- struct file *pfile = NULL;
+ struct eventfd_ctx *evt = NULL;
vgpu_dev_t *vgpu_dev = vgpuRef;
unsigned long eflags;
@@ -2780,12 +2796,12 @@ NV_STATUS nv_vgpu_inject_interrupt(void *vgpuRef)
NV_SPIN_LOCK_IRQSAVE(&vgpu_dev->intr_info_lock, eflags);
- if ((vgpu_dev->intr_info.index == VFIO_PCI_MSI_IRQ_INDEX) && (vgpu_dev->intr_info.msi_file == NULL))
+ if ((vgpu_dev->intr_info.index == VFIO_PCI_MSI_IRQ_INDEX) && (!vgpu_dev->intr_info.msi_evtfd))
{
NV_SPIN_UNLOCK_IRQRESTORE(&vgpu_dev->intr_info_lock, eflags);
return NV_ERR_INVALID_REQUEST;
}
- else if ((vgpu_dev->intr_info.index == VFIO_PCI_INTX_IRQ_INDEX) && (vgpu_dev->intr_info.intx_file == NULL))
+ else if ((vgpu_dev->intr_info.index == VFIO_PCI_INTX_IRQ_INDEX) && (!vgpu_dev->intr_info.intx_evtfd))
{
NV_SPIN_UNLOCK_IRQRESTORE(&vgpu_dev->intr_info_lock, eflags);
return NV_ERR_INVALID_REQUEST;
@@ -2797,9 +2813,9 @@ NV_STATUS nv_vgpu_inject_interrupt(void *vgpuRef)
}
if (vgpu_dev->intr_info.index == VFIO_PCI_MSI_IRQ_INDEX)
- pfile = vgpu_dev->intr_info.msi_file;
+ evt = vgpu_dev->intr_info.msi_evtfd;
else
- pfile = vgpu_dev->intr_info.intx_file;
+ evt = vgpu_dev->intr_info.intx_evtfd;
// QEMU has exited. So, safe to ignore interrupts.
if (vgpu_dev->intr_info.ignore_interrupts == NV_TRUE)
@@ -2809,19 +2825,14 @@ NV_STATUS nv_vgpu_inject_interrupt(void *vgpuRef)
}
NV_SPIN_UNLOCK_IRQRESTORE(&vgpu_dev->intr_info_lock, eflags);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
- if (pfile->f_op && pfile->f_op->write)
- ret = pfile->f_op->write(pfile, (char *)&val, sizeof(val), &offset);
- else
- status = NV_ERR_INVALID_REQUEST;
+ if (evt)
+ ret = eventfd_signal(evt, 1);
+ else
+ status = NV_ERR_INVALID_REQUEST;
if (ret < 0)
status = NV_ERR_INVALID_STATE;
- set_fs(old_fs);
-
return status;
}
@@ -4165,6 +4176,6 @@ static void __exit nv_vgpu_vfio_exit(void)
module_init(nv_vgpu_vfio_init);
module_exit(nv_vgpu_vfio_exit);
-MODULE_LICENSE("MIT");
+MODULE_LICENSE("GPL");
MODULE_INFO(supported, "external");
MODULE_VERSION(NV_VERSION_STRING);
--- ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.h
+++ ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.h
@@ -51,7 +51,7 @@ static NV_STATUS nv_vgpu_probe(struct pci_dev *dev, NvU32, NvU32 *);
static NV_STATUS nv_vgpu_vfio_validate_map_request(struct mdev_device *, loff_t, NvU64 *,
NvU64 *, NvU64 *, pgprot_t *, NvBool *);
static void nv_vgpu_remove(struct pci_dev *);
-static int nv_vgpu_vfio_create(struct kobject *, struct mdev_device *);
+static int nv_vgpu_vfio_create(struct mdev_device *);
static int nv_vgpu_vfio_destroy(struct mdev_device *mdev);
static int nv_vgpu_vfio_open(struct mdev_device *);
static void nv_vgpu_vfio_close(struct mdev_device *);
@@ -293,19 +293,20 @@ typedef struct
typedef struct
{
- struct file *intx_file;
- struct file *msi_file;
+ struct eventfd_ctx *intx_evtfd;
+ struct eventfd_ctx *msi_evtfd;
int index;
NvBool ignore_interrupts;
NvU32 allocated_irq[MAX_NUM_VECTORS];
NvU32 num_ctx;
#if defined(NV_VGPU_KVM_BUILD)
- struct fd msix_fd[MAX_NUM_VECTORS];
+ struct eventfd_ctx *msix_evtfd[MAX_NUM_VECTORS];
#endif
} intr_info_t;
+
typedef struct
{
NvU64 pending;
--- ./kernel/nvidia/nv-frontend.c
+++ ./kernel/nvidia/nv-frontend.c
@@ -15,7 +15,7 @@
#include "nv-frontend.h"
#if defined(MODULE_LICENSE)
-MODULE_LICENSE("NVIDIA");
+MODULE_LICENSE("GPL");
#endif
#if defined(MODULE_INFO)
MODULE_INFO(supported, "external");

Binary file not shown.

865
README.md
View File

@ -1,376 +1,491 @@
# NVIDIA vGPU on PVE 7.1 with a NVIDIA T1000 GPU
This tutorial (and included patches) should allow you to use vGPU unlock on PVE 7.1 with the opt-in 5.15 Linux Kernel with a NVIDIA T1000 GPU. The GPU uses the TU117 Chip so other GPUs with the same Chip (T400, T600, GTX 1650 **NOT** Super) will probably work (no guarantees).
### This tutorial assumes you are using a clean install of PVE 7.1, or ymmv when using an existing installation. Make sure to always have backups!
## Packages
Make sure to add the community pve repo and get rid of the enterprise repo (you can skip this step if you have a valid enterprise subscription)
```bash
echo "deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription" >> /etc/apt/sources.list
rm /etc/apt/sources.list.d/pve-enterprise.list
```
Update and upgrade
```bash
apt update
apt dist-upgrade
```
PVE 7.1 comes with version 5.13 of the Linux Kernel, that version is incompatible with vGPU. For this guide you will have to install version 5.15, which will probably come with PVE 7.2 (~Q2 2022) but is opt-in on current PVE versions
```bash
apt install -y pve-kernel-5.15 pve-headers-5.15
```
Next we need to install a few more packages like git, a compiler and some other tools
```bash
apt install -y git build-essential dkms jq pve-headers mdevctl
```
## Git repos and glorious [Rust](https://www.rust-lang.org/) compiler
First, clone this repo to your home folder (in this case `/root/`)
```bash
git clone https://gitlab.com/polloloco/vgpu-5.15.git
```
Clone two additional git repos for vGPU unlock
```bash
cd /opt
git clone https://github.com/DualCoder/vgpu_unlock
git clone https://github.com/p0lloloco/vgpu_unlock-rs
```
After that, install the rust compiler
```bash
curl https://sh.rustup.rs -sSf | sh -s -- -y
```
Now make the rust binaries available in your $PATH (you only have to do it the first time after installing rust)
```bash
source $HOME/.cargo/env
```
Enter the `vgpu_unlock-rs` directory and compile the library. Depending on your hardware and internet connection that may take a while
```bash
cd vgpu_unlock-rs/
cargo build --release
```
## Create files for vGPU unlock
The vgpu_unlock-rs library requires a few files and folders in order to work properly, lets create those
First create the folder for your vgpu unlock config and create an empty config file
```bash
mkdir /etc/vgpu_unlock
touch /etc/vgpu_unlock/profile_override.toml
```
Then, create folders and files for systemd to load the vgpu_unlock-rs library when starting the nvidia vgpu services
```bash
mkdir /etc/systemd/system/{nvidia-vgpud.service.d,nvidia-vgpu-mgr.service.d}
echo -e "[Service]\nEnvironment=LD_PRELOAD=/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so" > /etc/systemd/system/nvidia-vgpud.service.d/vgpu_unlock.conf
echo -e "[Service]\nEnvironment=LD_PRELOAD=/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so" > /etc/systemd/system/nvidia-vgpu-mgr.service.d/vgpu_unlock.conf
```
## Enabling IOMMU
#### Note: Usually this isn't required for vGPU to work, but it doesn't hurt to enable it. You can skip this section, but if you run into problems later on, make sure to enable IOMMU.
Assuming you installed PVE with ZFS-on-root and efi, you are booting with systemd-boot. All other installations use grub. The following instructions *ONLY* apply to systemd-boot, grub is different.
To enable IOMMU you have to enable it in your UEFI first. Due to it being vendor specific, I am unable to provide instructions for that, but usually for Intel systems the option you are looking for is called something like "Vt-d", AMD systems tend to call it "IOMMU".
After enabling IOMMU in your UEFI, you have to add some options to your kernel to enable it in proxmox. Edit the kernel command line like this
```bash
nano /etc/kernel/cmdline
```
On a clean installation the file might look similar to this:
```
root=ZFS=rpool/ROOT/pve-1 boot=zfs
```
On Intel systems, append this line at the end
```
intel_iommu=on iommu=pt
```
For AMD, use this
```
amd_iommu=on iommu=pt
```
After editing the file, it should look similar to this
```
root=ZFS=rpool/ROOT/pve-1 boot=zfs intel_iommu=on iommu=pt
```
Save and exit using Ctrl+O and then Ctrl+X
## Loading required kernel modules and blacklisting the open source nvidia driver
We have to load the `vfio`, `vfio_iommu_type1`, `vfio_pci` and `vfio_virqfd` kernel modules to get vGPU working
```bash
echo -e "vfio\nvfio_iommu_type1\nvfio_pci\nvfio_virqfd" >> /etc/modules
```
Proxmox comes with the open source nouveau driver for nvidia gpus, however we have to use our patched nvidia driver to enable vGPU. The next line will prevent the nouveau driver from loading
```bash
echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf
```
## IMPORTANT: Apply our kernel configuration
#### Note: This only applies to systemd-boot, if you are using grub, you can't use these instructions
```bash
proxmox-boot-tool refresh
```
...and reboot
```bash
reboot
```
## Check if IOMMU is enabled
#### Note: See section "Enabling IOMMU", this is optional
Wait for your server to restart, then type this into a root shell
```bash
dmesg | grep -e DMAR -e IOMMU
```
On my Intel system the output looks like this
```
[ 0.007235] ACPI: DMAR 0x000000009CC98B68 0000B8 (v01 INTEL BDW 00000001 INTL 00000001)
[ 0.007255] ACPI: Reserving DMAR table memory at [mem 0x9cc98b68-0x9cc98c1f]
[ 0.020766] DMAR: IOMMU enabled
[ 0.062294] DMAR: Host address width 39
[ 0.062296] DMAR: DRHD base: 0x000000fed90000 flags: 0x0
[ 0.062300] DMAR: dmar0: reg_base_addr fed90000 ver 1:0 cap c0000020660462 ecap f0101a
[ 0.062302] DMAR: DRHD base: 0x000000fed91000 flags: 0x1
[ 0.062305] DMAR: dmar1: reg_base_addr fed91000 ver 1:0 cap d2008c20660462 ecap f010da
[ 0.062307] DMAR: RMRR base: 0x0000009cc18000 end: 0x0000009cc25fff
[ 0.062309] DMAR: RMRR base: 0x0000009f000000 end: 0x000000af1fffff
[ 0.062312] DMAR-IR: IOAPIC id 8 under DRHD base 0xfed91000 IOMMU 1
[ 0.062314] DMAR-IR: HPET id 0 under DRHD base 0xfed91000
[ 0.062315] DMAR-IR: x2apic is disabled because BIOS sets x2apic opt out bit.
[ 0.062316] DMAR-IR: Use 'intremap=no_x2apic_optout' to override the BIOS setting.
[ 0.062797] DMAR-IR: Enabled IRQ remapping in xapic mode
[ 0.302431] DMAR: No ATSR found
[ 0.302432] DMAR: No SATC found
[ 0.302433] DMAR: IOMMU feature pgsel_inv inconsistent
[ 0.302435] DMAR: IOMMU feature sc_support inconsistent
[ 0.302436] DMAR: IOMMU feature pass_through inconsistent
[ 0.302437] DMAR: dmar0: Using Queued invalidation
[ 0.302443] DMAR: dmar1: Using Queued invalidation
[ 0.333474] DMAR: Intel(R) Virtualization Technology for Directed I/O
[ 3.990175] i915 0000:00:02.0: [drm] DMAR active, disabling use of stolen memory
```
Depending on your mainboard and cpu, the output will be different, in my output the important line is the third one: `DMAR: IOMMU enabled`. If you see something like that, IOMMU is enabled.
## NVIDIA Driver
### Choosing the right driver version
This is the tricky part, at the time of writing (Jan 2022), there are [three active branches](https://docs.nvidia.com/grid/) of the NVIDIA vGPU driver. The latest is branch 13 (long term support branch until mid 2024) with driver version 470. I had no luck getting *any* version of that driver to work with vGPU at all but as always - ymmv.
Branch 12 is a "regular" production branch with support until January of 2022 and has driver version number 460. Lots of people are running that driver in combination with the Linux Kernel 5.15. I got it installed with my gpu, but as soon as I tried to use the gpu in my VM, the display would freeze every 30-ish seconds and `nvidia-vgpu-mgr.service` would report an error similar to `error: vmiop_log: (0x0): XID 43 detected on physical_chid:0x1c, guest_chid:0x14`. At first I thought I messed up some of the driver patches required to get the driver working on kernels newer than 5.11 - so I tried on PVE 6.4 without any patches (5.4 kernel) but got the same errors there. If anyone knows what's causing this error, or even how to fix it, **please** let me know :)
Ruling out those two branches only leaves the older long term support branch 11: It is supported until mid 2023 and has the driver version 450. Like the other branch (12), you have to patch some parts of the driver to get it working on the Linux Kernel 5.15. I tried every patch I could find on the Internet (mostly twelve.patch and fourteen.patch and their variations) but no combination of them allowed me to install the driver - the installer would always complain about my system being incompatible. So I spent a few hours looking at the existing patches and reviewing the files they patch to finally come up with my own patch: Basically, it adapts twelve.patch and fourteen.patch to this older driver (they seem to be designed for the branch 12 driver) and merges them into a single patch.
### Obtaining the driver
I will be using the latest driver from branch 11 (at the time of writing that would be 11.6 / 450.156).
NVIDIA doesn't let you freely download vGPU drivers like they do with GeForce or normal Quadro drivers, instead you have to download them through the [NVIDIA Licensing Portal](https://nvid.nvidia.com/dashboard/) (see: [https://www.nvidia.com/en-us/drivers/vgpu-software-driver/](https://www.nvidia.com/en-us/drivers/vgpu-software-driver/)). You can sign up for a free evaluation to get access to the download page.
After downloading version 11.6 you should have a zip file called `NVIDIA-GRID-Linux-KVM-450.156-450.156.00-453.23.zip`, extract that and copy the file `NVIDIA-Linux-x86_64-450.156-vgpu-kvm.run` to your PVE host into the `/root/` folder
```bash
scp NVIDIA-Linux-x86_64-450.156-vgpu-kvm.run root@pve:/root/
```
### Patching the driver
Now, on the proxmox host, make the driver executable
```bash
chmod +x NVIDIA-Linux-x86_64-450.156-vgpu-kvm.run
```
And then unpack it
```bash
./NVIDIA-Linux-x86_64-450.156-vgpu-kvm.run -x
```
Go inside the extracted folder
```bash
cd NVIDIA-Linux-x86_64-450.156-vgpu-kvm/
```
To be able to install the driver on your proxmox host, apply the driver patch
```bash
patch -p0 < ~/vgpu-5.15/450_5.15.patch
```
If everything went right (and you are using the exact same nvidia driver version 11.6), the output should be exactly this
```
patching file ./kernel/Kbuild
patching file ./kernel/conftest.sh
patching file ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.c
patching file ./kernel/nvidia-vgpu-vfio/nvidia-vgpu-vfio.h
patching file ./kernel/nvidia/nv-frontend.c
```
There is a second patch you need to apply.
#### Warning: If you followed every step of this tutorial it should be safe to just apply it, but if you did anything different than I, you should check if the paths in the patch are valid for you.
```bash
patch -p0 < ~/vgpu-5.15/unlock.patch
```
The output should be exactly this
```
patching file ./kernel/nvidia/nvidia.Kbuild
patching file ./kernel/nvidia/os-interface.c
```
### Installing the driver
Now that all the required patches are applied, you can install the driver
```bash
./nvidia-installer --dkms
```
The installer will ask you `Would you like to register the kernel module sources with DKMS? This will allow DKMS to automatically build a new module, if you install a different kernel later.`, answer with `Yes`.
Depending on your hardware, the installation could take a minute or two.
If everything went right, you will be presented with this message.
```
Installation of the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version: 450.156) is now complete.
```
Click `Ok` to exit the installer.
To finish the installation, reboot.
```bash
reboot
```
### Finishing touches
Wait for your server to reboot, then type this into the shell to check if the driver install worked
```bash
nvidia-smi
```
You should get an output similar to this one
```
Mon Jan 3 20:41:15 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.156 Driver Version: 450.156 CUDA Version: N/A |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 T1000 On | 00000000:01:00.0 Off | N/A |
| 32% 39C P8 N/A / 50W | 30MiB / 4095MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
```
To verify if the vGPU unlock worked, type this command
```bash
mdevctl types
```
The output will be similar to this
```
0000:01:00.0
nvidia-256
Available instances: 24
Device API: vfio-pci
Name: GRID RTX6000-1Q
Description: num_heads=4, frl_config=60, framebuffer=1024M, max_resolution=5120x2880, max_instance=24
nvidia-257
Available instances: 12
Device API: vfio-pci
Name: GRID RTX6000-2Q
Description: num_heads=4, frl_config=60, framebuffer=2048M, max_resolution=7680x4320, max_instance=12
nvidia-258
Available instances: 8
Device API: vfio-pci
Name: GRID RTX6000-3Q
Description: num_heads=4, frl_config=60, framebuffer=3072M, max_resolution=7680x4320, max_instance=8
---SNIP---
```
If this command doesn't return any output, vGPU unlock isn't working.
### Bonus: working `nvidia-smi vgpu` command
I've included an adapted version of the `nvidia-smi` [wrapper script](https://github.com/erin-allison/nvidia-merged-arch/blob/d2ce752cd38461b53b7e017612410a3348aa86e5/nvidia-smi) to get useful output from `nvidia-smi vgpu`.
Without that wrapper script, running `nvidia-smi vgpu` in your shell results in this output
```
No supported devices in vGPU mode
```
With the wrapper script, the output looks similar to this
```
Mon Jan 3 20:54:35 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.156 Driver Version: 450.156 |
|---------------------------------+------------------------------+------------+
| GPU Name | Bus-Id | GPU-Util |
| vGPU ID Name | VM ID VM Name | vGPU-Util |
|=================================+==============================+============|
| 0 T1000 | 00000000:01:00.0 | 0% |
+---------------------------------+------------------------------+------------+
```
To install this script, copy the `nvidia-smi` file from this repo to `/usr/local/bin` and make it executable
```bash
cp ~/vgpu-5.15/nvidia-smi /usr/local/bin/
chmod +x /usr/local/bin/nvidia-smi
```
Run this in your shell (you might have to logout and back in first) to see if it worked
```bash
nvidia-smi vgpu
```
## Credits
Thanks to all these people (in no particular order) for making this project possible
- [DualCoder](https://github.com/DualCoder) for his original [vgpu_unlock](https://github.com/DualCoder/vgpu_unlock) repo with the kernel hooks
- [mbilker](https://github.com/mbilker) for the rust version, [vgpu_unlock-rs](https://github.com/mbilker/vgpu_unlock-rs)
- [KrutavShah](https://github.com/KrutavShah) for the [wiki](https://krutavshah.github.io/GPU_Virtualization-Wiki/)
- [HiFiPhile](https://github.com/HiFiPhile) for the [C version](https://gist.github.com/HiFiPhile/b3267ce1e93f15642ce3943db6e60776) of vgpu unlock
- [rupansh](https://github.com/rupansh) for the original [twelve.patch](https://github.com/rupansh/vgpu_unlock_5.12/blob/master/twelve.patch) to patch the driver on kernels >= 5.12
- mbuchel#1878 on the [GPU Unlocking discord](https://discord.gg/5rQsSV3Byq) for [fourteen.patch](https://gist.github.com/erin-allison/5f8acc33fa1ac2e4c0f77fdc5d0a3ed1) to patch the driver on kernels >= 5.14
- [erin-allison](https://github.com/erin-allison) for the [nvidia-smi wrapper script](https://github.com/erin-allison/nvidia-merged-arch/blob/d2ce752cd38461b53b7e017612410a3348aa86e5/nvidia-smi)
If I forgot to mention someone, please create an issue or let me know otherwise.
## TODO (soon tm)
- Add basic profile_override.toml config
- Add proxmox VM installation guide
## Contributing
# NVIDIA vGPU with the 510 driver
Thanks to the great work of a `LIL'pingu` in the vgpu unlock discord we can finally use the (at the time of writing) latest NVIDIA GRID driver with the version number 510.47.03 with most consumer GPUs. Personally I have tested the T1000, a turing based card but others from the discord server got it working with a pascal based card as well.
### This tutorial assumes you are using a clean install of Proxmox 7.1, or ymmv when using an existing installation. Make sure to always have backups!
The patch included in this repository should work on other linux systems with kernel versions >= 5.13 but I have only tested it on proxmox.
If you are not using proxmox, you have to adapt some parts of this tutorial to work for your distribution.
## Packages
Make sure to add the community pve repo and get rid of the enterprise repo (you can skip this step if you have a valid enterprise subscription)
```bash
echo "deb http://download.proxmox.com/debian/pve bullseye pve-no-subscription" >> /etc/apt/sources.list
rm /etc/apt/sources.list.d/pve-enterprise.list
```
Update and upgrade
```bash
apt update
apt dist-upgrade
```
By default Proxmox 7.1 has the 5.13 kernel, but you can opt-in to the newer 5.15 version. Both versions work just fine for vGPU.
If you didn't install a newer kernel on proxmox, then skip the following line. If you have the 5.15 kernel, it is required that you also install the corresponding kernel headers:
```bash
apt install -y pve-headers-5.15
```
Next we need to install a few more packages like git, a compiler and some other tools. This is required no matter which kernel version you are using.
```bash
apt install -y git build-essential dkms pve-headers mdevctl
```
## Git repos and [Rust](https://www.rust-lang.org/) compiler
First, clone this repo to your home folder (in this case `/root/`)
```bash
git clone https://gitlab.com/polloloco/vgpu-5.15.git
```
You also need the vgpu_unlock-rs repo
```bash
cd /opt
git clone https://github.com/p0lloloco/vgpu_unlock-rs
```
After that, install the rust compiler
```bash
curl https://sh.rustup.rs -sSf | sh -s -- -y
```
Now make the rust binaries available in your $PATH (you only have to do it the first time after installing rust)
```bash
source $HOME/.cargo/env
```
Enter the `vgpu_unlock-rs` directory and compile the library. Depending on your hardware and internet connection that may take a while
```bash
cd vgpu_unlock-rs/
cargo build --release
```
## Create files for vGPU unlock
The vgpu_unlock-rs library requires a few files and folders in order to work properly, lets create those
First create the folder for your vgpu unlock config and create an empty config file
```bash
mkdir /etc/vgpu_unlock
touch /etc/vgpu_unlock/profile_override.toml
```
Then, create folders and files for systemd to load the vgpu_unlock-rs library when starting the nvidia vgpu services
```bash
mkdir /etc/systemd/system/{nvidia-vgpud.service.d,nvidia-vgpu-mgr.service.d}
echo -e "[Service]\nEnvironment=LD_PRELOAD=/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so" > /etc/systemd/system/nvidia-vgpud.service.d/vgpu_unlock.conf
echo -e "[Service]\nEnvironment=LD_PRELOAD=/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so" > /etc/systemd/system/nvidia-vgpu-mgr.service.d/vgpu_unlock.conf
```
## Enabling IOMMU
#### Note: Usually this isn't required for vGPU to work, but it doesn't hurt to enable it. You can skip this section, but if you run into problems later on, make sure to enable IOMMU.
To enable IOMMU you have to enable it in your BIOS/UEFI first. Due to it being vendor specific, I am unable to provide instructions for that, but usually for Intel systems the option you are looking for is called something like "Vt-d", AMD systems tend to call it "IOMMU".
After enabling it in your BIOS/UEFI, you also have to enable it in your kernel. Depending on how your system is booting, there are two ways to do that.
If you installed your system with ZFS-on-root and in UEFI mode, then you are using systemd-boot, everything else is GRUB. GRUB is way more common so if you are unsure, you are probably using that.
Depending on which system you are using to boot, you have to chose from the following two options:
<details>
<summary>GRUB</summary>
Open the file `/etc/default/grub` in your favorite editor
```bash
nano /etc/default/grub
```
The kernel parameters have to be appended to the variable `GRUB_CMDLINE_LINUX_DEFAULT`. On a clean installation that line should look like this
```
GRUB_CMDLINE_LINUX_DEFAULT="quiet"
```
If you are using an Intel system, append this after `quiet`:
```
intel_iommu=on iommu=pt
```
On AMD systems, append this after `quiet`:
```
amd_iommu=on iommu=pt
```
The result should look like this (for intel systems):
```
GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on iommu=pt"
```
Now, save and exit from the editor using Ctrl+O and then Ctrl+X and then apply your changes:
```bash
update-grub
```
</details>
<details>
<summary>systemd-boot</summary>
The kernel parameters have to be appended to the commandline in the file `/etc/kernel/cmdline`, so open that in your favorite editor:
```bash
nano /etc/kernel/cmdline
```
On a clean installation the file might look similar to this:
```
root=ZFS=rpool/ROOT/pve-1 boot=zfs
```
On Intel systems, append this at the end
```
intel_iommu=on iommu=pt
```
For AMD, use this
```
amd_iommu=on iommu=pt
```
After editing the file, it should look similar to this
```
root=ZFS=rpool/ROOT/pve-1 boot=zfs intel_iommu=on iommu=pt
```
Now, save and exit from the editor using Ctrl+O and then Ctrl+X and then apply your changes:
```bash
proxmox-boot-tool refresh
```
</details>
## Loading required kernel modules and blacklisting the open source nvidia driver
We have to load the `vfio`, `vfio_iommu_type1`, `vfio_pci` and `vfio_virqfd` kernel modules to get vGPU working
```bash
echo -e "vfio\nvfio_iommu_type1\nvfio_pci\nvfio_virqfd" >> /etc/modules
```
Proxmox comes with the open source nouveau driver for nvidia gpus, however we have to use our patched nvidia driver to enable vGPU. The next line will prevent the nouveau driver from loading
```bash
echo "blacklist nouveau" >> /etc/modprobe.d/blacklist.conf
```
## Applying our kernel configuration
I'm not sure if this is needed, but it doesn't hurt :)
```bash
update-initramfs -u -k all
```
...and reboot
```bash
reboot
```
## Check if IOMMU is enabled
#### Note: See section "Enabling IOMMU", this is optional
Wait for your server to restart, then type this into a root shell
```bash
dmesg | grep -e DMAR -e IOMMU
```
On my Intel system the output looks like this
```
[ 0.007235] ACPI: DMAR 0x000000009CC98B68 0000B8 (v01 INTEL BDW 00000001 INTL 00000001)
[ 0.007255] ACPI: Reserving DMAR table memory at [mem 0x9cc98b68-0x9cc98c1f]
[ 0.020766] DMAR: IOMMU enabled
[ 0.062294] DMAR: Host address width 39
[ 0.062296] DMAR: DRHD base: 0x000000fed90000 flags: 0x0
[ 0.062300] DMAR: dmar0: reg_base_addr fed90000 ver 1:0 cap c0000020660462 ecap f0101a
[ 0.062302] DMAR: DRHD base: 0x000000fed91000 flags: 0x1
[ 0.062305] DMAR: dmar1: reg_base_addr fed91000 ver 1:0 cap d2008c20660462 ecap f010da
[ 0.062307] DMAR: RMRR base: 0x0000009cc18000 end: 0x0000009cc25fff
[ 0.062309] DMAR: RMRR base: 0x0000009f000000 end: 0x000000af1fffff
[ 0.062312] DMAR-IR: IOAPIC id 8 under DRHD base 0xfed91000 IOMMU 1
[ 0.062314] DMAR-IR: HPET id 0 under DRHD base 0xfed91000
[ 0.062315] DMAR-IR: x2apic is disabled because BIOS sets x2apic opt out bit.
[ 0.062316] DMAR-IR: Use 'intremap=no_x2apic_optout' to override the BIOS setting.
[ 0.062797] DMAR-IR: Enabled IRQ remapping in xapic mode
[ 0.302431] DMAR: No ATSR found
[ 0.302432] DMAR: No SATC found
[ 0.302433] DMAR: IOMMU feature pgsel_inv inconsistent
[ 0.302435] DMAR: IOMMU feature sc_support inconsistent
[ 0.302436] DMAR: IOMMU feature pass_through inconsistent
[ 0.302437] DMAR: dmar0: Using Queued invalidation
[ 0.302443] DMAR: dmar1: Using Queued invalidation
[ 0.333474] DMAR: Intel(R) Virtualization Technology for Directed I/O
[ 3.990175] i915 0000:00:02.0: [drm] DMAR active, disabling use of stolen memory
```
Depending on your mainboard and cpu, the output will be different, in my output the important line is the third one: `DMAR: IOMMU enabled`. If you see something like that, IOMMU is enabled.
## NVIDIA Driver
As of the time of this writing (March 2022), the latest available GRID driver is 14.0 with vGPU driver version 510.47.03. You can check for the latest version [here](https://docs.nvidia.com/grid/). I cannot guarantee that newer versions would work without additional patches, this tutorial only covers 14.0 (510.47.03).
### Obtaining the driver
NVIDIA doesn't let you freely download vGPU drivers like they do with GeForce or normal Quadro drivers, instead you have to download them through the [NVIDIA Licensing Portal](https://nvid.nvidia.com/dashboard/) (see: [https://www.nvidia.com/en-us/drivers/vgpu-software-driver/](https://www.nvidia.com/en-us/drivers/vgpu-software-driver/)). You can sign up for a free evaluation to get access to the download page.
The file you are looking for is called `NVIDIA-GRID-Linux-KVM-510.47.03-511.65.zip`, you can get it from the download portal by downloading version 14.0 for `Linux KVM`.
After downloading, extract that and copy the file `NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm.run` to your Proxmox host into the `/root/` folder
```bash
scp NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm.run root@pve:/root/
```
### Patching the driver
Now, on the proxmox host, make the driver executable
```bash
chmod +x NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm.run
```
And then patch it
```bash
./NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm.run --apply-patch ~/vgpu-5.15/NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm.patch
```
That should output a lot of lines ending with
```
Self-extractible archive "NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm-custom.run" successfully created.
```
You should now have a file called `NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm-custom.run`, that is your patched driver.
### Installing the driver
Now that the required patch is applied, you can install the driver
```bash
./NVIDIA-Linux-x86_64-510.47.03-vgpu-kvm-custom.run --dkms
```
The installer will ask you `Would you like to register the kernel module sources with DKMS? This will allow DKMS to automatically build a new module, if you install a different kernel later.`, answer with `Yes`.
Depending on your hardware, the installation could take a minute or two.
If everything went right, you will be presented with this message.
```
Installation of the NVIDIA Accelerated Graphics Driver for Linux-x86_64 (version: 510.47.03) is now complete.
```
Click `Ok` to exit the installer.
To finish the installation, reboot.
```bash
reboot
```
### Finishing touches
Wait for your server to reboot, then type this into the shell to check if the driver install worked
```bash
nvidia-smi
```
You should get an output similar to this one
```
Fri Mar 25 11:39:40 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: N/A |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA T1000 On | 00000000:01:00.0 Off | N/A |
| 0% 36C P8 N/A / 50W | 35MiB / 4096MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
```
To verify if the vGPU unlock worked, type this command
```bash
mdevctl types
```
The output will be similar to this
```
0000:01:00.0
nvidia-256
Available instances: 24
Device API: vfio-pci
Name: GRID RTX6000-1Q
Description: num_heads=4, frl_config=60, framebuffer=1024M, max_resolution=5120x2880, max_instance=24
nvidia-257
Available instances: 12
Device API: vfio-pci
Name: GRID RTX6000-2Q
Description: num_heads=4, frl_config=60, framebuffer=2048M, max_resolution=7680x4320, max_instance=12
nvidia-258
Available instances: 8
Device API: vfio-pci
Name: GRID RTX6000-3Q
Description: num_heads=4, frl_config=60, framebuffer=3072M, max_resolution=7680x4320, max_instance=8
---SNIP---
```
If this command doesn't return any output, vGPU unlock isn't working.
### Bonus: working `nvidia-smi vgpu` command
I've included an adapted version of the `nvidia-smi` [wrapper script](https://github.com/erin-allison/nvidia-merged-arch/blob/d2ce752cd38461b53b7e017612410a3348aa86e5/nvidia-smi) to get useful output from `nvidia-smi vgpu`.
Without that wrapper script, running `nvidia-smi vgpu` in your shell results in this output
```
No supported devices in vGPU mode
```
With the wrapper script, the output looks similar to this
```
Fri Mar 25 11:40:18 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 |
|---------------------------------+------------------------------+------------+
| GPU Name | Bus-Id | GPU-Util |
| vGPU ID Name | VM ID VM Name | vGPU-Util |
|=================================+==============================+============|
| 0 NVIDIA T1000 | 00000000:01:00.0 | 0% |
+---------------------------------+------------------------------+------------+
```
To install this script, copy the `nvidia-smi` file from this repo to `/usr/local/bin` and make it executable
```bash
cp ~/vgpu-5.15/nvidia-smi /usr/local/bin/
chmod +x /usr/local/bin/nvidia-smi
```
Run this in your shell (you might have to logout and back in first) to see if it worked
```bash
nvidia-smi vgpu
```
## vGPU overrides
Further up we have created the file `/etc/vgpu_unlock/profile_override.toml` and I didn't explain what it was for yet. Using that file you can override lots of parameters for your vGPU instances: For example you can change the maximum resolution, enable/disable the frame rate limiter, enable/disable support for CUDA or change the vram size of your virtual gpus.
If we take a look at the output of `mdevctl types` we see lots of different types that we can choose from. However, if we for example chose `GRID RTX6000-4Q` which gives us 4GB of vram in a VM, we are locked to that type for all of our VMs. Meaning we can only have 4GB VMs, its not possible to mix different types to have one 4GB VM, and two 2GB VMs.
All of that changes with the override config file. Technically we are still locked to only using one profile, but now its possible to change the vram of the profile on a VM basis so even though we have three `GRID RTX6000-4Q` instances, one VM can have 4GB or vram but we can override the vram size for the other two VMs to only 2GB.
Lets take a look at this example config override file (its in TOML format)
```toml
[profile.nvidia-259]
num_displays = 1 # Max number of virtual displays. Usually 1 if you want a simple remote gaming VM
display_width = 1920 # Maximum display width in the VM
display_height = 1080 # Maximum display height in the VM
max_pixels = 2073600 # This is the product of display_width and display_height so 1920 * 1080 = 2073600
cuda_enabled = 1 # Enables CUDA support. Either 1 or 0 for enabled/disabled
frl_enabled = 1 # This controls the frame rate limiter, if you enable it your fps in the VM get locked to 60fps. Either 1 or 0 for enabled/disabled
framebuffer = 0x76000000 # VRAM size for the VM. In this case its 2GB
# Other options:
# 1GB: 0x3B000000
# 2GB: 0x76000000
# 3GB: 0xB1000000
# 4GB: 0xEC000000
# 8GB: 0x1D8000000
# 16GB: 0x3B0000000
[mdev.00000000-0000-0000-0000-000000000100]
frl_enabled = 0
# You can override all the options from above here too. If you want to add more overrides for a new VM, just copy this block and change the UUID
```
There are two blocks here, the first being `[profile.nvidia-259]` and the second `[mdev.00000000-0000-0000-0000-000000000100]`.
The first one applies the overrides to all VM instances of the `nvidia-259` type (thats `GRID RTX6000-4Q`) and the second one applies its overrides only to one specific VM, that one with the uuid `00000000-0000-0000-0000-000000000100`.
You don't have to specify all parameters, only the ones you need/want. There are some more that I didn't mention here, you can find them by going through the source code of the `vgpu_unlock-rs` repo.
For a simple 1080p remote gaming VM I recommend going with something like this
```toml
[profile.nvidia-259] # choose the profile you want here
num_displays = 1
display_width = 1920
display_height = 1080
max_pixels = 2073600
```
### Spoofing your vGPU instance
You can very easily spoof your virtual GPU to a different card, so that you could install normal quadro drivers instead of the GRID drivers that require licensing.
For that you just have to add two lines to the override config. In this example I'm spoofing my Turing based card to a normal RTX 6000 Quadro card:
```toml
[profile.nvidia-259]
# insert all of your other overrides here too
pci_device_id = 0x1E30
pci_id = 0x1E3012BA # This is not always required, see below
```
`pci_device_id` is the pci id from the card you want to spoof to. In my case its `0x1E30` which is the `Quadro RTX 6000/8000`.
`pci_id` can be split in two parts: `0x1E30 12BA`, the first part `0x1E30` has to be the same as `pci_device_id`. The second part is the subdevice id. In my case `12BA` means its a RTX 6000 card and not RTX 8000.
You can get the IDs from [here](https://pci-ids.ucw.cz/read/PC/10de/). Just Ctrl+F and search the card you want to spoof to, then copy the id it shows you on the left and use it for `pci_device_id`.
After doing that, click the same id, it should open a new page where it lists the subsystems. If there are none listed, you can remove the `pci_id` entry from above. But if there are some, you have to select the one you want and use its id as the second value for `pci_id` (see above).
## Adding a vGPU to a Proxmox VM
There is only one thing you have to do from the commandline: Open the VM config file and give the VM a uuid.
For that you need your VM ID, in this example I'm using `1000`.
```bash
nano /etc/pve/qemu-server/<VM-ID>.conf
```
So with the VM ID 1000, I have to do this:
```bash
nano /etc/pve/qemu-server/1000.conf
```
In that file, you have to add a new line at the end:
```
args: -uuid 00000000-0000-0000-0000-00000000XXXX
```
You have to replace `XXXX` with your VM ID. With my 1000 ID I have to use this line:
```
args: -uuid 00000000-0000-0000-0000-000000001000
```
Save and exit from the editor. Thats all you have to do from the terminal.
Now go to the proxmox webinterface, go to your VM, then to `Hardware`, then to `Add` and select `PCI Device`.
You should be able to choose from a list of pci devices. Choose your GPU there, its entry should say `Yes` in the `Mediated Devices` column.
Now you should be able to also select the `MDev Type`. Choose whatever profile you want, if you don't remember which one you want, you can see the list of all available types with `mdevctl types`.
Finish by clicking `Add`, start the VM and install the required drivers. After installing the drivers you can shut the VM down and remove the virtual display adapter by selecting `Display` in the `Hardware` section and selecting `none (none)`. ONLY do that if you have some other way to access the Virtual Machine like Parsec or Remote Desktop because the Proxmox Console won't work anymore.
Enjoy your new vGPU VM :)
## Credits
Thanks to all these people (in no particular order) for making this project possible
- [DualCoder](https://github.com/DualCoder) for his original [vgpu_unlock](https://github.com/DualCoder/vgpu_unlock) repo with the kernel hooks
- [mbilker](https://github.com/mbilker) for the rust version, [vgpu_unlock-rs](https://github.com/mbilker/vgpu_unlock-rs)
- [KrutavShah](https://github.com/KrutavShah) for the [wiki](https://krutavshah.github.io/GPU_Virtualization-Wiki/)
- [HiFiPhile](https://github.com/HiFiPhile) for the [C version](https://gist.github.com/HiFiPhile/b3267ce1e93f15642ce3943db6e60776) of vgpu unlock
- [rupansh](https://github.com/rupansh) for the original [twelve.patch](https://github.com/rupansh/vgpu_unlock_5.12/blob/master/twelve.patch) to patch the driver on kernels >= 5.12
- mbuchel#1878 on the [GPU Unlocking discord](https://discord.gg/5rQsSV3Byq) for [fourteen.patch](https://gist.github.com/erin-allison/5f8acc33fa1ac2e4c0f77fdc5d0a3ed1) to patch the driver on kernels >= 5.14
- [erin-allison](https://github.com/erin-allison) for the [nvidia-smi wrapper script](https://github.com/erin-allison/nvidia-merged-arch/blob/d2ce752cd38461b53b7e017612410a3348aa86e5/nvidia-smi)
- LIL'pingu#9069 on the [GPU Unlocking discord](https://discord.gg/5rQsSV3Byq) for his patch to nop out code that NVIDIA added to prevent usage of drivers with a version >= 460 with consumer cards
If I forgot to mention someone, please create an issue or let me know otherwise.
## Contributing
Pull requests are welcome (factual errors, amendments, grammar/spelling mistakes etc).

View File

@ -1,12 +1,12 @@
#!/usr/bin/bash
for a in $*
do
case $a in
vgpu)
export LD_PRELOAD="/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so"
;;
esac
done
#!/usr/bin/bash
for a in $*
do
case $a in
vgpu)
export LD_PRELOAD="/opt/vgpu_unlock-rs/target/release/libvgpu_unlock_rs.so"
;;
esac
done
exec /usr/bin/nvidia-smi $@

View File

@ -1,17 +0,0 @@
--- ./kernel/nvidia/nvidia.Kbuild
+++ ./kernel/nvidia/nvidia.Kbuild
@@ -203,3 +203,4 @@ NV_CONFTEST_GENERIC_COMPILE_TESTS += get_user_pages_remote
NV_CONFTEST_GENERIC_COMPILE_TESTS += pm_runtime_available
NV_CONFTEST_GENERIC_COMPILE_TESTS += vm_fault_t
NV_CONFTEST_GENERIC_COMPILE_TESTS += pci_class_multimedia_hd_audio
+ldflags-y += -T /opt/vgpu_unlock/kern.ld
--- ./kernel/nvidia/os-interface.c
+++ ./kernel/nvidia/os-interface.c
@@ -16,7 +16,7 @@
#include "nv-time.h"
-
+#include "/opt/vgpu_unlock/vgpu_unlock_hooks.c"