From: Thierry Reding <treding at nvidia.com> Hi Ben, I messed up the ordering of patches in my tree a bit, so these two fixes got separated from the others. I don't consider these particularily urgent because the crash that the first one fixes only happens on gp10b which we don't enable by default yet and the second patch fixes a crash that only happens on module unload (or driver unbind, more accurately), which isn't a terribly common thing to do. I'll be sending out fixes shortly to make the GP10B work more properly on a wider range of Jetson TX2 devices and enable it by default. One thing to mention is that I'm not exactly sure if the first patch is the right thing to do. I haven't seen any issues after that change, but I'm also not exactly sure I understand what BAR2 is used for, so I don't know if I would've even covered those code paths (other than the one causing the crash at probe time) in my tests. It'd be great to get Lyude's feedback on the second patch, since that call to pci_disable_device() was rather oddly placed and I'm not sure if that was essential for things to work or whether the slightly different point in time where it's called after this patch is also okay. It looks to me like it should work fine, but I don't currently have a way to test this on desktop GPUs. Thierry Thierry Reding (2): drm/nouveau: tegra: Fix NULL pointer dereference drm/nouveau: tegra: Do not try to disable PCI device drivers/gpu/drm/nouveau/nouveau_drm.c | 3 +- .../drm/nouveau/nvkm/subdev/instmem/gk20a.c | 30 +++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) -- 2.23.0
Thierry Reding
2019-Sep-16 14:36 UTC
[Nouveau] [PATCH 1/2] drm/nouveau: tegra: Fix NULL pointer dereference
From: Thierry Reding <treding at nvidia.com> Fill in BAR2 callbacks for instance memory. There's no BAR2 on Tegra GPUs, but buffers are all in system memory anyway, so just return the plain address. Signed-off-by: Thierry Reding <treding at nvidia.com> --- .../drm/nouveau/nvkm/subdev/instmem/gk20a.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c index 985f2990ab0d..b0493f8df1fe 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c @@ -261,6 +261,34 @@ gk20a_instobj_release_iommu(struct nvkm_memory *memory) nvkm_ltc_invalidate(ltc); } +static u64 +gk20a_instobj_bar2_dma(struct nvkm_memory *memory) +{ + struct gk20a_instobj_dma *iobj = gk20a_instobj_dma(memory); + u64 addr = ~0ULL; + + if (gk20a_instobj_acquire_dma(&iobj->base.memory)) + addr = gk20a_instobj_addr(&iobj->base.memory); + + gk20a_instobj_release_dma(&iobj->base.memory); + + return addr; +} + +static u64 +gk20a_instobj_bar2_iommu(struct nvkm_memory *memory) +{ + struct gk20a_instobj_iommu *iobj = gk20a_instobj_iommu(memory); + u64 addr = ~0ULL; + + if (gk20a_instobj_acquire_iommu(&iobj->base.memory)) + addr = gk20a_instobj_addr(&iobj->base.memory); + + gk20a_instobj_release_iommu(&iobj->base.memory); + + return addr; +} + static u32 gk20a_instobj_rd32(struct nvkm_memory *memory, u64 offset) { @@ -353,6 +381,7 @@ static const struct nvkm_memory_func gk20a_instobj_func_dma = { .dtor = gk20a_instobj_dtor_dma, .target = gk20a_instobj_target, + .bar2 = gk20a_instobj_bar2_dma, .page = gk20a_instobj_page, .addr = gk20a_instobj_addr, .size = gk20a_instobj_size, @@ -365,6 +394,7 @@ static const struct nvkm_memory_func gk20a_instobj_func_iommu = { .dtor = gk20a_instobj_dtor_iommu, .target = gk20a_instobj_target, + .bar2 = gk20a_instobj_bar2_iommu, .page = gk20a_instobj_page, .addr = gk20a_instobj_addr, .size = gk20a_instobj_size, -- 2.23.0
Thierry Reding
2019-Sep-16 14:36 UTC
[Nouveau] [PATCH 2/2] drm/nouveau: tegra: Do not try to disable PCI device
From: Thierry Reding <treding at nvidia.com> When Nouveau is instantiated on top of a platform device, the dev->pdev field will be NULL and calling pci_disable_device() will crash. Move the PCI disabling code to the PCI specific driver removal code. Signed-off-by: Thierry Reding <treding at nvidia.com> --- drivers/gpu/drm/nouveau/nouveau_drm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 2cd83849600f..b65ae817eabf 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -715,7 +715,6 @@ static int nouveau_drm_probe(struct pci_dev *pdev, void nouveau_drm_device_remove(struct drm_device *dev) { - struct pci_dev *pdev = dev->pdev; struct nouveau_drm *drm = nouveau_drm(dev); struct nvkm_client *client; struct nvkm_device *device; @@ -727,7 +726,6 @@ nouveau_drm_device_remove(struct drm_device *dev) device = nvkm_device_find(client->device); nouveau_drm_device_fini(dev); - pci_disable_device(pdev); drm_dev_put(dev); nvkm_device_del(&device); } @@ -738,6 +736,7 @@ nouveau_drm_remove(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); nouveau_drm_device_remove(dev); + pci_disable_device(pdev); } static int -- 2.23.0
On Tue, 17 Sep 2019 at 00:36, Thierry Reding <thierry.reding at gmail.com> wrote:> > From: Thierry Reding <treding at nvidia.com> > > Hi Ben, > > I messed up the ordering of patches in my tree a bit, so these two fixes > got separated from the others. I don't consider these particularily > urgent because the crash that the first one fixes only happens on gp10b > which we don't enable by default yet and the second patch fixes a crash > that only happens on module unload (or driver unbind, more accurately), > which isn't a terribly common thing to do. > > I'll be sending out fixes shortly to make the GP10B work more properly > on a wider range of Jetson TX2 devices and enable it by default. > > One thing to mention is that I'm not exactly sure if the first patch is > the right thing to do. I haven't seen any issues after that change, but > I'm also not exactly sure I understand what BAR2 is used for, so I don't > know if I would've even covered those code paths (other than the one > causing the crash at probe time) in my tests.BAR2 on dGPUs is used to map kernel-level GPU objects in VRAM so they can be accessed by the driver. It's pretty much a smaller version of BAR1, but intended for a different purpose. On dGPUs, there's a couple of places (fault buffer address, and fault method buffer on volta) where the GPU wants PRI regs to be poked with an offset within BAR2 rather than an aperture+offset combination. I'm not 100% sure what Tegra parts do here, but presumably if it's working for you, they're happy to just accept a system memory address instead. I guess this would be the right thing to do here in that situation. The more obvious (from a "reading the code" POV) thing to do would be to write Tegra-specific versions of the functions that use nvkm_memory_bar2() to perform this mapping, and use nvkm_memory_addr() instead but I'm not sure if we need/want to go to that effort. It's conceivable it could be required at some point. Ben.> > It'd be great to get Lyude's feedback on the second patch, since that > call to pci_disable_device() was rather oddly placed and I'm not sure if > that was essential for things to work or whether the slightly different > point in time where it's called after this patch is also okay. It looks > to me like it should work fine, but I don't currently have a way to test > this on desktop GPUs. > > Thierry > > Thierry Reding (2): > drm/nouveau: tegra: Fix NULL pointer dereference > drm/nouveau: tegra: Do not try to disable PCI device > > drivers/gpu/drm/nouveau/nouveau_drm.c | 3 +- > .../drm/nouveau/nvkm/subdev/instmem/gk20a.c | 30 +++++++++++++++++++ > 2 files changed, 31 insertions(+), 2 deletions(-) > > -- > 2.23.0 > > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/nouveau
On Tue, Sep 17, 2019 at 04:07:54PM +1000, Ben Skeggs wrote:> On Tue, 17 Sep 2019 at 00:36, Thierry Reding <thierry.reding at gmail.com> wrote: > > > > From: Thierry Reding <treding at nvidia.com> > > > > Hi Ben, > > > > I messed up the ordering of patches in my tree a bit, so these two fixes > > got separated from the others. I don't consider these particularily > > urgent because the crash that the first one fixes only happens on gp10b > > which we don't enable by default yet and the second patch fixes a crash > > that only happens on module unload (or driver unbind, more accurately), > > which isn't a terribly common thing to do. > > > > I'll be sending out fixes shortly to make the GP10B work more properly > > on a wider range of Jetson TX2 devices and enable it by default. > > > > One thing to mention is that I'm not exactly sure if the first patch is > > the right thing to do. I haven't seen any issues after that change, but > > I'm also not exactly sure I understand what BAR2 is used for, so I don't > > know if I would've even covered those code paths (other than the one > > causing the crash at probe time) in my tests. > BAR2 on dGPUs is used to map kernel-level GPU objects in VRAM so they > can be accessed by the driver. It's pretty much a smaller version of > BAR1, but intended for a different purpose. > > On dGPUs, there's a couple of places (fault buffer address, and fault > method buffer on volta) where the GPU wants PRI regs to be poked with > an offset within BAR2 rather than an aperture+offset combination. I'm > not 100% sure what Tegra parts do here, but presumably if it's working > for you, they're happy to just accept a system memory address instead. > > I guess this would be the right thing to do here in that situation. > The more obvious (from a "reading the code" POV) thing to do would be > to write Tegra-specific versions of the functions that use > nvkm_memory_bar2() to perform this mapping, and use nvkm_memory_addr() > instead but I'm not sure if we need/want to go to that effort. It's > conceivable it could be required at some point.Yeah, that sounds slightly more correct. I'll look into it and see if I can come up with something. Thierry> > Ben. > > > > > It'd be great to get Lyude's feedback on the second patch, since that > > call to pci_disable_device() was rather oddly placed and I'm not sure if > > that was essential for things to work or whether the slightly different > > point in time where it's called after this patch is also okay. It looks > > to me like it should work fine, but I don't currently have a way to test > > this on desktop GPUs. > > > > Thierry > > > > Thierry Reding (2): > > drm/nouveau: tegra: Fix NULL pointer dereference > > drm/nouveau: tegra: Do not try to disable PCI device > > > > drivers/gpu/drm/nouveau/nouveau_drm.c | 3 +- > > .../drm/nouveau/nvkm/subdev/instmem/gk20a.c | 30 +++++++++++++++++++ > > 2 files changed, 31 insertions(+), 2 deletions(-) > > > > -- > > 2.23.0 > > > > _______________________________________________ > > Nouveau mailing list > > Nouveau at lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/nouveau-------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 833 bytes Desc: not available URL: <https://lists.freedesktop.org/archives/nouveau/attachments/20190917/391e84e7/attachment.sig>
Lyude Paul
2019-Sep-17 18:11 UTC
[Nouveau] [PATCH 2/2] drm/nouveau: tegra: Do not try to disable PCI device
I don't see any problems with this, since nvkm_device_del() is mainly in charge of just releasing memory allocations as is drm_dev_put(). Reviewed-by: Lyude Paul <lyude at redhat.com> On Mon, 2019-09-16 at 16:36 +0200, Thierry Reding wrote:> From: Thierry Reding <treding at nvidia.com> > > When Nouveau is instantiated on top of a platform device, the dev->pdev > field will be NULL and calling pci_disable_device() will crash. Move the > PCI disabling code to the PCI specific driver removal code. > > Signed-off-by: Thierry Reding <treding at nvidia.com> > --- > drivers/gpu/drm/nouveau/nouveau_drm.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c > b/drivers/gpu/drm/nouveau/nouveau_drm.c > index 2cd83849600f..b65ae817eabf 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drm.c > +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c > @@ -715,7 +715,6 @@ static int nouveau_drm_probe(struct pci_dev *pdev, > void > nouveau_drm_device_remove(struct drm_device *dev) > { > - struct pci_dev *pdev = dev->pdev; > struct nouveau_drm *drm = nouveau_drm(dev); > struct nvkm_client *client; > struct nvkm_device *device; > @@ -727,7 +726,6 @@ nouveau_drm_device_remove(struct drm_device *dev) > device = nvkm_device_find(client->device); > > nouveau_drm_device_fini(dev); > - pci_disable_device(pdev); > drm_dev_put(dev); > nvkm_device_del(&device); > } > @@ -738,6 +736,7 @@ nouveau_drm_remove(struct pci_dev *pdev) > struct drm_device *dev = pci_get_drvdata(pdev); > > nouveau_drm_device_remove(dev); > + pci_disable_device(pdev); > } > > static int-- Cheers, Lyude Paul
Lyude Paul
2019-Sep-17 18:11 UTC
[Nouveau] [PATCH 1/2] drm/nouveau: tegra: Fix NULL pointer dereference
Not sure I understand enough about why BAR2 is needed or what it's used for to give a proper review on this one, probably better to wait for skeggsb to take a look at this On Mon, 2019-09-16 at 16:36 +0200, Thierry Reding wrote:> From: Thierry Reding <treding at nvidia.com> > > Fill in BAR2 callbacks for instance memory. There's no BAR2 on Tegra > GPUs, but buffers are all in system memory anyway, so just return the > plain address. > > Signed-off-by: Thierry Reding <treding at nvidia.com> > --- > .../drm/nouveau/nvkm/subdev/instmem/gk20a.c | 30 +++++++++++++++++++ > 1 file changed, 30 insertions(+) > > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c > b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c > index 985f2990ab0d..b0493f8df1fe 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/instmem/gk20a.c > @@ -261,6 +261,34 @@ gk20a_instobj_release_iommu(struct nvkm_memory *memory) > nvkm_ltc_invalidate(ltc); > } > > +static u64 > +gk20a_instobj_bar2_dma(struct nvkm_memory *memory) > +{ > + struct gk20a_instobj_dma *iobj = gk20a_instobj_dma(memory); > + u64 addr = ~0ULL; > + > + if (gk20a_instobj_acquire_dma(&iobj->base.memory)) > + addr = gk20a_instobj_addr(&iobj->base.memory); > + > + gk20a_instobj_release_dma(&iobj->base.memory); > + > + return addr; > +} > + > +static u64 > +gk20a_instobj_bar2_iommu(struct nvkm_memory *memory) > +{ > + struct gk20a_instobj_iommu *iobj = gk20a_instobj_iommu(memory); > + u64 addr = ~0ULL; > + > + if (gk20a_instobj_acquire_iommu(&iobj->base.memory)) > + addr = gk20a_instobj_addr(&iobj->base.memory); > + > + gk20a_instobj_release_iommu(&iobj->base.memory); > + > + return addr; > +} > + > static u32 > gk20a_instobj_rd32(struct nvkm_memory *memory, u64 offset) > { > @@ -353,6 +381,7 @@ static const struct nvkm_memory_func > gk20a_instobj_func_dma = { > .dtor = gk20a_instobj_dtor_dma, > .target = gk20a_instobj_target, > + .bar2 = gk20a_instobj_bar2_dma, > .page = gk20a_instobj_page, > .addr = gk20a_instobj_addr, > .size = gk20a_instobj_size, > @@ -365,6 +394,7 @@ static const struct nvkm_memory_func > gk20a_instobj_func_iommu = { > .dtor = gk20a_instobj_dtor_iommu, > .target = gk20a_instobj_target, > + .bar2 = gk20a_instobj_bar2_iommu, > .page = gk20a_instobj_page, > .addr = gk20a_instobj_addr, > .size = gk20a_instobj_size,-- Cheers, Lyude Paul
Possibly Parallel Threads
- next/master boot bisection: Oops in nouveau driver on jetson-tk1
- [PATCH 2/3] drm/nouveau: manage nouveau_drm lifetime with devres
- next/master boot bisection: Oops in nouveau driver on jetson-tk1
- next/master boot bisection: Oops in nouveau driver on jetson-tk1
- [PATCH 2/3] drm/nouveau: manage nouveau_drm lifetime with devres