Lyude Paul
2022-Apr-29 19:53 UTC
[Nouveau] [PATCH] drm/nouveau/subdev/bus: Ratelimit logging for fault errors
There's plenty of ways to fudge the GPU when developing on nouveau by mistake, some of which can result in nouveau seriously spamming dmesg with fault errors. This can be somewhat annoying, as it can quickly overrun the message buffer (or your terminal emulator's buffer) and get rid of actually useful feedback from the driver. While working on my new atomic only MST branch, I ran into this issue a couple of times. So, let's fix this by adding nvkm_error_ratelimited(), and using it to ratelimit errors from faults. This should be fine for developers, since it's nearly always only the first few faults that we care about seeing. Plus, you can turn off rate limiting in the kernel if you really need to. Signed-off-by: Lyude Paul <lyude at redhat.com> --- drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h | 2 ++ drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c | 14 +++++++------- drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c | 6 +++--- drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c | 6 +++--- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h index 1665738948fb..96113c8bee8c 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h @@ -62,4 +62,6 @@ void nvkm_subdev_intr(struct nvkm_subdev *); #define nvkm_debug(s,f,a...) nvkm_printk((s), DEBUG, info, f, ##a) #define nvkm_trace(s,f,a...) nvkm_printk((s), TRACE, info, f, ##a) #define nvkm_spam(s,f,a...) nvkm_printk((s), SPAM, dbg, f, ##a) + +#define nvkm_error_ratelimited(s,f,a...) nvkm_printk((s), ERROR, err_ratelimited, f, ##a) #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c index 53a6651ac225..80b5aaceeaad 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c @@ -35,13 +35,13 @@ gf100_bus_intr(struct nvkm_bus *bus) u32 addr = nvkm_rd32(device, 0x009084); u32 data = nvkm_rd32(device, 0x009088); - nvkm_error(subdev, - "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", - (addr & 0x00000002) ? "write" : "read", data, - (addr & 0x00fffffc), - (stat & 0x00000002) ? "!ENGINE " : "", - (stat & 0x00000004) ? "PRIVRING " : "", - (stat & 0x00000008) ? "TIMEOUT " : ""); + nvkm_error_ratelimited(subdev, + "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", + (addr & 0x00000002) ? "write" : "read", data, + (addr & 0x00fffffc), + (stat & 0x00000002) ? "!ENGINE " : "", + (stat & 0x00000004) ? "PRIVRING " : "", + (stat & 0x00000008) ? "TIMEOUT " : ""); nvkm_wr32(device, 0x009084, 0x00000000); nvkm_wr32(device, 0x001100, (stat & 0x0000000e)); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c index ad8da523bb22..c75e463f3501 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c @@ -45,9 +45,9 @@ nv31_bus_intr(struct nvkm_bus *bus) u32 addr = nvkm_rd32(device, 0x009084); u32 data = nvkm_rd32(device, 0x009088); - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", - (addr & 0x00000002) ? "write" : "read", data, - (addr & 0x00fffffc)); + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", + (addr & 0x00000002) ? "write" : "read", data, + (addr & 0x00fffffc)); stat &= ~0x00000008; nvkm_wr32(device, 0x001100, 0x00000008); diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c index 3a1e45adeedc..2055d0b100d3 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c @@ -60,9 +60,9 @@ nv50_bus_intr(struct nvkm_bus *bus) u32 addr = nvkm_rd32(device, 0x009084); u32 data = nvkm_rd32(device, 0x009088); - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", - (addr & 0x00000002) ? "write" : "read", data, - (addr & 0x00fffffc)); + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", + (addr & 0x00000002) ? "write" : "read", data, + (addr & 0x00fffffc)); stat &= ~0x00000008; nvkm_wr32(device, 0x001100, 0x00000008); -- 2.35.1
Karol Herbst
2022-Apr-29 22:56 UTC
[Nouveau] [PATCH] drm/nouveau/subdev/bus: Ratelimit logging for fault errors
On Fri, Apr 29, 2022 at 9:54 PM Lyude Paul <lyude at redhat.com> wrote:> > There's plenty of ways to fudge the GPU when developing on nouveau by > mistake, some of which can result in nouveau seriously spamming dmesg with > fault errors. This can be somewhat annoying, as it can quickly overrun the > message buffer (or your terminal emulator's buffer) and get rid of actually > useful feedback from the driver. While working on my new atomic only MST > branch, I ran into this issue a couple of times. > > So, let's fix this by adding nvkm_error_ratelimited(), and using it to > ratelimit errors from faults. This should be fine for developers, since > it's nearly always only the first few faults that we care about seeing. > Plus, you can turn off rate limiting in the kernel if you really need to. >good idea :) Although I think we ultimately need a detection of "GPU state is trashed" where we cease to do anything and disable interrupts as some of those issues actually cause the kernel to get DOSed by nouveau spamming/handling interrupts. Not sure if we should go with this patch in the meantime, or if we should just go straight ahead and disable interrupts, which would also stop the spam. Do you know if this patch here is enough to get the system to reboot normally or is it only spamming less?> Signed-off-by: Lyude Paul <lyude at redhat.com> > --- > drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h | 2 ++ > drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c | 14 +++++++------- > drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c | 6 +++--- > drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c | 6 +++--- > 4 files changed, 15 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > index 1665738948fb..96113c8bee8c 100644 > --- a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > @@ -62,4 +62,6 @@ void nvkm_subdev_intr(struct nvkm_subdev *); > #define nvkm_debug(s,f,a...) nvkm_printk((s), DEBUG, info, f, ##a) > #define nvkm_trace(s,f,a...) nvkm_printk((s), TRACE, info, f, ##a) > #define nvkm_spam(s,f,a...) nvkm_printk((s), SPAM, dbg, f, ##a) > + > +#define nvkm_error_ratelimited(s,f,a...) nvkm_printk((s), ERROR, err_ratelimited, f, ##a) > #endif > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > index 53a6651ac225..80b5aaceeaad 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > @@ -35,13 +35,13 @@ gf100_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, > - "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc), > - (stat & 0x00000002) ? "!ENGINE " : "", > - (stat & 0x00000004) ? "PRIVRING " : "", > - (stat & 0x00000008) ? "TIMEOUT " : ""); > + nvkm_error_ratelimited(subdev, > + "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc), > + (stat & 0x00000002) ? "!ENGINE " : "", > + (stat & 0x00000004) ? "PRIVRING " : "", > + (stat & 0x00000008) ? "TIMEOUT " : ""); > > nvkm_wr32(device, 0x009084, 0x00000000); > nvkm_wr32(device, 0x001100, (stat & 0x0000000e)); > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > index ad8da523bb22..c75e463f3501 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > @@ -45,9 +45,9 @@ nv31_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc)); > + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc)); > > stat &= ~0x00000008; > nvkm_wr32(device, 0x001100, 0x00000008); > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > index 3a1e45adeedc..2055d0b100d3 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > @@ -60,9 +60,9 @@ nv50_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc)); > + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc)); > > stat &= ~0x00000008; > nvkm_wr32(device, 0x001100, 0x00000008); > -- > 2.35.1 >
Karol Herbst
2022-May-03 19:23 UTC
[Nouveau] [PATCH] drm/nouveau/subdev/bus: Ratelimit logging for fault errors
not able to hit any error on my machine, but regardless: Reviewed-by: Karol Herbst <kherbst at redhat.com> I suspect there are more places where we could put it, but we can add those later. Anyway, I think it's valuable to push it through fixes, not sure how far back we want to CC stable though. On Fri, Apr 29, 2022 at 9:54 PM Lyude Paul <lyude at redhat.com> wrote:> > There's plenty of ways to fudge the GPU when developing on nouveau by > mistake, some of which can result in nouveau seriously spamming dmesg with > fault errors. This can be somewhat annoying, as it can quickly overrun the > message buffer (or your terminal emulator's buffer) and get rid of actually > useful feedback from the driver. While working on my new atomic only MST > branch, I ran into this issue a couple of times. > > So, let's fix this by adding nvkm_error_ratelimited(), and using it to > ratelimit errors from faults. This should be fine for developers, since > it's nearly always only the first few faults that we care about seeing. > Plus, you can turn off rate limiting in the kernel if you really need to. > > Signed-off-by: Lyude Paul <lyude at redhat.com> > --- > drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h | 2 ++ > drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c | 14 +++++++------- > drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c | 6 +++--- > drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c | 6 +++--- > 4 files changed, 15 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > index 1665738948fb..96113c8bee8c 100644 > --- a/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/subdev.h > @@ -62,4 +62,6 @@ void nvkm_subdev_intr(struct nvkm_subdev *); > #define nvkm_debug(s,f,a...) nvkm_printk((s), DEBUG, info, f, ##a) > #define nvkm_trace(s,f,a...) nvkm_printk((s), TRACE, info, f, ##a) > #define nvkm_spam(s,f,a...) nvkm_printk((s), SPAM, dbg, f, ##a) > + > +#define nvkm_error_ratelimited(s,f,a...) nvkm_printk((s), ERROR, err_ratelimited, f, ##a) > #endif > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > index 53a6651ac225..80b5aaceeaad 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/gf100.c > @@ -35,13 +35,13 @@ gf100_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, > - "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc), > - (stat & 0x00000002) ? "!ENGINE " : "", > - (stat & 0x00000004) ? "PRIVRING " : "", > - (stat & 0x00000008) ? "TIMEOUT " : ""); > + nvkm_error_ratelimited(subdev, > + "MMIO %s of %08x FAULT at %06x [ %s%s%s]\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc), > + (stat & 0x00000002) ? "!ENGINE " : "", > + (stat & 0x00000004) ? "PRIVRING " : "", > + (stat & 0x00000008) ? "TIMEOUT " : ""); > > nvkm_wr32(device, 0x009084, 0x00000000); > nvkm_wr32(device, 0x001100, (stat & 0x0000000e)); > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > index ad8da523bb22..c75e463f3501 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv31.c > @@ -45,9 +45,9 @@ nv31_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc)); > + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc)); > > stat &= ~0x00000008; > nvkm_wr32(device, 0x001100, 0x00000008); > diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > index 3a1e45adeedc..2055d0b100d3 100644 > --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bus/nv50.c > @@ -60,9 +60,9 @@ nv50_bus_intr(struct nvkm_bus *bus) > u32 addr = nvkm_rd32(device, 0x009084); > u32 data = nvkm_rd32(device, 0x009088); > > - nvkm_error(subdev, "MMIO %s of %08x FAULT at %06x\n", > - (addr & 0x00000002) ? "write" : "read", data, > - (addr & 0x00fffffc)); > + nvkm_error_ratelimited(subdev, "MMIO %s of %08x FAULT at %06x\n", > + (addr & 0x00000002) ? "write" : "read", data, > + (addr & 0x00fffffc)); > > stat &= ~0x00000008; > nvkm_wr32(device, 0x001100, 0x00000008); > -- > 2.35.1 >