thr3ads.net - Nouveau - [Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery [Apr 2012]

If this information is useful, please help other people find it:
Share via:

Marcin Slusarz

2012-Apr-25 21:20 UTC

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

Overall idea:
Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
handle them at ioctl level, reset the GPU and repeat last ioctl.

GPU reset is done by doing suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening waits

Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
---
 drivers/gpu/drm/nouveau/Makefile           |    2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c       |    2 +-
 drivers/gpu/drm/nouveau/nouveau_channel.c  |    5 +-
 drivers/gpu/drm/nouveau/nouveau_drv.c      |   56 ++++++++++-
 drivers/gpu/drm/nouveau/nouveau_drv.h      |   45 ++++++++-
 drivers/gpu/drm/nouveau/nouveau_fence.c    |    7 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c      |   14 +++-
 drivers/gpu/drm/nouveau/nouveau_notifier.c |    3 +
 drivers/gpu/drm/nouveau/nouveau_object.c   |    6 +
 drivers/gpu/drm/nouveau/nouveau_reset.c    |  148 ++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_state.c    |    6 +
 drivers/gpu/drm/nouveau/nv50_graph.c       |   11 +-
 12 files changed, 290 insertions(+), 15 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c

diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 03860f5..77d0c33 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o nouveau_channel.o
nouveau_mem.o \
              nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
              nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
              nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
-             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
 	     nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
 	     nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
              nv04_timer.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 5b0dc50..7de6cad 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict,
bool intr,
 	}
 
 	/* Software copy if the card isn't up and running yet. */
-	if (!dev_priv->channel) {
+	if (!dev_priv->channel || nouveau_gpu_reset_in_progress(dev_priv->dev))
{
 		ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
 		goto out;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c
b/drivers/gpu/drm/nouveau/nouveau_channel.c
index 846afb0..c0fa5a7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_channel.c
+++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
@@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
 				    init->fb_ctxdma_handle,
 				    init->tt_ctxdma_handle);
 	if (ret)
-		return ret;
+		goto out;
 	init->channel  = chan->id;
 
 	if (nouveau_vram_pushbuf == 0) {
@@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void *data,
 	if (ret == 0)
 		atomic_inc(&chan->users); /* userspace reference */
 	nouveau_channel_put(&chan);
+out:
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c
b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 090fff6..261e1f5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t
pm_state)
 		if (!dev_priv->eng[e])
 			continue;
 
-		ret = dev_priv->eng[e]->fini(dev, e, true);
+		ret = dev_priv->eng[e]->fini(dev, e,
!nouveau_gpu_reset_in_progress(dev));
 		if (ret) {
 			NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
 			goto out_abort;
@@ -443,11 +443,63 @@ nouveau_pci_resume(struct pci_dev *pdev)
 	return 0;
 }
 
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+	init_rwsem(&r->rwsem);
+	mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+	while (down_read_trylock(&r->rwsem) == 0) {
+		int ret = mutex_lock_interruptible(&r->mutex);
+		if (ret)
+			return ret;
+		mutex_unlock(&r->mutex);
+	}
+	return 0;
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+	up_read(&r->rwsem);
+}
+
+void intr_rwsem_down_write(struct intr_rwsem *r)
+{
+	mutex_lock(&r->mutex);
+	down_write(&r->rwsem);
+}
+
+void intr_rwsem_up_write(struct intr_rwsem *r)
+{
+	up_write(&r->rwsem);
+	mutex_unlock(&r->mutex);
+}
+
+static long nouveau_ioctl(struct file *filp,
+	      unsigned int cmd, unsigned long arg)
+{
+	struct drm_file *file_priv = filp->private_data;
+	struct drm_device *dev = file_priv->minor->dev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	long ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
+	if (ret)
+		return ret;
+
+	ret = drm_ioctl(filp, cmd, arg);
+
+	intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+
+	return ret;
+}
+
 static const struct file_operations nouveau_driver_fops = {
 	.owner = THIS_MODULE,
 	.open = drm_open,
 	.release = drm_release,
-	.unlocked_ioctl = drm_ioctl,
+	.unlocked_ioctl = nouveau_ioctl,
 	.mmap = nouveau_ttm_mmap,
 	.poll = drm_poll,
 	.fasync = drm_fasync,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index d120baf..ad146e7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -704,9 +704,25 @@ enum nouveau_card_type {
 	NV_E0      = 0xe0,
 };
 
+struct intr_rwsem {
+	struct rw_semaphore rwsem;
+	struct mutex mutex;
+};
+
+extern void intr_rwsem_init(struct intr_rwsem *r);
+extern int  intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_read(struct intr_rwsem *r);
+extern void intr_rwsem_down_write(struct intr_rwsem *r);
+extern void intr_rwsem_up_write(struct intr_rwsem *r);
+
 struct drm_nouveau_private {
 	struct drm_device *dev;
 	bool noaccel;
+	struct intr_rwsem ioctls_rwsem;
+
+	struct mutex reset_lock;
+	atomic_t gpureset_in_progress;
+	unsigned long last_gpu_reset;
 
 	/* the card type, takes NV_* as values */
 	enum nouveau_card_type card_type;
@@ -841,6 +857,7 @@ struct drm_nouveau_private {
 
 	struct {
 		struct dentry *channel_root;
+		struct dentry *reset;
 	} debugfs;
 
 	struct nouveau_fbdev *nfbdev;
@@ -1537,6 +1554,20 @@ int nouveau_display_dumb_map_offset(struct drm_file *,
struct drm_device *,
 				    uint32_t handle, uint64_t *offset);
 int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
 				 uint32_t handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int  nouveau_reset_device(struct drm_device *dev);
+static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	return atomic_read(&dev_priv->gpureset_in_progress) != 0;
+}
 
 /* nv10_gpio.c */
 int nv10_gpio_init(struct drm_device *dev);
@@ -1632,12 +1663,20 @@ static inline void nv_wr08(struct drm_device *dev,
unsigned reg, u8 val)
 	iowrite8(val, dev_priv->mmio + reg);
 }
 
+static inline uint64_t nv_timeout(struct drm_device *dev)
+{
+	uint64_t tm = 2000000000ULL;
+	if (nouveau_gpu_reset_in_progress(dev))
+		tm = 50000000; /* 50ms */
+	return tm;
+}
+
 #define nv_wait(dev, reg, mask, val) \
-	nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_ne(dev, reg, mask, val) \
-	nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_cb(dev, func, data) \
-	nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+	nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
 
 /* PRAMIN access */
 static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 41ee17d..13d0176 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -233,17 +233,22 @@ int
 __nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
 {
 	struct nouveau_fence *fence = nouveau_fence(sync_obj);
+	struct drm_device *dev = fence->channel->dev;
 	unsigned long timeout = fence->timeout;
 	unsigned long sleep_time = NSEC_PER_MSEC / 1000;
 	ktime_t t;
 	int ret = 0;
 
+	if (nouveau_gpu_reset_in_progress(dev))
+		timeout = jiffies + DRM_HZ / 5;
+
 	while (1) {
 		if (__nouveau_fence_signalled(sync_obj, sync_arg))
 			break;
 
 		if (time_after_eq(jiffies, timeout)) {
-			ret = -EBUSY;
+			if (!nouveau_gpu_reset_in_progress(dev))
+				ret = -EIO;
 			break;
 		}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
b/drivers/gpu/drm/nouveau/nouveau_gem.c
index ed52a6f..f9bbcc0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
 			      req->info.domain, req->info.tile_mode,
 			      req->info.tile_flags, &nvbo);
 	if (ret)
-		return ret;
+		goto out;
 
 	ret = drm_gem_handle_create(file_priv, nvbo->gem,
&req->info.handle);
 	if (ret == 0) {
@@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void *data,
 
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_unreference_unlocked(nvbo->gem);
+out:
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -804,6 +807,9 @@ out_next:
 	}
 
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void
*data,
 	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
 	spin_unlock(&nvbo->bo.bdev->fence_lock);
 	drm_gem_object_unreference_unlocked(gem);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void *data,
 
 	ret = nouveau_gem_info(file_priv, gem, req);
 	drm_gem_object_unreference_unlocked(gem);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c
b/drivers/gpu/drm/nouveau/nouveau_notifier.c
index 2ef883c..e224b1c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
+++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
@@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev, void
*data,
 	ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
 				     &na->offset);
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c
b/drivers/gpu/drm/nouveau/nouveau_object.c
index cc419fa..ba592b0 100644
--- a/drivers/gpu/drm/nouveau/nouveau_object.c
+++ b/drivers/gpu/drm/nouveau/nouveau_object.c
@@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev, void
*data,
 
 out:
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
@@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev, void
*data,
 
 	ret = nouveau_ramht_remove(chan, objfree->handle);
 	nouveau_channel_put(&chan);
+
+	if (ret == -EIO)
+		ret = nouveau_reset_device(dev);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c
b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..e893096
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz at gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction,
including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static bool off(struct drm_device *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+	atomic_inc(&dev_priv->gpureset_in_progress);
+	intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	if (nouveau_pci_suspend(pdev, pmm))
+		goto fail;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+	return true;
+
+fail:
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+	intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+	return false;
+}
+
+static void on(struct drm_device *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	atomic_dec(&dev_priv->gpureset_in_progress);
+	nouveau_pci_resume(pdev);
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+	dev_priv->last_gpu_reset = jiffies;
+	intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	char usercmd[2];
+	if (cnt > 2)
+		cnt = 2;
+
+	if (copy_from_user(usercmd, ubuf, cnt))
+		return -EFAULT;
+
+	if (usercmd[0] == '1') {
+		int ret = intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
+		if (ret)
+			return ret;
+		nouveau_reset_device(dev);
+		intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+	}
+
+	return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.write = nouveau_reset_write,
+	.llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	if (dev_priv->debugfs.reset) {
+		debugfs_remove(dev_priv->debugfs.reset);
+		dev_priv->debugfs.reset = NULL;
+	}
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
+			minor->debugfs_root, dev, &nouveau_reset_fops);
+	if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
+		dev_priv->debugfs.reset = NULL;
+
+}
+#endif
+
+int nouveau_reset_device(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+
+	if (mutex_trylock(&dev_priv->reset_lock) == 0)
+		/* gpu reset in progress */
+		goto out;
+
+	if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
+		unsigned long start, end;
+
+		intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
+		NV_INFO(dev, "GPU lockup detected, resetting...\n");
+		start = jiffies;
+		while (!off(dev))
+			;
+		on(dev);
+		end = jiffies;
+		NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) /
DRM_HZ);
+		while (intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
+			; /* not possible, we are holding reset_lock */
+	}
+	mutex_unlock(&dev_priv->reset_lock);
+
+out:
+	return -EAGAIN;
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c
b/drivers/gpu/drm/nouveau/nouveau_state.c
index afec760..2fac5e5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -697,6 +697,8 @@ nouveau_card_init(struct drm_device *dev)
 	if (ret)
 		goto out;
 	engine = &dev_priv->engine;
+	intr_rwsem_init(&dev_priv->ioctls_rwsem);
+	mutex_init(&dev_priv->reset_lock);
 	spin_lock_init(&dev_priv->channels.lock);
 	spin_lock_init(&dev_priv->tile.lock);
 	spin_lock_init(&dev_priv->context_switch_lock);
@@ -886,6 +888,7 @@ nouveau_card_init(struct drm_device *dev)
 
 		nouveau_fbcon_init(dev);
 	}
+	nouveau_reset_debugfs_init(dev->primary);
 
 	return 0;
 
@@ -943,6 +946,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
 	struct nouveau_engine *engine = &dev_priv->engine;
 	int e;
 
+	nouveau_reset_debugfs_fini(dev->primary);
+
 	if (dev->mode_config.num_crtc) {
 		nouveau_fbcon_fini(dev);
 		nouveau_display_fini(dev);
@@ -1129,6 +1134,7 @@ int nouveau_load(struct drm_device *dev, unsigned long
flags)
 	}
 	dev->dev_private = dev_priv;
 	dev_priv->dev = dev;
+	atomic_set(&dev_priv->gpureset_in_progress, 0);
 
 	pci_set_master(dev->pdev);
 
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c
b/drivers/gpu/drm/nouveau/nv50_graph.c
index a61853f..d0a2e50 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int engine)
 			ret = -ERESTARTSYS;
 			break;
 		}
-	} while (!idle && !(timeout = ptimer->read(dev) - start >
2000000000));
+	} while (!idle && !(timeout = ptimer->read(dev) - start >
nv_timeout(dev)));
 
 	if (timeout) {
-		NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
-			      "0x%08x 0x%08x 0x%08x 0x%08x\n",
-			 nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
-			 nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
+		if (!nouveau_gpu_reset_in_progress(dev))
+			NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
+				"0x%08x 0x%08x 0x%08x 0x%08x\n",
+				nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
+				nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
 		ret = -EIO;
 	}
 
-- 
1.7.8.5

Marcin Slusarz

2012-Apr-25 21:32 UTC

head link

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz
wrote:> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
> 
> Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
> ---
What changed from v1:
- moved ioctl locking from drm core to nouveau
- made down_reads interruptible
- fixed build bug on 32-bit systems

Ben Skeggs

2012-Apr-26 07:32 UTC

head link

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

On Wed, 2012-04-25 at 23:20 +0200, Marcin Slusarz wrote:> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waitsOkay.  I've thought about this a bit for a couple of days and think I'll
be able to coherently share my thoughts on this issue now :)

Firstly, while I agree that we need to become more resilient to errors,
I don't think that following in the radeon/intel footsteps with
something (imo, hackish) like this is the right choice for us
necessarily.

The *vast* majority of "lockups" we have are as a result of us badly
mishandling exceptions reported to us by the GPU.  There are a couple of
exceptions, however, they're very rare..

A very common example is where people gain DMA_PUSHERs for whatever
reason, and things go haywire eventually.  To handle a DMA_PUSHER
sanely, generally you have to drop all pending commands for the channel
(set GET=PUT, etc) and continue on.  However, this leaves us with fences
and semaphores unsignalled etc, causing issues further up the stack with
perfectly good channels hanging on attempting to sync with the crashed
channel etc.

The next most common example I can think of is nv4x hardware, getting a
LIMIT_COLOR/ZETA exception from PGRAPH, and then a hang.  The solution
is simple, learn how to handle the exception, log it, and PGRAPH
survives.

I strongly believe that if we focused our efforts on dealing with what
the GPU reports to us a lot better, we'll find we really don't need such
"lockup recovery".

I am, however, considering pulling the vm flush timeout error
propagation and break-out-of-waits-on-signals that builds on it.  As we
really do need to become better at having killable processes if things
go wrong :)

Ben.
> 
> Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
> ---
>  drivers/gpu/drm/nouveau/Makefile           |    2 +-
>  drivers/gpu/drm/nouveau/nouveau_bo.c       |    2 +-
>  drivers/gpu/drm/nouveau/nouveau_channel.c  |    5 +-
>  drivers/gpu/drm/nouveau/nouveau_drv.c      |   56 ++++++++++-
>  drivers/gpu/drm/nouveau/nouveau_drv.h      |   45 ++++++++-
>  drivers/gpu/drm/nouveau/nouveau_fence.c    |    7 +-
>  drivers/gpu/drm/nouveau/nouveau_gem.c      |   14 +++-
>  drivers/gpu/drm/nouveau/nouveau_notifier.c |    3 +
>  drivers/gpu/drm/nouveau/nouveau_object.c   |    6 +
>  drivers/gpu/drm/nouveau/nouveau_reset.c    |  148
++++++++++++++++++++++++++++
>  drivers/gpu/drm/nouveau/nouveau_state.c    |    6 +
>  drivers/gpu/drm/nouveau/nv50_graph.c       |   11 +-
>  12 files changed, 290 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c
> 
> diff --git a/drivers/gpu/drm/nouveau/Makefile
b/drivers/gpu/drm/nouveau/Makefile
> index 03860f5..77d0c33 100644
> --- a/drivers/gpu/drm/nouveau/Makefile
> +++ b/drivers/gpu/drm/nouveau/Makefile
> @@ -9,7 +9,7 @@ nouveau-y := nouveau_drv.o nouveau_state.o
nouveau_channel.o nouveau_mem.o \
>               nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
>               nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
>               nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
> -             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
> +             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
>  	     nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_temp.o \
>  	     nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
>               nv04_timer.o \
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 5b0dc50..7de6cad 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -936,7 +936,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool
evict, bool intr,
>  	}
>  
>  	/* Software copy if the card isn't up and running yet. */
> -	if (!dev_priv->channel) {
> +	if (!dev_priv->channel ||
nouveau_gpu_reset_in_progress(dev_priv->dev)) {
>  		ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu,
new_mem);
>  		goto out;
>  	}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_channel.c
b/drivers/gpu/drm/nouveau/nouveau_channel.c
> index 846afb0..c0fa5a7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_channel.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_channel.c
> @@ -420,7 +420,7 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void
*data,
>  				    init->fb_ctxdma_handle,
>  				    init->tt_ctxdma_handle);
>  	if (ret)
> -		return ret;
> +		goto out;
>  	init->channel  = chan->id;
>  
>  	if (nouveau_vram_pushbuf == 0) {
> @@ -450,6 +450,9 @@ nouveau_ioctl_fifo_alloc(struct drm_device *dev, void
*data,
>  	if (ret == 0)
>  		atomic_inc(&chan->users); /* userspace reference */
>  	nouveau_channel_put(&chan);
> +out:
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c
b/drivers/gpu/drm/nouveau/nouveau_drv.c
> index 090fff6..261e1f5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
> @@ -237,7 +237,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t
pm_state)
>  		if (!dev_priv->eng[e])
>  			continue;
>  
> -		ret = dev_priv->eng[e]->fini(dev, e, true);
> +		ret = dev_priv->eng[e]->fini(dev, e,
!nouveau_gpu_reset_in_progress(dev));
>  		if (ret) {
>  			NV_ERROR(dev, "... engine %d failed: %d\n", e, ret);
>  			goto out_abort;
> @@ -443,11 +443,63 @@ nouveau_pci_resume(struct pci_dev *pdev)
>  	return 0;
>  }
>  
> +void intr_rwsem_init(struct intr_rwsem *r)
> +{
> +	init_rwsem(&r->rwsem);
> +	mutex_init(&r->mutex);
> +}
> +
> +int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
> +{
> +	while (down_read_trylock(&r->rwsem) == 0) {
> +		int ret = mutex_lock_interruptible(&r->mutex);
> +		if (ret)
> +			return ret;
> +		mutex_unlock(&r->mutex);
> +	}
> +	return 0;
> +}
> +
> +void intr_rwsem_up_read(struct intr_rwsem *r)
> +{
> +	up_read(&r->rwsem);
> +}
> +
> +void intr_rwsem_down_write(struct intr_rwsem *r)
> +{
> +	mutex_lock(&r->mutex);
> +	down_write(&r->rwsem);
> +}
> +
> +void intr_rwsem_up_write(struct intr_rwsem *r)
> +{
> +	up_write(&r->rwsem);
> +	mutex_unlock(&r->mutex);
> +}
> +
> +static long nouveau_ioctl(struct file *filp,
> +	      unsigned int cmd, unsigned long arg)
> +{
> +	struct drm_file *file_priv = filp->private_data;
> +	struct drm_device *dev = file_priv->minor->dev;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	long ret =
intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
> +	if (ret)
> +		return ret;
> +
> +	ret = drm_ioctl(filp, cmd, arg);
> +
> +	intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> +
> +	return ret;
> +}
> +
>  static const struct file_operations nouveau_driver_fops = {
>  	.owner = THIS_MODULE,
>  	.open = drm_open,
>  	.release = drm_release,
> -	.unlocked_ioctl = drm_ioctl,
> +	.unlocked_ioctl = nouveau_ioctl,
>  	.mmap = nouveau_ttm_mmap,
>  	.poll = drm_poll,
>  	.fasync = drm_fasync,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
> index d120baf..ad146e7 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_drv.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
> @@ -704,9 +704,25 @@ enum nouveau_card_type {
>  	NV_E0      = 0xe0,
>  };
>  
> +struct intr_rwsem {
> +	struct rw_semaphore rwsem;
> +	struct mutex mutex;
> +};
> +
> +extern void intr_rwsem_init(struct intr_rwsem *r);
> +extern int  intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
> +extern void intr_rwsem_up_read(struct intr_rwsem *r);
> +extern void intr_rwsem_down_write(struct intr_rwsem *r);
> +extern void intr_rwsem_up_write(struct intr_rwsem *r);
> +
>  struct drm_nouveau_private {
>  	struct drm_device *dev;
>  	bool noaccel;
> +	struct intr_rwsem ioctls_rwsem;
> +
> +	struct mutex reset_lock;
> +	atomic_t gpureset_in_progress;
> +	unsigned long last_gpu_reset;
>  
>  	/* the card type, takes NV_* as values */
>  	enum nouveau_card_type card_type;
> @@ -841,6 +857,7 @@ struct drm_nouveau_private {
>  
>  	struct {
>  		struct dentry *channel_root;
> +		struct dentry *reset;
>  	} debugfs;
>  
>  	struct nouveau_fbdev *nfbdev;
> @@ -1537,6 +1554,20 @@ int nouveau_display_dumb_map_offset(struct drm_file
*, struct drm_device *,
>  				    uint32_t handle, uint64_t *offset);
>  int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
>  				 uint32_t handle);
> +/* nouveau_reset.c */
> +#ifdef CONFIG_DRM_NOUVEAU_DEBUG
> +void nouveau_reset_debugfs_fini(struct drm_minor *minor);
> +void nouveau_reset_debugfs_init(struct drm_minor *minor);
> +#else
> +static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
> +static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
> +#endif
> +int  nouveau_reset_device(struct drm_device *dev);
> +static inline bool nouveau_gpu_reset_in_progress(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	return atomic_read(&dev_priv->gpureset_in_progress) != 0;
> +}
>  
>  /* nv10_gpio.c */
>  int nv10_gpio_init(struct drm_device *dev);
> @@ -1632,12 +1663,20 @@ static inline void nv_wr08(struct drm_device *dev,
unsigned reg, u8 val)
>  	iowrite8(val, dev_priv->mmio + reg);
>  }
>  
> +static inline uint64_t nv_timeout(struct drm_device *dev)
> +{
> +	uint64_t tm = 2000000000ULL;
> +	if (nouveau_gpu_reset_in_progress(dev))
> +		tm = 50000000; /* 50ms */
> +	return tm;
> +}
> +
>  #define nv_wait(dev, reg, mask, val) \
> -	nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
> +	nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
>  #define nv_wait_ne(dev, reg, mask, val) \
> -	nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
> +	nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
>  #define nv_wait_cb(dev, func, data) \
> -	nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
> +	nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
>  
>  /* PRAMIN access */
>  static inline u32 nv_ri32(struct drm_device *dev, unsigned offset)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 41ee17d..13d0176 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -233,17 +233,22 @@ int
>  __nouveau_fence_wait(void *sync_obj, void *sync_arg, bool lazy, bool intr)
>  {
>  	struct nouveau_fence *fence = nouveau_fence(sync_obj);
> +	struct drm_device *dev = fence->channel->dev;
>  	unsigned long timeout = fence->timeout;
>  	unsigned long sleep_time = NSEC_PER_MSEC / 1000;
>  	ktime_t t;
>  	int ret = 0;
>  
> +	if (nouveau_gpu_reset_in_progress(dev))
> +		timeout = jiffies + DRM_HZ / 5;
> +
>  	while (1) {
>  		if (__nouveau_fence_signalled(sync_obj, sync_arg))
>  			break;
>  
>  		if (time_after_eq(jiffies, timeout)) {
> -			ret = -EBUSY;
> +			if (!nouveau_gpu_reset_in_progress(dev))
> +				ret = -EIO;
>  			break;
>  		}
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index ed52a6f..f9bbcc0 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -214,7 +214,7 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void
*data,
>  			      req->info.domain, req->info.tile_mode,
>  			      req->info.tile_flags, &nvbo);
>  	if (ret)
> -		return ret;
> +		goto out;
>  
>  	ret = drm_gem_handle_create(file_priv, nvbo->gem,
&req->info.handle);
>  	if (ret == 0) {
> @@ -225,6 +225,9 @@ nouveau_gem_ioctl_new(struct drm_device *dev, void
*data,
>  
>  	/* drop reference from allocate - handle holds it now */
>  	drm_gem_object_unreference_unlocked(nvbo->gem);
> +out:
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> @@ -804,6 +807,9 @@ out_next:
>  	}
>  
>  	nouveau_channel_put(&chan);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> @@ -839,6 +845,9 @@ nouveau_gem_ioctl_cpu_prep(struct drm_device *dev, void
*data,
>  	ret = ttm_bo_wait(&nvbo->bo, true, true, no_wait);
>  	spin_unlock(&nvbo->bo.bdev->fence_lock);
>  	drm_gem_object_unreference_unlocked(gem);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> @@ -863,6 +872,9 @@ nouveau_gem_ioctl_info(struct drm_device *dev, void
*data,
>  
>  	ret = nouveau_gem_info(file_priv, gem, req);
>  	drm_gem_object_unreference_unlocked(gem);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_notifier.c
b/drivers/gpu/drm/nouveau/nouveau_notifier.c
> index 2ef883c..e224b1c 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_notifier.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_notifier.c
> @@ -200,5 +200,8 @@ nouveau_ioctl_notifier_alloc(struct drm_device *dev,
void *data,
>  	ret = nouveau_notifier_alloc(chan, na->handle, na->size, 0, 0x1000,
>  				     &na->offset);
>  	nouveau_channel_put(&chan);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_object.c
b/drivers/gpu/drm/nouveau/nouveau_object.c
> index cc419fa..ba592b0 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_object.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_object.c
> @@ -973,6 +973,9 @@ int nouveau_ioctl_grobj_alloc(struct drm_device *dev,
void *data,
>  
>  out:
>  	nouveau_channel_put(&chan);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> @@ -992,6 +995,9 @@ int nouveau_ioctl_gpuobj_free(struct drm_device *dev,
void *data,
>  
>  	ret = nouveau_ramht_remove(chan, objfree->handle);
>  	nouveau_channel_put(&chan);
> +
> +	if (ret == -EIO)
> +		ret = nouveau_reset_device(dev);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c
b/drivers/gpu/drm/nouveau/nouveau_reset.c
> new file mode 100644
> index 0000000..e893096
> --- /dev/null
> +++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
> @@ -0,0 +1,148 @@
> +/*
> + * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz at gmail.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining
> + * a copy of this software and associated documentation files (the
> + * "Software"), to deal in the Software without restriction,
including
> + * without limitation the rights to use, copy, modify, merge, publish,
> + * distribute, sublicense, and/or sell copies of the Software, and to
> + * permit persons to whom the Software is furnished to do so, subject to
> + * the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the
> + * next paragraph) shall be included in all copies or substantial
> + * portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
> + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
> + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
> + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
> + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/debugfs.h>
> +#include "drmP.h"
> +#include "nouveau_drv.h"
> +
> +static bool off(struct drm_device *dev)
> +{
> +	struct pci_dev *pdev = dev->pdev;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
> +	atomic_inc(&dev_priv->gpureset_in_progress);
> +	intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
> +
> +	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
> +	if (nouveau_pci_suspend(pdev, pmm))
> +		goto fail;
> +
> +	dev->switch_power_state = DRM_SWITCH_POWER_OFF;
> +	return true;
> +
> +fail:
> +	dev->switch_power_state = DRM_SWITCH_POWER_ON;
> +	intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
> +	return false;
> +}
> +
> +static void on(struct drm_device *dev)
> +{
> +	struct pci_dev *pdev = dev->pdev;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
> +	atomic_dec(&dev_priv->gpureset_in_progress);
> +	nouveau_pci_resume(pdev);
> +	dev->switch_power_state = DRM_SWITCH_POWER_ON;
> +
> +	dev_priv->last_gpu_reset = jiffies;
> +	intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
> +}
> +
> +#ifdef CONFIG_DRM_NOUVEAU_DEBUG
> +static ssize_t nouveau_reset_write(struct file *filp, const char __user
*ubuf,
> +			     size_t cnt, loff_t *ppos)
> +{
> +	struct drm_device *dev = filp->private_data;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +	char usercmd[2];
> +	if (cnt > 2)
> +		cnt = 2;
> +
> +	if (copy_from_user(usercmd, ubuf, cnt))
> +		return -EFAULT;
> +
> +	if (usercmd[0] == '1') {
> +		int ret =
intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem);
> +		if (ret)
> +			return ret;
> +		nouveau_reset_device(dev);
> +		intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> +	}
> +
> +	return cnt;
> +}
> +
> +static const struct file_operations nouveau_reset_fops = {
> +	.owner = THIS_MODULE,
> +	.open = simple_open,
> +	.write = nouveau_reset_write,
> +	.llseek = noop_llseek,
> +};
> +
> +void nouveau_reset_debugfs_fini(struct drm_minor *minor)
> +{
> +	struct drm_device *dev = minor->dev;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	if (dev_priv->debugfs.reset) {
> +		debugfs_remove(dev_priv->debugfs.reset);
> +		dev_priv->debugfs.reset = NULL;
> +	}
> +}
> +
> +
> +void nouveau_reset_debugfs_init(struct drm_minor *minor)
> +{
> +	struct drm_device *dev = minor->dev;
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	dev_priv->debugfs.reset = debugfs_create_file("reset", 0200,
> +			minor->debugfs_root, dev, &nouveau_reset_fops);
> +	if (IS_ERR_OR_NULL(dev_priv->debugfs.reset))
> +		dev_priv->debugfs.reset = NULL;
> +
> +}
> +#endif
> +
> +int nouveau_reset_device(struct drm_device *dev)
> +{
> +	struct drm_nouveau_private *dev_priv = dev->dev_private;
> +
> +	if (mutex_trylock(&dev_priv->reset_lock) == 0)
> +		/* gpu reset in progress */
> +		goto out;
> +
> +	if (time_after_eq(jiffies, dev_priv->last_gpu_reset + 10 * DRM_HZ)) {
> +		unsigned long start, end;
> +
> +		intr_rwsem_up_read(&dev_priv->ioctls_rwsem);
> +		NV_INFO(dev, "GPU lockup detected, resetting...\n");
> +		start = jiffies;
> +		while (!off(dev))
> +			;
> +		on(dev);
> +		end = jiffies;
> +		NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) /
DRM_HZ);
> +		while
(intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
> +			; /* not possible, we are holding reset_lock */
> +	}
> +	mutex_unlock(&dev_priv->reset_lock);
> +
> +out:
> +	return -EAGAIN;
> +}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c
b/drivers/gpu/drm/nouveau/nouveau_state.c
> index afec760..2fac5e5 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_state.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_state.c
> @@ -697,6 +697,8 @@ nouveau_card_init(struct drm_device *dev)
>  	if (ret)
>  		goto out;
>  	engine = &dev_priv->engine;
> +	intr_rwsem_init(&dev_priv->ioctls_rwsem);
> +	mutex_init(&dev_priv->reset_lock);
>  	spin_lock_init(&dev_priv->channels.lock);
>  	spin_lock_init(&dev_priv->tile.lock);
>  	spin_lock_init(&dev_priv->context_switch_lock);
> @@ -886,6 +888,7 @@ nouveau_card_init(struct drm_device *dev)
>  
>  		nouveau_fbcon_init(dev);
>  	}
> +	nouveau_reset_debugfs_init(dev->primary);
>  
>  	return 0;
>  
> @@ -943,6 +946,8 @@ static void nouveau_card_takedown(struct drm_device
*dev)
>  	struct nouveau_engine *engine = &dev_priv->engine;
>  	int e;
>  
> +	nouveau_reset_debugfs_fini(dev->primary);
> +
>  	if (dev->mode_config.num_crtc) {
>  		nouveau_fbcon_fini(dev);
>  		nouveau_display_fini(dev);
> @@ -1129,6 +1134,7 @@ int nouveau_load(struct drm_device *dev, unsigned
long flags)
>  	}
>  	dev->dev_private = dev_priv;
>  	dev_priv->dev = dev;
> +	atomic_set(&dev_priv->gpureset_in_progress, 0);
>  
>  	pci_set_master(dev->pdev);
>  
> diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c
b/drivers/gpu/drm/nouveau/nv50_graph.c
> index a61853f..d0a2e50 100644
> --- a/drivers/gpu/drm/nouveau/nv50_graph.c
> +++ b/drivers/gpu/drm/nouveau/nv50_graph.c
> @@ -440,13 +440,14 @@ nv84_graph_tlb_flush(struct drm_device *dev, int
engine)
>  			ret = -ERESTARTSYS;
>  			break;
>  		}
> -	} while (!idle && !(timeout = ptimer->read(dev) - start >
2000000000));
> +	} while (!idle && !(timeout = ptimer->read(dev) - start >
nv_timeout(dev)));
>  
>  	if (timeout) {
> -		NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
> -			      "0x%08x 0x%08x 0x%08x 0x%08x\n",
> -			 nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
> -			 nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
> +		if (!nouveau_gpu_reset_in_progress(dev))
> +			NV_ERROR(dev, "PGRAPH TLB flush idle timeout fail: "
> +				"0x%08x 0x%08x 0x%08x 0x%08x\n",
> +				nv_rd32(dev, 0x400700), nv_rd32(dev, 0x400380),
> +				nv_rd32(dev, 0x400384), nv_rd32(dev, 0x400388));
>  		ret = -EIO;
>  	}
>

Marcin Slusarz

2012-Apr-28 14:56 UTC

head link

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

On Wed, Apr 25, 2012 at 11:20:36PM +0200, Marcin Slusarz
wrote:> Overall idea:
> Detect lockups by watching for timeouts (vm flush / fence), return -EIOs,
> handle them at ioctl level, reset the GPU and repeat last ioctl.
> 
> GPU reset is done by doing suspend / resume cycle with few tweaks:
> - CPU-only bo eviction
> - ignoring vm flush / fence timeouts
> - shortening waits
> 
> Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
> ---
Martin,

I'm wondering how below patch (which builds upon the above) affects
reclocking stability. I can't test it on my card, because it has only
one performance level. Can you test it on yours?

---
From: Marcin Slusarz <marcin.slusarz at gmail.com>
Subject: [PATCH] drm/nouveau: take ioctls_rwsem before reclocking

Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
---
 drivers/gpu/drm/nouveau/nouveau_pm.c    |    6 ++++++
 drivers/gpu/drm/nouveau/nouveau_reset.c |    2 +-
 2 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_pm.c
b/drivers/gpu/drm/nouveau/nouveau_pm.c
index 34d591b..4716f39 100644
--- a/drivers/gpu/drm/nouveau/nouveau_pm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_pm.c
@@ -383,9 +383,15 @@ nouveau_pm_set_perflvl(struct device *d, struct
device_attribute *a,
 		       const char *buf, size_t count)
 {
 	struct drm_device *dev = pci_get_drvdata(to_pci_dev(d));
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
 	int ret;
 
+	intr_rwsem_down_write(&dev_priv->ioctls_rwsem);
+
 	ret = nouveau_pm_profile_set(dev, buf);
+
+	intr_rwsem_up_write(&dev_priv->ioctls_rwsem);
+
 	if (ret)
 		return ret;
 	return strlen(buf);
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c
b/drivers/gpu/drm/nouveau/nouveau_reset.c
index e893096..7c25a3c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_reset.c
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -139,7 +139,7 @@ int nouveau_reset_device(struct drm_device *dev)
 		end = jiffies;
 		NV_INFO(dev, "GPU reset done, took %lu s\n", (end - start) /
DRM_HZ);
 		while (intr_rwsem_down_read_interruptible(&dev_priv->ioctls_rwsem))
-			; /* not possible, we are holding reset_lock */
+			;
 	}
 	mutex_unlock(&dev_priv->reset_lock);
 
-- 
1.7.8.5

Marcin Slusarz

2012-May-27 19:52 UTC

head link

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

From: Marcin Slusarz <marcin.slusarz at gmail.com>
Subject: [PATCH v4] drm/nouveau: gpu lockup recovery

Detect lockups by watching for vm flush / fence timeouts and signal them by
returning EIO. When EIOs are met at ioctl level, reset the card and repeat
last ioctl.

GPU reset is done by going through suspend / resume cycle with few tweaks:
- CPU-only bo eviction
- ignoring vm flush / fence timeouts
- shortening wait times

v2:
- move ioctl locking from drm core to nouveau
- make ioctl-side locking interruptible
- fix build bug on 32-bit systems

v3:
- make reset-side locking interruptible
- add module parameter to disable lockup recovery
- move reset code to nouveau_ioctl

v4:
- rebased on top current nouveau-git

Signed-off-by: Marcin Slusarz <marcin.slusarz at gmail.com>
---
I skipped posting v3 because of possible other approach to the problem, but
I find this patch useful for debugging, so I'm posting rebased version for
other devs.
---
 drivers/gpu/drm/nouveau/Makefile        |    2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c    |    2 +-
 drivers/gpu/drm/nouveau/nouveau_drv.c   |   88 ++++++++++++++++-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |   47 ++++++++-
 drivers/gpu/drm/nouveau/nouveau_fence.c |   10 ++-
 drivers/gpu/drm/nouveau/nouveau_reset.c |  166 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_state.c |    6 +
 drivers/gpu/drm/nouveau/nv50_graph.c    |   11 +-
 8 files changed, 318 insertions(+), 14 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_reset.c

diff --git a/drivers/gpu/drm/nouveau/Makefile b/drivers/gpu/drm/nouveau/Makefile
index 338450e..1fa707c 100644
--- a/drivers/gpu/drm/nouveau/Makefile
+++ b/drivers/gpu/drm/nouveau/Makefile
@@ -10,7 +10,7 @@ nouveau-y := nouveau_device.o nouveau_subdev.o
nouveau_engine.o \
              nouveau_bo.o nouveau_fence.o nouveau_gem.o nouveau_ttm.o \
              nouveau_hw.o nouveau_calc.o nouveau_bios.o nouveau_i2c.o \
              nouveau_display.o nouveau_connector.o nouveau_fbcon.o \
-             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o \
+             nouveau_hdmi.o nouveau_dp.o nouveau_ramht.o nouveau_reset.o \
 	     nouveau_pm.o nouveau_volt.o nouveau_perf.o nouveau_therm.o \
 	     nouveau_mm.o nouveau_vm.o nouveau_mxm.o nouveau_gpio.o \
 	     nouveau_fanctl.o nouveau_abi16.o nouveau_agp.o \
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index f30a75a..6827f2e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -1133,7 +1133,7 @@ nouveau_bo_move(struct ttm_buffer_object *bo, bool evict,
bool intr,
 	}
 
 	/* CPU copy if we have no accelerated method available */
-	if (!ndev->ttm.move) {
+	if (!ndev->ttm.move || nouveau_gpu_reset_in_progress(ndev)) {
 		ret = ttm_bo_move_memcpy(bo, evict, no_wait_reserve, no_wait_gpu, new_mem);
 		goto out;
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c
b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 79b3236..1dccfcc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -131,6 +131,10 @@ MODULE_PARM_DESC(mxmdcb, "Santise DCB table according
to MXM-SIS");
 int nouveau_mxmdcb = 1;
 module_param_named(mxmdcb, nouveau_mxmdcb, int, 0400);
 
+MODULE_PARM_DESC(lockup_recovery, "Reset GPU on lockup (default:
1)\n");
+int nouveau_lockup_recovery = 1;
+module_param_named(lockup_recovery, nouveau_lockup_recovery, int, 0600);
+
 int nouveau_fbpercrtc;
 #if 0
 module_param_named(fbpercrtc, nouveau_fbpercrtc, int, 0400);
@@ -222,7 +226,7 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t
pm_state)
 	}
 
 	NV_INFO(ndev, "Disabling engines...\n");
-	ret = nouveau_device_fini(ndev, true);
+	ret = nouveau_device_fini(ndev, !nouveau_gpu_reset_in_progress(ndev));
 	if (ret)
 		goto out_abort;
 
@@ -362,11 +366,91 @@ static struct drm_ioctl_desc nouveau_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info,
DRM_UNLOCKED|DRM_AUTH),
 };
 
+void intr_rwsem_init(struct intr_rwsem *r)
+{
+	atomic_set(&r->readers, 0);
+	mutex_init(&r->mutex);
+}
+
+int intr_rwsem_down_read_interruptible(struct intr_rwsem *r)
+{
+	int ret = mutex_lock_interruptible(&r->mutex);
+	if (ret)
+		return ret;
+	atomic_inc(&r->readers);
+	mutex_unlock(&r->mutex);
+	return 0;
+}
+
+void intr_rwsem_down_read(struct intr_rwsem *r)
+{
+	mutex_lock(&r->mutex);
+	atomic_inc(&r->readers);
+	mutex_unlock(&r->mutex);
+}
+
+void intr_rwsem_up_read(struct intr_rwsem *r)
+{
+	atomic_dec(&r->readers);
+}
+
+int intr_rwsem_down_write_interruptible(struct intr_rwsem *r)
+{
+	int ret = mutex_lock_interruptible(&r->mutex);
+	if (ret)
+		return ret;
+	while (atomic_read(&r->readers)) {
+		if (signal_pending(current)) {
+			mutex_unlock(&r->mutex);
+			return -EINTR;
+		}
+		cond_resched();
+	}
+
+	return 0;
+}
+
+void intr_rwsem_down_write(struct intr_rwsem *r)
+{
+	mutex_lock(&r->mutex);
+	while (atomic_read(&r->readers))
+		cond_resched();
+}
+
+void intr_rwsem_up_write(struct intr_rwsem *r)
+{
+	mutex_unlock(&r->mutex);
+}
+
+static long nouveau_ioctl(struct file *filp,
+	      unsigned int cmd, unsigned long arg)
+{
+	struct drm_file *file_priv = filp->private_data;
+	struct drm_device *dev = file_priv->minor->dev;
+	struct nouveau_device *ndev = dev->dev_private;
+
+	long ret = intr_rwsem_down_read_interruptible(&ndev->ioctls_rwsem);
+	if (ret)
+		return -ERESTARTSYS;
+
+	ret = drm_ioctl(filp, cmd, arg);
+
+	intr_rwsem_up_read(&ndev->ioctls_rwsem);
+
+	if (unlikely(ret == -EIO)) {
+		ret = nouveau_reset_device(ndev);
+		if (ret == -EINTR)
+			ret = -ERESTARTSYS;
+	}
+
+	return ret;
+}
+
 static const struct file_operations nouveau_driver_fops = {
 	.owner = THIS_MODULE,
 	.open = drm_open,
 	.release = drm_release,
-	.unlocked_ioctl = drm_ioctl,
+	.unlocked_ioctl = nouveau_ioctl,
 	.mmap = nouveau_ttm_mmap,
 	.poll = drm_poll,
 	.fasync = drm_fasync,
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index c1539b5..83573b5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -481,8 +481,26 @@ enum nouveau_card_type {
 	NV_E0      = 0xe0,
 };
 
+struct intr_rwsem {
+	struct mutex mutex;
+	atomic_t readers;
+};
+
+extern void intr_rwsem_init(struct intr_rwsem *r);
+extern void intr_rwsem_down_read(struct intr_rwsem *r);
+extern int  intr_rwsem_down_read_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_read(struct intr_rwsem *r);
+extern void intr_rwsem_down_write(struct intr_rwsem *r);
+extern int  intr_rwsem_down_write_interruptible(struct intr_rwsem *r);
+extern void intr_rwsem_up_write(struct intr_rwsem *r);
+
 struct nouveau_device {
 	struct drm_device *dev;
+	struct intr_rwsem ioctls_rwsem;
+
+	struct mutex reset_lock;
+	atomic_t gpureset_in_progress;
+	unsigned long last_gpu_reset;
 
 	/* the card type, takes NV_* as values */
 	enum nouveau_card_type card_type;
@@ -575,6 +593,7 @@ struct nouveau_device {
 
 	struct {
 		struct dentry *channel_root;
+		struct dentry *reset;
 	} debugfs;
 
 	struct nouveau_fbdev *nfbdev;
@@ -652,6 +671,7 @@ extern int nouveau_perflvl_wr;
 extern int nouveau_msi;
 extern int nouveau_ctxfw;
 extern int nouveau_mxmdcb;
+extern int nouveau_lockup_recovery;
 
 int nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state);
 int nouveau_pci_resume(struct pci_dev *pdev);
@@ -926,6 +946,19 @@ int nouveau_display_dumb_map_offset(struct drm_file *,
struct drm_device *,
 				    u32 handle, u64 *offset);
 int nouveau_display_dumb_destroy(struct drm_file *, struct drm_device *,
 				 u32 handle);
+/* nouveau_reset.c */
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+void nouveau_reset_debugfs_fini(struct drm_minor *minor);
+void nouveau_reset_debugfs_init(struct drm_minor *minor);
+#else
+static inline void nouveau_reset_debugfs_fini(struct drm_minor *minor) {}
+static inline void nouveau_reset_debugfs_init(struct drm_minor *minor) {}
+#endif
+int  nouveau_reset_device(struct nouveau_device *ndev);
+static inline bool nouveau_gpu_reset_in_progress(struct nouveau_device *ndev)
+{
+	return atomic_read(&ndev->gpureset_in_progress) != 0;
+}
 
 /* nv50_calc.c */
 int nv50_calc_pll(struct nouveau_device *, struct pll_lims *, int clk,
@@ -1001,12 +1034,20 @@ static inline void nv_wr08(struct nouveau_device *ndev,
unsigned reg, u8 val)
 	iowrite8(val, ndev->mmio + reg);
 }
 
+static inline uint64_t nv_timeout(struct nouveau_device *ndev)
+{
+	uint64_t tm = 2000000000ULL;
+	if (nouveau_gpu_reset_in_progress(ndev))
+		tm = 50000000; /* 50ms */
+	return tm;
+}
+
 #define nv_wait(dev, reg, mask, val) \
-	nouveau_wait_eq(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_eq(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_ne(dev, reg, mask, val) \
-	nouveau_wait_ne(dev, 2000000000ULL, (reg), (mask), (val))
+	nouveau_wait_ne(dev, nv_timeout(dev), (reg), (mask), (val))
 #define nv_wait_cb(dev, func, data) \
-	nouveau_wait_cb(dev, 2000000000ULL, (func), (data))
+	nouveau_wait_cb(dev, nv_timeout(dev), (func), (data))
 
 /* PRAMIN access */
 static inline u32 nv_ri32(struct nouveau_device *ndev, unsigned offset)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 19a2534..e55fc52 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -114,13 +114,19 @@ nouveau_fence_done(struct nouveau_fence *fence)
 int
 nouveau_fence_wait(struct nouveau_fence *fence, bool lazy, bool intr)
 {
+	struct nouveau_device *ndev = fence->channel->device;
+	unsigned long timeout = fence->timeout;
 	unsigned long sleep_time = NSEC_PER_MSEC / 1000;
 	ktime_t t;
 	int ret = 0;
 
+	if (nouveau_gpu_reset_in_progress(ndev))
+		timeout = jiffies + DRM_HZ / 5;
+
 	while (!nouveau_fence_done(fence)) {
-		if (fence->timeout && time_after_eq(jiffies, fence->timeout)) {
-			ret = -EBUSY;
+		if (fence->timeout && time_after_eq(jiffies, timeout)) {
+			if (!nouveau_gpu_reset_in_progress(ndev))
+				ret = -EIO;
 			break;
 		}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_reset.c
b/drivers/gpu/drm/nouveau/nouveau_reset.c
new file mode 100644
index 0000000..9df93e6
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_reset.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 Marcin Slusarz <marcin.slusarz at gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction,
including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include "drmP.h"
+#include "nouveau_drv.h"
+
+static int off(struct nouveau_device *ndev)
+{
+	struct drm_device *dev = ndev->dev;
+	struct pci_dev *pdev = dev->pdev;
+	int ret;
+
+	pm_message_t pmm = { .event = PM_EVENT_SUSPEND };
+	atomic_inc(&ndev->gpureset_in_progress);
+	ret = intr_rwsem_down_write_interruptible(&ndev->ioctls_rwsem);
+	if (ret)
+		goto fail2;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	ret = nouveau_pci_suspend(pdev, pmm);
+	if (ret)
+		goto fail;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_OFF;
+	return 0;
+
+fail:
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+	intr_rwsem_up_write(&ndev->ioctls_rwsem);
+fail2:
+	atomic_dec(&ndev->gpureset_in_progress);
+	return ret;
+}
+
+static void on(struct nouveau_device *ndev)
+{
+	struct drm_device *dev = ndev->dev;
+	struct pci_dev *pdev = dev->pdev;
+
+	dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
+	atomic_dec(&ndev->gpureset_in_progress);
+	nouveau_pci_resume(pdev);
+	dev->switch_power_state = DRM_SWITCH_POWER_ON;
+
+	ndev->last_gpu_reset = jiffies;
+	intr_rwsem_up_write(&ndev->ioctls_rwsem);
+}
+
+static int __nouveau_reset_device(struct nouveau_device *ndev, bool manual)
+{
+	int ret = -EAGAIN;
+	unsigned long start, end;
+	int offret;
+
+	if (mutex_trylock(&ndev->reset_lock) == 0)
+		/* gpu reset in progress */
+		return -EAGAIN;
+
+	if (time_before(jiffies, ndev->last_gpu_reset + 10 * DRM_HZ))
+		goto out;
+	if (!(nouveau_lockup_recovery || manual))
+		goto out;
+
+	if (manual)
+		NV_INFO(ndev, "Manual GPU reset invoked...\n");
+	else
+		NV_INFO(ndev, "GPU lockup detected, resetting... (process:
%s[%d])\n",
+				current->comm, task_pid_nr(current));
+
+	start = jiffies;
+	do {
+		offret = off(ndev);
+	} while (offret != 0 && offret != -EINTR);
+
+	if (offret == 0) {
+		on(ndev);
+		end = jiffies;
+		NV_INFO(ndev, "GPU reset done, took %lus\n", (end - start) /
DRM_HZ);
+	} else {
+		ret = offret;
+		end = jiffies;
+		NV_INFO(ndev, "GPU reset interrupted after %lus\n", (end - start) /
DRM_HZ);
+	}
+
+out:
+	mutex_unlock(&ndev->reset_lock);
+	return ret;
+}
+
+int nouveau_reset_device(struct nouveau_device *ndev)
+{
+	return __nouveau_reset_device(ndev, false);
+}
+
+#ifdef CONFIG_DRM_NOUVEAU_DEBUG
+static ssize_t nouveau_reset_write(struct file *filp, const char __user *ubuf,
+			     size_t cnt, loff_t *ppos)
+{
+	struct nouveau_device *ndev = filp->private_data;
+	char usercmd[2];
+	if (cnt > 2)
+		cnt = 2;
+
+	if (copy_from_user(usercmd, ubuf, cnt))
+		return -EFAULT;
+
+	if (usercmd[0] == '1')
+		__nouveau_reset_device(ndev, true);
+
+	return cnt;
+}
+
+static const struct file_operations nouveau_reset_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.write = nouveau_reset_write,
+	.llseek = noop_llseek,
+};
+
+void nouveau_reset_debugfs_fini(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct nouveau_device *ndev = dev->dev_private;
+
+	if (ndev->debugfs.reset) {
+		debugfs_remove(ndev->debugfs.reset);
+		ndev->debugfs.reset = NULL;
+	}
+}
+
+
+void nouveau_reset_debugfs_init(struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	struct nouveau_device *ndev = dev->dev_private;
+
+	ndev->debugfs.reset = debugfs_create_file("reset", 0200,
+			minor->debugfs_root, ndev, &nouveau_reset_fops);
+	if (IS_ERR_OR_NULL(ndev->debugfs.reset))
+		ndev->debugfs.reset = NULL;
+
+}
+#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c
b/drivers/gpu/drm/nouveau/nouveau_state.c
index 628c46c..304b6a1 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -241,6 +241,8 @@ nouveau_card_init(struct nouveau_device *ndev)
 	if (ret)
 		goto out;
 	engine = &ndev->subsys;
+	intr_rwsem_init(&ndev->ioctls_rwsem);
+	mutex_init(&ndev->reset_lock);
 	spin_lock_init(&ndev->channels.lock);
 	spin_lock_init(&ndev->tile.lock);
 	spin_lock_init(&ndev->context_switch_lock);
@@ -323,6 +325,7 @@ nouveau_card_init(struct nouveau_device *ndev)
 
 		nouveau_fbcon_init(ndev);
 	}
+	nouveau_reset_debugfs_init(dev->primary);
 
 	return 0;
 
@@ -354,6 +357,8 @@ static void nouveau_card_takedown(struct nouveau_device
*ndev)
 	struct nouveau_subsys *engine = &ndev->subsys;
 	struct drm_device *dev = ndev->dev;
 
+	nouveau_reset_debugfs_fini(dev->primary);
+
 	if (dev->mode_config.num_crtc) {
 		nouveau_fbcon_fini(ndev);
 		nouveau_display_fini(ndev);
@@ -528,6 +533,7 @@ int nouveau_load(struct drm_device *dev, unsigned long
flags)
 	}
 	dev->dev_private = ndev;
 	ndev->dev = dev;
+	atomic_set(&ndev->gpureset_in_progress, 0);
 
 	pci_set_master(dev->pdev);
 
diff --git a/drivers/gpu/drm/nouveau/nv50_graph.c
b/drivers/gpu/drm/nouveau/nv50_graph.c
index ef6757f..26728100 100644
--- a/drivers/gpu/drm/nouveau/nv50_graph.c
+++ b/drivers/gpu/drm/nouveau/nv50_graph.c
@@ -247,13 +247,14 @@ nv84_graph_tlb_flush(struct nouveau_device *ndev, int
engine)
 			break;
 		}
 	} while (!idle &&
-		 !(timeout = ptimer->read(ptimer) - start > 2000000000));
+		 !(timeout = ptimer->read(ptimer) - start > nv_timeout(ndev)));
 
 	if (timeout) {
-		NV_ERROR(ndev, "PGRAPH TLB flush idle timeout fail: "
-			      "0x%08x 0x%08x 0x%08x 0x%08x\n",
-			 nv_rd32(ndev, 0x400700), nv_rd32(ndev, 0x400380),
-			 nv_rd32(ndev, 0x400384), nv_rd32(ndev, 0x400388));
+		if (!nouveau_gpu_reset_in_progress(ndev))
+			NV_ERROR(ndev, "PGRAPH TLB flush idle timeout fail: "
+				      "0x%08x 0x%08x 0x%08x 0x%08x\n",
+				 nv_rd32(ndev, 0x400700), nv_rd32(ndev, 0x400380),
+				 nv_rd32(ndev, 0x400384), nv_rd32(ndev, 0x400388));
 		ret = -EIO;
 	}
 
-- 
1.7.8.6

Marcin Slusarz

2012-Aug-05 21:15 UTC

head link

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

Hi

I refreshed this patchset to current nouveau git.
http://people.freedesktop.org/~mslusarz/gpu-lockup-recovery/

Marcin

Apparently Analagous Threads

Search for more seemingly similar threads

Nouveau - Apr 2012 - [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

[Nouveau] [PATCH v2 4/4] drm/nouveau: gpu lockup recovery

Apparently Analagous Threads