Lyude
2017-Jan-12  02:25 UTC
[Nouveau] [PATCH v2 1/2] drm/nouveau: Don't enabling polling twice on runtime resume
As it turns out, on cards that actually have CRTCs on them we're already
calling drm_kms_helper_poll_enable(drm_dev) from
nouveau_display_resume() before we call it in
nouveau_pmops_runtime_resume(). This leads us to accidentally trying to
enable polling twice, which results in a potential deadlock between the
RPM locks and drm_dev->mode_config.mutex if we end up trying to enable
polling the second time while output_poll_execute is running and holding
the mode_config lock. As such, make sure we only enable polling in
nouveau_pmops_runtime_resume() if we need to.
This fixes hangs observed on the ThinkPad W541
Signed-off-by: Lyude <lyude at redhat.com>
Cc: Hans de Goede <hdegoede at redhat.com>
Cc: Kilian Singer <kilian.singer at quantumtechnology.info>
Cc: Lukas Wunner <lukas at wunner.de>
Cc: David Airlie <airlied at redhat.com>
---
Changes since v1:
 - Rebase to work with master
 drivers/gpu/drm/nouveau/nouveau_display.c | 3 ++-
 drivers/gpu/drm/nouveau/nouveau_drm.c     | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_display.c
b/drivers/gpu/drm/nouveau/nouveau_display.c
index cef08da..6a15776 100644
--- a/drivers/gpu/drm/nouveau/nouveau_display.c
+++ b/drivers/gpu/drm/nouveau/nouveau_display.c
@@ -411,7 +411,8 @@ nouveau_display_init(struct drm_device *dev)
 		return ret;
 
 	/* enable polling for external displays */
-	drm_kms_helper_poll_enable(dev);
+	if (!dev->mode_config.poll_enabled)
+		drm_kms_helper_poll_enable(dev);
 
 	/* enable hotplug interrupts */
 	list_for_each_entry(connector, &dev->mode_config.connector_list, head)
{
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c
b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 59348fc..bc85a45 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -773,7 +773,10 @@ nouveau_pmops_runtime_resume(struct device *dev)
 	pci_set_master(pdev);
 
 	ret = nouveau_do_resume(drm_dev, true);
-	drm_kms_helper_poll_enable(drm_dev);
+
+	if (!drm_dev->mode_config.poll_enabled)
+		drm_kms_helper_poll_enable(drm_dev);
+
 	/* do magic */
 	nvif_mask(&device->object, 0x088488, (1 << 25), (1 << 25));
 	vga_switcheroo_set_dynamic_switch(pdev, VGA_SWITCHEROO_ON);
-- 
2.9.3
Lyude
2017-Jan-12  02:25 UTC
[Nouveau] [PATCH 2/2] drm/nouveau: Handle fbcon suspend/resume in seperate worker
Resuming from RPM can happen while already holding
dev->mode_config.mutex. This means we can't actually handle fbcon in
any RPM resume workers, since restoring fbcon requires grabbing
dev->mode_config.mutex again. So move the fbcon suspend/resume code into
it's own worker, and rely on that instead to avoid deadlocking.
This fixes more deadlocks for runtime suspending the GPU on the ThinkPad
W541. Reproduction recipe:
 - Get a machine with both optimus and a nvidia card with connectors
   attached to it
 - Wait for the nvidia GPU to suspend
 - Attempt to manually reprobe any of the connectors on the nvidia GPU
   using sysfs
 - *deadlock*
Signed-off-by: Lyude <lyude at redhat.com>
Cc: Hans de Goede <hdegoede at redhat.com>
Cc: Kilian Singer <kilian.singer at quantumtechnology.info>
Cc: Lukas Wunner <lukas at wunner.de>
Cc: David Airlie <airlied at redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |  2 ++
 drivers/gpu/drm/nouveau/nouveau_fbcon.c | 43 ++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 8d5ed5b..42c1fa5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -165,6 +165,8 @@ struct nouveau_drm {
 	struct backlight_device *backlight;
 	struct list_head bl_connectors;
 	struct work_struct hpd_work;
+	struct work_struct fbcon_work;
+	int fbcon_new_state;
 #ifdef CONFIG_ACPI
 	struct notifier_block acpi_nb;
 #endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
index 2f2a3dc..87cd30b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c
@@ -470,19 +470,43 @@ static const struct drm_fb_helper_funcs
nouveau_fbcon_helper_funcs = {
 	.fb_probe = nouveau_fbcon_create,
 };
 
+static void
+nouveau_fbcon_set_suspend_work(struct work_struct *work)
+{
+	struct nouveau_drm *drm = container_of(work, typeof(*drm), fbcon_work);
+	int state = drm->fbcon_new_state;
+
+	if (state == FBINFO_STATE_RUNNING)
+		pm_runtime_get_sync(drm->dev->dev);
+
+	console_lock();
+	if (state == FBINFO_STATE_RUNNING)
+		nouveau_fbcon_accel_restore(drm->dev);
+	drm_fb_helper_set_suspend(&drm->fbcon->helper, state);
+	if (state != FBINFO_STATE_RUNNING)
+		nouveau_fbcon_accel_save_disable(drm->dev);
+	console_unlock();
+
+	if (state == FBINFO_STATE_RUNNING) {
+		pm_runtime_mark_last_busy(drm->dev->dev);
+		pm_runtime_put_sync(drm->dev->dev);
+	}
+}
+
 void
 nouveau_fbcon_set_suspend(struct drm_device *dev, int state)
 {
 	struct nouveau_drm *drm = nouveau_drm(dev);
-	if (drm->fbcon) {
-		console_lock();
-		if (state == FBINFO_STATE_RUNNING)
-			nouveau_fbcon_accel_restore(dev);
-		drm_fb_helper_set_suspend(&drm->fbcon->helper, state);
-		if (state != FBINFO_STATE_RUNNING)
-			nouveau_fbcon_accel_save_disable(dev);
-		console_unlock();
-	}
+
+	if (!drm->fbcon)
+		return;
+
+	drm->fbcon_new_state = state;
+	/* Since runtime resume can happen as a result of a sysfs operation,
+	 * it's possible we already have the console locked. So handle fbcon
+	 * init/deinit from a seperate work thread
+	 */
+	schedule_work(&drm->fbcon_work);
 }
 
 int
@@ -502,6 +526,7 @@ nouveau_fbcon_init(struct drm_device *dev)
 		return -ENOMEM;
 
 	drm->fbcon = fbcon;
+	INIT_WORK(&drm->fbcon_work, nouveau_fbcon_set_suspend_work);
 
 	drm_fb_helper_prepare(dev, &fbcon->helper,
&nouveau_fbcon_helper_funcs);
 
-- 
2.9.3
Hans de Goede
2017-Jan-12  08:27 UTC
[Nouveau] [PATCH 2/2] drm/nouveau: Handle fbcon suspend/resume in seperate worker
Hi, Good catch (both the previous patch as well as this one). I've one small comment inline: On 12-01-17 03:25, Lyude wrote:> Resuming from RPM can happen while already holding > dev->mode_config.mutex. This means we can't actually handle fbcon in > any RPM resume workers, since restoring fbcon requires grabbing > dev->mode_config.mutex again. So move the fbcon suspend/resume code into > it's own worker, and rely on that instead to avoid deadlocking. > > This fixes more deadlocks for runtime suspending the GPU on the ThinkPad > W541. Reproduction recipe: > > - Get a machine with both optimus and a nvidia card with connectors > attached to it > - Wait for the nvidia GPU to suspend > - Attempt to manually reprobe any of the connectors on the nvidia GPU > using sysfs > - *deadlock* > > Signed-off-by: Lyude <lyude at redhat.com> > Cc: Hans de Goede <hdegoede at redhat.com> > Cc: Kilian Singer <kilian.singer at quantumtechnology.info> > Cc: Lukas Wunner <lukas at wunner.de> > Cc: David Airlie <airlied at redhat.com> > --- > drivers/gpu/drm/nouveau/nouveau_drv.h | 2 ++ > drivers/gpu/drm/nouveau/nouveau_fbcon.c | 43 ++++++++++++++++++++++++++------- > 2 files changed, 36 insertions(+), 9 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h > index 8d5ed5b..42c1fa5 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_drv.h > +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h > @@ -165,6 +165,8 @@ struct nouveau_drm { > struct backlight_device *backlight; > struct list_head bl_connectors; > struct work_struct hpd_work; > + struct work_struct fbcon_work; > + int fbcon_new_state; > #ifdef CONFIG_ACPI > struct notifier_block acpi_nb; > #endif > diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > index 2f2a3dc..87cd30b 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c > +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c > @@ -470,19 +470,43 @@ static const struct drm_fb_helper_funcs nouveau_fbcon_helper_funcs = { > .fb_probe = nouveau_fbcon_create, > }; > > +static void > +nouveau_fbcon_set_suspend_work(struct work_struct *work) > +{ > + struct nouveau_drm *drm = container_of(work, typeof(*drm), fbcon_work); > + int state = drm->fbcon_new_state;The compiler may decide to optimize away this variable and simply use drm->fbcon_new_state in the if-s below, which is racy. I would fix this by making drm->fbcon_new_state an atomic_t and using atomic_read(&drm->fbcon_new_state) here. (and atomic_set below). Regards, Hans> + > + if (state == FBINFO_STATE_RUNNING) > + pm_runtime_get_sync(drm->dev->dev); > + > + console_lock(); > + if (state == FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_restore(drm->dev); > + drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > + if (state != FBINFO_STATE_RUNNING) > + nouveau_fbcon_accel_save_disable(drm->dev); > + console_unlock(); > + > + if (state == FBINFO_STATE_RUNNING) { > + pm_runtime_mark_last_busy(drm->dev->dev); > + pm_runtime_put_sync(drm->dev->dev); > + } > +} > + > void > nouveau_fbcon_set_suspend(struct drm_device *dev, int state) > { > struct nouveau_drm *drm = nouveau_drm(dev); > - if (drm->fbcon) { > - console_lock(); > - if (state == FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_restore(dev); > - drm_fb_helper_set_suspend(&drm->fbcon->helper, state); > - if (state != FBINFO_STATE_RUNNING) > - nouveau_fbcon_accel_save_disable(dev); > - console_unlock(); > - } > + > + if (!drm->fbcon) > + return; > + > + drm->fbcon_new_state = state; > + /* Since runtime resume can happen as a result of a sysfs operation, > + * it's possible we already have the console locked. So handle fbcon > + * init/deinit from a seperate work thread > + */ > + schedule_work(&drm->fbcon_work); > } > > int > @@ -502,6 +526,7 @@ nouveau_fbcon_init(struct drm_device *dev) > return -ENOMEM; > > drm->fbcon = fbcon; > + INIT_WORK(&drm->fbcon_work, nouveau_fbcon_set_suspend_work); > > drm_fb_helper_prepare(dev, &fbcon->helper, &nouveau_fbcon_helper_funcs); > >