Philipp Stanner
2025-Mar-27  08:42 UTC
[PATCH] drm/nouveau: Prevent signalled fences in pending list
Nouveau currently relies on the assumption that dma_fences will only
ever get signalled through nouveau_fence_signal(), which takes care of
removing a signalled fence from the list nouveau_fence_chan.pending.
This self-imposed rule is violated in nouveau_fence_done(), where
dma_fence_is_signaled() can signal the fence without removing it from
the list. This enables accesses to already signalled fences through the
list, which is a bug.
Furthermore, it must always be possible to use standard dma_fence
methods an a dma_fence and observe valid behavior. The canonical way of
ensuring that signalling a fence has additional effects is to add those
effects to a callback and register it on the fence.
Move the code from nouveau_fence_signal() into a dma_fence callback.
Register that callback when creating the fence.
Cc: <stable at vger.kernel.org> # 4.10+
Fixes: f54d1867005c ("dma-buf: Rename struct fence to dma_fence")
Signed-off-by: Philipp Stanner <phasta at kernel.org>
---
I'm not entirely sure what Fixes-Tag is appropriate. The last time the
line causing the signalled fence in the list was touched is the commit
listed above.
---
 drivers/gpu/drm/nouveau/nouveau_fence.c | 41 ++++++++++++++++---------
 drivers/gpu/drm/nouveau/nouveau_fence.h |  1 +
 2 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 7cc84472cece..b2c2241a8803 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -50,24 +50,22 @@ nouveau_fctx(struct nouveau_fence *fence)
 	return container_of(fence->base.lock, struct nouveau_fence_chan, lock);
 }
 
-static int
-nouveau_fence_signal(struct nouveau_fence *fence)
+static void
+nouveau_fence_cleanup_cb(struct dma_fence *dfence, struct dma_fence_cb *cb)
 {
-	int drop = 0;
+	struct nouveau_fence_chan *fctx;
+	struct nouveau_fence *fence;
+
+	fence = container_of(dfence, struct nouveau_fence, base);
+	fctx = nouveau_fctx(fence);
 
-	dma_fence_signal_locked(&fence->base);
 	list_del(&fence->head);
 	rcu_assign_pointer(fence->channel, NULL);
 
-	if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags)) {
-		struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
-
-		if (!--fctx->notify_ref)
-			drop = 1;
-	}
+	if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags))
+		--fctx->notify_ref;
 
 	dma_fence_put(&fence->base);
-	return drop;
 }
 
 static struct nouveau_fence *
@@ -93,7 +91,8 @@ nouveau_fence_context_kill(struct nouveau_fence_chan *fctx,
int error)
 		if (error)
 			dma_fence_set_error(&fence->base, error);
 
-		if (nouveau_fence_signal(fence))
+		dma_fence_signal_locked(&fence->base);
+		if (fctx->notify_ref == 0)
 			nvif_event_block(&fctx->event);
 	}
 	fctx->killed = 1;
@@ -131,7 +130,6 @@ static int
 nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan
*fctx)
 {
 	struct nouveau_fence *fence;
-	int drop = 0;
 	u32 seq = fctx->read(chan);
 
 	while (!list_empty(&fctx->pending)) {
@@ -140,10 +138,10 @@ nouveau_fence_update(struct nouveau_channel *chan, struct
nouveau_fence_chan *fc
 		if ((int)(seq - fence->base.seqno) < 0)
 			break;
 
-		drop |= nouveau_fence_signal(fence);
+		dma_fence_signal_locked(&fence->base);
 	}
 
-	return drop;
+	return fctx->notify_ref == 0 ? 1 : 0;
 }
 
 static void
@@ -235,6 +233,19 @@ nouveau_fence_emit(struct nouveau_fence *fence)
 			       &fctx->lock, fctx->context, ++fctx->sequence);
 	kref_get(&fctx->fence_ref);
 
+	fence->cb.func = nouveau_fence_cleanup_cb;
+	/* Adding a callback runs into __dma_fence_enable_signaling(), which will
+	 * ultimately run into nouveau_fence_no_signaling(), where a WARN_ON
+	 * would fire because the refcount can be dropped there.
+	 *
+	 * Increment the refcount here temporarily to work around that.
+	 */
+	dma_fence_get(&fence->base);
+	ret = dma_fence_add_callback(&fence->base, &fence->cb,
nouveau_fence_cleanup_cb);
+	dma_fence_put(&fence->base);
+	if (ret)
+		return ret;
+
 	ret = fctx->emit(fence);
 	if (!ret) {
 		dma_fence_get(&fence->base);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 8bc065acfe35..e6b2df7fdc42 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -10,6 +10,7 @@ struct nouveau_bo;
 
 struct nouveau_fence {
 	struct dma_fence base;
+	struct dma_fence_cb cb;
 
 	struct list_head head;
 
-- 
2.48.1
Christian König
2025-Mar-27  08:56 UTC
[PATCH] drm/nouveau: Prevent signalled fences in pending list
Am 27.03.25 um 09:42 schrieb Philipp Stanner:> Nouveau currently relies on the assumption that dma_fences will only > ever get signalled through nouveau_fence_signal(), which takes care of > removing a signalled fence from the list nouveau_fence_chan.pending. > > This self-imposed rule is violated in nouveau_fence_done(), where > dma_fence_is_signaled() can signal the fence without removing it from > the list. This enables accesses to already signalled fences through the > list, which is a bug. > > Furthermore, it must always be possible to use standard dma_fence > methods an a dma_fence and observe valid behavior. The canonical way of > ensuring that signalling a fence has additional effects is to add those > effects to a callback and register it on the fence.Good catch.> > Move the code from nouveau_fence_signal() into a dma_fence callback. > Register that callback when creating the fence.But that's a really ugly approach. Either nouveau shouldn't implement the signaled callback or make sure that when returning true from the signaled callback the fence is also removed from the pending list.> > Cc: <stable at vger.kernel.org> # 4.10+ > Fixes: f54d1867005c ("dma-buf: Rename struct fence to dma_fence") > Signed-off-by: Philipp Stanner <phasta at kernel.org> > --- > I'm not entirely sure what Fixes-Tag is appropriate. The last time the > line causing the signalled fence in the list was touched is the commit > listed above.Yeah, that's most likely not correct. My educated guess is that the bug was always there just never discovered. Regards, Christian.> --- > drivers/gpu/drm/nouveau/nouveau_fence.c | 41 ++++++++++++++++--------- > drivers/gpu/drm/nouveau/nouveau_fence.h | 1 + > 2 files changed, 27 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c > index 7cc84472cece..b2c2241a8803 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fence.c > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c > @@ -50,24 +50,22 @@ nouveau_fctx(struct nouveau_fence *fence) > return container_of(fence->base.lock, struct nouveau_fence_chan, lock); > } > > -static int > -nouveau_fence_signal(struct nouveau_fence *fence) > +static void > +nouveau_fence_cleanup_cb(struct dma_fence *dfence, struct dma_fence_cb *cb) > { > - int drop = 0; > + struct nouveau_fence_chan *fctx; > + struct nouveau_fence *fence; > + > + fence = container_of(dfence, struct nouveau_fence, base); > + fctx = nouveau_fctx(fence); > > - dma_fence_signal_locked(&fence->base); > list_del(&fence->head); > rcu_assign_pointer(fence->channel, NULL); > > - if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags)) { > - struct nouveau_fence_chan *fctx = nouveau_fctx(fence); > - > - if (!--fctx->notify_ref) > - drop = 1; > - } > + if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence->base.flags)) > + --fctx->notify_ref; > > dma_fence_put(&fence->base); > - return drop; > } > > static struct nouveau_fence * > @@ -93,7 +91,8 @@ nouveau_fence_context_kill(struct nouveau_fence_chan *fctx, int error) > if (error) > dma_fence_set_error(&fence->base, error); > > - if (nouveau_fence_signal(fence)) > + dma_fence_signal_locked(&fence->base); > + if (fctx->notify_ref == 0) > nvif_event_block(&fctx->event); > } > fctx->killed = 1; > @@ -131,7 +130,6 @@ static int > nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan *fctx) > { > struct nouveau_fence *fence; > - int drop = 0; > u32 seq = fctx->read(chan); > > while (!list_empty(&fctx->pending)) { > @@ -140,10 +138,10 @@ nouveau_fence_update(struct nouveau_channel *chan, struct nouveau_fence_chan *fc > if ((int)(seq - fence->base.seqno) < 0) > break; > > - drop |= nouveau_fence_signal(fence); > + dma_fence_signal_locked(&fence->base); > } > > - return drop; > + return fctx->notify_ref == 0 ? 1 : 0; > } > > static void > @@ -235,6 +233,19 @@ nouveau_fence_emit(struct nouveau_fence *fence) > &fctx->lock, fctx->context, ++fctx->sequence); > kref_get(&fctx->fence_ref); > > + fence->cb.func = nouveau_fence_cleanup_cb; > + /* Adding a callback runs into __dma_fence_enable_signaling(), which will > + * ultimately run into nouveau_fence_no_signaling(), where a WARN_ON > + * would fire because the refcount can be dropped there. > + * > + * Increment the refcount here temporarily to work around that. > + */ > + dma_fence_get(&fence->base); > + ret = dma_fence_add_callback(&fence->base, &fence->cb, nouveau_fence_cleanup_cb); > + dma_fence_put(&fence->base); > + if (ret) > + return ret; > + > ret = fctx->emit(fence); > if (!ret) { > dma_fence_get(&fence->base); > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h > index 8bc065acfe35..e6b2df7fdc42 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_fence.h > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h > @@ -10,6 +10,7 @@ struct nouveau_bo; > > struct nouveau_fence { > struct dma_fence base; > + struct dma_fence_cb cb; > > struct list_head head; >