thr3ads.net - Nouveau - [PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown" [Sep 2025]

If this information is useful, please help other people find it:
Share via:

Christian König

2025-Sep-04 10:27 UTC

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

On 01.09.25 10:31, Philipp Stanner wrote:> This reverts:
> 
> commit bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
> commit 5f46f5c7af8c ("drm/nouveau: Add new callback for scheduler
teardown")
> 
> from the drm/sched teardown leak fix series:
> 
> https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta at
kernel.org/
> 
> The aforementioned series removed a blocking waitqueue from
> nouveau_sched_fini(). It was mistakenly assumed that this waitqueue only
> prevents jobs from leaking, which the series fixed.
> 
> The waitqueue, however, also guarantees that all VM_BIND related jobs
> are finished in order, cleaning up mappings in the GPU's MMU. These
jobs
> must be executed sequentially. Without the waitqueue, this is no longer
> guaranteed, because entity and scheduler teardown can race with each
> other.
That sounds like exactly the kind of issues I tried to catch with the recent
dma_fence changes.

Going to keep working on that and potentially using this here as blueprint for
something it should catch.

Regards,
Christian.
> 
> Revert all patches related to the waitqueue removal.
> 
> Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
> Suggested-by: Danilo Krummrich <dakr at kernel.org>
> Signed-off-by: Philipp Stanner <phasta at kernel.org>
> ---
> Changes in v2:
>   - Don't revert commit 89b2675198ab ("drm/nouveau: Make fence
container helper usable driver-wide")
>   - Add Fixes-tag
> ---
>  drivers/gpu/drm/nouveau/nouveau_fence.c | 15 -----------
>  drivers/gpu/drm/nouveau/nouveau_fence.h |  1 -
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 35 ++++++++++---------------
>  drivers/gpu/drm/nouveau/nouveau_sched.h |  9 ++++---
>  drivers/gpu/drm/nouveau/nouveau_uvmm.c  |  8 +++---
>  5 files changed, 24 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
> index 9f345a008717..869d4335c0f4 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence)
>  	return ret;
>  }
>  
> -void
> -nouveau_fence_cancel(struct nouveau_fence *fence)
> -{
> -	struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
> -	unsigned long flags;
> -
> -	spin_lock_irqsave(&fctx->lock, flags);
> -	if (!dma_fence_is_signaled_locked(&fence->base)) {
> -		dma_fence_set_error(&fence->base, -ECANCELED);
> -		if (nouveau_fence_signal(fence))
> -			nvif_event_block(&fctx->event);
> -	}
> -	spin_unlock_irqrestore(&fctx->lock, flags);
> -}
> -
>  bool
>  nouveau_fence_done(struct nouveau_fence *fence)
>  {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
b/drivers/gpu/drm/nouveau/nouveau_fence.h
> index 9957a919bd38..183dd43ecfff 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_fence.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
> @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **);
>  
>  int  nouveau_fence_emit(struct nouveau_fence *);
>  bool nouveau_fence_done(struct nouveau_fence *);
> -void nouveau_fence_cancel(struct nouveau_fence *fence);
>  int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
>  int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *,
bool exclusive, bool intr);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 0cc0bc9f9952..e60f7892f5ce 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -11,7 +11,6 @@
>  #include "nouveau_exec.h"
>  #include "nouveau_abi16.h"
>  #include "nouveau_sched.h"
> -#include "nouveau_chan.h"
>  
>  #define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000
>  
> @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
>  {
>  	struct nouveau_sched *sched = job->sched;
>  
> -	spin_lock(&sched->job_list.lock);
> +	spin_lock(&sched->job.list.lock);
>  	list_del(&job->entry);
> -	spin_unlock(&sched->job_list.lock);
> +	spin_unlock(&sched->job.list.lock);
> +
> +	wake_up(&sched->job.wq);
>  }
>  
>  void
> @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
>  	}
>  
>  	/* Submit was successful; add the job to the schedulers job list. */
> -	spin_lock(&sched->job_list.lock);
> -	list_add(&job->entry, &sched->job_list.head);
> -	spin_unlock(&sched->job_list.lock);
> +	spin_lock(&sched->job.list.lock);
> +	list_add(&job->entry, &sched->job.list.head);
> +	spin_unlock(&sched->job.list.lock);
>  
>  	drm_sched_job_arm(&job->base);
>  	job->done_fence =
dma_fence_get(&job->base.s_fence->finished);
> @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job
*sched_job)
>  	nouveau_job_fini(job);
>  }
>  
> -static void
> -nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
> -{
> -	struct nouveau_fence *fence;
> -	struct nouveau_job *job;
> -
> -	job = to_nouveau_job(sched_job);
> -	fence = to_nouveau_fence(job->done_fence);
> -
> -	nouveau_fence_cancel(fence);
> -}
> -
>  static const struct drm_sched_backend_ops nouveau_sched_ops = {
>  	.run_job = nouveau_sched_run_job,
>  	.timedout_job = nouveau_sched_timedout_job,
>  	.free_job = nouveau_sched_free_job,
> -	.cancel_job = nouveau_sched_cancel_job,
>  };
>  
>  static int
> @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct
nouveau_drm *drm,
>  		goto fail_sched;
>  
>  	mutex_init(&sched->mutex);
> -	spin_lock_init(&sched->job_list.lock);
> -	INIT_LIST_HEAD(&sched->job_list.head);
> +	spin_lock_init(&sched->job.list.lock);
> +	INIT_LIST_HEAD(&sched->job.list.head);
> +	init_waitqueue_head(&sched->job.wq);
>  
>  	return 0;
>  
> @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched,
struct nouveau_drm *drm,
>  	return 0;
>  }
>  
> +
>  static void
>  nouveau_sched_fini(struct nouveau_sched *sched)
>  {
>  	struct drm_gpu_scheduler *drm_sched = &sched->base;
>  	struct drm_sched_entity *entity = &sched->entity;
>  
> +	rmb(); /* for list_empty to work without lock */
> +	wait_event(sched->job.wq, list_empty(&sched->job.list.head));
> +
>  	drm_sched_entity_fini(entity);
>  	drm_sched_fini(drm_sched);
>  
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
b/drivers/gpu/drm/nouveau/nouveau_sched.h
> index b98c3f0bef30..20cd1da8db73 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> @@ -103,9 +103,12 @@ struct nouveau_sched {
>  	struct mutex mutex;
>  
>  	struct {
> -		struct list_head head;
> -		spinlock_t lock;
> -	} job_list;
> +		struct {
> +			struct list_head head;
> +			spinlock_t lock;
> +		} list;
> +		struct wait_queue_head wq;
> +	} job;
>  };
>  
>  int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm
*drm,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> index d94a85509176..79eefdfd08a2 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64
addr, u64 range)
>  	u64 end = addr + range;
>  
>  again:
> -	spin_lock(&sched->job_list.lock);
> -	list_for_each_entry(__job, &sched->job_list.head, entry) {
> +	spin_lock(&sched->job.list.lock);
> +	list_for_each_entry(__job, &sched->job.list.head, entry) {
>  		struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job);
>  
>  		list_for_each_op(op, &bind_job->ops) {
> @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64
addr, u64 range)
>  
>  				if (!(end <= op_addr || addr >= op_end)) {
>  					nouveau_uvmm_bind_job_get(bind_job);
> -					spin_unlock(&sched->job_list.lock);
> +					spin_unlock(&sched->job.list.lock);
>  					wait_for_completion(&bind_job->complete);
>  					nouveau_uvmm_bind_job_put(bind_job);
>  					goto again;
> @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64
addr, u64 range)
>  			}
>  		}
>  	}
> -	spin_unlock(&sched->job_list.lock);
> +	spin_unlock(&sched->job.list.lock);
>  }
>  
>  static int

Philipp Stanner

2025-Sep-04 11:12 UTC

head link

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

On Thu, 2025-09-04 at 12:27 +0200, Christian K?nig
wrote:> On 01.09.25 10:31, Philipp Stanner wrote:
> > This reverts:
> > 
> > commit bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
> > commit 5f46f5c7af8c ("drm/nouveau: Add new callback for scheduler
teardown")
> > 
> > from the drm/sched teardown leak fix series:
> > 
> > https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta at
kernel.org/
> > 
> > The aforementioned series removed a blocking waitqueue from
> > nouveau_sched_fini(). It was mistakenly assumed that this waitqueue
only
> > prevents jobs from leaking, which the series fixed.
> > 
> > The waitqueue, however, also guarantees that all VM_BIND related jobs
> > are finished in order, cleaning up mappings in the GPU's MMU.
These jobs
> > must be executed sequentially. Without the waitqueue, this is no
longer
> > guaranteed, because entity and scheduler teardown can race with each
> > other.
> 
> That sounds like exactly the kind of issues I tried to catch with the
recent dma_fence changes.
Link? :)
> 
> Going to keep working on that and potentially using this here as blueprint
for something it should catch.
This is more like a nouveau-specific issue. The problem is that
unmapping mappings in the GPU's MMU must be done in a specific order,
and all the unmappings must be performed, not canceled.

For EXEC jobs, it's perfectly fine to cancel pending jobs, remove the
waitqueue and just rush through drm_sched_fini().

I don't know the issue you're describing, but I don't think a
feature
in dma_fence could help with that specific Nouveau problem. dma_fence
can't force the driver to submit jobs in a specific order or to wait
until they're all completed.

Gr??e
P.
> 
> Regards,
> Christian.
> 
> > 
> > Revert all patches related to the waitqueue removal.
> > 
> > Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
> > Suggested-by: Danilo Krummrich <dakr at kernel.org>
> > Signed-off-by: Philipp Stanner <phasta at kernel.org>
> > ---
> > Changes in v2:
> > ? - Don't revert commit 89b2675198ab ("drm/nouveau: Make
fence container helper usable driver-wide")
> > ? - Add Fixes-tag
> > ---
> > ?drivers/gpu/drm/nouveau/nouveau_fence.c | 15 -----------
> > ?drivers/gpu/drm/nouveau/nouveau_fence.h |? 1 -
> > ?drivers/gpu/drm/nouveau/nouveau_sched.c | 35
++++++++++---------------
> > ?drivers/gpu/drm/nouveau/nouveau_sched.h |? 9 ++++---
> > ?drivers/gpu/drm/nouveau/nouveau_uvmm.c? |? 8 +++---
> > ?5 files changed, 24 insertions(+), 44 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > index 9f345a008717..869d4335c0f4 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
> > @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence)
> > ?	return ret;
> > ?}
> > ?
> > -void
> > -nouveau_fence_cancel(struct nouveau_fence *fence)
> > -{
> > -	struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
> > -	unsigned long flags;
> > -
> > -	spin_lock_irqsave(&fctx->lock, flags);
> > -	if (!dma_fence_is_signaled_locked(&fence->base)) {
> > -		dma_fence_set_error(&fence->base, -ECANCELED);
> > -		if (nouveau_fence_signal(fence))
> > -			nvif_event_block(&fctx->event);
> > -	}
> > -	spin_unlock_irqrestore(&fctx->lock, flags);
> > -}
> > -
> > ?bool
> > ?nouveau_fence_done(struct nouveau_fence *fence)
> > ?{
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
b/drivers/gpu/drm/nouveau/nouveau_fence.h
> > index 9957a919bd38..183dd43ecfff 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_fence.h
> > +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
> > @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **);
> > ?
> > ?int? nouveau_fence_emit(struct nouveau_fence *);
> > ?bool nouveau_fence_done(struct nouveau_fence *);
> > -void nouveau_fence_cancel(struct nouveau_fence *fence);
> > ?int? nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool
intr);
> > ?int? nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel
*, bool exclusive, bool intr);
> > ?
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
> > index 0cc0bc9f9952..e60f7892f5ce 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> > @@ -11,7 +11,6 @@
> > ?#include "nouveau_exec.h"
> > ?#include "nouveau_abi16.h"
> > ?#include "nouveau_sched.h"
> > -#include "nouveau_chan.h"
> > ?
> > ?#define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000
> > ?
> > @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
> > ?{
> > ?	struct nouveau_sched *sched = job->sched;
> > ?
> > -	spin_lock(&sched->job_list.lock);
> > +	spin_lock(&sched->job.list.lock);
> > ?	list_del(&job->entry);
> > -	spin_unlock(&sched->job_list.lock);
> > +	spin_unlock(&sched->job.list.lock);
> > +
> > +	wake_up(&sched->job.wq);
> > ?}
> > ?
> > ?void
> > @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
> > ?	}
> > ?
> > ?	/* Submit was successful; add the job to the schedulers job list. */
> > -	spin_lock(&sched->job_list.lock);
> > -	list_add(&job->entry, &sched->job_list.head);
> > -	spin_unlock(&sched->job_list.lock);
> > +	spin_lock(&sched->job.list.lock);
> > +	list_add(&job->entry, &sched->job.list.head);
> > +	spin_unlock(&sched->job.list.lock);
> > ?
> > ?	drm_sched_job_arm(&job->base);
> > ?	job->done_fence =
dma_fence_get(&job->base.s_fence->finished);
> > @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job
*sched_job)
> > ?	nouveau_job_fini(job);
> > ?}
> > ?
> > -static void
> > -nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
> > -{
> > -	struct nouveau_fence *fence;
> > -	struct nouveau_job *job;
> > -
> > -	job = to_nouveau_job(sched_job);
> > -	fence = to_nouveau_fence(job->done_fence);
> > -
> > -	nouveau_fence_cancel(fence);
> > -}
> > -
> > ?static const struct drm_sched_backend_ops nouveau_sched_ops = {
> > ?	.run_job = nouveau_sched_run_job,
> > ?	.timedout_job = nouveau_sched_timedout_job,
> > ?	.free_job = nouveau_sched_free_job,
> > -	.cancel_job = nouveau_sched_cancel_job,
> > ?};
> > ?
> > ?static int
> > @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched,
struct nouveau_drm *drm,
> > ?		goto fail_sched;
> > ?
> > ?	mutex_init(&sched->mutex);
> > -	spin_lock_init(&sched->job_list.lock);
> > -	INIT_LIST_HEAD(&sched->job_list.head);
> > +	spin_lock_init(&sched->job.list.lock);
> > +	INIT_LIST_HEAD(&sched->job.list.head);
> > +	init_waitqueue_head(&sched->job.wq);
> > ?
> > ?	return 0;
> > ?
> > @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched
**psched, struct nouveau_drm *drm,
> > ?	return 0;
> > ?}
> > ?
> > +
> > ?static void
> > ?nouveau_sched_fini(struct nouveau_sched *sched)
> > ?{
> > ?	struct drm_gpu_scheduler *drm_sched = &sched->base;
> > ?	struct drm_sched_entity *entity = &sched->entity;
> > ?
> > +	rmb(); /* for list_empty to work without lock */
> > +	wait_event(sched->job.wq,
list_empty(&sched->job.list.head));
> > +
> > ?	drm_sched_entity_fini(entity);
> > ?	drm_sched_fini(drm_sched);
> > ?
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > index b98c3f0bef30..20cd1da8db73 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_sched.h
> > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > @@ -103,9 +103,12 @@ struct nouveau_sched {
> > ?	struct mutex mutex;
> > ?
> > ?	struct {
> > -		struct list_head head;
> > -		spinlock_t lock;
> > -	} job_list;
> > +		struct {
> > +			struct list_head head;
> > +			spinlock_t lock;
> > +		} list;
> > +		struct wait_queue_head wq;
> > +	} job;
> > ?};
> > ?
> > ?int nouveau_sched_create(struct nouveau_sched **psched, struct
nouveau_drm *drm,
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > index d94a85509176..79eefdfd08a2 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
> > @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
> > ?	u64 end = addr + range;
> > ?
> > ?again:
> > -	spin_lock(&sched->job_list.lock);
> > -	list_for_each_entry(__job, &sched->job_list.head, entry) {
> > +	spin_lock(&sched->job.list.lock);
> > +	list_for_each_entry(__job, &sched->job.list.head, entry) {
> > ?		struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job);
> > ?
> > ?		list_for_each_op(op, &bind_job->ops) {
> > @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
> > ?
> > ?				if (!(end <= op_addr || addr >= op_end)) {
> > ?					nouveau_uvmm_bind_job_get(bind_job);
> > -					spin_unlock(&sched->job_list.lock);
> > +					spin_unlock(&sched->job.list.lock);
> > ?					wait_for_completion(&bind_job->complete);
> > ?					nouveau_uvmm_bind_job_put(bind_job);
> > ?					goto again;
> > @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
> > ?			}
> > ?		}
> > ?	}
> > -	spin_unlock(&sched->job_list.lock);
> > +	spin_unlock(&sched->job.list.lock);
> > ?}
> > ?
> > ?static int
>

Christian König

2025-Sep-04 11:56 UTC

head link

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

On 04.09.25 13:12, Philipp Stanner wrote:> On Thu, 2025-09-04 at 12:27 +0200, Christian K?nig wrote:
>> On 01.09.25 10:31, Philipp Stanner wrote:
>>> This reverts:
>>>
>>> commit bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
>>> commit 5f46f5c7af8c ("drm/nouveau: Add new callback for
scheduler teardown")
>>>
>>> from the drm/sched teardown leak fix series:
>>>
>>> https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta at
kernel.org/
>>>
>>> The aforementioned series removed a blocking waitqueue from
>>> nouveau_sched_fini(). It was mistakenly assumed that this waitqueue
only
>>> prevents jobs from leaking, which the series fixed.
>>>
>>> The waitqueue, however, also guarantees that all VM_BIND related
jobs
>>> are finished in order, cleaning up mappings in the GPU's MMU.
These jobs
>>> must be executed sequentially. Without the waitqueue, this is no
longer
>>> guaranteed, because entity and scheduler teardown can race with
each
>>> other.
>>
>> That sounds like exactly the kind of issues I tried to catch with the
recent dma_fence changes.
> 
> Link? :)
dma-buf: add warning when dma_fence is signaled from IOCTL
> 
>>
>> Going to keep working on that and potentially using this here as
blueprint for something it should catch.
> 
> This is more like a nouveau-specific issue. The problem is that
> unmapping mappings in the GPU's MMU must be done in a specific order,
> and all the unmappings must be performed, not canceled.
> 
> For EXEC jobs, it's perfectly fine to cancel pending jobs, remove the
> waitqueue and just rush through drm_sched_fini().
> 
> I don't know the issue you're describing, but I don't think a
feature
> in dma_fence could help with that specific Nouveau problem. dma_fence
> can't force the driver to submit jobs in a specific order or to wait
> until they're all completed.
Well the updates are represented by a dma_fence, aren't they?

So the dma_fence framework could potentially warn if a fence from the same
context signals out of order.

Regards,
Christian.
> 
> Gr??e
> P.
> 
>>
>> Regards,
>> Christian.
>>
>>>
>>> Revert all patches related to the waitqueue removal.
>>>
>>> Fixes: bead88002227 ("drm/nouveau: Remove waitque for sched
teardown")
>>> Suggested-by: Danilo Krummrich <dakr at kernel.org>
>>> Signed-off-by: Philipp Stanner <phasta at kernel.org>
>>> ---
>>> Changes in v2:
>>> ? - Don't revert commit 89b2675198ab ("drm/nouveau: Make
fence container helper usable driver-wide")
>>> ? - Add Fixes-tag
>>> ---
>>> ?drivers/gpu/drm/nouveau/nouveau_fence.c | 15 -----------
>>> ?drivers/gpu/drm/nouveau/nouveau_fence.h |? 1 -
>>> ?drivers/gpu/drm/nouveau/nouveau_sched.c | 35
++++++++++---------------
>>> ?drivers/gpu/drm/nouveau/nouveau_sched.h |? 9 ++++---
>>> ?drivers/gpu/drm/nouveau/nouveau_uvmm.c? |? 8 +++---
>>> ?5 files changed, 24 insertions(+), 44 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
>>> index 9f345a008717..869d4335c0f4 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_fence.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
>>> @@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence
*fence)
>>> ?	return ret;
>>> ?}
>>> ?
>>> -void
>>> -nouveau_fence_cancel(struct nouveau_fence *fence)
>>> -{
>>> -	struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
>>> -	unsigned long flags;
>>> -
>>> -	spin_lock_irqsave(&fctx->lock, flags);
>>> -	if (!dma_fence_is_signaled_locked(&fence->base)) {
>>> -		dma_fence_set_error(&fence->base, -ECANCELED);
>>> -		if (nouveau_fence_signal(fence))
>>> -			nvif_event_block(&fctx->event);
>>> -	}
>>> -	spin_unlock_irqrestore(&fctx->lock, flags);
>>> -}
>>> -
>>> ?bool
>>> ?nouveau_fence_done(struct nouveau_fence *fence)
>>> ?{
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
b/drivers/gpu/drm/nouveau/nouveau_fence.h
>>> index 9957a919bd38..183dd43ecfff 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_fence.h
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
>>> @@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence
**);
>>> ?
>>> ?int? nouveau_fence_emit(struct nouveau_fence *);
>>> ?bool nouveau_fence_done(struct nouveau_fence *);
>>> -void nouveau_fence_cancel(struct nouveau_fence *fence);
>>> ?int? nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool
intr);
>>> ?int? nouveau_fence_sync(struct nouveau_bo *, struct
nouveau_channel *, bool exclusive, bool intr);
>>> ?
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
>>> index 0cc0bc9f9952..e60f7892f5ce 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
>>> @@ -11,7 +11,6 @@
>>> ?#include "nouveau_exec.h"
>>> ?#include "nouveau_abi16.h"
>>> ?#include "nouveau_sched.h"
>>> -#include "nouveau_chan.h"
>>> ?
>>> ?#define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000
>>> ?
>>> @@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
>>> ?{
>>> ?	struct nouveau_sched *sched = job->sched;
>>> ?
>>> -	spin_lock(&sched->job_list.lock);
>>> +	spin_lock(&sched->job.list.lock);
>>> ?	list_del(&job->entry);
>>> -	spin_unlock(&sched->job_list.lock);
>>> +	spin_unlock(&sched->job.list.lock);
>>> +
>>> +	wake_up(&sched->job.wq);
>>> ?}
>>> ?
>>> ?void
>>> @@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
>>> ?	}
>>> ?
>>> ?	/* Submit was successful; add the job to the schedulers job list.
*/
>>> -	spin_lock(&sched->job_list.lock);
>>> -	list_add(&job->entry, &sched->job_list.head);
>>> -	spin_unlock(&sched->job_list.lock);
>>> +	spin_lock(&sched->job.list.lock);
>>> +	list_add(&job->entry, &sched->job.list.head);
>>> +	spin_unlock(&sched->job.list.lock);
>>> ?
>>> ?	drm_sched_job_arm(&job->base);
>>> ?	job->done_fence =
dma_fence_get(&job->base.s_fence->finished);
>>> @@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job
*sched_job)
>>> ?	nouveau_job_fini(job);
>>> ?}
>>> ?
>>> -static void
>>> -nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
>>> -{
>>> -	struct nouveau_fence *fence;
>>> -	struct nouveau_job *job;
>>> -
>>> -	job = to_nouveau_job(sched_job);
>>> -	fence = to_nouveau_fence(job->done_fence);
>>> -
>>> -	nouveau_fence_cancel(fence);
>>> -}
>>> -
>>> ?static const struct drm_sched_backend_ops nouveau_sched_ops = {
>>> ?	.run_job = nouveau_sched_run_job,
>>> ?	.timedout_job = nouveau_sched_timedout_job,
>>> ?	.free_job = nouveau_sched_free_job,
>>> -	.cancel_job = nouveau_sched_cancel_job,
>>> ?};
>>> ?
>>> ?static int
>>> @@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched,
struct nouveau_drm *drm,
>>> ?		goto fail_sched;
>>> ?
>>> ?	mutex_init(&sched->mutex);
>>> -	spin_lock_init(&sched->job_list.lock);
>>> -	INIT_LIST_HEAD(&sched->job_list.head);
>>> +	spin_lock_init(&sched->job.list.lock);
>>> +	INIT_LIST_HEAD(&sched->job.list.head);
>>> +	init_waitqueue_head(&sched->job.wq);
>>> ?
>>> ?	return 0;
>>> ?
>>> @@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched
**psched, struct nouveau_drm *drm,
>>> ?	return 0;
>>> ?}
>>> ?
>>> +
>>> ?static void
>>> ?nouveau_sched_fini(struct nouveau_sched *sched)
>>> ?{
>>> ?	struct drm_gpu_scheduler *drm_sched = &sched->base;
>>> ?	struct drm_sched_entity *entity = &sched->entity;
>>> ?
>>> +	rmb(); /* for list_empty to work without lock */
>>> +	wait_event(sched->job.wq,
list_empty(&sched->job.list.head));
>>> +
>>> ?	drm_sched_entity_fini(entity);
>>> ?	drm_sched_fini(drm_sched);
>>> ?
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>> index b98c3f0bef30..20cd1da8db73 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_sched.h
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>> @@ -103,9 +103,12 @@ struct nouveau_sched {
>>> ?	struct mutex mutex;
>>> ?
>>> ?	struct {
>>> -		struct list_head head;
>>> -		spinlock_t lock;
>>> -	} job_list;
>>> +		struct {
>>> +			struct list_head head;
>>> +			spinlock_t lock;
>>> +		} list;
>>> +		struct wait_queue_head wq;
>>> +	} job;
>>> ?};
>>> ?
>>> ?int nouveau_sched_create(struct nouveau_sched **psched, struct
nouveau_drm *drm,
>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> index d94a85509176..79eefdfd08a2 100644
>>> --- a/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> +++ b/drivers/gpu/drm/nouveau/nouveau_uvmm.c
>>> @@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
>>> ?	u64 end = addr + range;
>>> ?
>>> ?again:
>>> -	spin_lock(&sched->job_list.lock);
>>> -	list_for_each_entry(__job, &sched->job_list.head, entry) {
>>> +	spin_lock(&sched->job.list.lock);
>>> +	list_for_each_entry(__job, &sched->job.list.head, entry) {
>>> ?		struct nouveau_uvmm_bind_job *bind_job =
to_uvmm_bind_job(__job);
>>> ?
>>> ?		list_for_each_op(op, &bind_job->ops) {
>>> @@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
>>> ?
>>> ?				if (!(end <= op_addr || addr >= op_end)) {
>>> ?					nouveau_uvmm_bind_job_get(bind_job);
>>> -					spin_unlock(&sched->job_list.lock);
>>> +					spin_unlock(&sched->job.list.lock);
>>> ?					wait_for_completion(&bind_job->complete);
>>> ?					nouveau_uvmm_bind_job_put(bind_job);
>>> ?					goto again;
>>> @@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job
*job, u64 addr, u64 range)
>>> ?			}
>>> ?		}
>>> ?	}
>>> -	spin_unlock(&sched->job_list.lock);
>>> +	spin_unlock(&sched->job.list.lock);
>>> ?}
>>> ?
>>> ?static int
>>
>

Nouveau - Sep 2025 - [PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"

[PATCH v2] Revert "drm/nouveau: Remove waitque for sched teardown"