thr3ads.net - Nouveau - [Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job [Sep 2023]

If this information is useful, please help other people find it:
Share via:

Danilo Krummrich

2023-Sep-16 16:28 UTC

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

Always stop and re-start the scheduler in order to let the scheduler
free up the timedout job in case it got signaled. In case of exec jobs
the job type specific callback will take care to signal all fences and
tear down the channel.

Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
Signed-off-by: Danilo Krummrich <dakr at redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_exec.c  |  2 +-
 drivers/gpu/drm/nouveau/nouveau_sched.c | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
b/drivers/gpu/drm/nouveau/nouveau_exec.c
index 9c031d15fe0b..49d83ac9e036 100644
--- a/drivers/gpu/drm/nouveau/nouveau_exec.c
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
@@ -185,7 +185,7 @@ nouveau_exec_job_timeout(struct nouveau_job *job)
 
 	nouveau_sched_entity_fini(job->entity);
 
-	return DRM_GPU_SCHED_STAT_ENODEV;
+	return DRM_GPU_SCHED_STAT_NOMINAL;
 }
 
 static struct nouveau_job_ops nouveau_exec_job_ops = {
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
index 88217185e0f3..3b7ea5221226 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -375,14 +375,20 @@ nouveau_sched_run_job(struct drm_sched_job *sched_job)
 static enum drm_gpu_sched_stat
 nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
 {
+	struct drm_gpu_scheduler *sched = sched_job->sched;
 	struct nouveau_job *job = to_nouveau_job(sched_job);
+	enum drm_gpu_sched_stat stat = DRM_GPU_SCHED_STAT_NOMINAL;
 
-	NV_PRINTK(warn, job->cli, "Job timed out.\n");
+	drm_sched_stop(sched, sched_job);
 
 	if (job->ops->timeout)
-		return job->ops->timeout(job);
+		stat = job->ops->timeout(job);
+	else
+		NV_PRINTK(warn, job->cli, "Generic job timeout.\n");
+
+	drm_sched_start(sched, true);
 
-	return DRM_GPU_SCHED_STAT_ENODEV;
+	return stat;
 }
 
 static void
-- 
2.41.0

Lyude Paul

2023-Sep-18 22:19 UTC

head link

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

BTW - Would you like me to review work like this? I'm totally happy to do
that, although I'm not terribly familiar with these parts of nouveau/drm
(but
I'm always willing to learn, and would like to know more about these areas
anyway :)

?if the answer is yes, this patch looks fine to me so far - I guess the one
question I have that might have an obvious answer - how are jobs without an
job->ops->timeout callback cleaned up?

On Sat, 2023-09-16 at 18:28 +0200, Danilo Krummrich
wrote:> Always stop and re-start the scheduler in order to let the scheduler
> free up the timedout job in case it got signaled. In case of exec jobs
> the job type specific callback will take care to signal all fences and
> tear down the channel.
> 
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Danilo Krummrich <dakr at redhat.com>
> ---
>  drivers/gpu/drm/nouveau/nouveau_exec.c  |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 12 +++++++++---
>  2 files changed, 10 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index 9c031d15fe0b..49d83ac9e036 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -185,7 +185,7 @@ nouveau_exec_job_timeout(struct nouveau_job *job)
>  
>  	nouveau_sched_entity_fini(job->entity);
>  
> -	return DRM_GPU_SCHED_STAT_ENODEV;
> +	return DRM_GPU_SCHED_STAT_NOMINAL;
>  }
>  
>  static struct nouveau_job_ops nouveau_exec_job_ops = {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 88217185e0f3..3b7ea5221226 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -375,14 +375,20 @@ nouveau_sched_run_job(struct drm_sched_job
*sched_job)
>  static enum drm_gpu_sched_stat
>  nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>  {
> +	struct drm_gpu_scheduler *sched = sched_job->sched;
>  	struct nouveau_job *job = to_nouveau_job(sched_job);
> +	enum drm_gpu_sched_stat stat = DRM_GPU_SCHED_STAT_NOMINAL;
>  
> -	NV_PRINTK(warn, job->cli, "Job timed out.\n");
> +	drm_sched_stop(sched, sched_job);
>  
>  	if (job->ops->timeout)
> -		return job->ops->timeout(job);
> +		stat = job->ops->timeout(job);
> +	else
> +		NV_PRINTK(warn, job->cli, "Generic job timeout.\n");
> +
> +	drm_sched_start(sched, true);
>  
> -	return DRM_GPU_SCHED_STAT_ENODEV;
> +	return stat;
>  }
>  
>  static void
-- 
Cheers,
 Lyude Paul (she/her)
 Software Engineer at Red Hat

Dave Airlie

2023-Sep-19 02:13 UTC

head link

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

On Sun, 17 Sept 2023 at 02:28, Danilo Krummrich <dakr at redhat.com>
wrote:>
> Always stop and re-start the scheduler in order to let the scheduler
> free up the timedout job in case it got signaled. In case of exec jobs
> the job type specific callback will take care to signal all fences and
> tear down the channel.
Reviewed-by: Dave Airlie <airlied at redhat.com>>
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Danilo Krummrich <dakr at redhat.com>
> ---
>  drivers/gpu/drm/nouveau/nouveau_exec.c  |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 12 +++++++++---
>  2 files changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index 9c031d15fe0b..49d83ac9e036 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -185,7 +185,7 @@ nouveau_exec_job_timeout(struct nouveau_job *job)
>
>         nouveau_sched_entity_fini(job->entity);
>
> -       return DRM_GPU_SCHED_STAT_ENODEV;
> +       return DRM_GPU_SCHED_STAT_NOMINAL;
>  }
>
>  static struct nouveau_job_ops nouveau_exec_job_ops = {
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 88217185e0f3..3b7ea5221226 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -375,14 +375,20 @@ nouveau_sched_run_job(struct drm_sched_job
*sched_job)
>  static enum drm_gpu_sched_stat
>  nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>  {
> +       struct drm_gpu_scheduler *sched = sched_job->sched;
>         struct nouveau_job *job = to_nouveau_job(sched_job);
> +       enum drm_gpu_sched_stat stat = DRM_GPU_SCHED_STAT_NOMINAL;
>
> -       NV_PRINTK(warn, job->cli, "Job timed out.\n");
> +       drm_sched_stop(sched, sched_job);
>
>         if (job->ops->timeout)
> -               return job->ops->timeout(job);
> +               stat = job->ops->timeout(job);
> +       else
> +               NV_PRINTK(warn, job->cli, "Generic job
timeout.\n");
> +
> +       drm_sched_start(sched, true);
>
> -       return DRM_GPU_SCHED_STAT_ENODEV;
> +       return stat;
>  }
>
>  static void
> --
> 2.41.0
>

Nouveau - Sep 2023 - [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job

[Nouveau] [PATCH] drm/nouveau: sched: fix leaking memory of timedout job