Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] hmm_range_fault related fixes and legacy API removal v2
Hi Jérôme, Ben and Jason, below is a series against the hmm tree which fixes up the mmap_sem locking in nouveau and while at it also removes leftover legacy HMM APIs only used by nouveau. The first 4 patches are a bug fix for nouveau, which I suspect should go into this merge window even if the code is marked as staging, just to avoid people copying the breakage. Changes since v1: - don't return the valid state from hmm_range_unregister - additional nouveau cleanups
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 1/6] mm: always return EBUSY for invalid ranges in hmm_range_{fault, snapshot}
We should not have two different error codes for the same condition. In
addition this really complicates the code due to the special handling of
EAGAIN that drops the mmap_sem due to the FAULT_FLAG_ALLOW_RETRY logic
in the core vm.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Reviewed-by: Ralph Campbell <rcampbell at nvidia.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
---
Documentation/vm/hmm.rst | 2 +-
mm/hmm.c | 10 ++++------
2 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7d90964abbb0..710ce1c701bf 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -237,7 +237,7 @@ The usage pattern is::
ret = hmm_range_snapshot(&range);
if (ret) {
up_read(&mm->mmap_sem);
- if (ret == -EAGAIN) {
+ if (ret == -EBUSY) {
/*
* No need to check hmm_range_wait_until_valid() return value
* on retry we will get proper error with hmm_range_snapshot()
diff --git a/mm/hmm.c b/mm/hmm.c
index e1eedef129cf..16b6731a34db 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -946,7 +946,7 @@ EXPORT_SYMBOL(hmm_range_unregister);
* @range: range
* Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
* permission (for instance asking for write and range is read only),
- * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
+ * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid
* vma or it is illegal to access that range), number of valid pages
* in range->pfns[] (from range start address).
*
@@ -967,7 +967,7 @@ long hmm_range_snapshot(struct hmm_range *range)
do {
/* If range is no longer valid force retry. */
if (!range->valid)
- return -EAGAIN;
+ return -EBUSY;
vma = find_vma(hmm->mm, start);
if (vma == NULL || (vma->vm_flags & device_vma))
@@ -1062,10 +1062,8 @@ long hmm_range_fault(struct hmm_range *range, bool block)
do {
/* If range is no longer valid force retry. */
- if (!range->valid) {
- up_read(&hmm->mm->mmap_sem);
- return -EAGAIN;
- }
+ if (!range->valid)
+ return -EBUSY;
vma = find_vma(hmm->mm, start);
if (vma == NULL || (vma->vm_flags & device_vma))
--
2.20.1
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 2/6] mm: move hmm_vma_range_done and hmm_vma_fault to nouveau
These two functions are marked as a legacy APIs to get rid of, but seem
to suit the current nouveau flow. Move it to the only user in
preparation for fixing a locking bug involving caller and callee.
All comments referring to the old API have been removed as this now
is a driver private helper.
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/gpu/drm/nouveau/nouveau_svm.c | 45 +++++++++++++++++++++-
include/linux/hmm.h | 54 ---------------------------
2 files changed, 43 insertions(+), 56 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 8c92374afcf2..cde09003c06b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -475,6 +475,47 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm,
fault->inst, fault->addr, fault->access);
}
+static inline bool nouveau_range_done(struct hmm_range *range)
+{
+ bool ret = hmm_range_valid(range);
+
+ hmm_range_unregister(range);
+ return ret;
+}
+
+static int
+nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range,
+ bool block)
+{
+ long ret;
+
+ range->default_flags = 0;
+ range->pfn_flags_mask = -1UL;
+
+ ret = hmm_range_register(range, mirror,
+ range->start, range->end,
+ PAGE_SHIFT);
+ if (ret)
+ return (int)ret;
+
+ if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
+ up_read(&range->vma->vm_mm->mmap_sem);
+ return -EAGAIN;
+ }
+
+ ret = hmm_range_fault(range, block);
+ if (ret <= 0) {
+ if (ret == -EBUSY || !ret) {
+ up_read(&range->vma->vm_mm->mmap_sem);
+ ret = -EBUSY;
+ } else if (ret == -EAGAIN)
+ ret = -EBUSY;
+ hmm_range_unregister(range);
+ return ret;
+ }
+ return 0;
+}
+
static int
nouveau_svm_fault(struct nvif_notify *notify)
{
@@ -649,10 +690,10 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
again:
- ret = hmm_vma_fault(&svmm->mirror, &range, true);
+ ret = nouveau_range_fault(&svmm->mirror, &range, true);
if (ret == 0) {
mutex_lock(&svmm->mutex);
- if (!hmm_vma_range_done(&range)) {
+ if (!nouveau_range_done(&range)) {
mutex_unlock(&svmm->mutex);
goto again;
}
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index b8a08b2a10ca..7ef56dc18050 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -484,60 +484,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
*/
#define HMM_RANGE_DEFAULT_TIMEOUT 1000
-/* This is a temporary helper to avoid merge conflict between trees. */
-static inline bool hmm_vma_range_done(struct hmm_range *range)
-{
- bool ret = hmm_range_valid(range);
-
- hmm_range_unregister(range);
- return ret;
-}
-
-/* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_mirror *mirror,
- struct hmm_range *range, bool block)
-{
- long ret;
-
- /*
- * With the old API the driver must set each individual entries with
- * the requested flags (valid, write, ...). So here we set the mask to
- * keep intact the entries provided by the driver and zero out the
- * default_flags.
- */
- range->default_flags = 0;
- range->pfn_flags_mask = -1UL;
-
- ret = hmm_range_register(range, mirror,
- range->start, range->end,
- PAGE_SHIFT);
- if (ret)
- return (int)ret;
-
- if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
- /*
- * The mmap_sem was taken by driver we release it here and
- * returns -EAGAIN which correspond to mmap_sem have been
- * drop in the old API.
- */
- up_read(&range->vma->vm_mm->mmap_sem);
- return -EAGAIN;
- }
-
- ret = hmm_range_fault(range, block);
- if (ret <= 0) {
- if (ret == -EBUSY || !ret) {
- /* Same as above, drop mmap_sem to match old API. */
- up_read(&range->vma->vm_mm->mmap_sem);
- ret = -EBUSY;
- } else if (ret == -EAGAIN)
- ret = -EBUSY;
- hmm_range_unregister(range);
- return ret;
- }
- return 0;
-}
-
/* Below are for HMM internal use only! Not to be used by device driver! */
static inline void hmm_mm_init(struct mm_struct *mm)
{
--
2.20.1
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 3/6] nouveau: remove the block parameter to nouveau_range_fault
The parameter is always false, so remove it as well as the -EAGAIN
handling that can only happen for the non-blocking case.
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index cde09003c06b..5dd83a46578f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -484,8 +484,7 @@ static inline bool nouveau_range_done(struct hmm_range
*range)
}
static int
-nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range,
- bool block)
+nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range)
{
long ret;
@@ -503,7 +502,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct
hmm_range *range,
return -EAGAIN;
}
- ret = hmm_range_fault(range, block);
+ ret = hmm_range_fault(range, true);
if (ret <= 0) {
if (ret == -EBUSY || !ret) {
up_read(&range->vma->vm_mm->mmap_sem);
@@ -690,7 +689,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
again:
- ret = nouveau_range_fault(&svmm->mirror, &range, true);
+ ret = nouveau_range_fault(&svmm->mirror, &range);
if (ret == 0) {
mutex_lock(&svmm->mutex);
if (!nouveau_range_done(&range)) {
--
2.20.1
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 4/6] nouveau: unlock mmap_sem on all errors from nouveau_range_fault
Currently nouveau_svm_fault expects nouveau_range_fault to never unlock
mmap_sem, but the latter unlocks it for a random selection of error
codes. Fix this up by always unlocking mmap_sem for non-zero return
values in nouveau_range_fault, and only unlocking it in the caller
for successful returns.
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/gpu/drm/nouveau/nouveau_svm.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 5dd83a46578f..5de2d54b9782 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -494,8 +494,10 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct
hmm_range *range)
ret = hmm_range_register(range, mirror,
range->start, range->end,
PAGE_SHIFT);
- if (ret)
+ if (ret) {
+ up_read(&range->vma->vm_mm->mmap_sem);
return (int)ret;
+ }
if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
up_read(&range->vma->vm_mm->mmap_sem);
@@ -504,11 +506,9 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct
hmm_range *range)
ret = hmm_range_fault(range, true);
if (ret <= 0) {
- if (ret == -EBUSY || !ret) {
- up_read(&range->vma->vm_mm->mmap_sem);
- ret = -EBUSY;
- } else if (ret == -EAGAIN)
+ if (ret == 0)
ret = -EBUSY;
+ up_read(&range->vma->vm_mm->mmap_sem);
hmm_range_unregister(range);
return ret;
}
@@ -706,8 +706,8 @@ nouveau_svm_fault(struct nvif_notify *notify)
NULL);
svmm->vmm->vmm.object.client->super = false;
mutex_unlock(&svmm->mutex);
+ up_read(&svmm->mm->mmap_sem);
}
- up_read(&svmm->mm->mmap_sem);
/* Cancel any faults in the window whose pages didn't manage
* to keep their valid bit, or stay writeable when required.
--
2.20.1
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 5/6] nouveau: return -EBUSY when hmm_range_wait_until_valid fails
-EAGAIN has a magic meaning for non-blocking faults, so don't overload
it. Given that the caller doesn't check for specific error codes this
change is purely cosmetic.
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 5de2d54b9782..a9c5c58d425b 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -501,7 +501,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct
hmm_range *range)
if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
up_read(&range->vma->vm_mm->mmap_sem);
- return -EAGAIN;
+ return -EBUSY;
}
ret = hmm_range_fault(range, true);
--
2.20.1
Christoph Hellwig
2019-Jul-22 09:44 UTC
[Nouveau] [PATCH 6/6] mm: remove the legacy hmm_pfn_* APIs
Switch the one remaining user in nouveau over to its replacement,
and remove all the wrappers.
Signed-off-by: Christoph Hellwig <hch at lst.de>
Reviewed-by: Ralph Campbell <rcampbell at nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg at mellanox.com>
---
drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +-
include/linux/hmm.h | 34 --------------------------
2 files changed, 1 insertion(+), 35 deletions(-)
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 1333220787a1..345c63cb752a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -845,7 +845,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
struct page *page;
uint64_t addr;
- page = hmm_pfn_to_page(range, range->pfns[i]);
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
if (page == NULL)
continue;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7ef56dc18050..9f32586684c9 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -290,40 +290,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const
struct hmm_range *range,
range->flags[HMM_PFN_VALID];
}
-/*
- * Old API:
- * hmm_pfn_to_page()
- * hmm_pfn_to_pfn()
- * hmm_pfn_from_page()
- * hmm_pfn_from_pfn()
- *
- * This are the OLD API please use new API, it is here to avoid cross-tree
- * merge painfullness ie we convert things to new API in stages.
- */
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
- uint64_t pfn)
-{
- return hmm_device_entry_to_page(range, pfn);
-}
-
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
- uint64_t pfn)
-{
- return hmm_device_entry_to_pfn(range, pfn);
-}
-
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
- struct page *page)
-{
- return hmm_device_entry_from_page(range, page);
-}
-
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
-{
- return hmm_device_entry_from_pfn(range, pfn);
-}
-
/*
* Mirroring: how to synchronize device page table with CPU page table.
*
--
2.20.1
Souptick Joarder
2019-Jul-22 14:37 UTC
[Nouveau] [PATCH 1/6] mm: always return EBUSY for invalid ranges in hmm_range_{fault, snapshot}
On Mon, Jul 22, 2019 at 3:14 PM Christoph Hellwig <hch at lst.de> wrote:> > We should not have two different error codes for the same condition. In > addition this really complicates the code due to the special handling of > EAGAIN that drops the mmap_sem due to the FAULT_FLAG_ALLOW_RETRY logic > in the core vm. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > Reviewed-by: Ralph Campbell <rcampbell at nvidia.com> > Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> > --- > Documentation/vm/hmm.rst | 2 +- > mm/hmm.c | 10 ++++------ > 2 files changed, 5 insertions(+), 7 deletions(-) > > diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst > index 7d90964abbb0..710ce1c701bf 100644 > --- a/Documentation/vm/hmm.rst > +++ b/Documentation/vm/hmm.rst > @@ -237,7 +237,7 @@ The usage pattern is:: > ret = hmm_range_snapshot(&range); > if (ret) { > up_read(&mm->mmap_sem); > - if (ret == -EAGAIN) { > + if (ret == -EBUSY) { > /* > * No need to check hmm_range_wait_until_valid() return value > * on retry we will get proper error with hmm_range_snapshot() > diff --git a/mm/hmm.c b/mm/hmm.c > index e1eedef129cf..16b6731a34db 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -946,7 +946,7 @@ EXPORT_SYMBOL(hmm_range_unregister); > * @range: range > * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid > * permission (for instance asking for write and range is read only), > - * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid > + * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid > * vma or it is illegal to access that range), number of valid pages > * in range->pfns[] (from range start address). > * > @@ -967,7 +967,7 @@ long hmm_range_snapshot(struct hmm_range *range) > do { > /* If range is no longer valid force retry. */ > if (!range->valid) > - return -EAGAIN; > + return -EBUSY; > > vma = find_vma(hmm->mm, start); > if (vma == NULL || (vma->vm_flags & device_vma)) > @@ -1062,10 +1062,8 @@ long hmm_range_fault(struct hmm_range *range, bool block) > > do { > /* If range is no longer valid force retry. */ > - if (!range->valid) { > - up_read(&hmm->mm->mmap_sem); > - return -EAGAIN; > - } > + if (!range->valid) > + return -EBUSY;Is it fine to remove up_read(&hmm->mm->mmap_sem) ?> > vma = find_vma(hmm->mm, start); > if (vma == NULL || (vma->vm_flags & device_vma)) > -- > 2.20.1 >
Ralph Campbell
2019-Jul-23 01:11 UTC
[Nouveau] hmm_range_fault related fixes and legacy API removal v2
On 7/22/19 2:44 AM, Christoph Hellwig wrote:> Hi Jérôme, Ben and Jason, > > below is a series against the hmm tree which fixes up the mmap_sem > locking in nouveau and while at it also removes leftover legacy HMM APIs > only used by nouveau. > > The first 4 patches are a bug fix for nouveau, which I suspect should > go into this merge window even if the code is marked as staging, just > to avoid people copying the breakage. > > Changes since v1: > - don't return the valid state from hmm_range_unregister > - additional nouveau cleanups >I ran some OpenCL tests from Jerome with nouveau and this series, 5.3.0-rc1, and my two HMM fixes: ("mm/hmm: fix ZONE_DEVICE anon page mapping reuse") ("mm/hmm: Fix bad subpage pointer in try_to_unmap_one") You can add for the series: Tested-by: Ralph Campbell <rcampbell at nvidia.com>
Jason Gunthorpe
2019-Jul-23 14:55 UTC
[Nouveau] [PATCH 2/6] mm: move hmm_vma_range_done and hmm_vma_fault to nouveau
On Mon, Jul 22, 2019 at 11:44:22AM +0200, Christoph Hellwig wrote:> These two functions are marked as a legacy APIs to get rid of, but seem > to suit the current nouveau flow. Move it to the only user in > preparation for fixing a locking bug involving caller and callee. > All comments referring to the old API have been removed as this now > is a driver private helper. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/gpu/drm/nouveau/nouveau_svm.c | 45 +++++++++++++++++++++- > include/linux/hmm.h | 54 --------------------------- > 2 files changed, 43 insertions(+), 56 deletions(-)Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jul-23 14:56 UTC
[Nouveau] [PATCH 3/6] nouveau: remove the block parameter to nouveau_range_fault
On Mon, Jul 22, 2019 at 11:44:23AM +0200, Christoph Hellwig wrote:> The parameter is always false, so remove it as well as the -EAGAIN > handling that can only happen for the non-blocking case.? Did the EAGAIN handling get removed in this patch?> Signed-off-by: Christoph Hellwig <hch at lst.de> > drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++---- > 1 file changed, 3 insertions(+), 4 deletions(-)Code seems fine Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jul-23 15:18 UTC
[Nouveau] [PATCH 4/6] nouveau: unlock mmap_sem on all errors from nouveau_range_fault
On Mon, Jul 22, 2019 at 11:44:24AM +0200, Christoph Hellwig wrote:> Currently nouveau_svm_fault expects nouveau_range_fault to never unlock > mmap_sem, but the latter unlocks it for a random selection of error > codes. Fix this up by always unlocking mmap_sem for non-zero return > values in nouveau_range_fault, and only unlocking it in the caller > for successful returns. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > drivers/gpu/drm/nouveau/nouveau_svm.c | 12 ++++++------ > 1 file changed, 6 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c > index 5dd83a46578f..5de2d54b9782 100644 > +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c > @@ -494,8 +494,10 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) > ret = hmm_range_register(range, mirror, > range->start, range->end, > PAGE_SHIFT); > - if (ret) > + if (ret) { > + up_read(&range->vma->vm_mm->mmap_sem); > return (int)ret; > + } > > if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { > up_read(&range->vma->vm_mm->mmap_sem); > @@ -504,11 +506,9 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) > > ret = hmm_range_fault(range, true); > if (ret <= 0) { > - if (ret == -EBUSY || !ret) { > - up_read(&range->vma->vm_mm->mmap_sem); > - ret = -EBUSY; > - } else if (ret == -EAGAIN) > + if (ret == 0) > ret = -EBUSY; > + up_read(&range->vma->vm_mm->mmap_sem); > hmm_range_unregister(range); > return ret;Hum.. The caller does this: again: ret = nouveau_range_fault(&svmm->mirror, &range); if (ret == 0) { mutex_lock(&svmm->mutex); if (!nouveau_range_done(&range)) { mutex_unlock(&svmm->mutex); goto again; And we can't call nouveau_range_fault() -> hmm_range_fault() without holding the mmap_sem, so we can't allow nouveau_range_fault to unlock it. Maybe this instead? diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index a9c5c58d425b3d..92cf760a9bcc5d 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -494,21 +494,16 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); - if (ret) { - up_read(&range->vma->vm_mm->mmap_sem); + if (ret) return (int)ret; - } - if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - up_read(&range->vma->vm_mm->mmap_sem); + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) return -EBUSY; - } ret = hmm_range_fault(range, true); if (ret <= 0) { if (ret == 0) ret = -EBUSY; - up_read(&range->vma->vm_mm->mmap_sem); hmm_range_unregister(range); return ret; } @@ -706,8 +701,8 @@ nouveau_svm_fault(struct nvif_notify *notify) NULL); svmm->vmm->vmm.object.client->super = false; mutex_unlock(&svmm->mutex); - up_read(&svmm->mm->mmap_sem); } + up_read(&svmm->mm->mmap_sem); /* Cancel any faults in the window whose pages didn't manage * to keep their valid bit, or stay writeable when required.
Jason Gunthorpe
2019-Jul-23 15:18 UTC
[Nouveau] [PATCH 5/6] nouveau: return -EBUSY when hmm_range_wait_until_valid fails
On Mon, Jul 22, 2019 at 11:44:25AM +0200, Christoph Hellwig wrote:> -EAGAIN has a magic meaning for non-blocking faults, so don't overload > it. Given that the caller doesn't check for specific error codes this > change is purely cosmetic. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-)Agree Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jul-23 15:20 UTC
[Nouveau] hmm_range_fault related fixes and legacy API removal v2
On Mon, Jul 22, 2019 at 06:11:04PM -0700, Ralph Campbell wrote:> > On 7/22/19 2:44 AM, Christoph Hellwig wrote: > > Hi Jérôme, Ben and Jason, > > > > below is a series against the hmm tree which fixes up the mmap_sem > > locking in nouveau and while at it also removes leftover legacy HMM APIs > > only used by nouveau. > > > > The first 4 patches are a bug fix for nouveau, which I suspect should > > go into this merge window even if the code is marked as staging, just > > to avoid people copying the breakage. > > > > Changes since v1: > > - don't return the valid state from hmm_range_unregister > > - additional nouveau cleanups > > > > I ran some OpenCL tests from Jerome with nouveau and this series, > 5.3.0-rc1, and my two HMM fixes: > ("mm/hmm: fix ZONE_DEVICE anon page mapping reuse") > ("mm/hmm: Fix bad subpage pointer in try_to_unmap_one") > > You can add for the series: > Tested-by: Ralph Campbell <rcampbell at nvidia.com>Thanks, would you also rebase and resend the patch deleting struct hmm_update ? Jason
Jason Gunthorpe
2019-Jul-23 15:27 UTC
[Nouveau] hmm_range_fault related fixes and legacy API removal v2
On Mon, Jul 22, 2019 at 11:44:20AM +0200, Christoph Hellwig wrote:> Hi Jérôme, Ben and Jason, > > below is a series against the hmm tree which fixes up the mmap_sem > locking in nouveau and while at it also removes leftover legacy HMM APIs > only used by nouveau. > > The first 4 patches are a bug fix for nouveau, which I suspect should > go into this merge window even if the code is marked as staging, just > to avoid people copying the breakage.Ignoring the STAGING issue I've tried to use the same guideline as for -stable for -rc .. So this is a real problem, we definitely hit the locking bugs if we retry/etc under stress, so I would be OK to send it to Linus for early-rc. However, it doesn't look like the 1st patch is fixing a current bug though, the only callers uses blocking = true, so just the middle three are -rc? Thanks, Jason
Christoph Hellwig
2019-Jul-23 16:32 UTC
[Nouveau] hmm_range_fault related fixes and legacy API removal v2
On Tue, Jul 23, 2019 at 03:27:41PM +0000, Jason Gunthorpe wrote:> Ignoring the STAGING issue I've tried to use the same guideline as for > -stable for -rc .. > > So this is a real problem, we definitely hit the locking bugs if we > retry/etc under stress, so I would be OK to send it to Linus for > early-rc. > > However, it doesn't look like the 1st patch is fixing a current bug > though, the only callers uses blocking = true, so just the middle > three are -rc?nonblocking isn't used anywher, but it is a major, major API bug. Your call, but if it was my tree I'd probably send it to Linus.