Hi Dan, Jérôme and Jason, below is a series that cleans up the dev_pagemap interface so that it is more easily usable, which removes the need to wrap it in hmm and thus allowing to kill a lot of code Diffstat: 22 files changed, 245 insertions(+), 802 deletions(-) Git tree: git://git.infradead.org/users/hch/misc.git hmm-devmem-cleanup Gitweb: http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/hmm-devmem-cleanup
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 01/22] mm: remove the unused ARCH_HAS_HMM_DEVICE Kconfig option
Signed-off-by: Christoph Hellwig <hch at lst.de> --- mm/Kconfig | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index f0c76ba47695..0d2ba7e1f43e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -675,16 +675,6 @@ config ARCH_HAS_HMM_MIRROR depends on (X86_64 || PPC64) depends on MMU && 64BIT -config ARCH_HAS_HMM_DEVICE - bool - default y - depends on (X86_64 || PPC64) - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE - select XARRAY_MULTI - config ARCH_HAS_HMM bool default y -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 02/22] mm: remove the struct hmm_device infrastructure
This code is a trivial wrapper around device model helpers, which should have been integrated into the driver device model usage from the start. Assuming it actually had users, which it never had since the code was added more than 1 1/2 years ago. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/hmm.h | 20 ------------ mm/hmm.c | 80 --------------------------------------------- 2 files changed, 100 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0fa8ea34ccef..4867b9da1b6c 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -717,26 +717,6 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { return page->hmm_data; } - - -/* - * struct hmm_device - fake device to hang device memory onto - * - * @device: device struct - * @minor: device minor number - */ -struct hmm_device { - struct device device; - unsigned int minor; -}; - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper and - * it is not strictly needed, in order to make use of any HMM functionality. - */ -struct hmm_device *hmm_device_new(void *drvdata); -void hmm_device_put(struct hmm_device *hmm_device); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} diff --git a/mm/hmm.c b/mm/hmm.c index 886b18695b97..ff2598eb7377 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1499,84 +1499,4 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, return devmem; } EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper - * and it is not needed to make use of any HMM functionality. - */ -#define HMM_DEVICE_MAX 256 - -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); -static DEFINE_SPINLOCK(hmm_device_lock); -static struct class *hmm_device_class; -static dev_t hmm_device_devt; - -static void hmm_device_release(struct device *device) -{ - struct hmm_device *hmm_device; - - hmm_device = container_of(device, struct hmm_device, device); - spin_lock(&hmm_device_lock); - clear_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - kfree(hmm_device); -} - -struct hmm_device *hmm_device_new(void *drvdata) -{ - struct hmm_device *hmm_device; - - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); - if (!hmm_device) - return ERR_PTR(-ENOMEM); - - spin_lock(&hmm_device_lock); - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); - if (hmm_device->minor >= HMM_DEVICE_MAX) { - spin_unlock(&hmm_device_lock); - kfree(hmm_device); - return ERR_PTR(-EBUSY); - } - set_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), - hmm_device->minor); - hmm_device->device.release = hmm_device_release; - dev_set_drvdata(&hmm_device->device, drvdata); - hmm_device->device.class = hmm_device_class; - device_initialize(&hmm_device->device); - - return hmm_device; -} -EXPORT_SYMBOL(hmm_device_new); - -void hmm_device_put(struct hmm_device *hmm_device) -{ - put_device(&hmm_device->device); -} -EXPORT_SYMBOL(hmm_device_put); - -static int __init hmm_init(void) -{ - int ret; - - ret = alloc_chrdev_region(&hmm_device_devt, 0, - HMM_DEVICE_MAX, - "hmm_device"); - if (ret) - return ret; - - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); - if (IS_ERR(hmm_device_class)) { - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); - return PTR_ERR(hmm_device_class); - } - return 0; -} - -device_initcall(hmm_init); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 03/22] mm: remove hmm_devmem_add_resource
This function has never been used since it was first added to the kernel more than a year and a half ago, and if we ever grow a consumer of the MEMORY_DEVICE_PUBLIC infrastructure it can easily use devm_memremap_pages directly now that we've simplified the API for it. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/hmm.h | 3 --- mm/hmm.c | 54 --------------------------------------------- 2 files changed, 57 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4867b9da1b6c..5761a39221a6 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -688,9 +688,6 @@ struct hmm_devmem { struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, struct device *device, unsigned long size); -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res); /* * hmm_devmem_page_set_drvdata - set per-page driver data field diff --git a/mm/hmm.c b/mm/hmm.c index ff2598eb7377..0c62426d1257 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1445,58 +1445,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, return devmem; } EXPORT_SYMBOL_GPL(hmm_devmem_add); - -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res) -{ - struct hmm_devmem *devmem; - void *result; - int ret; - - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) - return ERR_PTR(-EINVAL); - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = res; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, - &devmem->ref); - if (ret) - return ERR_PTR(ret); - - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 04/22] mm: don't clear ->mapping in hmm_devmem_free
->mapping isn't even used by HMM users, and the field at the same offset in the zone_device part of the union is declared as pad. (Which btw is rather confusing, as DAX uses ->pgmap and ->mapping from two different sides of the union, but DAX doesn't use hmm_devmem_free). Signed-off-by: Christoph Hellwig <hch at lst.de> --- mm/hmm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 0c62426d1257..e1dc98407e7b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1347,8 +1347,6 @@ static void hmm_devmem_free(struct page *page, void *data) { struct hmm_devmem *devmem = data; - page->mapping = NULL; - devmem->ops->free(devmem, page); } -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 05/22] mm: export alloc_pages_vma
noveau is currently using this through an odd hmm wrapper, and I plan to switch it to the real thing later in this series. Signed-off-by: Christoph Hellwig <hch at lst.de> --- mm/mempolicy.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01600d80ae01..f9023b5fba37 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, out: return page; } +EXPORT_SYMBOL_GPL(alloc_pages_vma); /** * alloc_pages_current - Allocate pages. -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 06/22] mm: factor out a devm_request_free_mem_region helper
Keep the physical address allocation that hmm_add_device does with the rest of the resource code, and allow future reuse of it without the hmm wrapper. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/ioport.h | 2 ++ kernel/resource.c | 39 +++++++++++++++++++++++++++++++++++++++ mm/hmm.c | 33 ++++----------------------------- 3 files changed, 45 insertions(+), 29 deletions(-) diff --git a/include/linux/ioport.h b/include/linux/ioport.h index da0ebaec25f0..76a33ae3bf6c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -286,6 +286,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) return (r1->start <= r2->end && r1->end >= r2->start); } +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size); #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */ diff --git a/kernel/resource.c b/kernel/resource.c index 158f04ec1d4f..99c58134ed1c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) } EXPORT_SYMBOL(resource_list_free); +#ifdef CONFIG_DEVICE_PRIVATE +/** + * devm_request_free_mem_region - find free region for device private memory + * + * @dev: device struct to bind the resource too + * @size: size in bytes of the device memory to add + * @base: resource tree to look in + * + * This function tries to find an empty range of physical address big enough to + * contain the new resource, so that it can later be hotpluged as ZONE_DEVICE + * memory, which in turn allocates struct pages. + */ +struct resource *devm_request_free_mem_region(struct device *dev, + struct resource *base, unsigned long size) +{ + resource_size_t end, addr; + struct resource *res; + + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); + addr = end - size + 1UL; + + for (; addr > size && addr >= base->start; addr -= size) { + if (region_intersects(addr, size, 0, IORES_DESC_NONE) !+ REGION_DISJOINT) + continue; + + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); + if (!res) + return ERR_PTR(-ENOMEM); + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + return res; + } + + return ERR_PTR(-ERANGE); +} +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); +#endif /* CONFIG_DEVICE_PRIVATE */ + static int __init strict_iomem(char *str) { if (strstr(str, "relaxed")) diff --git a/mm/hmm.c b/mm/hmm.c index e1dc98407e7b..13a16faf0a77 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -26,8 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) - #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; @@ -1372,7 +1370,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, unsigned long size) { struct hmm_devmem *devmem; - resource_size_t addr; void *result; int ret; @@ -1398,32 +1395,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, if (ret) return ERR_PTR(ret); - size = ALIGN(size, PA_SECTION_SIZE); - addr = min((unsigned long)iomem_resource.end, - (1UL << MAX_PHYSMEM_BITS) - 1); - addr = addr - size + 1UL; - - /* - * FIXME add a new helper to quickly walk resource tree and find free - * range - * - * FIXME what about ioport_resource resource ? - */ - for (; addr > size && addr >= iomem_resource.start; addr -= size) { - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); - if (ret != REGION_DISJOINT) - continue; - - devmem->resource = devm_request_mem_region(device, addr, size, - dev_name(device)); - if (!devmem->resource) - return ERR_PTR(-ENOMEM); - break; - } - if (!devmem->resource) - return ERR_PTR(-ERANGE); - - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; + devmem->resource = devm_request_free_mem_region(device, &iomem_resource, + size); + if (IS_ERR(devmem->resource)) + return ERR_CAST(devmem->resource); devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 07/22] memremap: move dev_pagemap callbacks into a separate structure
The dev_pagemap is a growing too many callbacks. Move them into a separate ops structure so that they are not duplicated for multiple instances, and an attacker can't easily overwrite them. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/dax/device.c | 6 +++++- drivers/nvdimm/pmem.c | 18 +++++++++++++----- drivers/pci/p2pdma.c | 5 ++++- include/linux/memremap.h | 29 +++++++++++++++-------------- kernel/memremap.c | 12 ++++++------ mm/hmm.c | 8 ++++++-- tools/testing/nvdimm/test/iomap.c | 2 +- 7 files changed, 50 insertions(+), 30 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 996d68ff992a..4adab774dade 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -443,6 +443,10 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } +static const struct dev_pagemap_ops dev_dax_pagemap_ops = { + .kill = dev_dax_percpu_kill, +}; + int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -471,7 +475,7 @@ int dev_dax_probe(struct device *dev) return rc; dev_dax->pgmap.ref = &dev_dax->ref; - dev_dax->pgmap.kill = dev_dax_percpu_kill; + dev_dax->pgmap.ops = &dev_dax_pagemap_ops; addr = devm_memremap_pages(dev, &dev_dax->pgmap); if (IS_ERR(addr)) { devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d9d845077b8b..4efbf184ea68 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -316,7 +316,7 @@ static void pmem_release_queue(void *q) blk_cleanup_queue(q); } -static void pmem_freeze_queue(struct percpu_ref *ref) +static void pmem_kill(struct percpu_ref *ref) { struct request_queue *q; @@ -339,19 +339,27 @@ static void pmem_release_pgmap_ops(void *__pgmap) dev_pagemap_put_ops(); } -static void fsdax_pagefree(struct page *page, void *data) +static void pmem_fsdax_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); } +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .page_free = pmem_fsdax_page_free, + .kill = pmem_kill, +}; + +static const struct dev_pagemap_ops pmem_legacy_pagemap_ops = { + .kill = pmem_kill, +}; + static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) { dev_pagemap_get_ops(); if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) return -ENOMEM; pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->page_free = fsdax_pagefree; - + pgmap->ops = &fsdax_pagemap_ops; return 0; } @@ -412,7 +420,6 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; - pmem->pgmap.kill = pmem_freeze_queue; if (is_nd_pfn(dev)) { if (setup_pagemap_fsdax(dev, &pmem->pgmap)) return -ENOMEM; @@ -433,6 +440,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags |= PFN_MAP; memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); } else { + pmem->pgmap.ops = &pmem_legacy_pagemap_ops; addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); memcpy(&bb_res, &nsio->res, sizeof(bb_res)); diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 742928d0053e..6e76380f5b97 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -150,6 +150,10 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) return error; } +static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = { + .kill = pci_p2pdma_percpu_kill, +}; + /** * pci_p2pdma_add_resource - add memory for use as p2p memory * @pdev: the device to add the memory to @@ -196,7 +200,6 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar); - pgmap->kill = pci_p2pdma_percpu_kill; addr = devm_memremap_pages(&pdev->dev, pgmap); if (IS_ERR(addr)) { diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f0628660d541..5f7f40875b35 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -63,39 +63,40 @@ enum memory_type { MEMORY_DEVICE_PCI_P2PDMA, }; -/* - * Additional notes about MEMORY_DEVICE_PRIVATE may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief - * explanation in include/linux/memory_hotplug.h. - * - * The page_free() callback is called once the page refcount reaches 1 - * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. - * This allows the device driver to implement its own memory management.) - */ -typedef void (*dev_page_free_t)(struct page *page, void *data); +struct dev_pagemap_ops { + /* + * Called once the page refcount reaches 1. (ZONE_DEVICE pages never + * reach 0 refcount unless there is a refcount bug. This allows the + * device driver to implement its own memory management.) + */ + void (*page_free)(struct page *page, void *data); + + /* + * Transition the percpu_ref in struct dev_pagemap to the dead state. + */ + void (*kill)(struct percpu_ref *ref); +}; /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings - * @page_free: free page callback when page refcount reaches 1 * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping - * @kill: callback to transition @ref to the dead state * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h + * @ops: method table */ struct dev_pagemap { - dev_page_free_t page_free; struct vmem_altmap altmap; bool altmap_valid; struct resource res; struct percpu_ref *ref; - void (*kill)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; + const struct dev_pagemap_ops *ops; }; #ifdef CONFIG_ZONE_DEVICE diff --git a/kernel/memremap.c b/kernel/memremap.c index 1490e63f69a9..e23286ce0ec4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,7 +92,7 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); @@ -127,8 +127,8 @@ static void devm_memremap_pages_release(void *data) * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type members of @pgmap must be initialized - * by the caller before passing it to this function + * 1/ At a minimum the res, ref and type and ops members of @pgmap must be + * initialized by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true @@ -156,7 +156,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - if (!pgmap->ref || !pgmap->kill) + if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill) return ERR_PTR(-EINVAL); align_start = res->start & ~(SECTION_SIZE - 1); @@ -266,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); @@ -353,7 +353,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page, page->pgmap->data); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index 13a16faf0a77..2501ac6045d0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1348,6 +1348,11 @@ static void hmm_devmem_free(struct page *page, void *data) devmem->ops->free(devmem, page); } +static const struct dev_pagemap_ops hmm_pagemap_ops = { + .page_free = hmm_devmem_free, + .kill = hmm_devmem_ref_kill, +}; + /* * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory * @@ -1406,11 +1411,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; + devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index c6635fee27d8..7f5ece9b5011 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -108,7 +108,7 @@ static void nfit_test_kill(void *_pgmap) { struct dev_pagemap *pgmap = _pgmap; - pgmap->kill(pgmap->ref); + pgmap->ops->kill(pgmap->ref); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 08/22] memremap: pass a struct dev_pagemap to ->kill
Passing the actual typed structure leads to more understandable code vs the actual references. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/dax/device.c | 7 +++---- drivers/nvdimm/pmem.c | 6 +++--- drivers/pci/p2pdma.c | 6 +++--- include/linux/memremap.h | 2 +- kernel/memremap.c | 4 ++-- mm/hmm.c | 4 ++-- tools/testing/nvdimm/test/iomap.c | 6 ++---- 7 files changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 4adab774dade..e23fa1bd8c97 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -37,13 +37,12 @@ static void dev_dax_percpu_exit(void *data) percpu_ref_exit(ref); } -static void dev_dax_percpu_kill(struct percpu_ref *data) +static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) { - struct percpu_ref *ref = data; - struct dev_dax *dev_dax = ref_to_dev_dax(ref); + struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); dev_dbg(&dev_dax->dev, "%s\n", __func__); - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4efbf184ea68..b9638c6553a1 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -316,11 +316,11 @@ static void pmem_release_queue(void *q) blk_cleanup_queue(q); } -static void pmem_kill(struct percpu_ref *ref) +static void pmem_kill(struct dev_pagemap *pgmap) { - struct request_queue *q; + struct request_queue *q + container_of(pgmap->ref, struct request_queue, q_usage_counter); - q = container_of(ref, typeof(*q), q_usage_counter); blk_freeze_queue_start(q); } diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index 6e76380f5b97..3bcacc9222c6 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -82,7 +82,7 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref) complete_all(&p2p->devmap_ref_done); } -static void pci_p2pdma_percpu_kill(struct percpu_ref *ref) +static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap) { /* * pci_p2pdma_add_resource() may be called multiple times @@ -90,10 +90,10 @@ static void pci_p2pdma_percpu_kill(struct percpu_ref *ref) * times. We only want the first action to actually kill the * percpu_ref. */ - if (percpu_ref_is_dying(ref)) + if (percpu_ref_is_dying(pgmap->ref)) return; - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static void pci_p2pdma_release(void *data) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5f7f40875b35..96a3a6d564ad 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -74,7 +74,7 @@ struct dev_pagemap_ops { /* * Transition the percpu_ref in struct dev_pagemap to the dead state. */ - void (*kill)(struct percpu_ref *ref); + void (*kill)(struct dev_pagemap *pgmap); }; /** diff --git a/kernel/memremap.c b/kernel/memremap.c index e23286ce0ec4..94b830b6eca5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -92,7 +92,7 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); @@ -266,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/mm/hmm.c b/mm/hmm.c index 2501ac6045d0..c76a1b5defda 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1325,9 +1325,9 @@ static void hmm_devmem_ref_exit(void *data) percpu_ref_exit(ref); } -static void hmm_devmem_ref_kill(struct percpu_ref *ref) +static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) { - percpu_ref_kill(ref); + percpu_ref_kill(pgmap->ref); } static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index 7f5ece9b5011..ee07c4de2b35 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -104,11 +104,9 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, } EXPORT_SYMBOL(__wrap_devm_memremap); -static void nfit_test_kill(void *_pgmap) +static void nfit_test_kill(void *pgmap) { - struct dev_pagemap *pgmap = _pgmap; - - pgmap->ops->kill(pgmap->ref); + pgmap->ops->kill(pgmap); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 09/22] memremap: lift the devmap_enable manipulation into devm_memremap_pages
Just check if there is a ->page_free operation set and take care of the static key enable, as well as the put using device managed resources. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/nvdimm/pmem.c | 23 +++-------------- include/linux/mm.h | 10 -------- kernel/memremap.c | 59 +++++++++++++++++++++++++++---------------- mm/hmm.c | 2 -- 4 files changed, 41 insertions(+), 53 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b9638c6553a1..66837eed6375 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -334,11 +334,6 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } -static void pmem_release_pgmap_ops(void *__pgmap) -{ - dev_pagemap_put_ops(); -} - static void pmem_fsdax_page_free(struct page *page, void *data) { wake_up_var(&page->_refcount); @@ -353,16 +348,6 @@ static const struct dev_pagemap_ops pmem_legacy_pagemap_ops = { .kill = pmem_kill, }; -static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) -{ - dev_pagemap_get_ops(); - if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) - return -ENOMEM; - pgmap->type = MEMORY_DEVICE_FS_DAX; - pgmap->ops = &fsdax_pagemap_ops; - return 0; -} - static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -421,8 +406,8 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { - if (setup_pagemap_fsdax(dev, &pmem->pgmap)) - return -ENOMEM; + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -434,8 +419,8 @@ static int pmem_attach_disk(struct device *dev, } else if (pmem_should_map_pages(dev)) { memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); pmem->pgmap.altmap_valid = false; - if (setup_pagemap_fsdax(dev, &pmem->pgmap)) - return -ENOMEM; + pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e8834ac32b7..edcf2b821647 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -921,8 +921,6 @@ static inline bool is_zone_device_page(const struct page *page) #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void dev_pagemap_get_ops(void); -void dev_pagemap_put_ops(void); void __put_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool put_devmap_managed_page(struct page *page) @@ -969,14 +967,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page) #endif /* CONFIG_PCI_P2PDMA */ #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline void dev_pagemap_get_ops(void) -{ -} - -static inline void dev_pagemap_put_ops(void) -{ -} - static inline bool put_devmap_managed_page(struct page *page) { return false; diff --git a/kernel/memremap.c b/kernel/memremap.c index 94b830b6eca5..6a3183cac764 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -17,6 +17,37 @@ static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL(devmap_managed_key); +static atomic_t devmap_enable; + +static void dev_pagemap_put_ops(void *data) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +static int dev_pagemap_enable(struct device *dev) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); + + if (devm_add_action_or_reset(dev, dev_pagemap_put_ops, NULL)) + return -ENOMEM; + return 0; +} +#else +static inline int dev_pagemap_enable(struct device *dev) +{ + return 0; +} +#endif /* CONFIG_DEV_PAGEMAP_OPS */ + #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -159,6 +190,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill) return ERR_PTR(-EINVAL); + if (pgmap->ops->page_free) { + error = dev_pagemap_enable(dev); + if (error) + return ERR_PTR(error); + } + align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) - align_start; @@ -316,28 +353,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -DEFINE_STATIC_KEY_FALSE(devmap_managed_key); -EXPORT_SYMBOL(devmap_managed_key); -static atomic_t devmap_enable; - -/* - * Toggle the static key for ->page_free() callbacks when dev_pagemap - * pages go idle. - */ -void dev_pagemap_get_ops(void) -{ - if (atomic_inc_return(&devmap_enable) == 1) - static_branch_enable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); - -void dev_pagemap_put_ops(void) -{ - if (atomic_dec_and_test(&devmap_enable)) - static_branch_disable(&devmap_managed_key); -} -EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); - void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); diff --git a/mm/hmm.c b/mm/hmm.c index c76a1b5defda..6dc769feb2e1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1378,8 +1378,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, void *result; int ret; - dev_pagemap_get_ops(); - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); if (!devmem) return ERR_PTR(-ENOMEM); -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 10/22] memremap: add a migrate callback to struct dev_pagemap_ops
This replaces the hacky ->fault callback, which is currently directly called from common code through a hmm specific data structure as an exercise in layering violations. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/hmm.h | 6 ------ include/linux/memremap.h | 6 ++++++ include/linux/swapops.h | 15 --------------- kernel/memremap.c | 31 ------------------------------- mm/hmm.c | 13 +++++-------- mm/memory.c | 9 ++------- 6 files changed, 13 insertions(+), 67 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 5761a39221a6..3c9a59dbfdb8 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -658,11 +658,6 @@ struct hmm_devmem_ops { * chunk, as an optimization. It must, however, prioritize the faulting address * over all the others. */ -typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); struct hmm_devmem { struct completion completion; @@ -673,7 +668,6 @@ struct hmm_devmem { struct dev_pagemap pagemap; const struct hmm_devmem_ops *ops; struct percpu_ref ref; - dev_page_fault_t page_fault; }; /* diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 96a3a6d564ad..03a4099be701 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -75,6 +75,12 @@ struct dev_pagemap_ops { * Transition the percpu_ref in struct dev_pagemap to the dead state. */ void (*kill)(struct dev_pagemap *pgmap); + + /* + * Used for private (un-addressable) device memory only. Must migrate + * the page back to a CPU accessible page. + */ + vm_fault_t (*migrate)(struct vm_fault *vmf); }; /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4d961668e5fc..15bdb6fe71e5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return pfn_to_page(swp_offset(entry)); } - -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp); #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_device_private_entry(struct page *page, bool write) { @@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) { return NULL; } - -static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - return VM_FAULT_SIGBUS; -} #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION diff --git a/kernel/memremap.c b/kernel/memremap.c index 6a3183cac764..7167e717647d 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -11,7 +11,6 @@ #include <linux/types.h> #include <linux/wait_bit.h> #include <linux/xarray.h> -#include <linux/hmm.h> static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -48,36 +47,6 @@ static inline int dev_pagemap_enable(struct device *dev) } #endif /* CONFIG_DEV_PAGEMAP_OPS */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, - unsigned long addr, - swp_entry_t entry, - unsigned int flags, - pmd_t *pmdp) -{ - struct page *page = device_private_entry_to_page(entry); - struct hmm_devmem *devmem; - - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); - - /* - * The page_fault() callback must migrate page back to system memory - * so that CPU can access it. This might fail for various reasons - * (device issue, device was unsafely unplugged, ...). When such - * error conditions happen, the callback must return VM_FAULT_SIGBUS. - * - * Note that because memory cgroup charges are accounted to the device - * memory, this should never fail because of memory restrictions (but - * allocation of regular system page might still fail because we are - * out of memory). - * - * There is a more in-depth description of what that callback can and - * cannot do, in include/linux/memremap.h - */ - return devmem->page_fault(vma, addr, page, flags, pmdp); -} -#endif /* CONFIG_DEVICE_PRIVATE */ - static void pgmap_array_delete(struct resource *res) { xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), diff --git a/mm/hmm.c b/mm/hmm.c index 6dc769feb2e1..aab799677c7d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1330,15 +1330,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) percpu_ref_kill(pgmap->ref); } -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) +static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf) { - struct hmm_devmem *devmem = page->pgmap->data; + struct hmm_devmem *devmem = vmf->page->pgmap->data; - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); + return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, + vmf->flags, vmf->pmd); } static void hmm_devmem_free(struct page *page, void *data) @@ -1351,6 +1348,7 @@ static void hmm_devmem_free(struct page *page, void *data) static const struct dev_pagemap_ops hmm_pagemap_ops = { .page_free = hmm_devmem_free, .kill = hmm_devmem_ref_kill, + .migrate = hmm_devmem_migrate, }; /* @@ -1405,7 +1403,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; diff --git a/mm/memory.c b/mm/memory.c index ddf20bd0c317..cbf3cb598436 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2782,13 +2782,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - /* - * For un-addressable device memory we call the pgmap - * fault handler callback. The callback must migrate - * the page back to some CPU accessible page. - */ - ret = device_private_entry_fault(vma, vmf->address, entry, - vmf->flags, vmf->pmd); + vmf->page = device_private_entry_to_page(entry); + ret = page->pgmap->ops->migrate(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 11/22] memremap: remove the data field in struct dev_pagemap
struct dev_pagemap is always embedded into a containing structure, so there is no need to an additional private data field. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/nvdimm/pmem.c | 2 +- include/linux/memremap.h | 3 +-- kernel/memremap.c | 2 +- mm/hmm.c | 9 +++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 66837eed6375..847d1b2bc10e 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -334,7 +334,7 @@ static void pmem_release_disk(void *__pmem) put_disk(pmem->disk); } -static void pmem_fsdax_page_free(struct page *page, void *data) +static void pmem_fsdax_page_free(struct page *page) { wake_up_var(&page->_refcount); } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 03a4099be701..75b80de6394a 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -69,7 +69,7 @@ struct dev_pagemap_ops { * reach 0 refcount unless there is a refcount bug. This allows the * device driver to implement its own memory management.) */ - void (*page_free)(struct page *page, void *data); + void (*page_free)(struct page *page); /* * Transition the percpu_ref in struct dev_pagemap to the dead state. @@ -99,7 +99,6 @@ struct dev_pagemap { struct resource res; struct percpu_ref *ref; struct device *dev; - void *data; enum memory_type type; u64 pci_p2pdma_bus_offset; const struct dev_pagemap_ops *ops; diff --git a/kernel/memremap.c b/kernel/memremap.c index 7167e717647d..5c94ad4f5783 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -337,7 +337,7 @@ void __put_devmap_managed_page(struct page *page) mem_cgroup_uncharge(page); - page->pgmap->ops->page_free(page, page->pgmap->data); + page->pgmap->ops->page_free(page); } else if (!count) __put_page(page); } diff --git a/mm/hmm.c b/mm/hmm.c index aab799677c7d..ff0f9568922b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1332,15 +1332,17 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf) { - struct hmm_devmem *devmem = vmf->page->pgmap->data; + struct hmm_devmem *devmem + container_of(vmf->page->pgmap, struct hmm_devmem, pagemap); return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, vmf->flags, vmf->pmd); } -static void hmm_devmem_free(struct page *page, void *data) +static void hmm_devmem_free(struct page *page) { - struct hmm_devmem *devmem = data; + struct hmm_devmem *devmem + container_of(page->pgmap, struct hmm_devmem, pagemap); devmem->ops->free(devmem, page); } @@ -1409,7 +1411,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pagemap.ops = &hmm_pagemap_ops; devmem->pagemap.altmap_valid = false; devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; result = devm_memremap_pages(devmem->device, &devmem->pagemap); if (IS_ERR(result)) -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 12/22] memremap: provide an optional internal refcount in struct dev_pagemap
Provide an internal refcounting logic if no ->ref field is provided in the pagemap passed into devm_memremap_pages so that callers don't have to reinvent it poorly. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/memremap.h | 4 +++ kernel/memremap.c | 60 ++++++++++++++++++++++++++----- tools/testing/nvdimm/test/iomap.c | 9 +++-- 3 files changed, 62 insertions(+), 11 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 75b80de6394a..b77ed00851ce 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -88,6 +88,8 @@ struct dev_pagemap_ops { * @altmap: pre-allocated/reserved memory for vmemmap allocations * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping + * @internal_ref: internal reference if @ref is not provided by the caller + * @done: completion for @internal_ref * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -98,6 +100,8 @@ struct dev_pagemap { bool altmap_valid; struct resource res; struct percpu_ref *ref; + struct percpu_ref internal_ref; + struct completion done; struct device *dev; enum memory_type type; u64 pci_p2pdma_bus_offset; diff --git a/kernel/memremap.c b/kernel/memremap.c index 5c94ad4f5783..edca4389da68 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -83,6 +83,14 @@ static unsigned long pfn_next(unsigned long pfn) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) +static void dev_pagemap_kill(struct dev_pagemap *pgmap) +{ + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); +} + static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; @@ -92,7 +100,8 @@ static void devm_memremap_pages_release(void *data) unsigned long pfn; int nid; - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); + for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); @@ -121,20 +130,37 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } +static void dev_pagemap_percpu_release(struct percpu_ref *ref) +{ + struct dev_pagemap *pgmap + container_of(ref, struct dev_pagemap, internal_ref); + + complete(&pgmap->done); +} + +static void dev_pagemap_percpu_exit(void *data) +{ + struct dev_pagemap *pgmap = data; + + wait_for_completion(&pgmap->done); + percpu_ref_exit(pgmap->ref); +} + /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res * @pgmap: pointer to a struct dev_pagemap * * Notes: - * 1/ At a minimum the res, ref and type and ops members of @pgmap must be - * initialized by the caller before passing it to this function + * 1/ At a minimum the res and type members of @pgmap must be initialized + * by the caller before passing it to this function * * 2/ The altmap field may optionally be initialized, in which case altmap_valid * must be set to true * - * 3/ pgmap->ref must be 'live' on entry and will be killed at - * devm_memremap_pages_release() time, or if this routine fails. + * 3/ The ref field may optionally be provided, in which pgmap->ref must be + * 'live' on entry and will be killed at devm_memremap_pages_release() time, + * or if this routine fails. * * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -156,10 +182,26 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; - if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill) - return ERR_PTR(-EINVAL); + if (!pgmap->ref) { + if (pgmap->ops && pgmap->ops->kill) + return ERR_PTR(-EINVAL); + + init_completion(&pgmap->done); + error = percpu_ref_init(&pgmap->internal_ref, + dev_pagemap_percpu_release, 0, GFP_KERNEL); + if (error) + return ERR_PTR(error); + pgmap->ref = &pgmap->internal_ref; + error = devm_add_action_or_reset(dev, dev_pagemap_percpu_exit, + pgmap); + if (error) + return ERR_PTR(error); + } else { + if (!pgmap->ops || !pgmap->ops->kill) + return ERR_PTR(-EINVAL); + } - if (pgmap->ops->page_free) { + if (pgmap->ops && pgmap->ops->page_free) { error = dev_pagemap_enable(dev); if (error) return ERR_PTR(error); @@ -272,7 +314,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: pgmap_array_delete(res); err_array: - pgmap->ops->kill(pgmap); + dev_pagemap_kill(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(devm_memremap_pages); diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index ee07c4de2b35..3d0e916f9fff 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -104,9 +104,14 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, } EXPORT_SYMBOL(__wrap_devm_memremap); -static void nfit_test_kill(void *pgmap) +static void nfit_test_kill(void *data) { - pgmap->ops->kill(pgmap); + struct dev_pagemap *pgmap = data; + + if (pgmap->ops && pgmap->ops->kill) + pgmap->ops->kill(pgmap); + else + percpu_ref_kill(pgmap->ref); } void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 13/22] device-dax: use the dev_pagemap internal refcount
The functionality is identical to the one currently open coded in device-dax. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/dax/dax-private.h | 4 --- drivers/dax/device.c | 52 +-------------------------------------- 2 files changed, 1 insertion(+), 55 deletions(-) diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h index a45612148ca0..ed04a18a35be 100644 --- a/drivers/dax/dax-private.h +++ b/drivers/dax/dax-private.h @@ -51,8 +51,6 @@ struct dax_region { * @target_node: effective numa node if dev_dax memory range is onlined * @dev - device core * @pgmap - pgmap for memmap setup / lifetime (driver owned) - * @ref: pgmap reference count (driver owned) - * @cmp: @ref final put completion (driver owned) */ struct dev_dax { struct dax_region *region; @@ -60,8 +58,6 @@ struct dev_dax { int target_node; struct device dev; struct dev_pagemap pgmap; - struct percpu_ref ref; - struct completion cmp; }; static inline struct dev_dax *to_dev_dax(struct device *dev) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index e23fa1bd8c97..a9d7c90ecf1e 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -14,37 +14,6 @@ #include "dax-private.h" #include "bus.h" -static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref) -{ - return container_of(ref, struct dev_dax, ref); -} - -static void dev_dax_percpu_release(struct percpu_ref *ref) -{ - struct dev_dax *dev_dax = ref_to_dev_dax(ref); - - dev_dbg(&dev_dax->dev, "%s\n", __func__); - complete(&dev_dax->cmp); -} - -static void dev_dax_percpu_exit(void *data) -{ - struct percpu_ref *ref = data; - struct dev_dax *dev_dax = ref_to_dev_dax(ref); - - dev_dbg(&dev_dax->dev, "%s\n", __func__); - wait_for_completion(&dev_dax->cmp); - percpu_ref_exit(ref); -} - -static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) -{ - struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); - - dev_dbg(&dev_dax->dev, "%s\n", __func__); - percpu_ref_kill(pgmap->ref); -} - static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, const char *func) { @@ -442,10 +411,6 @@ static void dev_dax_kill(void *dev_dax) kill_dev_dax(dev_dax); } -static const struct dev_pagemap_ops dev_dax_pagemap_ops = { - .kill = dev_dax_percpu_kill, -}; - int dev_dax_probe(struct device *dev) { struct dev_dax *dev_dax = to_dev_dax(dev); @@ -463,24 +428,9 @@ int dev_dax_probe(struct device *dev) return -EBUSY; } - init_completion(&dev_dax->cmp); - rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0, - GFP_KERNEL); - if (rc) - return rc; - - rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref); - if (rc) - return rc; - - dev_dax->pgmap.ref = &dev_dax->ref; - dev_dax->pgmap.ops = &dev_dax_pagemap_ops; addr = devm_memremap_pages(dev, &dev_dax->pgmap); - if (IS_ERR(addr)) { - devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); - percpu_ref_exit(&dev_dax->ref); + if (IS_ERR(addr)) return PTR_ERR(addr); - } inode = dax_inode(dax_dev); cdev = inode->i_cdev; -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 14/22] nouveau: use alloc_page_vma directly
hmm_vma_alloc_locked_page is scheduled to go away, use the proper mm function directly. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 40c47d6a7d78..a50f6fd2fe24 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -148,11 +148,12 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; - dpage = hmm_vma_alloc_locked_page(vma, addr); + dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr); if (!dpage) { dst_pfns[i] = MIGRATE_PFN_ERROR; continue; } + lock_page(dpage); dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 15/22] nouveau: use devm_memremap_pages directly
Just use devm_memremap_pages instead of hmm_devmem_add pages to allow killing that wrapper which doesn't provide a whole lot of benefits. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 80 ++++++++++++-------------- 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index a50f6fd2fe24..9e32bc8ecbc7 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -72,7 +72,8 @@ struct nouveau_dmem_migrate { }; struct nouveau_dmem { - struct hmm_devmem *devmem; + struct nouveau_drm *drm; + struct dev_pagemap pagemap; struct nouveau_dmem_migrate migrate; struct list_head chunk_free; struct list_head chunk_full; @@ -80,6 +81,11 @@ struct nouveau_dmem { struct mutex mutex; }; +static inline struct nouveau_dmem *page_to_dmem(struct page *page) +{ + return container_of(page->pgmap, struct nouveau_dmem, pagemap); +} + struct nouveau_dmem_fault { struct nouveau_drm *drm; struct nouveau_fence *fence; @@ -96,8 +102,7 @@ struct nouveau_migrate { unsigned long dma_nr; }; -static void -nouveau_dmem_free(struct hmm_devmem *devmem, struct page *page) +static void nouveau_dmem_page_free(struct page *page) { struct nouveau_dmem_chunk *chunk; unsigned long idx; @@ -260,29 +265,21 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = { .finalize_and_map = nouveau_dmem_fault_finalize_and_map, }; -static vm_fault_t -nouveau_dmem_fault(struct hmm_devmem *devmem, - struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) +static vm_fault_t nouveau_dmem_devmem_migrate(struct vm_fault *vmf) { - struct drm_device *drm_dev = dev_get_drvdata(devmem->device); + struct nouveau_dmem *dmem = page_to_dmem(vmf->page); unsigned long src[1] = {0}, dst[1] = {0}; - struct nouveau_dmem_fault fault = {0}; + struct nouveau_dmem_fault fault = { .drm = dmem->drm }; int ret; - - /* * FIXME what we really want is to find some heuristic to migrate more * than just one page on CPU fault. When such fault happens it is very * likely that more surrounding page will CPU fault too. */ - fault.drm = nouveau_drm(drm_dev); - ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vma, addr, - addr + PAGE_SIZE, src, dst, &fault); + ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma, + vmf->address, vmf->address + PAGE_SIZE, + src, dst, &fault); if (ret) return VM_FAULT_SIGBUS; @@ -292,10 +289,9 @@ nouveau_dmem_fault(struct hmm_devmem *devmem, return 0; } -static const struct hmm_devmem_ops -nouveau_dmem_devmem_ops = { - .free = nouveau_dmem_free, - .fault = nouveau_dmem_fault, +static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { + .page_free = nouveau_dmem_page_free, + .migrate = nouveau_dmem_devmem_migrate, }; static int @@ -581,7 +577,8 @@ void nouveau_dmem_init(struct nouveau_drm *drm) { struct device *device = drm->dev->dev; - unsigned long i, size; + struct resource *res; + unsigned long i, size, pfn_first; int ret; /* This only make sense on PASCAL or newer */ @@ -591,6 +588,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) if (!(drm->dmem = kzalloc(sizeof(*drm->dmem), GFP_KERNEL))) return; + drm->dmem->drm = drm; mutex_init(&drm->dmem->mutex); INIT_LIST_HEAD(&drm->dmem->chunk_free); INIT_LIST_HEAD(&drm->dmem->chunk_full); @@ -600,11 +598,8 @@ nouveau_dmem_init(struct nouveau_drm *drm) /* Initialize migration dma helpers before registering memory */ ret = nouveau_dmem_migrate_init(drm); - if (ret) { - kfree(drm->dmem); - drm->dmem = NULL; - return; - } + if (ret) + goto out_free; /* * FIXME we need some kind of policy to decide how much VRAM we @@ -612,14 +607,16 @@ nouveau_dmem_init(struct nouveau_drm *drm) * and latter if we want to do thing like over commit then we * could revisit this. */ - drm->dmem->devmem = hmm_devmem_add(&nouveau_dmem_devmem_ops, - device, size); - if (IS_ERR(drm->dmem->devmem)) { - kfree(drm->dmem); - drm->dmem = NULL; - return; - } - + res = devm_request_free_mem_region(device, &iomem_resource, size); + if (IS_ERR(res)) + goto out_free; + drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + drm->dmem->pagemap.res = *res; + drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops; + if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap))) + goto out_free; + + pfn_first = res->start >> PAGE_SHIFT; for (i = 0; i < (size / DMEM_CHUNK_SIZE); ++i) { struct nouveau_dmem_chunk *chunk; struct page *page; @@ -632,8 +629,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) } chunk->drm = drm; - chunk->pfn_first = drm->dmem->devmem->pfn_first; - chunk->pfn_first += (i * DMEM_CHUNK_NPAGES); + chunk->pfn_first = pfn_first + (i * DMEM_CHUNK_NPAGES); list_add_tail(&chunk->list, &drm->dmem->chunk_empty); page = pfn_to_page(chunk->pfn_first); @@ -643,6 +639,10 @@ nouveau_dmem_init(struct nouveau_drm *drm) } NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20); + return; +out_free: + kfree(drm->dmem); + drm->dmem = NULL; } static void @@ -835,11 +835,7 @@ nouveau_dmem_page(struct nouveau_drm *drm, struct page *page) { if (!is_device_private_page(page)) return false; - - if (drm->dmem->devmem != page->pgmap->data) - return false; - - return true; + return drm->dmem == page_to_dmem(page); } void -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 16/22] mm: remove hmm_vma_alloc_locked_page
The only user of it has just been removed, and there wasn't really any need to wrap a basic memory allocator to start with. Signed-off-by: Christoph Hellwig <hch at lst.de> --- include/linux/hmm.h | 3 --- mm/hmm.c | 14 -------------- 2 files changed, 17 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 3c9a59dbfdb8..0e61d830b0a9 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -553,9 +553,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {} #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) struct hmm_devmem; -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr); - /* * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events * diff --git a/mm/hmm.c b/mm/hmm.c index ff0f9568922b..c15283f9bbf0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1293,20 +1293,6 @@ EXPORT_SYMBOL(hmm_range_dma_unmap); #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!page) - return NULL; - lock_page(page); - return page; -} -EXPORT_SYMBOL(hmm_vma_alloc_locked_page); - - static void hmm_devmem_ref_release(struct percpu_ref *ref) { struct hmm_devmem *devmem; -- 2.20.1
There isn't really much value add in the hmm_devmem_add wrapper. Just factor out a little helper to find the resource, and otherwise let the driver implement the dev_pagemap_ops directly. Signed-off-by: Christoph Hellwig <hch at lst.de> --- Documentation/vm/hmm.rst | 26 -------- include/linux/hmm.h | 129 --------------------------------------- mm/hmm.c | 115 ---------------------------------- 3 files changed, 270 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 7b6eeda5a7c0..b1c960fe246d 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -336,32 +336,6 @@ directly using struct page for device memory which left most kernel code paths unaware of the difference. We only need to make sure that no one ever tries to map those pages from the CPU side. -HMM provides a set of helpers to register and hotplug device memory as a new -region needing a struct page. This is offered through a very simple API:: - - struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size); - void hmm_devmem_remove(struct hmm_devmem *devmem); - -The hmm_devmem_ops is where most of the important things are:: - - struct hmm_devmem_ops { - void (*free)(struct hmm_devmem *devmem, struct page *page); - vm_fault_t (*fault)(struct hmm_devmem *devmem, - struct vm_area_struct *vma, - unsigned long addr, - struct page *page, - unsigned flags, - pmd_t *pmdp); - }; - -The first callback (free()) happens when the last reference on a device page is -dropped. This means the device page is now free and no longer used by anyone. -The second callback happens whenever the CPU tries to access a device page -which it cannot do. This second callback must trigger a migration back to -system memory. - Migration to and from device memory ==================================diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0e61d830b0a9..13152ab504ec 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -551,135 +551,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct hmm_devmem; - -/* - * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events - * - * @free: call when refcount on page reach 1 and thus is no longer use - * @fault: call when there is a page fault to unaddressable memory - * - * Both callback happens from page_free() and page_fault() callback of struct - * dev_pagemap respectively. See include/linux/memremap.h for more details on - * those. - * - * The hmm_devmem_ops callback are just here to provide a coherent and - * uniq API to device driver and device driver should not register their - * own page_free() or page_fault() but rely on the hmm_devmem_ops call- - * back. - */ -struct hmm_devmem_ops { - /* - * free() - free a device page - * @devmem: device memory structure (see struct hmm_devmem) - * @page: pointer to struct page being freed - * - * Call back occurs whenever a device page refcount reach 1 which - * means that no one is holding any reference on the page anymore - * (ZONE_DEVICE page have an elevated refcount of 1 as default so - * that they are not release to the general page allocator). - * - * Note that callback has exclusive ownership of the page (as no - * one is holding any reference). - */ - void (*free)(struct hmm_devmem *devmem, struct page *page); - /* - * fault() - CPU page fault or get user page (GUP) - * @devmem: device memory structure (see struct hmm_devmem) - * @vma: virtual memory area containing the virtual address - * @addr: virtual address that faulted or for which there is a GUP - * @page: pointer to struct page backing virtual address (unreliable) - * @flags: FAULT_FLAG_* (see include/linux/mm.h) - * @pmdp: page middle directory - * Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR - * on error - * - * The callback occurs whenever there is a CPU page fault or GUP on a - * virtual address. This means that the device driver must migrate the - * page back to regular memory (CPU accessible). - * - * The device driver is free to migrate more than one page from the - * fault() callback as an optimization. However if the device decides - * to migrate more than one page it must always priotirize the faulting - * address over the others. - * - * The struct page pointer is only given as a hint to allow quick - * lookup of internal device driver data. A concurrent migration - * might have already freed that page and the virtual address might - * no longer be backed by it. So it should not be modified by the - * callback. - * - * Note that mmap semaphore is held in read mode at least when this - * callback occurs, hence the vma is valid upon callback entry. - */ - vm_fault_t (*fault)(struct hmm_devmem *devmem, - struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp); -}; - -/* - * struct hmm_devmem - track device memory - * - * @completion: completion object for device memory - * @pfn_first: first pfn for this resource (set by hmm_devmem_add()) - * @pfn_last: last pfn for this resource (set by hmm_devmem_add()) - * @resource: IO resource reserved for this chunk of memory - * @pagemap: device page map for that chunk - * @device: device to bind resource to - * @ops: memory operations callback - * @ref: per CPU refcount - * @page_fault: callback when CPU fault on an unaddressable device page - * - * This is a helper structure for device drivers that do not wish to implement - * the gory details related to hotplugging new memoy and allocating struct - * pages. - * - * Device drivers can directly use ZONE_DEVICE memory on their own if they - * wish to do so. - * - * The page_fault() callback must migrate page back, from device memory to - * system memory, so that the CPU can access it. This might fail for various - * reasons (device issues, device have been unplugged, ...). When such error - * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and - * set the CPU page table entry to "poisoned". - * - * Note that because memory cgroup charges are transferred to the device memory, - * this should never fail due to memory restrictions. However, allocation - * of a regular system page might still fail because we are out of memory. If - * that happens, the page_fault() callback must return VM_FAULT_OOM. - * - * The page_fault() callback can also try to migrate back multiple pages in one - * chunk, as an optimization. It must, however, prioritize the faulting address - * over all the others. - */ - -struct hmm_devmem { - struct completion completion; - unsigned long pfn_first; - unsigned long pfn_last; - struct resource *resource; - struct device *device; - struct dev_pagemap pagemap; - const struct hmm_devmem_ops *ops; - struct percpu_ref ref; -}; - -/* - * To add (hotplug) device memory, HMM assumes that there is no real resource - * that reserves a range in the physical address space (this is intended to be - * use by unaddressable device memory). It will reserve a physical range big - * enough and allocate struct page for it. - * - * The device driver can wrap the hmm_devmem struct inside a private device - * driver struct. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size); - /* * hmm_devmem_page_set_drvdata - set per-page driver data field * diff --git a/mm/hmm.c b/mm/hmm.c index c15283f9bbf0..5b2e9bb6063a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1290,118 +1290,3 @@ long hmm_range_dma_unmap(struct hmm_range *range, } EXPORT_SYMBOL(hmm_range_dma_unmap); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -static void hmm_devmem_ref_release(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - complete(&devmem->completion); -} - -static void hmm_devmem_ref_exit(void *data) -{ - struct percpu_ref *ref = data; - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); -} - -static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) -{ - percpu_ref_kill(pgmap->ref); -} - -static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf) -{ - struct hmm_devmem *devmem - container_of(vmf->page->pgmap, struct hmm_devmem, pagemap); - - return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, - vmf->flags, vmf->pmd); -} - -static void hmm_devmem_free(struct page *page) -{ - struct hmm_devmem *devmem - container_of(page->pgmap, struct hmm_devmem, pagemap); - - devmem->ops->free(devmem, page); -} - -static const struct dev_pagemap_ops hmm_pagemap_ops = { - .page_free = hmm_devmem_free, - .kill = hmm_devmem_ref_kill, - .migrate = hmm_devmem_migrate, -}; - -/* - * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory - * - * @ops: memory event device driver callback (see struct hmm_devmem_ops) - * @device: device struct to bind the resource too - * @size: size in bytes of the device memory to add - * Return: pointer to new hmm_devmem struct ERR_PTR otherwise - * - * This function first finds an empty range of physical address big enough to - * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which - * in turn allocates struct pages. It does not do anything beyond that; all - * events affecting the memory will go through the various callbacks provided - * by hmm_devmem_ops struct. - * - * Device driver should call this function during device initialization and - * is then responsible of memory management. HMM only provides helpers. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size) -{ - struct hmm_devmem *devmem; - void *result; - int ret; - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = NULL; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref); - if (ret) - return ERR_PTR(ret); - - devmem->resource = devm_request_free_mem_region(device, &iomem_resource, - size); - if (IS_ERR(devmem->resource)) - return ERR_CAST(devmem->resource); - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.ops = &hmm_pagemap_ops; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 18/22] mm: mark DEVICE_PUBLIC as broken
The code hasn't been used since it was added to the tree, and doesn't appear to actually be usable. Mark it as BROKEN until either a user comes along or we finally give up on it. Signed-off-by: Christoph Hellwig <hch at lst.de> --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 0d2ba7e1f43e..406fa45e9ecc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -721,6 +721,7 @@ config DEVICE_PRIVATE config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" depends on ARCH_HAS_HMM + depends on BROKEN select HMM select DEV_PAGEMAP_OPS -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 19/22] mm: simplify ZONE_DEVICE page private data
Remove the clumsy hmm_devmem_page_{get,set}_drvdata helpers, and instead just access the page directly. Also make the page data a void pointer, and thus much easier to use. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 18 +++++++---------- include/linux/hmm.h | 27 -------------------------- include/linux/mm_types.h | 2 +- mm/page_alloc.c | 8 ++++---- 4 files changed, 12 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 9e32bc8ecbc7..27aa4e72abe9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -104,11 +104,8 @@ struct nouveau_migrate { static void nouveau_dmem_page_free(struct page *page) { - struct nouveau_dmem_chunk *chunk; - unsigned long idx; - - chunk = (void *)hmm_devmem_page_get_drvdata(page); - idx = page_to_pfn(page) - chunk->pfn_first; + struct nouveau_dmem_chunk *chunk = page->zone_device_data; + unsigned long idx = page_to_pfn(page) - chunk->pfn_first; /* * FIXME: @@ -200,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, dst_addr = fault->dma[fault->npages++]; - chunk = (void *)hmm_devmem_page_get_drvdata(spage); + chunk = spage->zone_device_data; src_addr = page_to_pfn(spage) - chunk->pfn_first; src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset; @@ -633,9 +630,8 @@ nouveau_dmem_init(struct nouveau_drm *drm) list_add_tail(&chunk->list, &drm->dmem->chunk_empty); page = pfn_to_page(chunk->pfn_first); - for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) { - hmm_devmem_page_set_drvdata(page, (long)chunk); - } + for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) + page->zone_device_data = chunk; } NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20); @@ -698,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) continue; - chunk = (void *)hmm_devmem_page_get_drvdata(dpage); + chunk = dpage->zone_device_data; dst_addr = page_to_pfn(dpage) - chunk->pfn_first; dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset; @@ -864,7 +860,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, continue; } - chunk = (void *)hmm_devmem_page_get_drvdata(page); + chunk = page->zone_device_data; addr = page_to_pfn(page) - chunk->pfn_first; addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 13152ab504ec..e095a8b55dfa 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -550,33 +550,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -/* - * hmm_devmem_page_set_drvdata - set per-page driver data field - * - * @page: pointer to struct page - * @data: driver data value to set - * - * Because page can not be on lru we have an unsigned long that driver can use - * to store a per page field. This just a simple helper to do that. - */ -static inline void hmm_devmem_page_set_drvdata(struct page *page, - unsigned long data) -{ - page->hmm_data = data; -} - -/* - * hmm_devmem_page_get_drvdata - get per page driver data field - * - * @page: pointer to struct page - * Return: driver data value - */ -static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) -{ - return page->hmm_data; -} -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ #else /* IS_ENABLED(CONFIG_HMM) */ static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8ec38b11b361..f33a1289c101 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -158,7 +158,7 @@ struct page { struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; - unsigned long hmm_data; + void *zone_device_data; unsigned long _zd_pad_1; /* uses mapping */ }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d66bc8abe0af..d069ee1f4c2e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5887,12 +5887,12 @@ void __ref memmap_init_zone_device(struct zone *zone, __SetPageReserved(page); /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer and hmm_data. It is a bug if a ZONE_DEVICE - * page is ever freed or placed on a driver-private list. + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. */ page->pgmap = pgmap; - page->hmm_data = 0; + page->zone_device_data = 0; /* * Mark the block movable so that blocks are reserved for -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 20/22] mm: sort out the DEVICE_PRIVATE Kconfig mess
The ZONE_DEVICE support doesn't depend on anything HMM related, just on various bits of arch support as indicated by the architecture. Also don't select the option from nouveau as it isn't present in many setups, and depend on it instead. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/Kconfig | 2 +- mm/Kconfig | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index dba2613f7180..6303d203ab1d 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -85,10 +85,10 @@ config DRM_NOUVEAU_BACKLIGHT config DRM_NOUVEAU_SVM bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" depends on ARCH_HAS_HMM + depends on DEVICE_PRIVATE depends on DRM_NOUVEAU depends on STAGING select HMM_MIRROR - select DEVICE_PRIVATE default n help Say Y here if you want to enable experimental support for diff --git a/mm/Kconfig b/mm/Kconfig index 406fa45e9ecc..4dbd718c8cf4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -677,13 +677,13 @@ config ARCH_HAS_HMM_MIRROR config ARCH_HAS_HMM bool - default y depends on (X86_64 || PPC64) depends on ZONE_DEVICE depends on MMU && 64BIT depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP + default y config MIGRATE_VMA_HELPER bool @@ -709,8 +709,7 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" - depends on ARCH_HAS_HMM - select HMM + depends on ZONE_DEVICE select DEV_PAGEMAP_OPS help -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 21/22] mm: remove the HMM config option
All the mm/hmm.c code is better keyed off HMM_MIRROR. Also let nouveau depend on it instead of the mix of a dummy dependency symbol plus the actually selected one. Drop various odd dependencies, as the code is pretty portable. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/Kconfig | 3 +-- include/linux/hmm.h | 14 +++----------- include/linux/mm_types.h | 2 +- mm/Kconfig | 27 +++------------------------ mm/Makefile | 2 +- mm/hmm.c | 2 -- 6 files changed, 9 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 6303d203ab1d..66c839d8e9d1 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -84,11 +84,10 @@ config DRM_NOUVEAU_BACKLIGHT config DRM_NOUVEAU_SVM bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" - depends on ARCH_HAS_HMM depends on DEVICE_PRIVATE depends on DRM_NOUVEAU + depends on HMM_MIRROR depends on STAGING - select HMM_MIRROR default n help Say Y here if you want to enable experimental support for diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e095a8b55dfa..64ea2fa00872 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -62,7 +62,7 @@ #include <linux/kconfig.h> #include <asm/pgtable.h> -#if IS_ENABLED(CONFIG_HMM) +#ifdef CONFIG_HMM_MIRROR #include <linux/device.h> #include <linux/migrate.h> @@ -324,9 +324,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, return hmm_device_entry_from_pfn(range, pfn); } - - -#if IS_ENABLED(CONFIG_HMM_MIRROR) /* * Mirroring: how to synchronize device page table with CPU page table. * @@ -546,13 +543,8 @@ static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - -#else /* IS_ENABLED(CONFIG_HMM) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} +#else /* CONFIG_HMM_MIRROR */ static inline void hmm_mm_init(struct mm_struct *mm) {} -#endif /* IS_ENABLED(CONFIG_HMM) */ +#endif /* CONFIG_HMM_MIRROR */ #endif /* LINUX_HMM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f33a1289c101..8d37182f8dbe 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -501,7 +501,7 @@ struct mm_struct { #endif struct work_struct async_put_work; -#if IS_ENABLED(CONFIG_HMM) +#ifdef CONFIG_HMM_MIRROR /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif diff --git a/mm/Kconfig b/mm/Kconfig index 4dbd718c8cf4..73676cb4693f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -669,37 +669,17 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config ARCH_HAS_HMM_MIRROR - bool - default y - depends on (X86_64 || PPC64) - depends on MMU && 64BIT - -config ARCH_HAS_HMM - bool - depends on (X86_64 || PPC64) - depends on ZONE_DEVICE - depends on MMU && 64BIT - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - default y - config MIGRATE_VMA_HELPER bool config DEV_PAGEMAP_OPS bool -config HMM - bool - select MMU_NOTIFIER - select MIGRATE_VMA_HELPER - config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" - depends on ARCH_HAS_HMM - select HMM + depends on MMU + select MMU_NOTIFIER + select MIGRATE_VMA_HELPER help Select HMM_MIRROR if you want to mirror range of the CPU page table of a process into a device page table. Here, mirror means "keep synchronized". @@ -721,7 +701,6 @@ config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" depends on ARCH_HAS_HMM depends on BROKEN - select HMM select DEV_PAGEMAP_OPS help diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..91c99040065c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o -obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/hmm.c b/mm/hmm.c index 5b2e9bb6063a..8d50c482469c 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -26,7 +26,6 @@ #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -#if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; /** @@ -1289,4 +1288,3 @@ long hmm_range_dma_unmap(struct hmm_range *range, return cpages; } EXPORT_SYMBOL(hmm_range_dma_unmap); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- 2.20.1
Christoph Hellwig
2019-Jun-13 09:43 UTC
[Nouveau] [PATCH 22/22] mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
The migrate_vma helper is only used by noveau to migrate device private pages around. Other HMM_MIRROR users like amdgpu or infiniband don't need it. Signed-off-by: Christoph Hellwig <hch at lst.de> --- drivers/gpu/drm/nouveau/Kconfig | 1 + mm/Kconfig | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 66c839d8e9d1..96b9814e6d06 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -88,6 +88,7 @@ config DRM_NOUVEAU_SVM depends on DRM_NOUVEAU depends on HMM_MIRROR depends on STAGING + select MIGRATE_VMA_HELPER default n help Say Y here if you want to enable experimental support for diff --git a/mm/Kconfig b/mm/Kconfig index 73676cb4693f..eca88679b624 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -679,7 +679,6 @@ config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on MMU select MMU_NOTIFIER - select MIGRATE_VMA_HELPER help Select HMM_MIRROR if you want to mirror range of the CPU page table of a process into a device page table. Here, mirror means "keep synchronized". -- 2.20.1
On Thu, Jun 13, 2019 at 11:43:03AM +0200, Christoph Hellwig wrote:> Hi Dan, Jérôme and Jason, > > below is a series that cleans up the dev_pagemap interface so that > it is more easily usable, which removes the need to wrap it in hmm > and thus allowing to kill a lot of codeDo you want some of this to run through hmm.git? I see many patches that don't seem to have inter-dependencies.. Thanks, Jason
On Thu, Jun 13, 2019 at 2:43 AM Christoph Hellwig <hch at lst.de> wrote:> > Hi Dan, Jérôme and Jason, > > below is a series that cleans up the dev_pagemap interface so that > it is more easily usable, which removes the need to wrap it in hmm > and thus allowing to kill a lot of code > > Diffstat: > > 22 files changed, 245 insertions(+), 802 deletions(-)Hooray!> Git tree: > > git://git.infradead.org/users/hch/misc.git hmm-devmem-cleanupI just realized this collides with the dev_pagemap release rework in Andrew's tree (commit ids below are from next.git and are not stable) 4422ee8476f0 mm/devm_memremap_pages: fix final page put race 771f0714d0dc PCI/P2PDMA: track pgmap references per resource, not globally af37085de906 lib/genalloc: introduce chunk owners e0047ff8aa77 PCI/P2PDMA: fix the gen_pool_add_virt() failure path 0315d47d6ae9 mm/devm_memremap_pages: introduce devm_memunmap_pages 216475c7eaa8 drivers/base/devres: introduce devm_release_action() CONFLICT (content): Merge conflict in tools/testing/nvdimm/test/iomap.c CONFLICT (content): Merge conflict in mm/hmm.c CONFLICT (content): Merge conflict in kernel/memremap.c CONFLICT (content): Merge conflict in include/linux/memremap.h CONFLICT (content): Merge conflict in drivers/pci/p2pdma.c CONFLICT (content): Merge conflict in drivers/nvdimm/pmem.c CONFLICT (content): Merge conflict in drivers/dax/device.c CONFLICT (content): Merge conflict in drivers/dax/dax-private.h Perhaps we should pull those out and resend them through hmm.git? It also turns out the nvdimm unit tests crash with this signature on that branch where base v5.2-rc3 passes: BUG: kernel NULL pointer dereference, address: 0000000000000008 [..] CPU: 15 PID: 1414 Comm: lt-libndctl Tainted: G OE 5.2.0-rc3+ #3399 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:percpu_ref_kill_and_confirm+0x1e/0x180 [..] Call Trace: release_nodes+0x234/0x280 device_release_driver_internal+0xe8/0x1b0 bus_remove_device+0xf2/0x160 device_del+0x166/0x370 unregister_dev_dax+0x23/0x50 release_nodes+0x234/0x280 device_release_driver_internal+0xe8/0x1b0 unbind_store+0x94/0x120 kernfs_fop_write+0xf0/0x1a0 vfs_write+0xb7/0x1b0 ksys_write+0x5c/0xd0 do_syscall_64+0x60/0x240 The crash bisects to: d8cc8bbe108c device-dax: use the dev_pagemap internal refcount
Jason Gunthorpe
2019-Jun-13 18:30 UTC
[Nouveau] [PATCH 01/22] mm: remove the unused ARCH_HAS_HMM_DEVICE Kconfig option
On Thu, Jun 13, 2019 at 11:43:04AM +0200, Christoph Hellwig wrote:> Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > mm/Kconfig | 10 ---------- > 1 file changed, 10 deletions(-)So long as we are willing to run a hmm.git we can merge only complete features going forward. Thus we don't need the crazy process described in the commit message that (deliberately) introduced this unused kconfig. I also tried to find something in-flight for 5.3 that would need this and found nothing Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 18:46 UTC
[Nouveau] [PATCH 02/22] mm: remove the struct hmm_device infrastructure
On Thu, Jun 13, 2019 at 11:43:05AM +0200, Christoph Hellwig wrote:> This code is a trivial wrapper around device model helpers, which > should have been integrated into the driver device model usage from > the start. Assuming it actually had users, which it never had since > the code was added more than 1 1/2 years ago. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > include/linux/hmm.h | 20 ------------ > mm/hmm.c | 80 --------------------------------------------- > 2 files changed, 100 deletions(-)I haven't looked in detail at this device memory stuff.. But I did check a bit through the mailing list archives for some clue what this was supposed to be for (wow, this is from 2016!) The commit that added this says: This introduce a dummy HMM device class so device driver can use it to create hmm_device for the sole purpose of registering device memory. Which I just can't understand at all. If we need a 'struct device' for some 'device memory' purpose then it probably ought to be the 'struct pci_device' holding the BAR, not a fake device. I also can't comprehend why a supposed fake device would need a chardev, with a stanadrd 'hmm_deviceX' name, without also defining a core kernel ABI for that char dev.. If this comes back it needs a proper explanation and review, with a user. Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 18:52 UTC
[Nouveau] [PATCH 03/22] mm: remove hmm_devmem_add_resource
On Thu, Jun 13, 2019 at 11:43:06AM +0200, Christoph Hellwig wrote:> This function has never been used since it was first added to the kernel > more than a year and a half ago, and if we ever grow a consumer of the > MEMORY_DEVICE_PUBLIC infrastructure it can easily use devm_memremap_pages > directly now that we've simplified the API for it.nit: Have we simplified the interface for devm_memremap_pages() at this point, or are you talking about later patches in this series. I checked this and all the called functions are exported symbols, so there is no blocker for a future driver to call devm_memremap_pages(), maybe even with all this boiler plate, in future. If we eventually get many users that need some simplified registration then we should add a devm_memremap_pages_simplified() interface and factor out that code when we can see the pattern. Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:05 UTC
[Nouveau] [PATCH 04/22] mm: don't clear ->mapping in hmm_devmem_free
On Thu, Jun 13, 2019 at 11:43:07AM +0200, Christoph Hellwig wrote:> ->mapping isn't even used by HMM users, and the field at the same offset > in the zone_device part of the union is declared as pad. (Which btw is > rather confusing, as DAX uses ->pgmap and ->mapping from two different > sides of the union, but DAX doesn't use hmm_devmem_free). > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > mm/hmm.c | 2 -- > 1 file changed, 2 deletions(-)Hurm, is hmm following this comment from mm_types.h? * If you allocate the page using alloc_pages(), you can use some of the * space in struct page for your own purposes. The five words in the main * union are available, except for bit 0 of the first word which must be * kept clear. Many users use this word to store a pointer to an object * which is guaranteed to be aligned. If you use the same storage as * page->mapping, you must restore it to NULL before freeing the page. Maybe the assumption was that a driver is using ->mapping ? However, nouveau is the only driver that uses this path, and it never touches page->mapping either (nor in -next). It looks like if a driver were to start using mapping then the driver should be responsible to set it back to NULL before being done with the page. Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:16 UTC
[Nouveau] [PATCH 06/22] mm: factor out a devm_request_free_mem_region helper
On Thu, Jun 13, 2019 at 11:43:09AM +0200, Christoph Hellwig wrote:> Keep the physical address allocation that hmm_add_device does with the > rest of the resource code, and allow future reuse of it without the hmm > wrapper. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > include/linux/ioport.h | 2 ++ > kernel/resource.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/hmm.c | 33 ++++----------------------------- > 3 files changed, 45 insertions(+), 29 deletions(-) > > diff --git a/include/linux/ioport.h b/include/linux/ioport.h > index da0ebaec25f0..76a33ae3bf6c 100644 > +++ b/include/linux/ioport.h > @@ -286,6 +286,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) > return (r1->start <= r2->end && r1->end >= r2->start); > } > > +struct resource *devm_request_free_mem_region(struct device *dev, > + struct resource *base, unsigned long size); > > #endif /* __ASSEMBLY__ */ > #endif /* _LINUX_IOPORT_H */ > diff --git a/kernel/resource.c b/kernel/resource.c > index 158f04ec1d4f..99c58134ed1c 100644 > +++ b/kernel/resource.c > @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) > } > EXPORT_SYMBOL(resource_list_free); > > +#ifdef CONFIG_DEVICE_PRIVATE > +/** > + * devm_request_free_mem_region - find free region for device private memory > + * > + * @dev: device struct to bind the resource too > + * @size: size in bytes of the device memory to add > + * @base: resource tree to look in > + * > + * This function tries to find an empty range of physical address big enough to > + * contain the new resource, so that it can later be hotpluged as ZONE_DEVICE > + * memory, which in turn allocates struct pages. > + */ > +struct resource *devm_request_free_mem_region(struct device *dev, > + struct resource *base, unsigned long size) > +{ > + resource_size_t end, addr; > + struct resource *res; > + > + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); > + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);Even fixed it to use min_t> + addr = end - size + 1UL; > + for (; addr > size && addr >= base->start; addr -= size) { > + if (region_intersects(addr, size, 0, IORES_DESC_NONE) !> + REGION_DISJOINT) > + continue;The FIXME about the algorithm cost seems justified though, yikes.> + > + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); > + if (!res) > + return ERR_PTR(-ENOMEM); > + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;I wonder if IORES_DESC_DEVICE_PRIVATE_MEMORY should be a function argument? Not really any substantive remark, so Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:18 UTC
[Nouveau] [PATCH 07/22] memremap: move dev_pagemap callbacks into a separate structure
On Thu, Jun 13, 2019 at 11:43:10AM +0200, Christoph Hellwig wrote:> The dev_pagemap is a growing too many callbacks. Move them into a > separate ops structure so that they are not duplicated for multiple > instances, and an attacker can't easily overwrite them.Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:26 UTC
[Nouveau] [PATCH 08/22] memremap: pass a struct dev_pagemap to ->kill
On Thu, Jun 13, 2019 at 11:43:11AM +0200, Christoph Hellwig wrote:> Passing the actual typed structure leads to more understandable code > vs the actual references. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > drivers/dax/device.c | 7 +++---- > drivers/nvdimm/pmem.c | 6 +++--- > drivers/pci/p2pdma.c | 6 +++--- > include/linux/memremap.h | 2 +- > kernel/memremap.c | 4 ++-- > mm/hmm.c | 4 ++-- > tools/testing/nvdimm/test/iomap.c | 6 ++---- > 7 files changed, 16 insertions(+), 19 deletions(-) > > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index 4adab774dade..e23fa1bd8c97 100644 > +++ b/drivers/dax/device.c > @@ -37,13 +37,12 @@ static void dev_dax_percpu_exit(void *data) > percpu_ref_exit(ref); > } > > -static void dev_dax_percpu_kill(struct percpu_ref *data) > +static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) > {Looks like it was always like this, but I also can't see a reason to use the percpu as a handle for a dev_pagemap callback. Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:34 UTC
[Nouveau] [PATCH 09/22] memremap: lift the devmap_enable manipulation into devm_memremap_pages
On Thu, Jun 13, 2019 at 11:43:12AM +0200, Christoph Hellwig wrote:> Just check if there is a ->page_free operation set and take care of the > static key enable, as well as the put using device managed resources. > diff --git a/mm/hmm.c b/mm/hmm.c > index c76a1b5defda..6dc769feb2e1 100644 > +++ b/mm/hmm.c > @@ -1378,8 +1378,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > void *result; > int ret; > > - dev_pagemap_get_ops(); > -Where was the matching dev_pagemap_put_ops() for this hmm case? This is a bug fix too? The nouveau driver is the only one to actually call this hmm function and it does it as part of a probe function. Seems reasonable, however, in the unlikely event that it fails to init 'dmem' the driver will retain a dev_pagemap_get_ops until it unloads. This imbalance doesn't seem worth worrying about. Reviewed-by: Christoph Hellwig <hch at lst.de> Jason
Jason Gunthorpe
2019-Jun-13 19:37 UTC
[Nouveau] [PATCH 11/22] memremap: remove the data field in struct dev_pagemap
On Thu, Jun 13, 2019 at 11:43:14AM +0200, Christoph Hellwig wrote:> struct dev_pagemap is always embedded into a containing structure, so > there is no need to an additional private data field. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/nvdimm/pmem.c | 2 +- > include/linux/memremap.h | 3 +-- > kernel/memremap.c | 2 +- > mm/hmm.c | 9 +++++---- > 4 files changed, 8 insertions(+), 8 deletions(-)Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:39 UTC
[Nouveau] [PATCH 14/22] nouveau: use alloc_page_vma directly
On Thu, Jun 13, 2019 at 11:43:17AM +0200, Christoph Hellwig wrote:> hmm_vma_alloc_locked_page is scheduled to go away, use the proper > mm function directly. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-)Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
On Thu, Jun 13, 2019 at 11:43:20AM +0200, Christoph Hellwig wrote:> There isn't really much value add in the hmm_devmem_add wrapper. Just > factor out a little helper to find the resource, and otherwise let the > driver implement the dev_pagemap_ops directly.Was this commit message written when other patches were squashed in here? I think the helper this mentions was from an earlier patch> Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > Documentation/vm/hmm.rst | 26 -------- > include/linux/hmm.h | 129 --------------------------------------- > mm/hmm.c | 115 ---------------------------------- > 3 files changed, 270 deletions(-)I looked for in-flight patches that might be using these APIs and found nothing. To be sent patches can use the new API with no loss in functionality... Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Jason Gunthorpe
2019-Jun-13 19:44 UTC
[Nouveau] [PATCH 18/22] mm: mark DEVICE_PUBLIC as broken
On Thu, Jun 13, 2019 at 11:43:21AM +0200, Christoph Hellwig wrote:> The code hasn't been used since it was added to the tree, and doesn't > appear to actually be usable. Mark it as BROKEN until either a user > comes along or we finally give up on it. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > mm/Kconfig | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/Kconfig b/mm/Kconfig > index 0d2ba7e1f43e..406fa45e9ecc 100644 > +++ b/mm/Kconfig > @@ -721,6 +721,7 @@ config DEVICE_PRIVATE > config DEVICE_PUBLIC > bool "Addressable device memory (like GPU memory)" > depends on ARCH_HAS_HMM > + depends on BROKEN > select HMM > select DEV_PAGEMAP_OPSThis seems a bit harsh, we do have another kconfig that selects this one today: config DRM_NOUVEAU_SVM bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" depends on ARCH_HAS_HMM depends on DRM_NOUVEAU depends on STAGING select HMM_MIRROR select DEVICE_PRIVATE default n help Say Y here if you want to enable experimental support for Shared Virtual Memory (SVM). Maybe it should be depends on STAGING not broken? or maybe nouveau_svm doesn't actually need DEVICE_PRIVATE? Jason
Jason Gunthorpe
2019-Jun-13 19:55 UTC
[Nouveau] [PATCH 20/22] mm: sort out the DEVICE_PRIVATE Kconfig mess
On Thu, Jun 13, 2019 at 11:43:23AM +0200, Christoph Hellwig wrote:> The ZONE_DEVICE support doesn't depend on anything HMM related, just on > various bits of arch support as indicated by the architecture. Also > don't select the option from nouveau as it isn't present in many setups, > and depend on it instead. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > drivers/gpu/drm/nouveau/Kconfig | 2 +- > mm/Kconfig | 5 ++--- > 2 files changed, 3 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig > index dba2613f7180..6303d203ab1d 100644 > +++ b/drivers/gpu/drm/nouveau/Kconfig > @@ -85,10 +85,10 @@ config DRM_NOUVEAU_BACKLIGHT > config DRM_NOUVEAU_SVM > bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" > depends on ARCH_HAS_HMM > + depends on DEVICE_PRIVATE > depends on DRM_NOUVEAU > depends on STAGING > select HMM_MIRROR > - select DEVICE_PRIVATE > default n > help > Say Y here if you want to enable experimental support forBen, I heard you might have a patch like this in your tree, but I don't think I could find your tree?? Do you have any nouveau/Kconfig patches that might conflict? Thanks Does this fix the randconfigs failures that have been reported?> diff --git a/mm/Kconfig b/mm/Kconfig > index 406fa45e9ecc..4dbd718c8cf4 100644 > +++ b/mm/Kconfig > @@ -677,13 +677,13 @@ config ARCH_HAS_HMM_MIRROR > > config ARCH_HAS_HMM > bool > - default y > depends on (X86_64 || PPC64) > depends on ZONE_DEVICE > depends on MMU && 64BIT > depends on MEMORY_HOTPLUG > depends on MEMORY_HOTREMOVE > depends on SPARSEMEM_VMEMMAP > + default yWhat is the reason we have this ARCH thing anyhow? Does hmm have arch dependencies someplace? Thanks Jason
Jason Gunthorpe
2019-Jun-13 20:01 UTC
[Nouveau] [PATCH 21/22] mm: remove the HMM config option
On Thu, Jun 13, 2019 at 11:43:24AM +0200, Christoph Hellwig wrote:> All the mm/hmm.c code is better keyed off HMM_MIRROR. Also let nouveau > depend on it instead of the mix of a dummy dependency symbol plus the > actually selected one. Drop various odd dependencies, as the code is > pretty portable.I don't really know, but I thought this needed the arch restriction for the same reason get_user_pages has various unique arch specific implementations (it does seem to have some open coded GUP like thing)? I was hoping we could do this after your common gup series? But sooner is better too. Jason
Jason Gunthorpe
2019-Jun-13 20:04 UTC
[Nouveau] [PATCH 22/22] mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
On Thu, Jun 13, 2019 at 11:43:25AM +0200, Christoph Hellwig wrote:> The migrate_vma helper is only used by noveau to migrate device private > pages around. Other HMM_MIRROR users like amdgpu or infiniband don't > need it. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/gpu/drm/nouveau/Kconfig | 1 + > mm/Kconfig | 1 - > 2 files changed, 1 insertion(+), 1 deletion(-)Yes, the thing that calls migrate_vma() should be the thing that has the kconfig stuff. Reviewed-by: Jason Gunthorpe <jgg at mellanox.com> Jason
Logan Gunthorpe
2019-Jun-13 20:12 UTC
[Nouveau] [PATCH 08/22] memremap: pass a struct dev_pagemap to ->kill
On 2019-06-13 3:43 a.m., Christoph Hellwig wrote:> Passing the actual typed structure leads to more understandable code > vs the actual references.Ha, ok, I originally suggested this to Dan when he introduced the callback[1]. Reviewed-by: Logan Gunthorpe <logang at deltatee.com> Logan [1] https://lore.kernel.org/lkml/8f0cae82-130f-8a64-cfbd-fda5fd76bb79 at deltatee.com/T/#u
Logan Gunthorpe
2019-Jun-13 20:14 UTC
[Nouveau] [PATCH 07/22] memremap: move dev_pagemap callbacks into a separate structure
On 2019-06-13 3:43 a.m., Christoph Hellwig wrote:> The dev_pagemap is a growing too many callbacks. Move them into a > separate ops structure so that they are not duplicated for multiple > instances, and an attacker can't easily overwrite them. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/dax/device.c | 6 +++++- > drivers/nvdimm/pmem.c | 18 +++++++++++++----- > drivers/pci/p2pdma.c | 5 ++++- > include/linux/memremap.h | 29 +++++++++++++++-------------- > kernel/memremap.c | 12 ++++++------ > mm/hmm.c | 8 ++++++-- > tools/testing/nvdimm/test/iomap.c | 2 +- > 7 files changed, 50 insertions(+), 30 deletions(-)Looks good to me, Reviewed-by: Logan Gunthorpe <logang at deltatee.com> Logan
On 2019-06-13 12:27 p.m., Dan Williams wrote:> On Thu, Jun 13, 2019 at 2:43 AM Christoph Hellwig <hch at lst.de> wrote: >> >> Hi Dan, Jérôme and Jason, >> >> below is a series that cleans up the dev_pagemap interface so that >> it is more easily usable, which removes the need to wrap it in hmm >> and thus allowing to kill a lot of code >> >> Diffstat: >> >> 22 files changed, 245 insertions(+), 802 deletions(-) > > Hooray! > >> Git tree: >> >> git://git.infradead.org/users/hch/misc.git hmm-devmem-cleanup > > I just realized this collides with the dev_pagemap release rework in > Andrew's tree (commit ids below are from next.git and are not stable) > > 4422ee8476f0 mm/devm_memremap_pages: fix final page put race > 771f0714d0dc PCI/P2PDMA: track pgmap references per resource, not globally > af37085de906 lib/genalloc: introduce chunk owners > e0047ff8aa77 PCI/P2PDMA: fix the gen_pool_add_virt() failure path > 0315d47d6ae9 mm/devm_memremap_pages: introduce devm_memunmap_pages > 216475c7eaa8 drivers/base/devres: introduce devm_release_action() > > CONFLICT (content): Merge conflict in tools/testing/nvdimm/test/iomap.c > CONFLICT (content): Merge conflict in mm/hmm.c > CONFLICT (content): Merge conflict in kernel/memremap.c > CONFLICT (content): Merge conflict in include/linux/memremap.h > CONFLICT (content): Merge conflict in drivers/pci/p2pdma.c > CONFLICT (content): Merge conflict in drivers/nvdimm/pmem.c > CONFLICT (content): Merge conflict in drivers/dax/device.c > CONFLICT (content): Merge conflict in drivers/dax/dax-private.h > > Perhaps we should pull those out and resend them through hmm.git?Hmm, I've been waiting for those patches to get in for a little while now ;( Logan
On Thu, Jun 13, 2019 at 11:27:39AM -0700, Dan Williams wrote:> On Thu, Jun 13, 2019 at 2:43 AM Christoph Hellwig <hch at lst.de> wrote: > > > > Hi Dan, Jérôme and Jason, > > > > below is a series that cleans up the dev_pagemap interface so that > > it is more easily usable, which removes the need to wrap it in hmm > > and thus allowing to kill a lot of code > > > > Diffstat: > > > > 22 files changed, 245 insertions(+), 802 deletions(-) > > Hooray! > > > Git tree: > > > > git://git.infradead.org/users/hch/misc.git hmm-devmem-cleanup > > I just realized this collides with the dev_pagemap release rework in > Andrew's tree (commit ids below are from next.git and are not stable) > > 4422ee8476f0 mm/devm_memremap_pages: fix final page put race > 771f0714d0dc PCI/P2PDMA: track pgmap references per resource, not globally > af37085de906 lib/genalloc: introduce chunk owners > e0047ff8aa77 PCI/P2PDMA: fix the gen_pool_add_virt() failure path > 0315d47d6ae9 mm/devm_memremap_pages: introduce devm_memunmap_pages > 216475c7eaa8 drivers/base/devres: introduce devm_release_action() > > CONFLICT (content): Merge conflict in tools/testing/nvdimm/test/iomap.c > CONFLICT (content): Merge conflict in mm/hmm.c > CONFLICT (content): Merge conflict in kernel/memremap.c > CONFLICT (content): Merge conflict in include/linux/memremap.h > CONFLICT (content): Merge conflict in drivers/pci/p2pdma.c > CONFLICT (content): Merge conflict in drivers/nvdimm/pmem.c > CONFLICT (content): Merge conflict in drivers/dax/device.c > CONFLICT (content): Merge conflict in drivers/dax/dax-private.h > > Perhaps we should pull those out and resend them through hmm.git?It could be done - but how bad is the conflict resolution? I'd be more comfortable to take a PR from you to merge into hmm.git, rather than raw patches, then apply CH's series on top. I think. That way if something goes wrong you can send your PR to Linus directly.> It also turns out the nvdimm unit tests crash with this signature on > that branch where base v5.2-rc3 passes: > > BUG: kernel NULL pointer dereference, address: 0000000000000008 > [..] > CPU: 15 PID: 1414 Comm: lt-libndctl Tainted: G OE > 5.2.0-rc3+ #3399 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 > RIP: 0010:percpu_ref_kill_and_confirm+0x1e/0x180 > [..] > Call Trace: > release_nodes+0x234/0x280 > device_release_driver_internal+0xe8/0x1b0 > bus_remove_device+0xf2/0x160 > device_del+0x166/0x370 > unregister_dev_dax+0x23/0x50 > release_nodes+0x234/0x280 > device_release_driver_internal+0xe8/0x1b0 > unbind_store+0x94/0x120 > kernfs_fop_write+0xf0/0x1a0 > vfs_write+0xb7/0x1b0 > ksys_write+0x5c/0xd0 > do_syscall_64+0x60/0x240Too bad the trace didn't say which devm cleanup triggered it.. Did dev_pagemap_percpu_exit get called with a NULL pgmap->ref ? Jason
John Hubbard
2019-Jun-13 23:06 UTC
[Nouveau] [PATCH 02/22] mm: remove the struct hmm_device infrastructure
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> This code is a trivial wrapper around device model helpers, which > should have been integrated into the driver device model usage from > the start. Assuming it actually had users, which it never had since > the code was added more than 1 1/2 years ago. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > include/linux/hmm.h | 20 ------------ > mm/hmm.c | 80 --------------------------------------------- > 2 files changed, 100 deletions(-) >Yes. This code is definitely unnecessary, and it's a good housecleaning here. (As to the history: I know that there was some early "HMM dummy device" testing when the HMM code was much younger, but such testing has long since been superseded by more elaborate testing with real drivers.) Reviewed-by: John Hubbard <jhubbard at nvidia.com> thanks, -- John Hubbard NVIDIA> diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 0fa8ea34ccef..4867b9da1b6c 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -717,26 +717,6 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) > { > return page->hmm_data; > } > - > - > -/* > - * struct hmm_device - fake device to hang device memory onto > - * > - * @device: device struct > - * @minor: device minor number > - */ > -struct hmm_device { > - struct device device; > - unsigned int minor; > -}; > - > -/* > - * A device driver that wants to handle multiple devices memory through a > - * single fake device can use hmm_device to do so. This is purely a helper and > - * it is not strictly needed, in order to make use of any HMM functionality. > - */ > -struct hmm_device *hmm_device_new(void *drvdata); > -void hmm_device_put(struct hmm_device *hmm_device); > #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ > #else /* IS_ENABLED(CONFIG_HMM) */ > static inline void hmm_mm_destroy(struct mm_struct *mm) {} > diff --git a/mm/hmm.c b/mm/hmm.c > index 886b18695b97..ff2598eb7377 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1499,84 +1499,4 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, > return devmem; > } > EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); > - > -/* > - * A device driver that wants to handle multiple devices memory through a > - * single fake device can use hmm_device to do so. This is purely a helper > - * and it is not needed to make use of any HMM functionality. > - */ > -#define HMM_DEVICE_MAX 256 > - > -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); > -static DEFINE_SPINLOCK(hmm_device_lock); > -static struct class *hmm_device_class; > -static dev_t hmm_device_devt; > - > -static void hmm_device_release(struct device *device) > -{ > - struct hmm_device *hmm_device; > - > - hmm_device = container_of(device, struct hmm_device, device); > - spin_lock(&hmm_device_lock); > - clear_bit(hmm_device->minor, hmm_device_mask); > - spin_unlock(&hmm_device_lock); > - > - kfree(hmm_device); > -} > - > -struct hmm_device *hmm_device_new(void *drvdata) > -{ > - struct hmm_device *hmm_device; > - > - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); > - if (!hmm_device) > - return ERR_PTR(-ENOMEM); > - > - spin_lock(&hmm_device_lock); > - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); > - if (hmm_device->minor >= HMM_DEVICE_MAX) { > - spin_unlock(&hmm_device_lock); > - kfree(hmm_device); > - return ERR_PTR(-EBUSY); > - } > - set_bit(hmm_device->minor, hmm_device_mask); > - spin_unlock(&hmm_device_lock); > - > - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); > - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), > - hmm_device->minor); > - hmm_device->device.release = hmm_device_release; > - dev_set_drvdata(&hmm_device->device, drvdata); > - hmm_device->device.class = hmm_device_class; > - device_initialize(&hmm_device->device); > - > - return hmm_device; > -} > -EXPORT_SYMBOL(hmm_device_new); > - > -void hmm_device_put(struct hmm_device *hmm_device) > -{ > - put_device(&hmm_device->device); > -} > -EXPORT_SYMBOL(hmm_device_put); > - > -static int __init hmm_init(void) > -{ > - int ret; > - > - ret = alloc_chrdev_region(&hmm_device_devt, 0, > - HMM_DEVICE_MAX, > - "hmm_device"); > - if (ret) > - return ret; > - > - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); > - if (IS_ERR(hmm_device_class)) { > - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); > - return PTR_ERR(hmm_device_class); > - } > - return 0; > -} > - > -device_initcall(hmm_init); > #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ >
Ralph Campbell
2019-Jun-13 23:42 UTC
[Nouveau] [PATCH 10/22] memremap: add a migrate callback to struct dev_pagemap_ops
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> This replaces the hacky ->fault callback, which is currently directly > called from common code through a hmm specific data structure as an > exercise in layering violations. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > include/linux/hmm.h | 6 ------ > include/linux/memremap.h | 6 ++++++ > include/linux/swapops.h | 15 --------------- > kernel/memremap.c | 31 ------------------------------- > mm/hmm.c | 13 +++++-------- > mm/memory.c | 9 ++------- > 6 files changed, 13 insertions(+), 67 deletions(-) > > diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 5761a39221a6..3c9a59dbfdb8 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -658,11 +658,6 @@ struct hmm_devmem_ops { > * chunk, as an optimization. It must, however, prioritize the faulting address > * over all the others. > */ > -typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma, > - unsigned long addr, > - const struct page *page, > - unsigned int flags, > - pmd_t *pmdp); > > struct hmm_devmem { > struct completion completion; > @@ -673,7 +668,6 @@ struct hmm_devmem { > struct dev_pagemap pagemap; > const struct hmm_devmem_ops *ops; > struct percpu_ref ref; > - dev_page_fault_t page_fault; > }; > > /* > diff --git a/include/linux/memremap.h b/include/linux/memremap.h > index 96a3a6d564ad..03a4099be701 100644 > --- a/include/linux/memremap.h > +++ b/include/linux/memremap.h > @@ -75,6 +75,12 @@ struct dev_pagemap_ops { > * Transition the percpu_ref in struct dev_pagemap to the dead state. > */ > void (*kill)(struct dev_pagemap *pgmap); > + > + /* > + * Used for private (un-addressable) device memory only. Must migrate > + * the page back to a CPU accessible page. > + */ > + vm_fault_t (*migrate)(struct vm_fault *vmf); > }; > > /** > diff --git a/include/linux/swapops.h b/include/linux/swapops.h > index 4d961668e5fc..15bdb6fe71e5 100644 > --- a/include/linux/swapops.h > +++ b/include/linux/swapops.h > @@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) > { > return pfn_to_page(swp_offset(entry)); > } > - > -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, > - unsigned long addr, > - swp_entry_t entry, > - unsigned int flags, > - pmd_t *pmdp); > #else /* CONFIG_DEVICE_PRIVATE */ > static inline swp_entry_t make_device_private_entry(struct page *page, bool write) > { > @@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) > { > return NULL; > } > - > -static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, > - unsigned long addr, > - swp_entry_t entry, > - unsigned int flags, > - pmd_t *pmdp) > -{ > - return VM_FAULT_SIGBUS; > -} > #endif /* CONFIG_DEVICE_PRIVATE */ > > #ifdef CONFIG_MIGRATION > diff --git a/kernel/memremap.c b/kernel/memremap.c > index 6a3183cac764..7167e717647d 100644 > --- a/kernel/memremap.c > +++ b/kernel/memremap.c > @@ -11,7 +11,6 @@ > #include <linux/types.h> > #include <linux/wait_bit.h> > #include <linux/xarray.h> > -#include <linux/hmm.h> > > static DEFINE_XARRAY(pgmap_array); > #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) > @@ -48,36 +47,6 @@ static inline int dev_pagemap_enable(struct device *dev) > } > #endif /* CONFIG_DEV_PAGEMAP_OPS */ > > -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) > -vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, > - unsigned long addr, > - swp_entry_t entry, > - unsigned int flags, > - pmd_t *pmdp) > -{ > - struct page *page = device_private_entry_to_page(entry); > - struct hmm_devmem *devmem; > - > - devmem = container_of(page->pgmap, typeof(*devmem), pagemap); > - > - /* > - * The page_fault() callback must migrate page back to system memory > - * so that CPU can access it. This might fail for various reasons > - * (device issue, device was unsafely unplugged, ...). When such > - * error conditions happen, the callback must return VM_FAULT_SIGBUS. > - * > - * Note that because memory cgroup charges are accounted to the device > - * memory, this should never fail because of memory restrictions (but > - * allocation of regular system page might still fail because we are > - * out of memory). > - * > - * There is a more in-depth description of what that callback can and > - * cannot do, in include/linux/memremap.h > - */ > - return devmem->page_fault(vma, addr, page, flags, pmdp); > -} > -#endif /* CONFIG_DEVICE_PRIVATE */ > - > static void pgmap_array_delete(struct resource *res) > { > xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), > diff --git a/mm/hmm.c b/mm/hmm.c > index 6dc769feb2e1..aab799677c7d 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1330,15 +1330,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) > percpu_ref_kill(pgmap->ref); > } > > -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, > - unsigned long addr, > - const struct page *page, > - unsigned int flags, > - pmd_t *pmdp) > +static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf) > { > - struct hmm_devmem *devmem = page->pgmap->data; > + struct hmm_devmem *devmem = vmf->page->pgmap->data; > > - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); > + return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, > + vmf->flags, vmf->pmd); > } > > static void hmm_devmem_free(struct page *page, void *data) > @@ -1351,6 +1348,7 @@ static void hmm_devmem_free(struct page *page, void *data) > static const struct dev_pagemap_ops hmm_pagemap_ops = { > .page_free = hmm_devmem_free, > .kill = hmm_devmem_ref_kill, > + .migrate = hmm_devmem_migrate, > }; > > /* > @@ -1405,7 +1403,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; > devmem->pfn_last = devmem->pfn_first + > (resource_size(devmem->resource) >> PAGE_SHIFT); > - devmem->page_fault = hmm_devmem_fault; > > devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; > devmem->pagemap.res = *devmem->resource; > diff --git a/mm/memory.c b/mm/memory.c > index ddf20bd0c317..cbf3cb598436 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2782,13 +2782,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) > migration_entry_wait(vma->vm_mm, vmf->pmd, > vmf->address); > } else if (is_device_private_entry(entry)) { > - /* > - * For un-addressable device memory we call the pgmap > - * fault handler callback. The callback must migrate > - * the page back to some CPU accessible page. > - */ > - ret = device_private_entry_fault(vma, vmf->address, entry, > - vmf->flags, vmf->pmd); > + vmf->page = device_private_entry_to_page(entry); > + ret = page->pgmap->ops->migrate(vmf);This needs to either initialize "page" or be changed to "vmf->page". Otherwise, it is a NULL pointer dereference.> } else if (is_hwpoison_entry(entry)) { > ret = VM_FAULT_HWPOISON; > } else { >You can add: Reviewed-by: Ralph Campbell <rcampbell at nvidia.com>
Ira Weiny
2019-Jun-14 00:22 UTC
[Nouveau] [PATCH 13/22] device-dax: use the dev_pagemap internal refcount
On Thu, Jun 13, 2019 at 11:43:16AM +0200, Christoph Hellwig wrote:> The functionality is identical to the one currently open coded in > device-dax. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/dax/dax-private.h | 4 --- > drivers/dax/device.c | 52 +-------------------------------------- > 2 files changed, 1 insertion(+), 55 deletions(-) > > diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h > index a45612148ca0..ed04a18a35be 100644 > --- a/drivers/dax/dax-private.h > +++ b/drivers/dax/dax-private.h > @@ -51,8 +51,6 @@ struct dax_region { > * @target_node: effective numa node if dev_dax memory range is onlined > * @dev - device core > * @pgmap - pgmap for memmap setup / lifetime (driver owned) > - * @ref: pgmap reference count (driver owned) > - * @cmp: @ref final put completion (driver owned) > */ > struct dev_dax { > struct dax_region *region; > @@ -60,8 +58,6 @@ struct dev_dax { > int target_node; > struct device dev; > struct dev_pagemap pgmap; > - struct percpu_ref ref; > - struct completion cmp; > }; > > static inline struct dev_dax *to_dev_dax(struct device *dev) > diff --git a/drivers/dax/device.c b/drivers/dax/device.c > index e23fa1bd8c97..a9d7c90ecf1e 100644 > --- a/drivers/dax/device.c > +++ b/drivers/dax/device.c > @@ -14,37 +14,6 @@ > #include "dax-private.h" > #include "bus.h" > > -static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref) > -{ > - return container_of(ref, struct dev_dax, ref); > -} > - > -static void dev_dax_percpu_release(struct percpu_ref *ref) > -{ > - struct dev_dax *dev_dax = ref_to_dev_dax(ref); > - > - dev_dbg(&dev_dax->dev, "%s\n", __func__); > - complete(&dev_dax->cmp); > -} > - > -static void dev_dax_percpu_exit(void *data) > -{ > - struct percpu_ref *ref = data; > - struct dev_dax *dev_dax = ref_to_dev_dax(ref); > - > - dev_dbg(&dev_dax->dev, "%s\n", __func__); > - wait_for_completion(&dev_dax->cmp); > - percpu_ref_exit(ref); > -} > - > -static void dev_dax_percpu_kill(struct dev_pagemap *pgmap) > -{ > - struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap); > - > - dev_dbg(&dev_dax->dev, "%s\n", __func__); > - percpu_ref_kill(pgmap->ref); > -} > - > static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, > const char *func) > { > @@ -442,10 +411,6 @@ static void dev_dax_kill(void *dev_dax) > kill_dev_dax(dev_dax); > } > > -static const struct dev_pagemap_ops dev_dax_pagemap_ops = { > - .kill = dev_dax_percpu_kill, > -}; > - > int dev_dax_probe(struct device *dev) > { > struct dev_dax *dev_dax = to_dev_dax(dev); > @@ -463,24 +428,9 @@ int dev_dax_probe(struct device *dev) > return -EBUSY; > } > > - init_completion(&dev_dax->cmp); > - rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0, > - GFP_KERNEL); > - if (rc) > - return rc; > - > - rc = devm_add_action_or_reset(dev, dev_dax_percpu_exit, &dev_dax->ref); > - if (rc) > - return rc; > - > - dev_dax->pgmap.ref = &dev_dax->ref;I don't think this exactly correct. pgmap.ref is a pointer to the dev_dax ref structure. Taking it away will cause devm_memremap_pages() to fail AFAICS. I think you need to change struct dev_pagemap as well: diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f0628660d541..5e2120589ddf 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -90,7 +90,7 @@ struct dev_pagemap { struct vmem_altmap altmap; bool altmap_valid; struct resource res; - struct percpu_ref *ref; + struct percpu_ref ref; void (*kill)(struct percpu_ref *ref); struct device *dev; void *data; And all usages of it, right? Ira> - dev_dax->pgmap.ops = &dev_dax_pagemap_ops; > addr = devm_memremap_pages(dev, &dev_dax->pgmap); > - if (IS_ERR(addr)) { > - devm_remove_action(dev, dev_dax_percpu_exit, &dev_dax->ref); > - percpu_ref_exit(&dev_dax->ref); > + if (IS_ERR(addr)) > return PTR_ERR(addr); > - } > > inode = dax_inode(dax_dev); > cdev = inode->i_cdev; > -- > 2.20.1 > > _______________________________________________ > Linux-nvdimm mailing list > Linux-nvdimm at lists.01.org > https://lists.01.org/mailman/listinfo/linux-nvdimm
John Hubbard
2019-Jun-14 00:54 UTC
[Nouveau] [PATCH 03/22] mm: remove hmm_devmem_add_resource
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> This function has never been used since it was first added to the kernel > more than a year and a half ago, and if we ever grow a consumer of the > MEMORY_DEVICE_PUBLIC infrastructure it can easily use devm_memremap_pages > directly now that we've simplified the API for it. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > include/linux/hmm.h | 3 --- > mm/hmm.c | 54 --------------------------------------------- > 2 files changed, 57 deletions(-) >No objections here, good cleanup. Reviewed-by: John Hubbard <jhubbard at nvidia.com> thanks, -- John Hubbard NVIDIA> diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 4867b9da1b6c..5761a39221a6 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -688,9 +688,6 @@ struct hmm_devmem { > struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > struct device *device, > unsigned long size); > -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, > - struct device *device, > - struct resource *res); > > /* > * hmm_devmem_page_set_drvdata - set per-page driver data field > diff --git a/mm/hmm.c b/mm/hmm.c > index ff2598eb7377..0c62426d1257 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1445,58 +1445,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > return devmem; > } > EXPORT_SYMBOL_GPL(hmm_devmem_add); > - > -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, > - struct device *device, > - struct resource *res) > -{ > - struct hmm_devmem *devmem; > - void *result; > - int ret; > - > - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) > - return ERR_PTR(-EINVAL); > - > - dev_pagemap_get_ops(); > - > - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); > - if (!devmem) > - return ERR_PTR(-ENOMEM); > - > - init_completion(&devmem->completion); > - devmem->pfn_first = -1UL; > - devmem->pfn_last = -1UL; > - devmem->resource = res; > - devmem->device = device; > - devmem->ops = ops; > - > - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, > - 0, GFP_KERNEL); > - if (ret) > - return ERR_PTR(ret); > - > - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, > - &devmem->ref); > - if (ret) > - return ERR_PTR(ret); > - > - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; > - devmem->pfn_last = devmem->pfn_first + > - (resource_size(devmem->resource) >> PAGE_SHIFT); > - devmem->page_fault = hmm_devmem_fault; > - > - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; > - devmem->pagemap.res = *devmem->resource; > - devmem->pagemap.page_free = hmm_devmem_free; > - devmem->pagemap.altmap_valid = false; > - devmem->pagemap.ref = &devmem->ref; > - devmem->pagemap.data = devmem; > - devmem->pagemap.kill = hmm_devmem_ref_kill; > - > - result = devm_memremap_pages(devmem->device, &devmem->pagemap); > - if (IS_ERR(result)) > - return result; > - return devmem; > -} > -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); > #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ >
John Hubbard
2019-Jun-14 01:46 UTC
[Nouveau] [PATCH 04/22] mm: don't clear ->mapping in hmm_devmem_free
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> ->mapping isn't even used by HMM users, and the field at the same offset > in the zone_device part of the union is declared as pad. (Which btw is > rather confusing, as DAX uses ->pgmap and ->mapping from two different > sides of the union, but DAX doesn't use hmm_devmem_free). > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > mm/hmm.c | 2 -- > 1 file changed, 2 deletions(-) > > diff --git a/mm/hmm.c b/mm/hmm.c > index 0c62426d1257..e1dc98407e7b 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1347,8 +1347,6 @@ static void hmm_devmem_free(struct page *page, void *data) > { > struct hmm_devmem *devmem = data; > > - page->mapping = NULL; > - > devmem->ops->free(devmem, page); > } > >Yes, I think that line was unnecessary. I see from git history that it was originally being set to NULL from within __put_devmap_managed_page(), and then in commit 2fa147bdbf672c53386a8f5f2c7fe358004c3ef8, Dan moved it out of there, and stashed in specifically here. But it appears to have been unnecessary from the beginning. Reviewed-by: John Hubbard <jhubbard at nvidia.com> thanks, -- John Hubbard NVIDIA
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> noveau is currently using this through an odd hmm wrapper, and I plan"nouveau"> to switch it to the real thing later in this series. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > ---Reviewed-by: John Hubbard <jhubbard at nvidia.com> thanks, -- John Hubbard NVIDIA> mm/mempolicy.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 01600d80ae01..f9023b5fba37 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, > out: > return page; > } > +EXPORT_SYMBOL_GPL(alloc_pages_vma); > > /** > * alloc_pages_current - Allocate pages. >
John Hubbard
2019-Jun-14 01:53 UTC
[Nouveau] [PATCH 22/22] mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> The migrate_vma helper is only used by noveau to migrate device private > pages around. Other HMM_MIRROR users like amdgpu or infiniband don't > need it. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > drivers/gpu/drm/nouveau/Kconfig | 1 + > mm/Kconfig | 1 - > 2 files changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig > index 66c839d8e9d1..96b9814e6d06 100644 > --- a/drivers/gpu/drm/nouveau/Kconfig > +++ b/drivers/gpu/drm/nouveau/Kconfig > @@ -88,6 +88,7 @@ config DRM_NOUVEAU_SVM > depends on DRM_NOUVEAU > depends on HMM_MIRROR > depends on STAGING > + select MIGRATE_VMA_HELPER > default n > help > Say Y here if you want to enable experimental support for > diff --git a/mm/Kconfig b/mm/Kconfig > index 73676cb4693f..eca88679b624 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -679,7 +679,6 @@ config HMM_MIRROR > bool "HMM mirror CPU page table into a device page table" > depends on MMU > select MMU_NOTIFIER > - select MIGRATE_VMA_HELPER > help > Select HMM_MIRROR if you want to mirror range of the CPU page table of a > process into a device page table. Here, mirror means "keep synchronized". >For those who have out of tree drivers that need migrate_vma(), but are not Nouveau, could we pretty please allow a way to select that independently? It's not a big deal, as I expect the Nouveau option will normally be selected, but it would be nice. Because there is a valid configuration that involves Nouveau not being selected, but our driver still wanting to run. Maybe we can add something like this on top of what you have? diff --git a/mm/Kconfig b/mm/Kconfig index eca88679b624..330996632513 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -670,7 +670,10 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. config MIGRATE_VMA_HELPER - bool + bool "migrate_vma() helper routine" + help + Provides a migrate_vma() routine that GPUs and other + device drivers may need. config DEV_PAGEMAP_OPS bool thanks, -- John Hubbard NVIDIA
On Thu, Jun 13, 2019 at 02:16:27PM +0000, Jason Gunthorpe wrote:> On Thu, Jun 13, 2019 at 11:43:03AM +0200, Christoph Hellwig wrote: > > Hi Dan, Jérôme and Jason, > > > > below is a series that cleans up the dev_pagemap interface so that > > it is more easily usable, which removes the need to wrap it in hmm > > and thus allowing to kill a lot of code > > Do you want some of this to run through hmm.git? I see many patches > that don't seem to have inter-dependencies..I think running it through hmm.git makes sense. While there are not actual functional dependency and just a few cosmetic conflicts keeping the hmm stuff together makes a lot of sense.
On Thu, Jun 13, 2019 at 11:27:39AM -0700, Dan Williams wrote:> It also turns out the nvdimm unit tests crash with this signature on > that branch where base v5.2-rc3 passes:How do you run that test?
John Hubbard
2019-Jun-15 02:21 UTC
[Nouveau] [PATCH 06/22] mm: factor out a devm_request_free_mem_region helper
On 6/13/19 2:43 AM, Christoph Hellwig wrote:> Keep the physical address allocation that hmm_add_device does with the > rest of the resource code, and allow future reuse of it without the hmm > wrapper. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > include/linux/ioport.h | 2 ++ > kernel/resource.c | 39 +++++++++++++++++++++++++++++++++++++++ > mm/hmm.c | 33 ++++----------------------------- > 3 files changed, 45 insertions(+), 29 deletions(-)Some trivial typos noted below, but this accurately moves the code into a helper routine, looks good. Reviewed-by: John Hubbard <jhubbard at nvidia.com>> > diff --git a/include/linux/ioport.h b/include/linux/ioport.h > index da0ebaec25f0..76a33ae3bf6c 100644 > --- a/include/linux/ioport.h > +++ b/include/linux/ioport.h > @@ -286,6 +286,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) > return (r1->start <= r2->end && r1->end >= r2->start); > } > > +struct resource *devm_request_free_mem_region(struct device *dev, > + struct resource *base, unsigned long size); > > #endif /* __ASSEMBLY__ */ > #endif /* _LINUX_IOPORT_H */ > diff --git a/kernel/resource.c b/kernel/resource.c > index 158f04ec1d4f..99c58134ed1c 100644 > --- a/kernel/resource.c > +++ b/kernel/resource.c > @@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) > } > EXPORT_SYMBOL(resource_list_free); > > +#ifdef CONFIG_DEVICE_PRIVATE > +/** > + * devm_request_free_mem_region - find free region for device private memory > + * > + * @dev: device struct to bind the resource too"to"> + * @size: size in bytes of the device memory to add > + * @base: resource tree to look in > + * > + * This function tries to find an empty range of physical address big enough to > + * contain the new resource, so that it can later be hotpluged as ZONE_DEVICE"hotplugged"> + * memory, which in turn allocates struct pages. > + */ > +struct resource *devm_request_free_mem_region(struct device *dev, > + struct resource *base, unsigned long size) > +{ > + resource_size_t end, addr; > + struct resource *res; > + > + size = ALIGN(size, 1UL << PA_SECTION_SHIFT); > + end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1); > + addr = end - size + 1UL; > + > + for (; addr > size && addr >= base->start; addr -= size) { > + if (region_intersects(addr, size, 0, IORES_DESC_NONE) !> + REGION_DISJOINT) > + continue; > + > + res = devm_request_mem_region(dev, addr, size, dev_name(dev)); > + if (!res) > + return ERR_PTR(-ENOMEM); > + res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; > + return res; > + } > + > + return ERR_PTR(-ERANGE); > +} > +EXPORT_SYMBOL_GPL(devm_request_free_mem_region); > +#endif /* CONFIG_DEVICE_PRIVATE */ > + > static int __init strict_iomem(char *str) > { > if (strstr(str, "relaxed")) > diff --git a/mm/hmm.c b/mm/hmm.c > index e1dc98407e7b..13a16faf0a77 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -26,8 +26,6 @@ > #include <linux/mmu_notifier.h> > #include <linux/memory_hotplug.h> > > -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) > - > #if IS_ENABLED(CONFIG_HMM_MIRROR) > static const struct mmu_notifier_ops hmm_mmu_notifier_ops; > > @@ -1372,7 +1370,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > unsigned long size) > { > struct hmm_devmem *devmem; > - resource_size_t addr; > void *result; > int ret; > > @@ -1398,32 +1395,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > if (ret) > return ERR_PTR(ret); > > - size = ALIGN(size, PA_SECTION_SIZE); > - addr = min((unsigned long)iomem_resource.end, > - (1UL << MAX_PHYSMEM_BITS) - 1); > - addr = addr - size + 1UL; > - > - /* > - * FIXME add a new helper to quickly walk resource tree and find free > - * range > - * > - * FIXME what about ioport_resource resource ? > - */ > - for (; addr > size && addr >= iomem_resource.start; addr -= size) { > - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); > - if (ret != REGION_DISJOINT) > - continue; > - > - devmem->resource = devm_request_mem_region(device, addr, size, > - dev_name(device)); > - if (!devmem->resource) > - return ERR_PTR(-ENOMEM); > - break; > - } > - if (!devmem->resource) > - return ERR_PTR(-ERANGE); > - > - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; > + devmem->resource = devm_request_free_mem_region(device, &iomem_resource, > + size); > + if (IS_ERR(devmem->resource)) > + return ERR_CAST(devmem->resource); > devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; > devmem->pfn_last = devmem->pfn_first + > (resource_size(devmem->resource) >> PAGE_SHIFT); >thanks, -- John Hubbard NVIDIA
On Thu 13-06-19 11:43:08, Christoph Hellwig wrote:> noveau is currently using this through an odd hmm wrapper, and I plan > to switch it to the real thing later in this series. > > Signed-off-by: Christoph Hellwig <hch at lst.de> > --- > mm/mempolicy.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index 01600d80ae01..f9023b5fba37 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, > out: > return page; > } > +EXPORT_SYMBOL_GPL(alloc_pages_vma);All allocator exported symbols are EXPORT_SYMBOL, what is a reason to have this one special? -- Michal Hocko SUSE Labs
Michal Hocko
2019-Jun-20 19:26 UTC
[Nouveau] [PATCH 18/22] mm: mark DEVICE_PUBLIC as broken
On Thu 13-06-19 11:43:21, Christoph Hellwig wrote:> The code hasn't been used since it was added to the tree, and doesn't > appear to actually be usable. Mark it as BROKEN until either a user > comes along or we finally give up on it.I would go even further and simply remove all the DEVICE_PUBLIC code.> Signed-off-by: Christoph Hellwig <hch at lst.de>Anyway Acked-by: Michal Hocko <mhocko at suse.com>> --- > mm/Kconfig | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/Kconfig b/mm/Kconfig > index 0d2ba7e1f43e..406fa45e9ecc 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -721,6 +721,7 @@ config DEVICE_PRIVATE > config DEVICE_PUBLIC > bool "Addressable device memory (like GPU memory)" > depends on ARCH_HAS_HMM > + depends on BROKEN > select HMM > select DEV_PAGEMAP_OPS > > -- > 2.20.1-- Michal Hocko SUSE Labs
Michal Hocko
2019-Jun-20 19:32 UTC
[Nouveau] [PATCH 03/22] mm: remove hmm_devmem_add_resource
On Thu 13-06-19 11:43:06, Christoph Hellwig wrote:> This function has never been used since it was first added to the kernel > more than a year and a half ago, and if we ever grow a consumer of the > MEMORY_DEVICE_PUBLIC infrastructure it can easily use devm_memremap_pages > directly now that we've simplified the API for it. > > Signed-off-by: Christoph Hellwig <hch at lst.de>Acked-by: Michal Hocko <mhocko at suse.com>> --- > include/linux/hmm.h | 3 --- > mm/hmm.c | 54 --------------------------------------------- > 2 files changed, 57 deletions(-) > > diff --git a/include/linux/hmm.h b/include/linux/hmm.h > index 4867b9da1b6c..5761a39221a6 100644 > --- a/include/linux/hmm.h > +++ b/include/linux/hmm.h > @@ -688,9 +688,6 @@ struct hmm_devmem { > struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > struct device *device, > unsigned long size); > -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, > - struct device *device, > - struct resource *res); > > /* > * hmm_devmem_page_set_drvdata - set per-page driver data field > diff --git a/mm/hmm.c b/mm/hmm.c > index ff2598eb7377..0c62426d1257 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1445,58 +1445,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, > return devmem; > } > EXPORT_SYMBOL_GPL(hmm_devmem_add); > - > -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, > - struct device *device, > - struct resource *res) > -{ > - struct hmm_devmem *devmem; > - void *result; > - int ret; > - > - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) > - return ERR_PTR(-EINVAL); > - > - dev_pagemap_get_ops(); > - > - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); > - if (!devmem) > - return ERR_PTR(-ENOMEM); > - > - init_completion(&devmem->completion); > - devmem->pfn_first = -1UL; > - devmem->pfn_last = -1UL; > - devmem->resource = res; > - devmem->device = device; > - devmem->ops = ops; > - > - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, > - 0, GFP_KERNEL); > - if (ret) > - return ERR_PTR(ret); > - > - ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, > - &devmem->ref); > - if (ret) > - return ERR_PTR(ret); > - > - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; > - devmem->pfn_last = devmem->pfn_first + > - (resource_size(devmem->resource) >> PAGE_SHIFT); > - devmem->page_fault = hmm_devmem_fault; > - > - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; > - devmem->pagemap.res = *devmem->resource; > - devmem->pagemap.page_free = hmm_devmem_free; > - devmem->pagemap.altmap_valid = false; > - devmem->pagemap.ref = &devmem->ref; > - devmem->pagemap.data = devmem; > - devmem->pagemap.kill = hmm_devmem_ref_kill; > - > - result = devm_memremap_pages(devmem->device, &devmem->pagemap); > - if (IS_ERR(result)) > - return result; > - return devmem; > -} > -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); > #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ > -- > 2.20.1-- Michal Hocko SUSE Labs
Michal Hocko
2019-Jun-20 19:36 UTC
[Nouveau] [PATCH 04/22] mm: don't clear ->mapping in hmm_devmem_free
On Thu 13-06-19 11:43:07, Christoph Hellwig wrote:> ->mapping isn't even used by HMM users, and the field at the same offset > in the zone_device part of the union is declared as pad. (Which btw is > rather confusing, as DAX uses ->pgmap and ->mapping from two different > sides of the union, but DAX doesn't use hmm_devmem_free). > > Signed-off-by: Christoph Hellwig <hch at lst.de>I cannot really judge here but setting mapping here without any comment is quite confusing. So if this is safe to remove then it is certainly an improvement.> --- > mm/hmm.c | 2 -- > 1 file changed, 2 deletions(-) > > diff --git a/mm/hmm.c b/mm/hmm.c > index 0c62426d1257..e1dc98407e7b 100644 > --- a/mm/hmm.c > +++ b/mm/hmm.c > @@ -1347,8 +1347,6 @@ static void hmm_devmem_free(struct page *page, void *data) > { > struct hmm_devmem *devmem = data; > > - page->mapping = NULL; > - > devmem->ops->free(devmem, page); > } > > -- > 2.20.1-- Michal Hocko SUSE Labs