David Hildenbrand
2022-Feb-14 17:41 UTC
[PATCH v1 0/2] mm: enforce pageblock_order < MAX_ORDER
Having pageblock_order >= MAX_ORDER seems to be able to happen in corner cases and some parts of the kernel are not prepared for it. For example, Aneesh has shown [1] that such kernels can be compiled on ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will run into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right during boot. We can get pageblock_order >= MAX_ORDER when the default hugetlb size is bigger than the maximum allocation granularity of the buddy, in which case we are no longer talking about huge pages but instead gigantic pages. Having pageblock_order >= MAX_ORDER can only make alloc_contig_range() of such gigantic pages more likely to succeed. Reliable use of gigantic pages either requires boot time allcoation or CMA, no need to overcomplicate some places in the kernel to optimize for corner cases that are broken in other areas of the kernel. Let's enforce pageblock_order < MAX_ORDER and simplify. Especially patch #1 can be regarded a cleanup before: [PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range alignment. [2] [1] https://lkml.kernel.org/r/87r189a2ks.fsf at linux.ibm.com [2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi.yan at sent.com Cc: Andrew Morton <akpm at linux-foundation.org> Cc: Aneesh Kumar K V <aneesh.kumar at linux.ibm.com> Cc: Zi Yan <ziy at nvidia.com> Cc: Michael Ellerman <mpe at ellerman.id.au> Cc: Benjamin Herrenschmidt <benh at kernel.crashing.org> Cc: Paul Mackerras <paulus at samba.org> Cc: Rob Herring <robh+dt at kernel.org> Cc: Frank Rowand <frowand.list at gmail.com> Cc: "Michael S. Tsirkin" <mst at redhat.com> Cc: Christoph Hellwig <hch at lst.de> Cc: Marek Szyprowski <m.szyprowski at samsung.com> Cc: Robin Murphy <robin.murphy at arm.com> Cc: Minchan Kim <minchan at kernel.org> Cc: Vlastimil Babka <vbabka at suse.cz> Cc: linuxppc-dev at lists.ozlabs.org Cc: devicetree at vger.kernel.org Cc: virtualization at lists.linux-foundation.org Cc: iommu at lists.linux-foundation.org Cc: linux-mm at kvack.org David Hildenbrand (2): cma: factor out minimum alignment requirement mm: enforce pageblock_order < MAX_ORDER arch/powerpc/include/asm/fadump-internal.h | 5 ---- arch/powerpc/kernel/fadump.c | 2 +- drivers/of/of_reserved_mem.c | 9 ++---- drivers/virtio/virtio_mem.c | 9 ++---- include/linux/cma.h | 8 ++++++ include/linux/pageblock-flags.h | 7 +++-- kernel/dma/contiguous.c | 4 +-- mm/Kconfig | 3 ++ mm/cma.c | 20 ++++---------- mm/page_alloc.c | 32 ++++++---------------- 10 files changed, 37 insertions(+), 62 deletions(-) base-commit: 754e0b0e35608ed5206d6a67a791563c631cec07 -- 2.34.1
David Hildenbrand
2022-Feb-14 17:41 UTC
[PATCH v1 1/2] cma: factor out minimum alignment requirement
Let's factor out determining the minimum alignment requirement for CMA and add a helpful comment. No functional change intended. Signed-off-by: David Hildenbrand <david at redhat.com> --- arch/powerpc/include/asm/fadump-internal.h | 5 ----- arch/powerpc/kernel/fadump.c | 2 +- drivers/of/of_reserved_mem.c | 9 +++------ include/linux/cma.h | 9 +++++++++ kernel/dma/contiguous.c | 4 +--- mm/cma.c | 20 +++++--------------- 6 files changed, 19 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 52189928ec08..81bcb9abb371 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -19,11 +19,6 @@ #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) -/* Alignment per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, \ - pageblock_order)) - /* FAD commands */ #define FADUMP_REGISTER 1 #define FADUMP_UNREGISTER 2 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9c..7eb67201ea41 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -544,7 +544,7 @@ int __init fadump_reserve_mem(void) if (!fw_dump.nocma) { fw_dump.boot_memory_size ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + CMA_MIN_ALIGNMENT_BYTES); } #endif diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 9c0fb962c22b..75caa6f5d36f 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -22,6 +22,7 @@ #include <linux/slab.h> #include <linux/memblock.h> #include <linux/kmemleak.h> +#include <linux/cma.h> #include "of_private.h" @@ -116,12 +117,8 @@ static int __init __reserved_mem_alloc_size(unsigned long node, if (IS_ENABLED(CONFIG_CMA) && of_flat_dt_is_compatible(node, "shared-dma-pool") && of_get_flat_dt_prop(node, "reusable", NULL) - && !nomap) { - unsigned long order - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - - align = max(align, (phys_addr_t)PAGE_SIZE << order); - } + && !nomap) + align = max_t(phys_addr_t, align, CMA_MIN_ALIGNMENT_BYTES); prop = of_get_flat_dt_prop(node, "alloc-ranges", &len); if (prop) { diff --git a/include/linux/cma.h b/include/linux/cma.h index bd801023504b..75fe188ec4a1 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -20,6 +20,15 @@ #define CMA_MAX_NAME 64 +/* + * TODO: once the buddy -- especially pageblock merging and alloc_contig_range() + * -- can deal with only some pageblocks of a higher-order page being + * MIGRATE_CMA, we can use pageblock_nr_pages. + */ +#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \ + pageblock_nr_pages) +#define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) + struct cma; extern unsigned long totalcma_pages; diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..6ea80ae42622 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/mm/cma.c b/mm/cma.c index bc9ca8f3c487..5a2cd5851658 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -168,7 +168,6 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, struct cma **res_cma) { struct cma *cma; - phys_addr_t alignment; /* Sanity checks */ if (cma_area_count == ARRAY_SIZE(cma_areas)) { @@ -179,15 +178,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order); - /* alignment should be aligned with order_per_bit */ - if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + if (!IS_ALIGNED(CMA_MIN_ALIGNMENT_PAGES, 1 << order_per_bit)) return -EINVAL; - if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + /* ensure minimal alignment required by mm core */ + if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL; /* @@ -262,14 +258,8 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, if (alignment && !is_power_of_2(alignment)) return -EINVAL; - /* - * Sanitise input arguments. - * Pages both ends in CMA area could be merged into adjacent unmovable - * migratetype page by page allocator's buddy algorithm. In the case, - * you couldn't get a contiguous memory, which is not what we want. - */ - alignment = max(alignment, (phys_addr_t)PAGE_SIZE << - max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + /* Sanitise input arguments. */ + alignment = max_t(phys_addr_t, alignment, CMA_MIN_ALIGNMENT_BYTES); if (fixed && base & (alignment - 1)) { ret = -EINVAL; pr_err("Region at %pa must be aligned to %pa bytes\n", -- 2.34.1
David Hildenbrand
2022-Feb-14 17:41 UTC
[PATCH v1 2/2] mm: enforce pageblock_order < MAX_ORDER
Some places in the kernel don't really expect pageblock_order >MAX_ORDER, and it looks like this is only possible in corner cases: 1) CONFIG_DEFERRED_STRUCT_PAGE_INIT we'll end up freeing pageblock_order pages via __free_pages_core(), which cannot possibly work. 2) find_zone_movable_pfns_for_nodes() will roundup the ZONE_MOVABLE start PFN to MAX_ORDER_NR_PAGES. Consequently with a bigger pageblock_order, we could have a single pageblock partially managed by two zones. 3) compaction code runs into __fragmentation_index() with order >= MAX_ORDER, when checking WARN_ON_ONCE(order >= MAX_ORDER). [1] 4) mm/page_reporting.c won't be reporting any pages with default page_reporting_order == pageblock_order, as we'll be skipping the reporting loop inside page_reporting_process_zone(). 5) __rmqueue_fallback() will never be able to steal with ALLOC_NOFRAGMENT. pageblock_order >= MAX_ORDER is weird either way: it's a pure optimization for making alloc_contig_range(), as used for allcoation of gigantic pages, a little more reliable to succeed. However, if there is demand for somewhat reliable allocation of gigantic pages, affected setups should be using CMA or boottime allocations instead. So let's make sure that pageblock_order < MAX_ORDER and simplify. [1] https://lkml.kernel.org/r/87r189a2ks.fsf at linux.ibm.com Signed-off-by: David Hildenbrand <david at redhat.com> --- drivers/virtio/virtio_mem.c | 9 +++------ include/linux/cma.h | 3 +-- include/linux/pageblock-flags.h | 7 +++++-- mm/Kconfig | 3 +++ mm/page_alloc.c | 32 ++++++++------------------------ 5 files changed, 20 insertions(+), 34 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index 38becd8d578c..e7d6b679596d 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -2476,13 +2476,10 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm) VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD); /* - * We want subblocks to span at least MAX_ORDER_NR_PAGES and - * pageblock_nr_pages pages. This: - * - Is required for now for alloc_contig_range() to work reliably - - * it doesn't properly handle smaller granularity on ZONE_NORMAL. + * TODO: once alloc_contig_range() works reliably with pageblock + * granularity on ZONE_NORMAL, use pageblock_nr_pages instead. */ - sb_size = max_t(uint64_t, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) * PAGE_SIZE; + sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES; sb_size = max_t(uint64_t, vm->device_block_size, sb_size); if (sb_size < memory_block_size_bytes() && !force_bbm) { diff --git a/include/linux/cma.h b/include/linux/cma.h index 75fe188ec4a1..b1ba94f1cc9c 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -25,8 +25,7 @@ * -- can deal with only some pageblocks of a higher-order page being * MIGRATE_CMA, we can use pageblock_nr_pages. */ -#define CMA_MIN_ALIGNMENT_PAGES max_t(phys_addr_t, MAX_ORDER_NR_PAGES, \ - pageblock_nr_pages) +#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES #define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES) struct cma; diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 973fd731a520..83c7248053a1 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -37,8 +37,11 @@ extern unsigned int pageblock_order; #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -/* Huge pages are a constant size */ -#define pageblock_order HUGETLB_PAGE_ORDER +/* + * Huge pages are a constant size, but don't exceed the maximum allocation + * granularity. + */ +#define pageblock_order min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_ORDER - 1) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..4c91b92e7537 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -262,6 +262,9 @@ config HUGETLB_PAGE_SIZE_VARIABLE HUGETLB_PAGE_ORDER when there are multiple HugeTLB page sizes available on a platform. + Note that the pageblock_order cannot exceed MAX_ORDER - 1 and will be + clamped down to MAX_ORDER - 1. + config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3589febc6d31..04cf964b57b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1072,14 +1072,12 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); + unsigned int max_order = pageblock_order; unsigned long buddy_pfn; unsigned long combined_pfn; - unsigned int max_order; struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); - VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -2260,19 +2258,8 @@ void __init init_cma_reserved_pageblock(struct page *page) } while (++p, --i); set_pageblock_migratetype(page, MIGRATE_CMA); - - if (pageblock_order >= MAX_ORDER) { - i = pageblock_nr_pages; - p = page; - do { - set_page_refcounted(p); - __free_pages(p, MAX_ORDER - 1); - p += MAX_ORDER_NR_PAGES; - } while (i -= MAX_ORDER_NR_PAGES); - } else { - set_page_refcounted(page); - __free_pages(page, pageblock_order); - } + set_page_refcounted(page); + __free_pages(page, pageblock_order); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; @@ -7389,16 +7376,15 @@ static inline void setup_usemap(struct zone *zone) {} /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ void __init set_pageblock_order(void) { - unsigned int order; + unsigned int order = MAX_ORDER - 1; /* Check that pageblock_nr_pages has not already been setup */ if (pageblock_order) return; - if (HPAGE_SHIFT > PAGE_SHIFT) + /* Don't let pageblocks exceed the maximum allocation granularity. */ + if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order) order = HUGETLB_PAGE_ORDER; - else - order = MAX_ORDER - 1; /* * Assume the largest contiguous order of interest is a huge page. @@ -8986,14 +8972,12 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page, #ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { - return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages) - 1); + return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES); } static unsigned long pfn_max_align_up(unsigned long pfn) { - return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, - pageblock_nr_pages)); + return ALIGN(pfn, MAX_ORDER_NR_PAGES); } #if defined(CONFIG_DYNAMIC_DEBUG) || \ -- 2.34.1