This patch introduces a per-node layer to the buddy allocator. Xen currently defines the heap as a two-dimensional array, [zone][order]. This patch adds a node layer between zone and order. This allows Xen to hand memory out in the proper zone while preferring local memory allocation, but can fall-back on non-local to satisfy a zone request. When the heap is initialized, for each page that is added we determine the node to which the page belongs and insert into the proper zone, node and order. When allocating memory via the alloc_heap_pages() function, we try to satisfy the zone request in the target node which is determined by the requesting cpu. If no memory is found in the target node for a given zone, we examine other nodes before increasing the order of the memory request. Existing heap API has been preserved and uses smp_processor_id() to supply the required parameter to alloc_heap_pages() and alloc_domheap_pages. Also, Xen code can directly call alloc_heap_pages()/__alloc_domheap_pages() supplying the required cpu parameter to request pages local to the processor in question. avail_heap_pages() and avail_domheap_pages() have been altered to provide an easier method for querying total available memory given a zone or node. -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@us.ibm.com diffstat output: common/page_alloc.c | 254 ++++++++++++++++++++++++++++++++++++++++++---------- include/xen/mm.h | 10 +- include/xen/numa.h | 2 3 files changed, 217 insertions(+), 49 deletions(-) Signed-off-by: Ryan Harper <ryanh@us.ibm.com> Signed-off-by: Ryan Grimm <grimm@us.ibm.com> --- # HG changeset patch # User Ryan Harper <ryanh@us.ibm.com> # Node ID 44ee2cfd164d091249dd133fa65dfd1a4d3f1e66 # Parent 2a81ffed9e53be432c95c3bc99fa2fadd8f93bb9 This patch introduces a per-node layer to the buddy allocator. Xen currently defines the heap as a two-dimensional array, [zone][order]. This patch adds a node layer between zone and order. This allows Xen to hand memory out in the proper zone while preferring local memory allocation, but can fall-back on non-local to satisfy a zone request. When the heap is initialized, for each page that is added we determine the node to which the page belongs and insert into the proper zone, node and order. When allocating memory via the alloc_heap_pages() function, we try to satisfy the zone request in the target node which is determined by the requesting cpu. If no memory is found in the target node for a given zone, we examine other nodes before increasing the order of the memory request. Existing heap API has been preserved and uses smp_processor_id() to supply the required parameter to alloc_heap_pages() and alloc_domheap_pages. Also, Xen code can directly call alloc_heap_pages()/__alloc_domheap_pages() supplying the required cpu parameter to request pages local to the processor in question. avail_heap_pages() and avail_domheap_pages() have been altered to provide an easier method for querying total available memory given a zone or node. diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/common/page_alloc.c --- a/xen/common/page_alloc.c Fri Apr 28 17:56:52 2006 +++ b/xen/common/page_alloc.c Fri Apr 28 18:00:23 2006 @@ -4,6 +4,7 @@ * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser + * Copyright (c) 2006 IBM * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,6 +35,54 @@ #include <xen/domain_page.h> #include <xen/keyhandler.h> #include <asm/page.h> +#include <xen/nodemask.h> +#ifdef CONFIG_NUMA +#include <xen/numa.h> + +/* min and max paddr per node */ +extern int num_memory_chunks; +extern node_memory_chunk_t node_memory_chunk[]; +extern int cpu_to_node[]; + +/* + * NB: assumes caller used page_spans_chunk to check for + * splitting across chunk boundaries + */ +int page_to_node(struct page_info *pg) +{ + node_memory_chunk_t *c; + u64 pg_paddr = page_to_maddr(pg); + + for (c = node_memory_chunk; c < (node_memory_chunk+num_memory_chunks); c++) { + if ( pg_paddr >= c->start_paddr && pg_paddr <= c->end_paddr ) { + ASSERT(c->nid < num_online_nodes()); + return (int)c->nid; + } + } + return -1; +} + +/* check if the list page is head of spans a chunk */ +int page_spans_chunk(struct page_info *pg, unsigned int order) +{ + node_memory_chunk_t *c; + u64 pg_start = page_to_maddr(pg); + u64 pg_end = pg_start + ((PAGE_SIZE << order)-1); + + if (order == 0) + return 0; /* single page cannot span a chunk */ + + for (c = node_memory_chunk; c < (node_memory_chunk+num_memory_chunks); c++) { + if ( pg_start >= c->start_paddr && pg_start <= c->end_paddr && + pg_end >= c->start_paddr && pg_end <= c->end_paddr ) { + return 0; + } + } + + return 1; +} + +#endif /* * Comma-separated list of hexadecimal page numbers containing bad bytes. @@ -246,9 +295,16 @@ #define pfn_dom_zone_type(_pfn) \ (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM) +/* Up to 2^20 pages can be allocated at once. */ +#ifdef CONFIG_NUMA +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1]; + +static unsigned long avail[NR_ZONES][MAX_NUMNODES]; +#else static struct list_head heap[NR_ZONES][MAX_ORDER+1]; static unsigned long avail[NR_ZONES]; +#endif static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED; @@ -260,8 +316,16 @@ memset(avail, 0, sizeof(avail)); for ( i = 0; i < NR_ZONES; i++ ) +#ifdef CONFIG_NUMA + for ( j = 0; j < MAX_NUMNODES; j++ ) { + unsigned int k; + for ( k = 0; k <= MAX_ORDER; k++ ) + INIT_LIST_HEAD(&heap[i][j][k]); + } +#else for ( j = 0; j <= MAX_ORDER; j++ ) INIT_LIST_HEAD(&heap[i][j]); +#endif /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < max_page; i++ ) @@ -289,11 +353,22 @@ /* Allocate 2^@order contiguous pages. */ -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) -{ - int i; +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu, + unsigned int order) +{ + int i,j, node, target_node, nodes_online; struct page_info *pg; - + struct list_head *list; + +#ifdef CONFIG_NUMA + target_node = cpu_to_node[cpu]; + nodes_online = num_online_nodes(); + ASSERT(target_node >= 0); + ASSERT(target_node < num_online_nodes()); +#else + target_node = 0; + nodes_online = 1; +#endif ASSERT(zone < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) @@ -301,50 +376,70 @@ spin_lock(&heap_lock); - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= MAX_ORDER; i++ ) - if ( !list_empty(&heap[zone][i]) ) - goto found; + /* start with requested node, but exhaust all node memory + * in requested zone before failing */ + for ( i = 0; i < nodes_online; i++ ) { + node = (target_node+i) % nodes_online; + /* Find smallest order which can satisfy the request. */ + for ( j = order; j <= MAX_ORDER; j++ ) { +#ifdef CONFIG_NUMA + list = heap[zone][node]; +#else + list = heap[zone]; +#endif + if ( !list_empty(&list[j]) ) + goto found; + } + } /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: - pg = list_entry(heap[zone][i].next, struct page_info, list); + pg = list_entry(list[j].next, struct page_info, list); list_del(&pg->list); /* We may have to halve the chunk a number of times. */ - while ( i != order ) - { - PFN_ORDER(pg) = --i; - list_add_tail(&pg->list, &heap[zone][i]); - pg += 1 << i; + while ( j != order ) + { + PFN_ORDER(pg) = --j; + list_add_tail(&pg->list, &list[j]); + pg += 1 << j; } map_alloc(page_to_mfn(pg), 1 << order); +#ifdef CONFIG_NUMA + avail[zone][node] -= 1 << order; +#else avail[zone] -= 1 << order; +#endif spin_unlock(&heap_lock); return pg; } - -/* Free 2^@order set of pages. */ -void free_heap_pages( - unsigned int zone, struct page_info *pg, unsigned int order) +/* + * helper function for free_heap_pages + * NB: assumes caller holds heap_lock + */ +void merge_pages( + struct page_info *pg, unsigned int zone, unsigned int order) { unsigned long mask; - - ASSERT(zone < NR_ZONES); - ASSERT(order <= MAX_ORDER); - - spin_lock(&heap_lock); - - map_free(page_to_mfn(pg), 1 << order); + struct list_head *list; +#ifdef CONFIG_NUMA + unsigned int node = page_to_node(pg); + + ASSERT((node >= 0) && (node < num_online_nodes())); + avail[zone][node] += 1 << order; + list = heap[zone][node]; +#else avail[zone] += 1 << order; - + list = heap[zone]; +#endif + /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) { @@ -372,8 +467,42 @@ } PFN_ORDER(pg) = order; - list_add_tail(&pg->list, &heap[zone][order]); - + + list_add_tail(&pg->list, &list[order]); +} + +/* Free 2^@order set of pages. */ +void free_heap_pages( + unsigned int zone, struct page_info *pg, unsigned int order) +{ + ASSERT(zone < NR_ZONES); + ASSERT(order <= MAX_ORDER); + + spin_lock(&heap_lock); + + map_free(page_to_mfn(pg), 1 << order); + +#ifdef CONFIG_NUMA + /* + * If the page list order spans a chunk, halve the region + * until it fits and merge the remaining pages one at a time. + */ + while ( page_spans_chunk(pg, order) ) { + int i; + struct page_info *p; + + ASSERT(order > 0); + + PFN_ORDER(pg) = --order; + for ( i=0; i<(1<<order); i++ ) { + p = pg+(1<<order)+i; + PFN_ORDER(p) = 0; + merge_pages(p, zone, 0); + } + } +#endif + merge_pages(pg, zone, order); + spin_unlock(&heap_lock); } @@ -467,7 +596,7 @@ int i; local_irq_save(flags); - pg = alloc_heap_pages(MEMZONE_XEN, order); + pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order); local_irq_restore(flags); if ( unlikely(pg == NULL) ) @@ -531,8 +660,8 @@ } -struct page_info *alloc_domheap_pages( - struct domain *d, unsigned int order, unsigned int flags) +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags) { struct page_info *pg = NULL; cpumask_t mask; @@ -542,17 +671,17 @@ if ( !(flags & ALLOC_DOM_DMA) ) { - pg = alloc_heap_pages(MEMZONE_DOM, order); + pg = alloc_heap_pages(MEMZONE_DOM, cpu, order); /* Failure? Then check if we can fall back to the DMA pool. */ - if ( unlikely(pg == NULL) && - ((order > MAX_ORDER) || - (avail[MEMZONE_DMADOM] < + if ( unlikely(pg == NULL) + && ((order > MAX_ORDER) || + (avail_heap_pages(MEMZONE_DMADOM,-1) < (lowmem_emergency_pool_pages + (1UL << order)))) ) return NULL; } if ( pg == NULL ) - if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL ) + if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL ) return NULL; mask = pg->u.free.cpumask; @@ -615,6 +744,13 @@ spin_unlock(&d->page_alloc_lock); return pg; +} + +inline struct page_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) +{ + return __alloc_domheap_pages(d, smp_processor_id(), order, flags); + } @@ -690,13 +826,32 @@ } -unsigned long avail_domheap_pages(void) +u64 avail_heap_pages(int zone, int node) +{ + int i,j; + u64 free_pages = 0; + + for (i=0; i<NR_ZONES; i++) + if ( (zone == -1) || (zone == i) ) + for (j=0; j<num_online_nodes(); j++) + if ( (node == -1) || (node == j) ) +#ifdef CONFIG_NUMA + free_pages += avail[i][j]; +#else + free_pages += avail[i]; +#endif + + return free_pages; +} + +u64 avail_domheap_pages(void) { unsigned long avail_nrm, avail_dma; - - avail_nrm = avail[MEMZONE_DOM]; - - avail_dma = avail[MEMZONE_DMADOM]; + + /* return avail[MEMZONE_DOM] + avail[MEMZONE_DMADOM] */ + avail_nrm = avail_heap_pages(MEMZONE_DOM,-1); + avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1); + if ( avail_dma > lowmem_emergency_pool_pages ) avail_dma -= lowmem_emergency_pool_pages; else @@ -705,16 +860,21 @@ return avail_nrm + avail_dma; } +u64 avail_nodeheap_pages(int node) +{ + return avail_heap_pages(-1, node); +} static void pagealloc_keyhandler(unsigned char key) { printk("Physical memory information:\n"); - printk(" Xen heap: %lukB free\n" - " DMA heap: %lukB free\n" - " Dom heap: %lukB free\n", - avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), - avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), - avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); + + printk(" Xen heap: %"PRIu64"kB free\n" + " DMA heap: %"PRIu64"kB free\n" + " Dom heap: %"PRIu64"kB free\n", + avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10)); } diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/include/xen/mm.h --- a/xen/include/xen/mm.h Fri Apr 28 17:56:52 2006 +++ b/xen/include/xen/mm.h Fri Apr 28 18:00:23 2006 @@ -45,7 +45,8 @@ /* Generic allocator. These functions are *not* interrupt-safe. */ void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages); -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order); +struct page_info *alloc_heap_pages( + unsigned int zone, unsigned int cpu, unsigned int order); void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order); void scrub_heap_pages(void); @@ -61,8 +62,13 @@ void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int flags); +#ifdef CONFIG_NUMA +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags); +#endif void free_domheap_pages(struct page_info *pg, unsigned int order); -unsigned long avail_domheap_pages(void); +u64 avail_domheap_pages(void); +u64 avail_heap_pages(int zone, int node); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) diff -r 2a81ffed9e53 -r 44ee2cfd164d xen/include/xen/numa.h --- a/xen/include/xen/numa.h Fri Apr 28 17:56:52 2006 +++ b/xen/include/xen/numa.h Fri Apr 28 18:00:23 2006 @@ -35,6 +35,8 @@ extern int cpu_to_node[]; extern cpumask_t node_to_cpumask[]; +int page_to_node(struct page_info *pg); + int numa_init(void); #endif /* _XEN_NUMA_H */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2006-May-02 14:17 UTC
Re: [Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen
On 1 May 2006, at 22:57, Ryan Harper wrote:> This patch introduces a per-node layer to the buddy allocator. Xen > currently > defines the heap as a two-dimensional array, [zone][order]. This > patch adds a > node layer between zone and order. This allows Xen to hand memory out > in the > proper zone while preferring local memory allocation, but can > fall-back on > non-local to satisfy a zone request.Loops over every memory chunk structure on the alloc/free paths aren''t going to get merged. There''s no need for it -- in most cases memory chunks are probably aligned on a MAX_ORDER boundary (or they will be when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb a bit first). When that isn''t the case you can simply reserve guard pages at the start and end of such chunks to avoid cross-chunk merging. -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ryan Harper
2006-May-02 14:53 UTC
Re: [Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen
* Keir Fraser <Keir.Fraser@cl.cam.ac.uk> [2006-05-02 09:18]:> > On 1 May 2006, at 22:57, Ryan Harper wrote: > > >This patch introduces a per-node layer to the buddy allocator. Xen > >currently > >defines the heap as a two-dimensional array, [zone][order]. This > >patch adds a > >node layer between zone and order. This allows Xen to hand memory out > >in the > >proper zone while preferring local memory allocation, but can > >fall-back on > >non-local to satisfy a zone request. > > Loops over every memory chunk structure on the alloc/free paths aren''t > going to get merged. There''s no need for it -- in most cases memory > chunks are probably aligned on a MAX_ORDER boundary (or they will be > when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb > a bit first). When that isn''t the case you can simply reserve guard > pages at the start and end of such chunks to avoid cross-chunk merging.I''ll toss page_spans_chunk() and the user in the free path, use some guard pages and resubmit. page_to_node still uses the chunk array to determine which node a struct page_info belongs to, which is used in the free path. Is that acceptable? -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@us.ibm.com _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2006-May-02 16:22 UTC
Re: [Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen
On 2 May 2006, at 15:53, Ryan Harper wrote:>> Loops over every memory chunk structure on the alloc/free paths aren''t >> going to get merged. There''s no need for it -- in most cases memory >> chunks are probably aligned on a MAX_ORDER boundary (or they will be >> when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb >> a bit first). When that isn''t the case you can simply reserve guard >> pages at the start and end of such chunks to avoid cross-chunk >> merging. > > I''ll toss page_spans_chunk() and the user in the free path, use some > guard pages and resubmit.Great. Please do make it conditional on the start/end not being on a MAX_ORDER boundary though -- that''s a worthwhile optimisation to avoid the guard page.> page_to_node still uses the chunk array to > determine which node a struct page_info belongs to, which is used in > the > free path. Is that acceptable?It''ll have to do for now. As long as it doesn''t suck on small NUMA or non NUMA systems we can improve it later (extra flags in each page structure, or some faster lookup structure). -- Keir _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ryan Harper
2006-May-08 18:16 UTC
Re: [Xen-devel] [PATCH 2/6] xen: Add NUMA support to Xen
* Keir Fraser <Keir.Fraser@cl.cam.ac.uk> [2006-05-02 11:23]:> > On 2 May 2006, at 15:53, Ryan Harper wrote: > > >>Loops over every memory chunk structure on the alloc/free paths aren''t > >>going to get merged. There''s no need for it -- in most cases memory > >>chunks are probably aligned on a MAX_ORDER boundary (or they will be > >>when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb > >>a bit first). When that isn''t the case you can simply reserve guard > >>pages at the start and end of such chunks to avoid cross-chunk > >>merging. > > > >I''ll toss page_spans_chunk() and the user in the free path, use some > >guard pages and resubmit. > > Great. Please do make it conditional on the start/end not being on a > MAX_ORDER boundary though -- that''s a worthwhile optimisation to avoid > the guard page. >I''ve taken out the CONFIG_NUMA ifdefs and dumped the page_spans_chunk() marking the chunk boundaries if they aren''t on MAX_ORDER boundaries and if not allocated. I''m not clear on the difference between reserving the sensitive pages in the alloc bitmap (via map_alloc()) and the memguard routines. For instance, in init_xenheap_pages(), the range is guarded, and then the range is handed to the heap (call to init_heap_pages() which clears the alloc bitmap). Then in init_domheap_pages(), there are no calls to memguard, just work to set up the range for a call to init_heap_pages(). I''m not sure if I need to use memguard for marking the chunk boundaries, or if just reserving chunk boundaries that weren''t already on a MAX_ORDER edge via map_alloc() is sufficient. Also, I didn''t see a way to ensure reserved pages aren''t freed via a call to init_heap_pages() which just clears out a range of bits in the alloc map. Should we be worried about that? Attached is what the current working patch looks like. Let me know if this is more to your liking. If so, I''ll re-spin the whole patchset and test it across the set of test machines we have (NUMA and non-NUMA). -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@us.ibm.com --- diff -r 38ba1fe5009c xen/common/page_alloc.c --- a/xen/common/page_alloc.c Tue May 9 02:23:08 2006 +++ b/xen/common/page_alloc.c Mon May 8 21:27:53 2006 @@ -4,6 +4,7 @@ * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser + * Copyright (c) 2006 IBM * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,6 +35,25 @@ #include <xen/domain_page.h> #include <xen/keyhandler.h> #include <asm/page.h> +#include <xen/nodemask.h> +#include <xen/numa.h> + +extern int num_memory_chunks; +extern node_memory_chunk_t node_memory_chunk[]; +extern int cpu_to_node[]; + +/* map a given page_info to the node it came from */ +int page_to_node(struct page_info *pg) +{ + node_memory_chunk_t *c = node_memory_chunk; + u64 pg_paddr = page_to_maddr(pg); + + for (; c < (node_memory_chunk+num_memory_chunks); c++) + if ( (pg_paddr >= c->start_paddr) && (pg_paddr <= c->end_paddr) ) + return (int)c->nid; + + return -1; +} /* * Comma-separated list of hexadecimal page numbers containing bad bytes. @@ -246,22 +266,43 @@ #define pfn_dom_zone_type(_pfn) \ (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM) -static struct list_head heap[NR_ZONES][MAX_ORDER+1]; - -static unsigned long avail[NR_ZONES]; +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1]; + +static unsigned long avail[NR_ZONES][MAX_NUMNODES]; static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED; +#define NOT_MAX_ORDER_ALIGNED ((1UL << MAX_ORDER)-1) void end_boot_allocator(void) { - unsigned long i, j; + unsigned long i, j, k; int curr_free = 0, next_free = 0; memset(avail, 0, sizeof(avail)); for ( i = 0; i < NR_ZONES; i++ ) - for ( j = 0; j <= MAX_ORDER; j++ ) - INIT_LIST_HEAD(&heap[i][j]); + for ( j = 0; j < MAX_NUMNODES; j++ ) + for ( k = 0; k <= MAX_ORDER; k++ ) + INIT_LIST_HEAD(&heap[i][j][k]); + + /* mark NUMA chunk boundaries in multi-node systems */ + if ( num_online_nodes() > 1 ) + { + node_memory_chunk_t *c = node_memory_chunk; + + /* sacrifice the ends of a chunk if not MAX_ORDER + aligned to prevent merging across chunks */ + for (; c < (node_memory_chunk+num_memory_chunks); c++ ) + { + if ( (c->start_paddr & NOT_MAX_ORDER_ALIGNED) && + !allocated_in_map(paddr_to_pfn(c->start_paddr)) ) + map_alloc(paddr_to_pfn(c->start_paddr), 1); + + if ( (c->end_paddr & NOT_MAX_ORDER_ALIGNED) && + !allocated_in_map(paddr_to_pfn(c->end_paddr)) ) + map_alloc(paddr_to_pfn(c->end_paddr), 1); + } + } /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < max_page; i++ ) @@ -289,11 +330,14 @@ /* Allocate 2^@order contiguous pages. */ -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) -{ - int i; +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu, + unsigned int order) +{ + int i,j, node; struct page_info *pg; + ASSERT(cpu_to_node[cpu] >= 0); + ASSERT(cpu_to_node[cpu] < num_online_nodes()); ASSERT(zone < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) @@ -301,29 +345,36 @@ spin_lock(&heap_lock); - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= MAX_ORDER; i++ ) - if ( !list_empty(&heap[zone][i]) ) - goto found; + /* start with requested node, but exhaust all node memory + * in requested zone before failing */ + for ( i = 0; i < num_online_nodes(); i++ ) + { + node = (cpu_to_node[cpu]+i) % num_online_nodes(); + /* Find smallest order which can satisfy the request. */ + for ( j = order; j <= MAX_ORDER; j++ ) { + if ( !list_empty(&heap[zone][node][j]) ) + goto found; + } + } /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: - pg = list_entry(heap[zone][i].next, struct page_info, list); + pg = list_entry(heap[zone][node][j].next, struct page_info, list); list_del(&pg->list); /* We may have to halve the chunk a number of times. */ - while ( i != order ) - { - PFN_ORDER(pg) = --i; - list_add_tail(&pg->list, &heap[zone][i]); - pg += 1 << i; + while ( j != order ) + { + PFN_ORDER(pg) = --j; + list_add_tail(&pg->list, &heap[zone][node][j]); + pg += 1 << j; } map_alloc(page_to_mfn(pg), 1 << order); - avail[zone] -= 1 << order; + avail[zone][node] -= 1 << order; spin_unlock(&heap_lock); @@ -336,14 +387,17 @@ unsigned int zone, struct page_info *pg, unsigned int order) { unsigned long mask; + int node = page_to_node(pg); ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); + ASSERT(node >= 0); + ASSERT(node < num_online_nodes()); spin_lock(&heap_lock); map_free(page_to_mfn(pg), 1 << order); - avail[zone] += 1 << order; + avail[zone][node] += 1 << order; /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) @@ -372,7 +426,7 @@ } PFN_ORDER(pg) = order; - list_add_tail(&pg->list, &heap[zone][order]); + list_add_tail(&pg->list, &heap[zone][node][order]); spin_unlock(&heap_lock); } @@ -467,7 +521,7 @@ int i; local_irq_save(flags); - pg = alloc_heap_pages(MEMZONE_XEN, order); + pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order); local_irq_restore(flags); if ( unlikely(pg == NULL) ) @@ -531,8 +585,8 @@ } -struct page_info *alloc_domheap_pages( - struct domain *d, unsigned int order, unsigned int flags) +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags) { struct page_info *pg = NULL; cpumask_t mask; @@ -542,17 +596,17 @@ if ( !(flags & ALLOC_DOM_DMA) ) { - pg = alloc_heap_pages(MEMZONE_DOM, order); + pg = alloc_heap_pages(MEMZONE_DOM, cpu, order); /* Failure? Then check if we can fall back to the DMA pool. */ - if ( unlikely(pg == NULL) && - ((order > MAX_ORDER) || - (avail[MEMZONE_DMADOM] < + if ( unlikely(pg == NULL) + && ((order > MAX_ORDER) || + (avail_heap_pages(MEMZONE_DMADOM,-1) < (lowmem_emergency_pool_pages + (1UL << order)))) ) return NULL; } if ( pg == NULL ) - if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL ) + if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL ) return NULL; mask = pg->u.free.cpumask; @@ -615,6 +669,13 @@ spin_unlock(&d->page_alloc_lock); return pg; +} + +inline struct page_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) +{ + return __alloc_domheap_pages(d, smp_processor_id(), order, flags); + } @@ -690,13 +751,27 @@ } +unsigned long avail_heap_pages(int zone, int node) +{ + int i,j; + unsigned long free_pages = 0; + + for (i=0; i<NR_ZONES; i++) + if ( (zone == -1) || (zone == i) ) + for (j=0; j<num_online_nodes(); j++) + if ( (node == -1) || (node == j) ) + free_pages += avail[i][j]; + + return free_pages; +} + unsigned long avail_domheap_pages(void) { unsigned long avail_nrm, avail_dma; - - avail_nrm = avail[MEMZONE_DOM]; - - avail_dma = avail[MEMZONE_DMADOM]; + + avail_nrm = avail_heap_pages(MEMZONE_DOM,-1); + + avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1); if ( avail_dma > lowmem_emergency_pool_pages ) avail_dma -= lowmem_emergency_pool_pages; else @@ -705,6 +780,10 @@ return avail_nrm + avail_dma; } +unsigned long avail_nodeheap_pages(int node) +{ + return avail_heap_pages(-1, node); +} static void pagealloc_keyhandler(unsigned char key) { @@ -712,9 +791,9 @@ printk(" Xen heap: %lukB free\n" " DMA heap: %lukB free\n" " Dom heap: %lukB free\n", - avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), - avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), - avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); + avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10)); } diff -r 38ba1fe5009c xen/include/xen/mm.h --- a/xen/include/xen/mm.h Tue May 9 02:23:08 2006 +++ b/xen/include/xen/mm.h Mon May 8 21:27:53 2006 @@ -45,7 +45,8 @@ /* Generic allocator. These functions are *not* interrupt-safe. */ void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages); -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order); +struct page_info *alloc_heap_pages( + unsigned int zone, unsigned int cpu, unsigned int order); void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order); void scrub_heap_pages(void); @@ -61,8 +62,11 @@ void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int flags); +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags); void free_domheap_pages(struct page_info *pg, unsigned int order); unsigned long avail_domheap_pages(void); +unsigned long avail_heap_pages(int zone, int node); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) diff -r 38ba1fe5009c xen/include/xen/numa.h --- a/xen/include/xen/numa.h Tue May 9 02:23:08 2006 +++ b/xen/include/xen/numa.h Mon May 8 21:27:53 2006 @@ -35,6 +35,8 @@ extern int cpu_to_node[]; extern cpumask_t node_to_cpumask[]; +int page_to_node(struct page_info *pg); + int numa_init(void); #endif /* _XEN_NUMA_H */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ryan Harper
2006-May-12 15:12 UTC
[Xen-devel] [PATCH 2/6][RESEND] xen: Add NUMA support to Xen
* Ryan Harper <ryanh@us.ibm.com> [2006-05-08 13:18]:> * Keir Fraser <Keir.Fraser@cl.cam.ac.uk> [2006-05-02 11:23]: > > > > On 2 May 2006, at 15:53, Ryan Harper wrote: > > > > >>Loops over every memory chunk structure on the alloc/free paths aren''t > > >>going to get merged. There''s no need for it -- in most cases memory > > >>chunks are probably aligned on a MAX_ORDER boundary (or they will be > > >>when I reduce MAX_ORDER, which requires me to fix up our Linux swiotlb > > >>a bit first). When that isn''t the case you can simply reserve guard > > >>pages at the start and end of such chunks to avoid cross-chunk > > >>merging. > > > > > >I''ll toss page_spans_chunk() and the user in the free path, use some > > >guard pages and resubmit. > > > > Great. Please do make it conditional on the start/end not being on a > > MAX_ORDER boundary though -- that''s a worthwhile optimisation to avoid > > the guard page. > > > > I''ve taken out the CONFIG_NUMA ifdefs and dumped the page_spans_chunk() > marking the chunk boundaries if they aren''t on MAX_ORDER boundaries and > if not allocated. I''m not clear on the difference between > reserving the sensitive pages in the alloc bitmap (via map_alloc()) > and the memguard routines. For instance, in init_xenheap_pages(), > the range is guarded, and then the range is handed to the heap (call to > init_heap_pages() which clears the alloc bitmap). Then in > init_domheap_pages(), there are no calls to memguard, just > work to set up the range for a call to init_heap_pages(). I''m > not sure if I need to use memguard for marking the chunk > boundaries, or if just reserving chunk boundaries that weren''t > already on a MAX_ORDER edge via map_alloc() is sufficient. > > Also, I didn''t see a way to ensure reserved pages aren''t freed via a > call to init_heap_pages() which just clears out a range of bits in > the alloc map. Should we be worried about that? > > Attached is what the current working patch looks like. Let me know if > this is more to your liking. If so, I''ll re-spin the whole patchset and > test it across the set of test machines we have (NUMA and non-NUMA).Same patch, but a proper hg export. Any comments for the above questions? -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@us.ibm.com diffstat output: common/page_alloc.c | 153 +++++++++++++++++++++++++++++++++++++++------------- include/xen/mm.h | 6 +- include/xen/numa.h | 2 3 files changed, 123 insertions(+), 38 deletions(-) Signed-off-by: Ryan Harper <ryanh@us.ibm.com> --- # HG changeset patch # User Ryan Harper <ryanh@us.ibm.com> # Node ID 3b140a1d5a7ddb9fcb0b3f63b44accd57bc5b89d # Parent 0f46b7056c02bac728e732c138d392f793679182 This patch introduces a per-node layer to the buddy allocator. Xen currently defines the heap as a two-dimensional array, [zone][order]. This patch adds a node layer between zone and order. This allows Xen to hand memory out in the proper zone while preferring local memory allocation, but can fall-back on non-local to satisfy a zone request. When the heap is initialized, for each page that is added we determine the node to which the page belongs and insert into the proper zone, node and order, while checking NUMA chunk boundaries and reserving a guard page when the ends of the chunk are not aligned with MAX_ORDER preventing cross-chunk merging. When allocating memory via the alloc_heap_pages() function, we try to satisfy the zone request in the target node which is determined by the requesting cpu. If no memory is found in the target node for a given zone, we examine other nodes before increasing the order of the memory request. Existing heap API has been preserved and uses smp_processor_id() to supply the required parameter to alloc_heap_pages() and alloc_domheap_pages. Also, Xen code can directly call alloc_heap_pages()/__alloc_domheap_pages() supplying the required cpu parameter to request pages local to the processor in question. avail_heap_pages() and avail_domheap_pages() have been altered to provide an eaiser method for querying total available memory given a zone or node. diff -r 0f46b7056c02 -r 3b140a1d5a7d xen/common/page_alloc.c --- a/xen/common/page_alloc.c Thu May 11 20:43:34 2006 +++ b/xen/common/page_alloc.c Thu May 11 20:47:16 2006 @@ -4,6 +4,7 @@ * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser + * Copyright (c) 2006 IBM * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -34,6 +35,25 @@ #include <xen/domain_page.h> #include <xen/keyhandler.h> #include <asm/page.h> +#include <xen/nodemask.h> +#include <xen/numa.h> + +extern int num_memory_chunks; +extern node_memory_chunk_t node_memory_chunk[]; +extern int cpu_to_node[]; + +/* map a given page_info to the node it came from */ +int page_to_node(struct page_info *pg) +{ + node_memory_chunk_t *c = node_memory_chunk; + u64 pg_paddr = page_to_maddr(pg); + + for (; c < (node_memory_chunk+num_memory_chunks); c++) + if ( (pg_paddr >= c->start_paddr) && (pg_paddr <= c->end_paddr) ) + return (int)c->nid; + + return -1; +} /* * Comma-separated list of hexadecimal page numbers containing bad bytes. @@ -246,22 +266,43 @@ #define pfn_dom_zone_type(_pfn) \ (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM) -static struct list_head heap[NR_ZONES][MAX_ORDER+1]; - -static unsigned long avail[NR_ZONES]; +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1]; + +static unsigned long avail[NR_ZONES][MAX_NUMNODES]; static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED; +#define NOT_MAX_ORDER_ALIGNED ((1UL << MAX_ORDER)-1) void end_boot_allocator(void) { - unsigned long i, j; + unsigned long i, j, k; int curr_free = 0, next_free = 0; memset(avail, 0, sizeof(avail)); for ( i = 0; i < NR_ZONES; i++ ) - for ( j = 0; j <= MAX_ORDER; j++ ) - INIT_LIST_HEAD(&heap[i][j]); + for ( j = 0; j < MAX_NUMNODES; j++ ) + for ( k = 0; k <= MAX_ORDER; k++ ) + INIT_LIST_HEAD(&heap[i][j][k]); + + /* mark NUMA chunk boundaries in multi-node systems */ + if ( num_online_nodes() > 1 ) + { + node_memory_chunk_t *c = node_memory_chunk; + + /* sacrifice the ends of a chunk if not MAX_ORDER + aligned to prevent merging across chunks */ + for (; c < (node_memory_chunk+num_memory_chunks); c++ ) + { + if ( (c->start_paddr & NOT_MAX_ORDER_ALIGNED) && + !allocated_in_map(paddr_to_pfn(c->start_paddr)) ) + map_alloc(paddr_to_pfn(c->start_paddr), 1); + + if ( (c->end_paddr & NOT_MAX_ORDER_ALIGNED) && + !allocated_in_map(paddr_to_pfn(c->end_paddr)) ) + map_alloc(paddr_to_pfn(c->end_paddr), 1); + } + } /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < max_page; i++ ) @@ -289,11 +330,14 @@ /* Allocate 2^@order contiguous pages. */ -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) -{ - int i; +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu, + unsigned int order) +{ + int i,j, node; struct page_info *pg; + ASSERT(cpu_to_node[cpu] >= 0); + ASSERT(cpu_to_node[cpu] < num_online_nodes()); ASSERT(zone < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) @@ -301,29 +345,36 @@ spin_lock(&heap_lock); - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= MAX_ORDER; i++ ) - if ( !list_empty(&heap[zone][i]) ) - goto found; + /* start with requested node, but exhaust all node memory + * in requested zone before failing */ + for ( i = 0; i < num_online_nodes(); i++ ) + { + node = (cpu_to_node[cpu]+i) % num_online_nodes(); + /* Find smallest order which can satisfy the request. */ + for ( j = order; j <= MAX_ORDER; j++ ) { + if ( !list_empty(&heap[zone][node][j]) ) + goto found; + } + } /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: - pg = list_entry(heap[zone][i].next, struct page_info, list); + pg = list_entry(heap[zone][node][j].next, struct page_info, list); list_del(&pg->list); /* We may have to halve the chunk a number of times. */ - while ( i != order ) - { - PFN_ORDER(pg) = --i; - list_add_tail(&pg->list, &heap[zone][i]); - pg += 1 << i; + while ( j != order ) + { + PFN_ORDER(pg) = --j; + list_add_tail(&pg->list, &heap[zone][node][j]); + pg += 1 << j; } map_alloc(page_to_mfn(pg), 1 << order); - avail[zone] -= 1 << order; + avail[zone][node] -= 1 << order; spin_unlock(&heap_lock); @@ -336,14 +387,17 @@ unsigned int zone, struct page_info *pg, unsigned int order) { unsigned long mask; + int node = page_to_node(pg); ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); + ASSERT(node >= 0); + ASSERT(node < num_online_nodes()); spin_lock(&heap_lock); map_free(page_to_mfn(pg), 1 << order); - avail[zone] += 1 << order; + avail[zone][node] += 1 << order; /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) @@ -372,7 +426,7 @@ } PFN_ORDER(pg) = order; - list_add_tail(&pg->list, &heap[zone][order]); + list_add_tail(&pg->list, &heap[zone][node][order]); spin_unlock(&heap_lock); } @@ -467,7 +521,7 @@ int i; local_irq_save(flags); - pg = alloc_heap_pages(MEMZONE_XEN, order); + pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order); local_irq_restore(flags); if ( unlikely(pg == NULL) ) @@ -531,8 +585,8 @@ } -struct page_info *alloc_domheap_pages( - struct domain *d, unsigned int order, unsigned int flags) +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags) { struct page_info *pg = NULL; cpumask_t mask; @@ -542,17 +596,17 @@ if ( !(flags & ALLOC_DOM_DMA) ) { - pg = alloc_heap_pages(MEMZONE_DOM, order); + pg = alloc_heap_pages(MEMZONE_DOM, cpu, order); /* Failure? Then check if we can fall back to the DMA pool. */ - if ( unlikely(pg == NULL) && - ((order > MAX_ORDER) || - (avail[MEMZONE_DMADOM] < + if ( unlikely(pg == NULL) + && ((order > MAX_ORDER) || + (avail_heap_pages(MEMZONE_DMADOM,-1) < (lowmem_emergency_pool_pages + (1UL << order)))) ) return NULL; } if ( pg == NULL ) - if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL ) + if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL ) return NULL; mask = pg->u.free.cpumask; @@ -615,6 +669,13 @@ spin_unlock(&d->page_alloc_lock); return pg; +} + +inline struct page_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) +{ + return __alloc_domheap_pages(d, smp_processor_id(), order, flags); + } @@ -690,13 +751,27 @@ } +unsigned long avail_heap_pages(int zone, int node) +{ + int i,j; + unsigned long free_pages = 0; + + for (i=0; i<NR_ZONES; i++) + if ( (zone == -1) || (zone == i) ) + for (j=0; j<num_online_nodes(); j++) + if ( (node == -1) || (node == j) ) + free_pages += avail[i][j]; + + return free_pages; +} + unsigned long avail_domheap_pages(void) { unsigned long avail_nrm, avail_dma; - - avail_nrm = avail[MEMZONE_DOM]; - - avail_dma = avail[MEMZONE_DMADOM]; + + avail_nrm = avail_heap_pages(MEMZONE_DOM,-1); + + avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1); if ( avail_dma > lowmem_emergency_pool_pages ) avail_dma -= lowmem_emergency_pool_pages; else @@ -705,6 +780,10 @@ return avail_nrm + avail_dma; } +unsigned long avail_nodeheap_pages(int node) +{ + return avail_heap_pages(-1, node); +} static void pagealloc_keyhandler(unsigned char key) { @@ -712,9 +791,9 @@ printk(" Xen heap: %lukB free\n" " DMA heap: %lukB free\n" " Dom heap: %lukB free\n", - avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), - avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), - avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); + avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10)); } diff -r 0f46b7056c02 -r 3b140a1d5a7d xen/include/xen/mm.h --- a/xen/include/xen/mm.h Thu May 11 20:43:34 2006 +++ b/xen/include/xen/mm.h Thu May 11 20:47:16 2006 @@ -45,7 +45,8 @@ /* Generic allocator. These functions are *not* interrupt-safe. */ void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages); -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order); +struct page_info *alloc_heap_pages( + unsigned int zone, unsigned int cpu, unsigned int order); void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order); void scrub_heap_pages(void); @@ -61,8 +62,11 @@ void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int flags); +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags); void free_domheap_pages(struct page_info *pg, unsigned int order); unsigned long avail_domheap_pages(void); +unsigned long avail_heap_pages(int zone, int node); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) diff -r 0f46b7056c02 -r 3b140a1d5a7d xen/include/xen/numa.h --- a/xen/include/xen/numa.h Thu May 11 20:43:34 2006 +++ b/xen/include/xen/numa.h Thu May 11 20:47:16 2006 @@ -35,6 +35,8 @@ extern int cpu_to_node[]; extern cpumask_t node_to_cpumask[]; +int page_to_node(struct page_info *pg); + int numa_init(void); #endif /* _XEN_NUMA_H */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2006-May-13 09:27 UTC
[Xen-devel] Re: [PATCH 2/6][RESEND] xen: Add NUMA support to Xen
On 12 May 2006, at 16:12, Ryan Harper wrote:>> Then in >> init_domheap_pages(), there are no calls to memguard, just >> work to set up the range for a call to init_heap_pages(). I''m >> not sure if I need to use memguard for marking the chunk >> boundaries, or if just reserving chunk boundaries that weren''t >> already on a MAX_ORDER edge via map_alloc() is sufficient.Just reserving is sufficient. memguard is a debug-build aid for xen heap only.>> Also, I didn''t see a way to ensure reserved pages aren''t freed via a >> call to init_heap_pages() which just clears out a range of bits in >> the alloc map. Should we be worried about that?Well, in theory. It''s only called in one place though right now, and probably with a physical range below 128MB in all cases. So it''s unlikely to straddle a NUMA boundary. Another comment on this patch (2/2): page_to_node() should be defined in numa.c (for now) rather than page_alloc.c (where it will never belong). I know you also directly scan the chunk array to find boundaries to reserve: perhaps for now we could have end_boot_allocator() call init_heap_pages() for each page, then initheap_pages() can compare page_to_node(page) with page_to_node(page-1). If they differ and the latter is not -1 and page is not on a MAX_ORDER boundary, then you do not free the page to the buddy allocator. Clearly this will be crappily slow, but it''s only used at boot time and as long as it''s not *too* bad (which it certainly won''t be for short chunk lists) then it''ll do until we improve things (probably with a fast constant-time page_to_node() implementation). The main thing here is to get the interfaces right, and the current export of memory_chunk structure and array is definitely not right. I''d like it to remain hidden in numa.c/srat.c as it is on Linux, or at least hidden in arch/x86. It also lets us stub out page_to_node() easily for ia64, and breaks them a whole lot less. -- Keir>> Attached is what the current working patch looks like. Let me know if >> this is more to your liking. If so, I''ll re-spin the whole patchset >> and >> test it across the set of test machines we have (NUMA and non-NUMA)._______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ryan Harper
2006-May-31 19:14 UTC
Re: [Xen-devel] Re: [PATCH 2/6][RESEND] xen: Add NUMA support to Xen
* Keir Fraser <Keir.Fraser@cl.cam.ac.uk> [2006-05-13 04:33]:> > On 12 May 2006, at 16:12, Ryan Harper wrote: > > >>Then in > >>init_domheap_pages(), there are no calls to memguard, just > >>work to set up the range for a call to init_heap_pages(). I''m > >>not sure if I need to use memguard for marking the chunk > >>boundaries, or if just reserving chunk boundaries that weren''t > >>already on a MAX_ORDER edge via map_alloc() is sufficient. > > Just reserving is sufficient. memguard is a debug-build aid for xen > heap only. > > >>Also, I didn''t see a way to ensure reserved pages aren''t freed via a > >>call to init_heap_pages() which just clears out a range of bits in > >>the alloc map. Should we be worried about that? > > Well, in theory. It''s only called in one place though right now, and > probably with a physical range below 128MB in all cases. So it''s > unlikely to straddle a NUMA boundary.OK.> > Another comment on this patch (2/2): page_to_node() should be defined > in numa.c (for now) rather than page_alloc.c (where it will never > belong). I know you also directly scan the chunk array to find > boundaries to reserve: perhaps for now we could have > end_boot_allocator() call init_heap_pages() for each page, then > initheap_pages() can compare page_to_node(page) with > page_to_node(page-1). If they differ and the latter is not -1 and page > is not on a MAX_ORDER boundary, then you do not free the page to the > buddy allocator. > > Clearly this will be crappily slow, but it''s only used at boot time and > as long as it''s not *too* bad (which it certainly won''t be for short > chunk lists) then it''ll do until we improve things (probably with a > fast constant-time page_to_node() implementation).I''ve re-worked this patch to use the new ACPI/x86_64 NUMA infrastructure I just resent. Now page_alloc.c only uses MAX_NUMNODES and the phys_to_nid() function. I''ve modifed end_boot_allocator/init_heap_pages as described above. -- Ryan Harper Software Engineer; Linux Technology Center IBM Corp., Austin, Tx (512) 838-9253 T/L: 678-9253 ryanh@us.ibm.com diffstat output: common/page_alloc.c | 198 ++++++++++++++++++++++++++++++++++++++++------------ include/xen/mm.h | 6 + 2 files changed, 160 insertions(+), 44 deletions(-) Signed-off-by: Ryan Harper <ryanh@us.ibm.com> --- diff -r 99e60f5df1d8 xen/common/page_alloc.c --- a/xen/common/page_alloc.c Wed May 31 15:36:41 2006 +++ b/xen/common/page_alloc.c Wed May 31 14:10:01 2006 @@ -4,6 +4,7 @@ * Simple buddy heap allocator for Xen. * * Copyright (c) 2002-2004 K A Fraser + * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,6 +34,7 @@ #include <xen/shadow.h> #include <xen/domain_page.h> #include <xen/keyhandler.h> +#include <asm/numa.h> #include <asm/page.h> /* @@ -246,22 +248,23 @@ #define pfn_dom_zone_type(_pfn) \ (((_pfn) <= MAX_DMADOM_PFN) ? MEMZONE_DMADOM : MEMZONE_DOM) -static struct list_head heap[NR_ZONES][MAX_ORDER+1]; - -static unsigned long avail[NR_ZONES]; +static struct list_head heap[NR_ZONES][MAX_NUMNODES][MAX_ORDER+1]; + +static unsigned long avail[NR_ZONES][MAX_NUMNODES]; static spinlock_t heap_lock = SPIN_LOCK_UNLOCKED; void end_boot_allocator(void) { - unsigned long i, j; + unsigned long i, j, k; int curr_free = 0, next_free = 0; memset(avail, 0, sizeof(avail)); for ( i = 0; i < NR_ZONES; i++ ) - for ( j = 0; j <= MAX_ORDER; j++ ) - INIT_LIST_HEAD(&heap[i][j]); + for ( j = 0; j < MAX_NUMNODES; j++ ) + for ( k = 0; k <= MAX_ORDER; k++ ) + INIT_LIST_HEAD(&heap[i][j][k]); /* Pages that are free now go to the domain sub-allocator. */ for ( i = 0; i < max_page; i++ ) @@ -271,29 +274,58 @@ if ( next_free ) map_alloc(i+1, 1); /* prevent merging in free_heap_pages() */ if ( curr_free ) - free_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 0); - } -} - -/* Hand the specified arbitrary page range to the specified heap zone. */ + init_heap_pages(pfn_dom_zone_type(i), mfn_to_page(i), 1); + } +} + +/* + * Hand the specified arbitrary page range to the specified heap zone + * checking the node_id of the previous page. If they differ and the + * latter is not on a MAX_ORDER boundary, then we reserve the page by + * not freeing it to the buddy allocator. + */ +#define MAX_ORDER_ALIGNED (1UL << (MAX_ORDER)) void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages) { + unsigned int nid_curr,nid_prev; unsigned long i; ASSERT(zone < NR_ZONES); + if ( likely(page_to_mfn(pg) != 0) ) + nid_prev = phys_to_nid(page_to_maddr(pg-1)); + else + nid_prev = phys_to_nid(page_to_maddr(pg)); + for ( i = 0; i < nr_pages; i++ ) - free_heap_pages(zone, pg+i, 0); -} - + { + nid_curr = phys_to_nid(page_to_maddr(pg+i)); + + /* + * free pages of the same node, or if they differ, but are on a + * MAX_ORDER alignement boundary (which already get reserved) + */ + if ( (nid_curr == nid_prev) || (page_to_maddr(pg+i) & + MAX_ORDER_ALIGNED) ) + free_heap_pages(zone, pg+i, 0); + else + printk("Reserving non-aligned node boundary @ mfn %lu\n", + page_to_mfn(pg+i)); + + nid_prev = nid_curr; + } +} /* Allocate 2^@order contiguous pages. */ -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order) -{ - int i; +struct page_info *alloc_heap_pages(unsigned int zone, unsigned int cpu, + unsigned int order) +{ + int i,j, node; struct page_info *pg; + ASSERT(cpu_to_node[cpu] >= 0); + ASSERT(cpu_to_node[cpu] < num_online_nodes()); ASSERT(zone < NR_ZONES); if ( unlikely(order > MAX_ORDER) ) @@ -301,29 +333,38 @@ spin_lock(&heap_lock); - /* Find smallest order which can satisfy the request. */ - for ( i = order; i <= MAX_ORDER; i++ ) - if ( !list_empty(&heap[zone][i]) ) - goto found; + /* start with requested node, but exhaust all node memory + * in requested zone before failing */ + for ( i = 0; i < num_online_nodes(); i++ ) + { + node = (cpu_to_node[cpu]+i) % num_online_nodes(); + /* Find smallest order which can satisfy the request. */ + for ( j = order; j <= MAX_ORDER; j++ ) + { + if ( !list_empty(&heap[zone][node][j]) ) + goto found; + } + } /* No suitable memory blocks. Fail the request. */ spin_unlock(&heap_lock); return NULL; found: - pg = list_entry(heap[zone][i].next, struct page_info, list); + pg = list_entry(heap[zone][node][j].next, struct page_info, list); list_del(&pg->list); /* We may have to halve the chunk a number of times. */ - while ( i != order ) - { - PFN_ORDER(pg) = --i; - list_add_tail(&pg->list, &heap[zone][i]); - pg += 1 << i; + while ( j != order ) + { + PFN_ORDER(pg) = --j; + list_add_tail(&pg->list, &heap[zone][node][j]); + pg += 1 << j; } map_alloc(page_to_mfn(pg), 1 << order); - avail[zone] -= 1 << order; + ASSERT(avail[zone][node] >= (1 << order)); + avail[zone][node] -= 1 << order; spin_unlock(&heap_lock); @@ -336,14 +377,17 @@ unsigned int zone, struct page_info *pg, unsigned int order) { unsigned long mask; + int node = phys_to_nid(page_to_maddr(pg)); ASSERT(zone < NR_ZONES); ASSERT(order <= MAX_ORDER); + ASSERT(node >= 0); + ASSERT(node < num_online_nodes()); spin_lock(&heap_lock); map_free(page_to_mfn(pg), 1 << order); - avail[zone] += 1 << order; + avail[zone][node] += 1 << order; /* Merge chunks as far as possible. */ while ( order < MAX_ORDER ) @@ -369,10 +413,13 @@ } order++; + + /* after merging, pg should be in the same node */ + ASSERT(phys_to_nid(page_to_maddr(pg)) == node ); } PFN_ORDER(pg) = order; - list_add_tail(&pg->list, &heap[zone][order]); + list_add_tail(&pg->list, &heap[zone][node][order]); spin_unlock(&heap_lock); } @@ -467,7 +514,7 @@ int i; local_irq_save(flags); - pg = alloc_heap_pages(MEMZONE_XEN, order); + pg = alloc_heap_pages(MEMZONE_XEN, smp_processor_id(), order); local_irq_restore(flags); if ( unlikely(pg == NULL) ) @@ -531,8 +578,8 @@ } -struct page_info *alloc_domheap_pages( - struct domain *d, unsigned int order, unsigned int flags) +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags) { struct page_info *pg = NULL; cpumask_t mask; @@ -542,17 +589,17 @@ if ( !(flags & ALLOC_DOM_DMA) ) { - pg = alloc_heap_pages(MEMZONE_DOM, order); + pg = alloc_heap_pages(MEMZONE_DOM, cpu, order); /* Failure? Then check if we can fall back to the DMA pool. */ if ( unlikely(pg == NULL) && ((order > MAX_ORDER) || - (avail[MEMZONE_DMADOM] < + (avail_heap_pages(MEMZONE_DMADOM,-1) < (lowmem_emergency_pool_pages + (1UL << order)))) ) return NULL; } if ( pg == NULL ) - if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, order)) == NULL ) + if ( (pg = alloc_heap_pages(MEMZONE_DMADOM, cpu, order)) == NULL ) return NULL; mask = pg->u.free.cpumask; @@ -615,6 +662,13 @@ spin_unlock(&d->page_alloc_lock); return pg; +} + +inline struct page_info *alloc_domheap_pages( + struct domain *d, unsigned int order, unsigned int flags) +{ + return __alloc_domheap_pages(d, smp_processor_id(), order, flags); + } @@ -690,13 +744,27 @@ } +unsigned long avail_heap_pages(int zone, int node) +{ + int i,j; + unsigned long free_pages = 0; + + for (i=0; i<NR_ZONES; i++) + if ( (zone == -1) || (zone == i) ) + for (j=0; j<num_online_nodes(); j++) + if ( (node == -1) || (node == j) ) + free_pages += avail[i][j]; + + return free_pages; +} + unsigned long avail_domheap_pages(void) { unsigned long avail_nrm, avail_dma; - - avail_nrm = avail[MEMZONE_DOM]; - - avail_dma = avail[MEMZONE_DMADOM]; + + avail_nrm = avail_heap_pages(MEMZONE_DOM,-1); + + avail_dma = avail_heap_pages(MEMZONE_DMADOM,-1); if ( avail_dma > lowmem_emergency_pool_pages ) avail_dma -= lowmem_emergency_pool_pages; else @@ -705,6 +773,10 @@ return avail_nrm + avail_dma; } +unsigned long avail_nodeheap_pages(int node) +{ + return avail_heap_pages(-1, node); +} static void pagealloc_keyhandler(unsigned char key) { @@ -712,9 +784,9 @@ printk(" Xen heap: %lukB free\n" " DMA heap: %lukB free\n" " Dom heap: %lukB free\n", - avail[MEMZONE_XEN]<<(PAGE_SHIFT-10), - avail[MEMZONE_DMADOM]<<(PAGE_SHIFT-10), - avail[MEMZONE_DOM]<<(PAGE_SHIFT-10)); + avail_heap_pages(MEMZONE_XEN, -1) << (PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DMADOM, -1) <<(PAGE_SHIFT-10), + avail_heap_pages(MEMZONE_DOM, -1) <<(PAGE_SHIFT-10)); } @@ -776,6 +848,46 @@ } while ( (NOW() - start) < MILLISECS(1) ); } +static unsigned long count_bucket(struct list_head* l, int order) +{ + unsigned long total_pages = 0; + int pages = 1 << order; + struct page_info *pg; + + list_for_each_entry(pg, l, list) + total_pages += pages; + + return total_pages; +} + +static void dump_heap(unsigned char key) +{ + s_time_t now = NOW(); + int i,j,k; + unsigned long total; + + printk("''%c'' pressed -> dumping heap info (now-0x%X:%08X)\n", key, + (u32)(now>>32), (u32)now); + + for (i=0; i<NR_ZONES; i++ ) + for (j=0;j<MAX_NUMNODES;j++) + for (k=0;k<=MAX_ORDER;k++) + if ( !list_empty(&heap[i][j][k]) ) + { + total = count_bucket(&heap[i][j][k], k); + printk("heap[%d][%d][%d]-> %lu pages\n", + i, j, k, total); + } +} + +static __init int register_heap_trigger(void) +{ + register_keyhandler(''H'', dump_heap, "dump heap info"); + return 0; +} +__initcall(register_heap_trigger); + + static __init int page_scrub_init(void) { open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq); diff -r 99e60f5df1d8 xen/include/xen/mm.h --- a/xen/include/xen/mm.h Wed May 31 15:36:41 2006 +++ b/xen/include/xen/mm.h Wed May 31 14:10:01 2006 @@ -45,7 +45,8 @@ /* Generic allocator. These functions are *not* interrupt-safe. */ void init_heap_pages( unsigned int zone, struct page_info *pg, unsigned long nr_pages); -struct page_info *alloc_heap_pages(unsigned int zone, unsigned int order); +struct page_info *alloc_heap_pages( + unsigned int zone, unsigned int cpu, unsigned int order); void free_heap_pages( unsigned int zone, struct page_info *pg, unsigned int order); void scrub_heap_pages(void); @@ -61,8 +62,11 @@ void init_domheap_pages(paddr_t ps, paddr_t pe); struct page_info *alloc_domheap_pages( struct domain *d, unsigned int order, unsigned int flags); +struct page_info *__alloc_domheap_pages( + struct domain *d, unsigned int cpu, unsigned int order, unsigned int flags); void free_domheap_pages(struct page_info *pg, unsigned int order); unsigned long avail_domheap_pages(void); +unsigned long avail_heap_pages(int zone, int node); #define alloc_domheap_page(d) (alloc_domheap_pages(d,0,0)) #define free_domheap_page(p) (free_domheap_pages(p,0)) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel