Malcolm Crossley
2013-Sep-30 12:35 UTC
[PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
The page scrubbing is done in 128MB chunks in lockstep across all the CPU''s. This allows for the boot CPU to hold the heap_lock whilst each chunk is being scrubbed and then release the heap_lock when all CPU''s are finished scrubing their individual chunk. This allows for the heap_lock to not be held continously and for pending softirqs are to be serviced periodically across all CPU''s. The page scrub memory chunks are allocated to the CPU''s in a NUMA aware fashion to reduce Socket interconnect overhead and improve performance. This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron 6386 machine from 49 seconds to 3 seconds. Changes in v2 - Reduced default chunk size to 128MB - Added code to scrub NUMA nodes with no active CPU linked to them - Be robust to boot CPU not being linked to a NUMA node diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -188,6 +188,16 @@ Scrub free RAM during boot. This is a s accidentally leaking sensitive VM data into other VMs if Xen crashes and reboots. +### bootscrub_blocksize +> `= <size>` + +> Default: `128MiB` + +Maximum RAM block size to be scrubbed whilst holding the page heap lock and not +running softirqs. Reduce this if softirqs are not being run frequently enough. +Setting this to a high value may cause cause boot failure, particularly if the +NMI watchdog is also enabled. + ### cachesize > `= <size>` diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata boolean_param("bootscrub", opt_bootscrub); /* + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held + */ +static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024; +size_param("bootscrub_blocksize", opt_bootscrub_blocksize); + +/* * Bit width of the DMA heap -- used to override NUMA-node-first. * allocation strategy, which can otherwise exhaust low memory. */ @@ -90,6 +96,16 @@ static struct bootmem_region { } *__initdata bootmem_region_list; static unsigned int __initdata nr_bootmem_regions; +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0); + +struct scrub_region { + u64 offset; + u64 start; + u64 chunk_size; + u64 cpu_block_size; +}; +static struct scrub_region __initdata region[MAX_NUMNODES]; + static void __init boot_bug(int line) { panic("Boot BUG at %s:%d\n", __FILE__, line); @@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void) printk("\n"); } -/* - * Scrub all unallocated pages in all heap zones. This function is more - * convoluted than appears necessary because we do not want to continuously - * hold the lock while scrubbing very large memory areas. - */ -void __init scrub_heap_pages(void) +void __init smp_scrub_heap_pages(void *data) { - unsigned long mfn; + unsigned long mfn, start_mfn, end_mfn; struct page_info *pg; + struct scrub_region *region = data; + unsigned int temp_cpu, local_node, local_cpu_index = 0; + unsigned int cpu = smp_processor_id(); - if ( !opt_bootscrub ) - return; + ASSERT(region != NULL); - printk("Scrubbing Free RAM: "); + local_node = cpu_to_node(cpu); + /* Determine if we are scrubbing using the boot CPU */ + if ( region->cpu_block_size != ~0ULL ) + /* Determine the current CPU''s index into CPU''s linked to this node*/ + for_each_cpu( temp_cpu, &node_to_cpumask(local_node) ) + { + if ( cpu == temp_cpu ) + break; + local_cpu_index++; + } - for ( mfn = first_valid_mfn; mfn < max_page; mfn++ ) + /* Calculate the starting mfn for this CPU''s memory block */ + start_mfn = region->start + (region->cpu_block_size * local_cpu_index) + + region->offset; + + /* Calculate the end mfn into this CPU''s memory block for this iteration */ + if ( region->offset + region->chunk_size > region->cpu_block_size ) + end_mfn = region->start + (region->cpu_block_size * local_cpu_index) + + region->cpu_block_size; + else + end_mfn = start_mfn + region->chunk_size; + + + for ( mfn = start_mfn; mfn < end_mfn; mfn++ ) { - process_pending_softirqs(); - pg = mfn_to_page(mfn); - /* Quick lock-free check. */ + /* Check the mfn is valid and page is free. */ if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) continue; @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void) if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) printk("."); + /* Do the scrub if possible */ + if ( page_state_is(pg, free) ) + scrub_one_page(pg); + } + /* Increment count to indicate scrubbing complete on this CPU */ + atomic_dec(&bootscrub_count); +} + +/* + * Scrub all unallocated pages in all heap zones. This function uses all + * online cpu''s to scrub the memory in parallel. + */ +void __init scrub_heap_pages(void) +{ + cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }}; + unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id(); + unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0; + unsigned long mem_start, mem_end; + + if ( !opt_bootscrub ) + return; + + boot_cpu_node = cpu_to_node(cpu); + + printk("Scrubbing Free RAM: "); + + /* Scrub block size */ + chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT; + if ( chunk_size == 0 ) + chunk_size = 1; + + /* Determine the amount of memory to scrub, per CPU on each Node */ + for_each_online_node ( i ) + { + /* Calculate Node memory start and end address */ + mem_start = max(node_start_pfn(i), first_valid_mfn); + mem_end = min(mem_start + node_spanned_pages(i), max_page); + /* Divide by number of CPU''s for this node */ + node_cpus = node_to_cpumask(i); + /* It''s possible a node has no CPU''s */ + if ( cpumask_empty(&node_cpus) ) + continue; + cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus); + + region[i].cpu_block_size = (mem_end - mem_start) / + cpumask_weight(&node_cpus); + region[i].start = mem_start; + + if ( region[i].cpu_block_size > max_cpu_blk_size ) + max_cpu_blk_size = region[i].cpu_block_size; + } + + /* Round default chunk size down if required */ + if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size ) + chunk_size = max_cpu_blk_size; + + total_node_cpus = cpumask_weight(&total_node_cpus_mask); + /* Start all CPU''s scrubbing memory, chunk_size at a time */ + for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size ) + { + process_pending_softirqs(); + + atomic_set(&bootscrub_count, total_node_cpus); + spin_lock(&heap_lock); - /* Re-check page status with lock held. */ - if ( page_state_is(pg, free) ) - scrub_one_page(pg); + /* Start all other CPU''s on all nodes */ + for_each_online_node ( i ) + { + region[i].chunk_size = chunk_size; + region[i].offset = mfn_off; + node_cpus = node_to_cpumask(i); + /* Clear local cpu ID */ + cpumask_clear_cpu(cpu, &node_cpus); + /* Start page scrubbing on all other CPU''s */ + on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 0); + } + + /* Start scrub on local CPU if CPU linked to a memory node */ + if ( boot_cpu_node != NUMA_NO_NODE ) + smp_scrub_heap_pages(®ion[boot_cpu_node]); + + /* Wait for page scrubbing to complete on all other CPU''s */ + while ( atomic_read(&bootscrub_count) > 0 ) + cpu_relax(); spin_unlock(&heap_lock); } + /* Use the boot CPU to scrub any nodes which have no CPU''s linked to them */ + for_each_online_node ( i ) + { + node_cpus = node_to_cpumask(i); + + if ( !cpumask_empty(&node_cpus) ) + continue; + + mem_start = max(node_start_pfn(i), first_valid_mfn); + mem_end = min(mem_start + node_spanned_pages(i), max_page); + + region[0].offset = 0; + region[0].cpu_block_size = ~0ULL; + + for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size ) + { + spin_lock(&heap_lock); + if ( mfn + chunk_size > mem_end ) + region[0].chunk_size = mem_end - mfn; + else + region[0].chunk_size = chunk_size; + + region[0].start = mfn; + + smp_scrub_heap_pages(®ion[0]); + spin_unlock(&heap_lock); + process_pending_softirqs(); + } + } printk("done.\n"); /* Now that the heap is initialized, run checks and set bounds
Jan Beulich
2013-Sep-30 13:26 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
>>> On 30.09.13 at 14:35, Malcolm Crossley <malcolm.crossley@citrix.com> wrote: > The page scrubbing is done in 128MB chunks in lockstep across all the CPU''s. > This allows for the boot CPU to hold the heap_lock whilst each chunk is being > scrubbed and then release the heap_lock when all CPU''s are finished scrubing > their individual chunk. This allows for the heap_lock to not be held > continously and for pending softirqs are to be serviced periodically across > all CPU''s. > > The page scrub memory chunks are allocated to the CPU''s in a NUMA aware > fashion to reduce Socket interconnect overhead and improve performance. > > This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron > 6386 machine from 49 seconds to 3 seconds.And is this a NUMA system with heavily different access times between local and remote memory? What I''m trying to understand before reviewing the actual patch is whether what you do is really necessary: Generally it ought to be sufficient to have one CPU on each node scrub that node''s memory, as a CPU should be able to saturate the bus if it does (almost) nothing but memory write. Hence having multiple cores on the same socket (not to speak of multiple threads in a core) do this work in parallel is likely not going to be beneficial, and hence the logic you''re adding here might be more complex than necessary. Jan
Malcolm Crossley
2013-Sep-30 13:56 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
On 30/09/13 14:26, Jan Beulich wrote:>>>> On 30.09.13 at 14:35, Malcolm Crossley <malcolm.crossley@citrix.com> wrote: >> The page scrubbing is done in 128MB chunks in lockstep across all the CPU''s. >> This allows for the boot CPU to hold the heap_lock whilst each chunk is being >> scrubbed and then release the heap_lock when all CPU''s are finished scrubing >> their individual chunk. This allows for the heap_lock to not be held >> continously and for pending softirqs are to be serviced periodically across >> all CPU''s. >> >> The page scrub memory chunks are allocated to the CPU''s in a NUMA aware >> fashion to reduce Socket interconnect overhead and improve performance. >> >> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron >> 6386 machine from 49 seconds to 3 seconds. > And is this a NUMA system with heavily different access times > between local and remote memory?The AMD 64 core system has 8 NUMA nodes with up to 2 hops between the nodes. This page show''s some data on the roundtrip bandwidth between different core''s http://www.cl.cam.ac.uk/research/srg/netos/ipc-bench/details/tmpn2YlFp.html This paper also show the difference memory bandwidth with NUMA aware threads: http://www.cs.uchicago.edu/files/tr_authentic/TR-2011-02.pdf The unstrided results are the only one''s we''re interested in.> > What I''m trying to understand before reviewing the actual patch > is whether what you do is really necessary: Generally it ought to > be sufficient to have one CPU on each node scrub that node''s > memory, as a CPU should be able to saturate the bus if it does > (almost) nothing but memory write. Hence having multiple cores > on the same socket (not to speak of multiple threads in a core) > do this work in parallel is likely not going to be beneficial, and > hence the logic you''re adding here might be more complex than > necessary.The second paper cited above shows that between 3 times more core''s than NUMA nodes are required to reach peak NUMA memory bandwidth. The difference between 1 core per node bandwidth and 3 core''s per node bandwidth is: AMD: 30000MB/s (1-core) vs 48000MB/s (3-core) Intel: 12000MB/s (1-core) vs 38000MB/s (3-core) So I think it''s worth the extra complexity to have multiple core''s per node scrubbing memory. Malcolm> Jan >
Jan Beulich
2013-Sep-30 15:35 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote: > The difference between 1 core per node bandwidth and 3 core''s per node > bandwidth is: > > AMD: 30000MB/s (1-core) vs 48000MB/s (3-core) > Intel: 12000MB/s (1-core) vs 38000MB/s (3-core) > > So I think it''s worth the extra complexity to have multiple core''s per > node scrubbing memory.The numbers are convincing. Which means that the only request I''d have is to avoid more than one hyperthread on a core to get picked for doing the scrubbing. Jan
Andrew Cooper
2013-Sep-30 15:42 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
On 30/09/13 16:35, Jan Beulich wrote:>>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote: >> The difference between 1 core per node bandwidth and 3 core''s per node >> bandwidth is: >> >> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core) >> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core) >> >> So I think it''s worth the extra complexity to have multiple core''s per >> node scrubbing memory. > The numbers are convincing. Which means that the only request > I''d have is to avoid more than one hyperthread on a core to get > picked for doing the scrubbing. > > Jan >Why? Being independent operations, scrubbing like this will still be sped up by hyperthreading. As scrubbing RAM is simply a race to get it done as quickly as possible, unless it can be demonstrated that using all cores is actively detrimental to the overall time taken, we really should use all cores where possible. ~Andrew
Jan Beulich
2013-Sep-30 16:08 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
>>> On 30.09.13 at 17:42, Andrew Cooper <andrew.cooper3@citrix.com> wrote: > On 30/09/13 16:35, Jan Beulich wrote: >>>>> On 30.09.13 at 15:56, Malcolm Crossley <malcolm.crossley@citrix.com> wrote: >>> The difference between 1 core per node bandwidth and 3 core''s per node >>> bandwidth is: >>> >>> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core) >>> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core) >>> >>> So I think it''s worth the extra complexity to have multiple core''s per >>> node scrubbing memory. >> The numbers are convincing. Which means that the only request >> I''d have is to avoid more than one hyperthread on a core to get >> picked for doing the scrubbing. > > Why? Being independent operations, scrubbing like this will still be > sped up by hyperthreading. > > As scrubbing RAM is simply a race to get it done as quickly as possible, > unless it can be demonstrated that using all cores is actively > detrimental to the overall time taken, we really should use all cores > where possible.As long as the actual scrubbing routine is written well enough (to saturate the store bandwidth of a single core), having two threads compete with one another for the store ports can only be negatively affecting performance. _If_ multiple hyper- threads would really turn out to be beneficial, that would thus rather be an indication to improve clear_page() (or whatever is being used for the actual scrubbing). Jan
Konrad Rzeszutek Wilk
2013-Sep-30 17:43 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
On Mon, Sep 30, 2013 at 01:35:17PM +0100, Malcolm Crossley wrote:> The page scrubbing is done in 128MB chunks in lockstep across all the CPU''s. > This allows for the boot CPU to hold the heap_lock whilst each chunk is being > scrubbed and then release the heap_lock when all CPU''s are finished scrubing > their individual chunk. This allows for the heap_lock to not be held > continously and for pending softirqs are to be serviced periodically across > all CPU''s. > > The page scrub memory chunks are allocated to the CPU''s in a NUMA aware > fashion to reduce Socket interconnect overhead and improve performance. > > This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron > 6386 machine from 49 seconds to 3 seconds.A bit older version of this one cut down the 1TB machine scrubbing from minutes (I think it was 5 or 10 - I gave up on counting) down to less than a minute.> > Changes in v2 > - Reduced default chunk size to 128MB > - Added code to scrub NUMA nodes with no active CPU linked to them > - Be robust to boot CPU not being linked to a NUMA node > > diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown > --- a/docs/misc/xen-command-line.markdown > +++ b/docs/misc/xen-command-line.markdown > @@ -188,6 +188,16 @@ Scrub free RAM during boot. This is a s > accidentally leaking sensitive VM data into other VMs if Xen crashes > and reboots. > > +### bootscrub_blocksize > +> `= <size>` > + > +> Default: `128MiB` > + > +Maximum RAM block size to be scrubbed whilst holding the page heap lock and not > +running softirqs. Reduce this if softirqs are not being run frequently enough. > +Setting this to a high value may cause cause boot failure, particularly if the > +NMI watchdog is also enabled. > + > ### cachesize > > `= <size>` > > diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c > --- a/xen/common/page_alloc.c > +++ b/xen/common/page_alloc.c > @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata > boolean_param("bootscrub", opt_bootscrub); > > /* > + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock held > + */ > +static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024; > +size_param("bootscrub_blocksize", opt_bootscrub_blocksize); > + > +/* > * Bit width of the DMA heap -- used to override NUMA-node-first. > * allocation strategy, which can otherwise exhaust low memory. > */ > @@ -90,6 +96,16 @@ static struct bootmem_region { > } *__initdata bootmem_region_list; > static unsigned int __initdata nr_bootmem_regions; > > +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0); > + > +struct scrub_region { > + u64 offset; > + u64 start; > + u64 chunk_size; > + u64 cpu_block_size; > +}; > +static struct scrub_region __initdata region[MAX_NUMNODES]; > + > static void __init boot_bug(int line) > { > panic("Boot BUG at %s:%d\n", __FILE__, line); > @@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void) > printk("\n"); > } > > -/* > - * Scrub all unallocated pages in all heap zones. This function is more > - * convoluted than appears necessary because we do not want to continuously > - * hold the lock while scrubbing very large memory areas. > - */ > -void __init scrub_heap_pages(void) > +void __init smp_scrub_heap_pages(void *data) > { > - unsigned long mfn; > + unsigned long mfn, start_mfn, end_mfn; > struct page_info *pg; > + struct scrub_region *region = data; > + unsigned int temp_cpu, local_node, local_cpu_index = 0; > + unsigned int cpu = smp_processor_id(); > > - if ( !opt_bootscrub ) > - return; > + ASSERT(region != NULL); > > - printk("Scrubbing Free RAM: "); > + local_node = cpu_to_node(cpu); > + /* Determine if we are scrubbing using the boot CPU */ > + if ( region->cpu_block_size != ~0ULL ) > + /* Determine the current CPU''s index into CPU''s linked to this node*/ > + for_each_cpu( temp_cpu, &node_to_cpumask(local_node) ) > + { > + if ( cpu == temp_cpu ) > + break; > + local_cpu_index++; > + } > > - for ( mfn = first_valid_mfn; mfn < max_page; mfn++ ) > + /* Calculate the starting mfn for this CPU''s memory block */ > + start_mfn = region->start + (region->cpu_block_size * local_cpu_index) > + + region->offset; > + > + /* Calculate the end mfn into this CPU''s memory block for this iteration */ > + if ( region->offset + region->chunk_size > region->cpu_block_size ) > + end_mfn = region->start + (region->cpu_block_size * local_cpu_index) > + + region->cpu_block_size; > + else > + end_mfn = start_mfn + region->chunk_size; > + > + > + for ( mfn = start_mfn; mfn < end_mfn; mfn++ ) > { > - process_pending_softirqs(); > - > pg = mfn_to_page(mfn); > > - /* Quick lock-free check. */ > + /* Check the mfn is valid and page is free. */ > if ( !mfn_valid(mfn) || !page_state_is(pg, free) ) > continue; > > @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void) > if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) > printk("."); > > + /* Do the scrub if possible */ > + if ( page_state_is(pg, free) ) > + scrub_one_page(pg); > + } > + /* Increment count to indicate scrubbing complete on this CPU */ > + atomic_dec(&bootscrub_count); > +} > + > +/* > + * Scrub all unallocated pages in all heap zones. This function uses all > + * online cpu''s to scrub the memory in parallel. > + */ > +void __init scrub_heap_pages(void) > +{ > + cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }}; > + unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id(); > + unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0; > + unsigned long mem_start, mem_end; > + > + if ( !opt_bootscrub ) > + return; > + > + boot_cpu_node = cpu_to_node(cpu); > + > + printk("Scrubbing Free RAM: "); > + > + /* Scrub block size */ > + chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT; > + if ( chunk_size == 0 ) > + chunk_size = 1; > + > + /* Determine the amount of memory to scrub, per CPU on each Node */ > + for_each_online_node ( i ) > + { > + /* Calculate Node memory start and end address */ > + mem_start = max(node_start_pfn(i), first_valid_mfn); > + mem_end = min(mem_start + node_spanned_pages(i), max_page); > + /* Divide by number of CPU''s for this node */ > + node_cpus = node_to_cpumask(i); > + /* It''s possible a node has no CPU''s */ > + if ( cpumask_empty(&node_cpus) ) > + continue; > + cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus); > + > + region[i].cpu_block_size = (mem_end - mem_start) / > + cpumask_weight(&node_cpus); > + region[i].start = mem_start; > + > + if ( region[i].cpu_block_size > max_cpu_blk_size ) > + max_cpu_blk_size = region[i].cpu_block_size; > + } > + > + /* Round default chunk size down if required */ > + if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size ) > + chunk_size = max_cpu_blk_size; > + > + total_node_cpus = cpumask_weight(&total_node_cpus_mask); > + /* Start all CPU''s scrubbing memory, chunk_size at a time */ > + for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size ) > + { > + process_pending_softirqs(); > + > + atomic_set(&bootscrub_count, total_node_cpus); > + > spin_lock(&heap_lock); > > - /* Re-check page status with lock held. */ > - if ( page_state_is(pg, free) ) > - scrub_one_page(pg); > + /* Start all other CPU''s on all nodes */ > + for_each_online_node ( i ) > + { > + region[i].chunk_size = chunk_size; > + region[i].offset = mfn_off; > + node_cpus = node_to_cpumask(i); > + /* Clear local cpu ID */ > + cpumask_clear_cpu(cpu, &node_cpus); > + /* Start page scrubbing on all other CPU''s */ > + on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 0); > + } > + > + /* Start scrub on local CPU if CPU linked to a memory node */ > + if ( boot_cpu_node != NUMA_NO_NODE ) > + smp_scrub_heap_pages(®ion[boot_cpu_node]); > + > + /* Wait for page scrubbing to complete on all other CPU''s */ > + while ( atomic_read(&bootscrub_count) > 0 ) > + cpu_relax(); > > spin_unlock(&heap_lock); > } > > + /* Use the boot CPU to scrub any nodes which have no CPU''s linked to them */ > + for_each_online_node ( i ) > + { > + node_cpus = node_to_cpumask(i); > + > + if ( !cpumask_empty(&node_cpus) ) > + continue; > + > + mem_start = max(node_start_pfn(i), first_valid_mfn); > + mem_end = min(mem_start + node_spanned_pages(i), max_page); > + > + region[0].offset = 0; > + region[0].cpu_block_size = ~0ULL; > + > + for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size ) > + { > + spin_lock(&heap_lock); > + if ( mfn + chunk_size > mem_end ) > + region[0].chunk_size = mem_end - mfn; > + else > + region[0].chunk_size = chunk_size; > + > + region[0].start = mfn; > + > + smp_scrub_heap_pages(®ion[0]); > + spin_unlock(&heap_lock); > + process_pending_softirqs(); > + } > + } > printk("done.\n"); > > /* Now that the heap is initialized, run checks and set bounds
Tim Deegan
2013-Oct-03 11:39 UTC
Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s
Hi, Again, I''m very much in favour of this in principle, but I have some comments on implementation -- some of which I made last time as well! At 13:35 +0100 on 30 Sep (1380548117), Malcolm Crossley wrote:> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0); > + > +struct scrub_region { > + u64 offset; > + u64 start; > + u64 chunk_size; > + u64 cpu_block_size;Unsigned long, please: your actual scrubbing code uses unsigned long local varables to work with these numbers anyway.> + local_node = cpu_to_node(cpu); > + /* Determine if we are scrubbing using the boot CPU */ > + if ( region->cpu_block_size != ~0ULL )Please define a correctly typed macro for this constant. (Actually, having read the calling side I think this constant won''t be needed at all)> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void) > if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 ) > printk(".");Since the scrub order is pretty much arbitrary this printk isn''t much of an indicator of progress. Maybe replace it with a printk in the dispatch loop instead?> + /* Do the scrub if possible */ > + if ( page_state_is(pg, free) )You already checked this a couple of lines above, and since you''re not taking the lock there''s no need to re-check.> + scrub_one_page(pg); > + }There should be a wmb() here, to make sure the main scrub dispatcher can''t exit while the last worker is still issuing writes.> + /* Increment count to indicate scrubbing complete on this CPU */ > + atomic_dec(&bootscrub_count); > +} > + > +/* > + * Scrub all unallocated pages in all heap zones. This function uses all > + * online cpu''s to scrub the memory in parallel. > + */ > +void __init scrub_heap_pages(void) > +{ > + cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }}; > + unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id(); > + unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0; > + unsigned long mem_start, mem_end; > + > + if ( !opt_bootscrub ) > + return; > + > + boot_cpu_node = cpu_to_node(cpu); > + > + printk("Scrubbing Free RAM: "); > + > + /* Scrub block size */ > + chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;The naming is a little distracting here; in general you''re using ''block'' to mean the amount that a CPU will scrub altogether, and ''chunk'' for the amount scrubbed at once -- maybe this command-line option should be ''chunk'' too?> + if ( chunk_size == 0 ) > + chunk_size = 1; > + > + /* Determine the amount of memory to scrub, per CPU on each Node */ > + for_each_online_node ( i ) > + { > + /* Calculate Node memory start and end address */ > + mem_start = max(node_start_pfn(i), first_valid_mfn); > + mem_end = min(mem_start + node_spanned_pages(i), max_page);DYM min(node_start_pfn(i) + node_spanned_pages(i), max_page)?> + /* Divide by number of CPU''s for this node */ > + node_cpus = node_to_cpumask(i);Shouldn''t this be masked with cpu_online_map?> + /* It''s possible a node has no CPU''s */ > + if ( cpumask_empty(&node_cpus) ) > + continue; > + cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask, &node_cpus); > + > + region[i].cpu_block_size = (mem_end - mem_start) / > + cpumask_weight(&node_cpus);What if there''s a remainder in this division? I don''t see anything that will scrub the leftover frames.> + region[i].start = mem_start; > + > + if ( region[i].cpu_block_size > max_cpu_blk_size ) > + max_cpu_blk_size = region[i].cpu_block_size; > + } > + > + /* Round default chunk size down if required */ > + if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size ) > + chunk_size = max_cpu_blk_size;Is this necessary? The worker will confine itself to the block size anyway.> + total_node_cpus = cpumask_weight(&total_node_cpus_mask); > + /* Start all CPU''s scrubbing memory, chunk_size at a time */ > + for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size ) > + { > + process_pending_softirqs(); > + > + atomic_set(&bootscrub_count, total_node_cpus); > + > spin_lock(&heap_lock); > > - /* Re-check page status with lock held. */ > - if ( page_state_is(pg, free) ) > - scrub_one_page(pg); > + /* Start all other CPU''s on all nodes */ > + for_each_online_node ( i ) > + { > + region[i].chunk_size = chunk_size; > + region[i].offset = mfn_off;Not sure these need to be per-node values since they''re always the same. Also, since thsy''re always the same, this doesn''t need to be in a for_each_online_node() loop: you could just set the shared offset variable and use total_node_cpus_mask to trigger all the workers at once.> + node_cpus = node_to_cpumask(i);Again, masked with cpu_online_map?> + /* Clear local cpu ID */ > + cpumask_clear_cpu(cpu, &node_cpus);AIUI, on_selected_cpus will DTRT here even if you''re calling yourself. You could also call it with @wait == 1 and then you don''t need to maintain your own bootscrub_count.> + /* Start page scrubbing on all other CPU''s */ > + on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 0); > + } > + > + /* Start scrub on local CPU if CPU linked to a memory node */ > + if ( boot_cpu_node != NUMA_NO_NODE ) > + smp_scrub_heap_pages(®ion[boot_cpu_node]); > + > + /* Wait for page scrubbing to complete on all other CPU''s */ > + while ( atomic_read(&bootscrub_count) > 0 ) > + cpu_relax(); > > spin_unlock(&heap_lock); > } > > + /* Use the boot CPU to scrub any nodes which have no CPU''s linked to them */ > + for_each_online_node ( i ) > + { > + node_cpus = node_to_cpumask(i); > + > + if ( !cpumask_empty(&node_cpus) ) > + continue; > + > + mem_start = max(node_start_pfn(i), first_valid_mfn); > + mem_end = min(mem_start + node_spanned_pages(i), max_page);Again, ITYM node_start_pfn(i) + node_spanned_pages(i).> + region[0].offset = 0; > + region[0].cpu_block_size = ~0ULL; > + > + for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size ) > + { > + spin_lock(&heap_lock); > + if ( mfn + chunk_size > mem_end ) > + region[0].chunk_size = mem_end - mfn; > + else > + region[0].chunk_size = chunk_size; > + > + region[0].start = mfn; > +You''re struggling a little to force this information into the SMP protocol; I think it would be cleaner if you carved the central loop of smp_scrub_heap_pages() out into its own funciton and called that here instead.> + smp_scrub_heap_pages(®ion[0]); > + spin_unlock(&heap_lock); > + process_pending_softirqs();This belongs at the top of the loop, I think, since the last softirq check was before the last iteration of the main loop. Cheers, Tim.