thr3ads.net - Xen devel - [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s [Sep 2013]

If this information is useful, please help other people find it:
Share via:

Malcolm Crossley

2013-Sep-30 12:35 UTC

[PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

The page scrubbing is done in 128MB chunks in lockstep across all the
CPU''s.
This allows for the boot CPU to hold the heap_lock whilst each chunk is being
scrubbed and then release the heap_lock when all CPU''s are finished
scrubing
their individual chunk. This allows for the heap_lock to not be held
continously and for pending softirqs are to be serviced periodically across
all CPU''s.

The page scrub memory chunks are allocated to the CPU''s in a NUMA aware
fashion to reduce Socket interconnect overhead and improve performance.

This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
6386 machine from 49 seconds to 3 seconds.

Changes in v2
 - Reduced default chunk size to 128MB
 - Added code to scrub NUMA nodes with no active CPU linked to them
 - Be robust to boot CPU not being linked to a NUMA node

diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -188,6 +188,16 @@ Scrub free RAM during boot.  This is a s
 accidentally leaking sensitive VM data into other VMs if Xen crashes
 and reboots.
 
+### bootscrub_blocksize
+> `= <size>`
+
+> Default: `128MiB`
+
+Maximum RAM block size to be scrubbed whilst holding the page heap lock and not
+running softirqs. Reduce this if softirqs are not being run frequently enough.
+Setting this to a high value may cause cause boot failure, particularly if the
+NMI watchdog is also enabled.
+
 ### cachesize
 > `= <size>`
 
diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata 
boolean_param("bootscrub", opt_bootscrub);
 
 /*
+ * bootscrub_blocksize -> Size (bytes) of mem block to scrub with heaplock
held
+ */
+static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 * 1024;
+size_param("bootscrub_blocksize", opt_bootscrub_blocksize);
+
+/*
  * Bit width of the DMA heap -- used to override NUMA-node-first.
  * allocation strategy, which can otherwise exhaust low memory.
  */
@@ -90,6 +96,16 @@ static struct bootmem_region {
 } *__initdata bootmem_region_list;
 static unsigned int __initdata nr_bootmem_regions;
 
+static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
+
+struct scrub_region {
+    u64 offset;
+    u64 start;
+    u64 chunk_size;
+    u64 cpu_block_size;
+};
+static struct scrub_region __initdata region[MAX_NUMNODES];
+
 static void __init boot_bug(int line)
 {
     panic("Boot BUG at %s:%d\n", __FILE__, line);
@@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void)
     printk("\n");
 }
 
-/*
- * Scrub all unallocated pages in all heap zones. This function is more
- * convoluted than appears necessary because we do not want to continuously
- * hold the lock while scrubbing very large memory areas.
- */
-void __init scrub_heap_pages(void)
+void __init smp_scrub_heap_pages(void *data)
 {
-    unsigned long mfn;
+    unsigned long mfn, start_mfn, end_mfn;
     struct page_info *pg;
+    struct scrub_region *region = data;
+    unsigned int temp_cpu, local_node, local_cpu_index = 0;
+    unsigned int cpu = smp_processor_id();
 
-    if ( !opt_bootscrub )
-        return;
+    ASSERT(region != NULL);
 
-    printk("Scrubbing Free RAM: ");
+    local_node = cpu_to_node(cpu);
+    /* Determine if we are scrubbing using the boot CPU */
+    if ( region->cpu_block_size != ~0ULL )
+        /* Determine the current CPU''s index into CPU''s
linked to this node*/
+        for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
+        {
+            if ( cpu == temp_cpu )
+                break;
+            local_cpu_index++;
+        }
 
-    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
+    /* Calculate the starting mfn for this CPU''s memory block */
+    start_mfn = region->start + (region->cpu_block_size *
local_cpu_index)
+                + region->offset;
+
+    /* Calculate the end mfn into this CPU''s memory block for this
iteration */
+    if ( region->offset + region->chunk_size >
region->cpu_block_size )
+        end_mfn = region->start + (region->cpu_block_size *
local_cpu_index)
+                  + region->cpu_block_size;
+    else
+        end_mfn = start_mfn + region->chunk_size;
+
+
+    for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
     {
-        process_pending_softirqs();
-
         pg = mfn_to_page(mfn);
 
-        /* Quick lock-free check. */
+        /* Check the mfn is valid and page is free. */
         if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
             continue;
 
@@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
         if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
             printk(".");
 
+        /* Do the scrub if possible */
+        if ( page_state_is(pg, free) )
+            scrub_one_page(pg);
+    }
+    /* Increment count to indicate scrubbing complete on this CPU */
+    atomic_dec(&bootscrub_count);
+}
+
+/*
+ * Scrub all unallocated pages in all heap zones. This function uses all
+ * online cpu''s to scrub the memory in parallel.
+ */
+void __init scrub_heap_pages(void)
+{
+    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
+    unsigned int i, boot_cpu_node, total_node_cpus, cpu = smp_processor_id();
+    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
+    unsigned long mem_start, mem_end;
+
+    if ( !opt_bootscrub )
+        return;
+
+    boot_cpu_node = cpu_to_node(cpu);
+
+    printk("Scrubbing Free RAM: ");
+
+    /* Scrub block size */
+    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
+    if ( chunk_size == 0 )
+        chunk_size = 1;
+
+    /* Determine the amount of memory to scrub, per CPU on each Node */
+    for_each_online_node ( i )
+    {
+        /* Calculate Node memory start and end address */
+        mem_start = max(node_start_pfn(i), first_valid_mfn);
+        mem_end = min(mem_start + node_spanned_pages(i), max_page);
+        /* Divide by number of CPU''s for this node */
+        node_cpus = node_to_cpumask(i);
+        /* It''s possible a node has no CPU''s */
+        if ( cpumask_empty(&node_cpus) )
+            continue;
+        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask,
&node_cpus);
+
+        region[i].cpu_block_size = (mem_end - mem_start) /
+                                    cpumask_weight(&node_cpus);
+        region[i].start = mem_start;
+
+        if ( region[i].cpu_block_size > max_cpu_blk_size )
+            max_cpu_blk_size = region[i].cpu_block_size;
+    }
+
+    /* Round default chunk size down if required */
+    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
+        chunk_size = max_cpu_blk_size;
+
+    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
+    /* Start all CPU''s scrubbing memory, chunk_size at a time */
+    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off += chunk_size )
+    {
+        process_pending_softirqs();
+
+        atomic_set(&bootscrub_count, total_node_cpus);
+
         spin_lock(&heap_lock);
 
-        /* Re-check page status with lock held. */
-        if ( page_state_is(pg, free) )
-            scrub_one_page(pg);
+        /* Start all other CPU''s on all nodes */
+        for_each_online_node ( i )
+        {
+            region[i].chunk_size = chunk_size;
+            region[i].offset = mfn_off;
+            node_cpus = node_to_cpumask(i);
+            /* Clear local cpu ID */
+            cpumask_clear_cpu(cpu, &node_cpus);
+            /* Start page scrubbing on all other CPU''s */
+            on_selected_cpus(&node_cpus, smp_scrub_heap_pages,
&region[i], 0);
+        }
+
+        /* Start scrub on local CPU if CPU linked to a memory node */
+        if ( boot_cpu_node != NUMA_NO_NODE )
+            smp_scrub_heap_pages(&region[boot_cpu_node]);
+
+        /* Wait for page scrubbing to complete on all other CPU''s */
+        while ( atomic_read(&bootscrub_count) > 0 )
+            cpu_relax();
 
         spin_unlock(&heap_lock);
     }
 
+    /* Use the boot CPU to scrub any nodes which have no CPU''s linked
to them */
+    for_each_online_node ( i )
+    {
+        node_cpus = node_to_cpumask(i);
+
+        if ( !cpumask_empty(&node_cpus) )
+            continue;
+
+        mem_start = max(node_start_pfn(i), first_valid_mfn);
+        mem_end = min(mem_start + node_spanned_pages(i), max_page);
+
+        region[0].offset = 0;
+        region[0].cpu_block_size = ~0ULL;
+
+        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
+        {
+            spin_lock(&heap_lock);
+            if ( mfn + chunk_size > mem_end )
+                region[0].chunk_size = mem_end - mfn;
+            else
+                region[0].chunk_size = chunk_size;
+
+            region[0].start = mfn;
+
+            smp_scrub_heap_pages(&region[0]);
+            spin_unlock(&heap_lock);
+            process_pending_softirqs();
+        }
+    }
     printk("done.\n");
 
     /* Now that the heap is initialized, run checks and set bounds

Jan Beulich

2013-Sep-30 13:26 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

>>> On 30.09.13 at 14:35, Malcolm Crossley
<malcolm.crossley@citrix.com> wrote:
> The page scrubbing is done in 128MB chunks in lockstep across all the
CPU''s.
> This allows for the boot CPU to hold the heap_lock whilst each chunk is
being
> scrubbed and then release the heap_lock when all CPU''s are
finished scrubing
> their individual chunk. This allows for the heap_lock to not be held
> continously and for pending softirqs are to be serviced periodically across
> all CPU''s.
> 
> The page scrub memory chunks are allocated to the CPU''s in a NUMA
aware
> fashion to reduce Socket interconnect overhead and improve performance.
> 
> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> 6386 machine from 49 seconds to 3 seconds.
And is this a NUMA system with heavily different access times
between local and remote memory?

What I''m trying to understand before reviewing the actual patch
is whether what you do is really necessary: Generally it ought to
be sufficient to have one CPU on each node scrub that node''s
memory, as a CPU should be able to saturate the bus if it does
(almost) nothing but memory write. Hence having multiple cores
on the same socket (not to speak of multiple threads in a core)
do this work in parallel is likely not going to be beneficial, and
hence the logic you''re adding here might be more complex than
necessary.

Jan

Malcolm Crossley

2013-Sep-30 13:56 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

On 30/09/13 14:26, Jan Beulich wrote:>>>> On 30.09.13 at 14:35, Malcolm Crossley
<malcolm.crossley@citrix.com> wrote:
>> The page scrubbing is done in 128MB chunks in lockstep across all the
CPU''s.
>> This allows for the boot CPU to hold the heap_lock whilst each chunk is
being
>> scrubbed and then release the heap_lock when all CPU''s are
finished scrubing
>> their individual chunk. This allows for the heap_lock to not be held
>> continously and for pending softirqs are to be serviced periodically
across
>> all CPU''s.
>>
>> The page scrub memory chunks are allocated to the CPU''s in a
NUMA aware
>> fashion to reduce Socket interconnect overhead and improve performance.
>>
>> This patch reduces the boot page scrub time on a 128GB 64 core AMD
Opteron
>> 6386 machine from 49 seconds to 3 seconds.
> And is this a NUMA system with heavily different access times
> between local and remote memory?The AMD 64 core system has 8 NUMA nodes with up to 2 hops between the nodes.

This page show''s some data on the roundtrip bandwidth between different
core''s
http://www.cl.cam.ac.uk/research/srg/netos/ipc-bench/details/tmpn2YlFp.html

This paper also show the difference memory bandwidth with NUMA aware 
threads:

http://www.cs.uchicago.edu/files/tr_authentic/TR-2011-02.pdf

The unstrided results are the only one''s we''re interested
in.>
> What I''m trying to understand before reviewing the actual patch
> is whether what you do is really necessary: Generally it ought to
> be sufficient to have one CPU on each node scrub that node''s
> memory, as a CPU should be able to saturate the bus if it does
> (almost) nothing but memory write. Hence having multiple cores
> on the same socket (not to speak of multiple threads in a core)
> do this work in parallel is likely not going to be beneficial, and
> hence the logic you''re adding here might be more complex than
> necessary.The second paper cited above shows that between 3 times more core''s
than NUMA nodes are required to reach peak NUMA memory bandwidth.

The difference between 1 core per node bandwidth and 3 core''s per node 
bandwidth is:

AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)

So I think it''s worth the extra complexity to have multiple
core''s per
node scrubbing memory.

Malcolm> Jan
>

Jan Beulich

2013-Sep-30 15:35 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

>>> On 30.09.13 at 15:56, Malcolm Crossley
<malcolm.crossley@citrix.com> wrote:
> The difference between 1 core per node bandwidth and 3 core''s per
node
> bandwidth is:
> 
> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
> 
> So I think it''s worth the extra complexity to have multiple
core''s per
> node scrubbing memory.
The numbers are convincing. Which means that the only request
I''d have is to avoid more than one hyperthread on a core to get
picked for doing the scrubbing.

Jan

Andrew Cooper

2013-Sep-30 15:42 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

On 30/09/13 16:35, Jan Beulich wrote:>>>> On 30.09.13 at 15:56, Malcolm Crossley
<malcolm.crossley@citrix.com> wrote:
>> The difference between 1 core per node bandwidth and 3 core''s
per node
>> bandwidth is:
>>
>> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
>> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
>>
>> So I think it''s worth the extra complexity to have multiple
core''s per
>> node scrubbing memory.
> The numbers are convincing. Which means that the only request
> I''d have is to avoid more than one hyperthread on a core to get
> picked for doing the scrubbing.
>
> Jan
>
Why? Being independent operations, scrubbing like this will still be
sped up by hyperthreading.

As scrubbing RAM is simply a race to get it done as quickly as possible,
unless it can be demonstrated that using all cores is actively
detrimental to the overall time taken, we really should use all cores
where possible.

~Andrew

Jan Beulich

2013-Sep-30 16:08 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

>>> On 30.09.13 at 17:42, Andrew Cooper
<andrew.cooper3@citrix.com> wrote:
> On 30/09/13 16:35, Jan Beulich wrote:
>>>>> On 30.09.13 at 15:56, Malcolm Crossley
<malcolm.crossley@citrix.com> wrote:
>>> The difference between 1 core per node bandwidth and 3
core''s per node
>>> bandwidth is:
>>>
>>> AMD: 30000MB/s (1-core) vs 48000MB/s (3-core)
>>> Intel: 12000MB/s (1-core) vs 38000MB/s (3-core)
>>>
>>> So I think it''s worth the extra complexity to have
multiple core''s per
>>> node scrubbing memory.
>> The numbers are convincing. Which means that the only request
>> I''d have is to avoid more than one hyperthread on a core to
get
>> picked for doing the scrubbing.
> 
> Why? Being independent operations, scrubbing like this will still be
> sped up by hyperthreading.
> 
> As scrubbing RAM is simply a race to get it done as quickly as possible,
> unless it can be demonstrated that using all cores is actively
> detrimental to the overall time taken, we really should use all cores
> where possible.
As long as the actual scrubbing routine is written well enough
(to saturate the store bandwidth of a single core), having
two threads compete with one another for the store ports can
only be negatively affecting performance. _If_ multiple hyper-
threads would really turn out to be beneficial, that would thus
rather be an indication to improve clear_page() (or whatever is
being used for the actual scrubbing).

Jan

Konrad Rzeszutek Wilk

2013-Sep-30 17:43 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

On Mon, Sep 30, 2013 at 01:35:17PM +0100, Malcolm Crossley
wrote:> The page scrubbing is done in 128MB chunks in lockstep across all the
CPU''s.
> This allows for the boot CPU to hold the heap_lock whilst each chunk is
being
> scrubbed and then release the heap_lock when all CPU''s are
finished scrubing
> their individual chunk. This allows for the heap_lock to not be held
> continously and for pending softirqs are to be serviced periodically across
> all CPU''s.
> 
> The page scrub memory chunks are allocated to the CPU''s in a NUMA
aware
> fashion to reduce Socket interconnect overhead and improve performance.
> 
> This patch reduces the boot page scrub time on a 128GB 64 core AMD Opteron
> 6386 machine from 49 seconds to 3 seconds.
A bit older version of this one cut down the 1TB machine scrubbing from minutes
(I think it was 5 or 10 - I gave up on counting) down to less than a minute.
> 
> Changes in v2
>  - Reduced default chunk size to 128MB
>  - Added code to scrub NUMA nodes with no active CPU linked to them
>  - Be robust to boot CPU not being linked to a NUMA node
> 
> diff -r a03cc3136759 -r ee1108d26fc5 docs/misc/xen-command-line.markdown
> --- a/docs/misc/xen-command-line.markdown
> +++ b/docs/misc/xen-command-line.markdown
> @@ -188,6 +188,16 @@ Scrub free RAM during boot.  This is a s
>  accidentally leaking sensitive VM data into other VMs if Xen crashes
>  and reboots.
>  
> +### bootscrub_blocksize
> +> `= <size>`
> +
> +> Default: `128MiB`
> +
> +Maximum RAM block size to be scrubbed whilst holding the page heap lock
and not
> +running softirqs. Reduce this if softirqs are not being run frequently
enough.
> +Setting this to a high value may cause cause boot failure, particularly if
the
> +NMI watchdog is also enabled.
> +
>  ### cachesize
>  > `= <size>`
>  
> diff -r a03cc3136759 -r ee1108d26fc5 xen/common/page_alloc.c
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -65,6 +65,12 @@ static bool_t opt_bootscrub __initdata > 
boolean_param("bootscrub", opt_bootscrub);
>  
>  /*
> + * bootscrub_blocksize -> Size (bytes) of mem block to scrub with
heaplock held
> + */
> +static unsigned int __initdata opt_bootscrub_blocksize = 128 * 1024 *
1024;
> +size_param("bootscrub_blocksize", opt_bootscrub_blocksize);
> +
> +/*
>   * Bit width of the DMA heap -- used to override NUMA-node-first.
>   * allocation strategy, which can otherwise exhaust low memory.
>   */
> @@ -90,6 +96,16 @@ static struct bootmem_region {
>  } *__initdata bootmem_region_list;
>  static unsigned int __initdata nr_bootmem_regions;
>  
> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
> +
> +struct scrub_region {
> +    u64 offset;
> +    u64 start;
> +    u64 chunk_size;
> +    u64 cpu_block_size;
> +};
> +static struct scrub_region __initdata region[MAX_NUMNODES];
> +
>  static void __init boot_bug(int line)
>  {
>      panic("Boot BUG at %s:%d\n", __FILE__, line);
> @@ -1254,28 +1270,44 @@ void __init end_boot_allocator(void)
>      printk("\n");
>  }
>  
> -/*
> - * Scrub all unallocated pages in all heap zones. This function is more
> - * convoluted than appears necessary because we do not want to
continuously
> - * hold the lock while scrubbing very large memory areas.
> - */
> -void __init scrub_heap_pages(void)
> +void __init smp_scrub_heap_pages(void *data)
>  {
> -    unsigned long mfn;
> +    unsigned long mfn, start_mfn, end_mfn;
>      struct page_info *pg;
> +    struct scrub_region *region = data;
> +    unsigned int temp_cpu, local_node, local_cpu_index = 0;
> +    unsigned int cpu = smp_processor_id();
>  
> -    if ( !opt_bootscrub )
> -        return;
> +    ASSERT(region != NULL);
>  
> -    printk("Scrubbing Free RAM: ");
> +    local_node = cpu_to_node(cpu);
> +    /* Determine if we are scrubbing using the boot CPU */
> +    if ( region->cpu_block_size != ~0ULL )
> +        /* Determine the current CPU''s index into CPU''s
linked to this node*/
> +        for_each_cpu( temp_cpu, &node_to_cpumask(local_node) )
> +        {
> +            if ( cpu == temp_cpu )
> +                break;
> +            local_cpu_index++;
> +        }
>  
> -    for ( mfn = first_valid_mfn; mfn < max_page; mfn++ )
> +    /* Calculate the starting mfn for this CPU''s memory block */
> +    start_mfn = region->start + (region->cpu_block_size *
local_cpu_index)
> +                + region->offset;
> +
> +    /* Calculate the end mfn into this CPU''s memory block for
this iteration */
> +    if ( region->offset + region->chunk_size >
region->cpu_block_size )
> +        end_mfn = region->start + (region->cpu_block_size *
local_cpu_index)
> +                  + region->cpu_block_size;
> +    else
> +        end_mfn = start_mfn + region->chunk_size;
> +
> +
> +    for ( mfn = start_mfn; mfn < end_mfn; mfn++ )
>      {
> -        process_pending_softirqs();
> -
>          pg = mfn_to_page(mfn);
>  
> -        /* Quick lock-free check. */
> +        /* Check the mfn is valid and page is free. */
>          if ( !mfn_valid(mfn) || !page_state_is(pg, free) )
>              continue;
>  
> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
>          if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
>              printk(".");
>  
> +        /* Do the scrub if possible */
> +        if ( page_state_is(pg, free) )
> +            scrub_one_page(pg);
> +    }
> +    /* Increment count to indicate scrubbing complete on this CPU */
> +    atomic_dec(&bootscrub_count);
> +}
> +
> +/*
> + * Scrub all unallocated pages in all heap zones. This function uses all
> + * online cpu''s to scrub the memory in parallel.
> + */
> +void __init scrub_heap_pages(void)
> +{
> +    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
> +    unsigned int i, boot_cpu_node, total_node_cpus, cpu =
smp_processor_id();
> +    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
> +    unsigned long mem_start, mem_end;
> +
> +    if ( !opt_bootscrub )
> +        return;
> +
> +    boot_cpu_node = cpu_to_node(cpu);
> +
> +    printk("Scrubbing Free RAM: ");
> +
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
> +    if ( chunk_size == 0 )
> +        chunk_size = 1;
> +
> +    /* Determine the amount of memory to scrub, per CPU on each Node */
> +    for_each_online_node ( i )
> +    {
> +        /* Calculate Node memory start and end address */
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +        /* Divide by number of CPU''s for this node */
> +        node_cpus = node_to_cpumask(i);
> +        /* It''s possible a node has no CPU''s */
> +        if ( cpumask_empty(&node_cpus) )
> +            continue;
> +        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask,
&node_cpus);
> +
> +        region[i].cpu_block_size = (mem_end - mem_start) /
> +                                    cpumask_weight(&node_cpus);
> +        region[i].start = mem_start;
> +
> +        if ( region[i].cpu_block_size > max_cpu_blk_size )
> +            max_cpu_blk_size = region[i].cpu_block_size;
> +    }
> +
> +    /* Round default chunk size down if required */
> +    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
> +        chunk_size = max_cpu_blk_size;
> +
> +    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
> +    /* Start all CPU''s scrubbing memory, chunk_size at a time */
> +    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off +=
chunk_size )
> +    {
> +        process_pending_softirqs();
> +
> +        atomic_set(&bootscrub_count, total_node_cpus);
> +
>          spin_lock(&heap_lock);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        /* Start all other CPU''s on all nodes */
> +        for_each_online_node ( i )
> +        {
> +            region[i].chunk_size = chunk_size;
> +            region[i].offset = mfn_off;
> +            node_cpus = node_to_cpumask(i);
> +            /* Clear local cpu ID */
> +            cpumask_clear_cpu(cpu, &node_cpus);
> +            /* Start page scrubbing on all other CPU''s */
> +            on_selected_cpus(&node_cpus, smp_scrub_heap_pages,
&region[i], 0);
> +        }
> +
> +        /* Start scrub on local CPU if CPU linked to a memory node */
> +        if ( boot_cpu_node != NUMA_NO_NODE )
> +            smp_scrub_heap_pages(&region[boot_cpu_node]);
> +
> +        /* Wait for page scrubbing to complete on all other CPU''s
*/
> +        while ( atomic_read(&bootscrub_count) > 0 )
> +            cpu_relax();
>  
>          spin_unlock(&heap_lock);
>      }
>  
> +    /* Use the boot CPU to scrub any nodes which have no CPU''s
linked to them */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
> +            continue;
> +
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
> +
> +        region[0].offset = 0;
> +        region[0].cpu_block_size = ~0ULL;
> +
> +        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
> +        {
> +            spin_lock(&heap_lock);
> +            if ( mfn + chunk_size > mem_end )
> +                region[0].chunk_size = mem_end - mfn;
> +            else
> +                region[0].chunk_size = chunk_size;
> +
> +            region[0].start = mfn;
> +
> +            smp_scrub_heap_pages(&region[0]);
> +            spin_unlock(&heap_lock);
> +            process_pending_softirqs();
> +        }
> +    }
>      printk("done.\n");
>  
>      /* Now that the heap is initialized, run checks and set bounds

Tim Deegan

2013-Oct-03 11:39 UTC

head link

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Hi,

Again, I''m very much in favour of this in principle, but I have some
comments on implementation -- some of which I made last time as well!

At 13:35 +0100 on 30 Sep (1380548117), Malcolm Crossley
wrote:> +static atomic_t __initdata bootscrub_count = ATOMIC_INIT(0);
> +
> +struct scrub_region {
> +    u64 offset;
> +    u64 start;
> +    u64 chunk_size;
> +    u64 cpu_block_size;
Unsigned long, please: your actual scrubbing code uses unsigned long
local varables to work with these numbers anyway.
> +    local_node = cpu_to_node(cpu);
> +    /* Determine if we are scrubbing using the boot CPU */
> +    if ( region->cpu_block_size != ~0ULL )
Please define a correctly typed macro for this constant. 
(Actually, having read the calling side I think this constant won''t be
needed at all)
> @@ -1283,15 +1315,124 @@ void __init scrub_heap_pages(void)
>          if ( (mfn % ((100*1024*1024)/PAGE_SIZE)) == 0 )
>              printk(".");
Since the scrub order is pretty much arbitrary this printk isn''t much
of
an indicator of progress.  Maybe replace it with a printk in the
dispatch loop instead?
> +        /* Do the scrub if possible */
> +        if ( page_state_is(pg, free) )
You already checked this a couple of lines above, and since you''re not
taking the lock there''s no need to re-check.
> +            scrub_one_page(pg);
> +    }
There should be a wmb() here, to make sure the main scrub dispatcher
can''t
exit while the last worker is still issuing writes.
> +    /* Increment count to indicate scrubbing complete on this CPU */
> +    atomic_dec(&bootscrub_count);
> +}
> +
> +/*
> + * Scrub all unallocated pages in all heap zones. This function uses all
> + * online cpu''s to scrub the memory in parallel.
> + */
> +void __init scrub_heap_pages(void)
> +{
> +    cpumask_t node_cpus, total_node_cpus_mask = {{ 0 }};
> +    unsigned int i, boot_cpu_node, total_node_cpus, cpu =
smp_processor_id();
> +    unsigned long mfn, mfn_off, chunk_size, max_cpu_blk_size = 0;
> +    unsigned long mem_start, mem_end;
> +
> +    if ( !opt_bootscrub )
> +        return;
> +
> +    boot_cpu_node = cpu_to_node(cpu);
> +
> +    printk("Scrubbing Free RAM: ");
> +
> +    /* Scrub block size */
> +    chunk_size = opt_bootscrub_blocksize >> PAGE_SHIFT;
The naming is a little distracting here; in general you''re using
''block''
to mean the amount that a CPU will scrub altogether, and
''chunk'' for the
amount scrubbed at once -- maybe this command-line option should be
''chunk'' too?
> +    if ( chunk_size == 0 )
> +        chunk_size = 1;
> +
> +    /* Determine the amount of memory to scrub, per CPU on each Node */
> +    for_each_online_node ( i )
> +    {
> +        /* Calculate Node memory start and end address */
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
DYM min(node_start_pfn(i) + node_spanned_pages(i), max_page)?
> +        /* Divide by number of CPU''s for this node */
> +        node_cpus = node_to_cpumask(i);
Shouldn''t this be masked with cpu_online_map?
> +        /* It''s possible a node has no CPU''s */
> +        if ( cpumask_empty(&node_cpus) )
> +            continue;
> +        cpumask_or(&total_node_cpus_mask, &total_node_cpus_mask,
&node_cpus);
> +
> +        region[i].cpu_block_size = (mem_end - mem_start) /
> +                                    cpumask_weight(&node_cpus);
What if there''s a remainder in this division?  I don''t see
anything that
will scrub the leftover frames.
> +        region[i].start = mem_start;
> +
> +        if ( region[i].cpu_block_size > max_cpu_blk_size )
> +            max_cpu_blk_size = region[i].cpu_block_size;
> +    }
> +
> +    /* Round default chunk size down if required */
> +    if ( max_cpu_blk_size && chunk_size > max_cpu_blk_size )
> +        chunk_size = max_cpu_blk_size;
Is this necessary?  The worker will confine itself to the block size anyway.
> +    total_node_cpus = cpumask_weight(&total_node_cpus_mask);
> +    /* Start all CPU''s scrubbing memory, chunk_size at a time */
> +    for ( mfn_off = 0; mfn_off < max_cpu_blk_size; mfn_off +=
chunk_size )
> +    {
> +        process_pending_softirqs();
> +
> +        atomic_set(&bootscrub_count, total_node_cpus);
> +
>          spin_lock(&heap_lock);
>  
> -        /* Re-check page status with lock held. */
> -        if ( page_state_is(pg, free) )
> -            scrub_one_page(pg);
> +        /* Start all other CPU''s on all nodes */
> +        for_each_online_node ( i )
> +        {
> +            region[i].chunk_size = chunk_size;
> +            region[i].offset = mfn_off;
Not sure these need to be per-node values since they''re always the
same.
Also, since thsy''re always the same, this doesn''t need to be
in a
for_each_online_node() loop: you could just set the shared offset
variable and use total_node_cpus_mask to trigger all the workers at once.
> +            node_cpus = node_to_cpumask(i);
Again, masked with cpu_online_map?
> +            /* Clear local cpu ID */
> +            cpumask_clear_cpu(cpu, &node_cpus);
AIUI, on_selected_cpus will DTRT here even if you''re calling yourself.
You could also call it with @wait == 1 and then you don''t need to
maintain your own bootscrub_count.
> +            /* Start page scrubbing on all other CPU''s */
> +            on_selected_cpus(&node_cpus, smp_scrub_heap_pages,
&region[i], 0);
> +        }
> +
> +        /* Start scrub on local CPU if CPU linked to a memory node */
> +        if ( boot_cpu_node != NUMA_NO_NODE )
> +            smp_scrub_heap_pages(&region[boot_cpu_node]);
> +
> +        /* Wait for page scrubbing to complete on all other CPU''s
*/
> +        while ( atomic_read(&bootscrub_count) > 0 )
> +            cpu_relax();
>  
>          spin_unlock(&heap_lock);
>      }
>  
> +    /* Use the boot CPU to scrub any nodes which have no CPU''s
linked to them */
> +    for_each_online_node ( i )
> +    {
> +        node_cpus = node_to_cpumask(i);
> +
> +        if ( !cpumask_empty(&node_cpus) )
> +            continue;
> +
> +        mem_start = max(node_start_pfn(i), first_valid_mfn);
> +        mem_end = min(mem_start + node_spanned_pages(i), max_page);
Again, ITYM node_start_pfn(i) + node_spanned_pages(i).
> +        region[0].offset = 0;
> +        region[0].cpu_block_size = ~0ULL;
> +
> +        for ( mfn = mem_start; mfn < mem_end; mfn += chunk_size )
> +        {
> +            spin_lock(&heap_lock);
> +            if ( mfn + chunk_size > mem_end )
> +                region[0].chunk_size = mem_end - mfn;
> +            else
> +                region[0].chunk_size = chunk_size;
> +
> +            region[0].start = mfn;
> +
You''re struggling a little to force this information into the SMP
protocol; I think it would be cleaner if you carved the central loop of
smp_scrub_heap_pages() out into its own funciton and called that here
instead.
> +            smp_scrub_heap_pages(&region[0]);
> +            spin_unlock(&heap_lock);
> +            process_pending_softirqs();
This belongs at the top of the loop, I think, since the last softirq
check was before the last iteration of the main loop.

Cheers,

Tim.

Xen devel - Sep 2013 - [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU's

[PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s

Re: [PATCH v2] Xen: Spread boot time page scrubbing across all available CPU''s