thr3ads.net - Xen devel - [Xen-devel] [PATCH] x86: fix NUMA handling (c/s 20599) [Jan 2010]

If this information is useful, please help other people find it:
Share via:

Jan Beulich

2010-Jan-06 15:36 UTC

[Xen-devel] [PATCH] x86: fix NUMA handling (c/s 20599)

c/s 20599 caused the hash shift to become significantly smaller on
systems with an SRAT like this

(XEN) SRAT: Node 0 PXM 0 0-a0000
(XEN) SRAT: Node 0 PXM 0 100000-80000000
(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
(XEN) SRAT: Node 1 PXM 1 100000000-130000000

Comined with the static size of the memnodemap[] array, NUMA got
therefore disabled on such systems. The backport from Linux was really
incomplete, as Linux much earlier had already introduced a dynamcially
allocated memnodemap[].

Further, doing to/from pdx translations on addresses just past a valid
range is not correct, as it may strip/fail to insert non-zero bits in
this case.

Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
since pdx values are unsigned long.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

--- 2010-01-06.orig/xen/arch/x86/numa.c	2010-01-06 15:16:18.000000000 +0100
+++ 2010-01-06/xen/arch/x86/numa.c	2010-01-06 16:10:59.000000000 +0100
@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]
 
 /* Mapping from pdx to node id */
 int memnode_shift;
-u8  memnodemap[NODEMAPSIZE];
+static typeof(*memnodemap) _memnodemap[2];
+unsigned long memnodemapsize;
+u8 *memnodemap;
 
 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
 	unsigned long spdx, epdx;
 	int i, res = -1;
 
-	memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
+	memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
 	for (i = 0; i < numnodes; i++) {
 		spdx = paddr_to_pdx(nodes[i].start);
-		epdx = paddr_to_pdx(nodes[i].end);
+		epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
 		if (spdx >= epdx)
 			continue;
-		if ((epdx >> shift) >= NODEMAPSIZE)
+		if ((epdx >> shift) >= memnodemapsize)
 			return 0;
 		do {
 			if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
@@ -86,6 +88,28 @@ static int __init populate_memnodemap(co
 	return res;
 }
 
+static int __init allocate_cachealigned_memnodemap(void)
+{
+	unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+	unsigned long mfn = alloc_boot_pages(size, 1);
+
+	if (!mfn) {
+		printk(KERN_ERR
+		       "NUMA: Unable to allocate Memory to Node hash map\n");
+		memnodemapsize = 0;
+		return -1;
+	}
+
+	memnodemap = mfn_to_virt(mfn);
+	mfn <<= PAGE_SHIFT;
+	size <<= PAGE_SHIFT;
+	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+	       mfn, mfn + size);
+	memnodemapsize = size / sizeof(*memnodemap);
+
+	return 0;
+}
+
 /*
  * The LSB of all start and end addresses in the node map is the value of the
  * maximum possible shift.
@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes
 
 	for (i = 0; i < numnodes; i++) {
 		spdx = paddr_to_pdx(nodes[i].start);
-		epdx = paddr_to_pdx(nodes[i].end);
+		epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
 		if (spdx >= epdx)
 			continue;
 		bitfield |= spdx;
@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
 			memtop = epdx;
 	}
 	if (nodes_used <= 1)
-		i = 63;
+		i = BITS_PER_LONG - 1;
 	else
 		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	memnodemapsize = (memtop >> i) + 1;
 	return i;
 }
 
@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
 	int shift;
 
 	shift = extract_lsb_from_nodes(nodes, numnodes);
+	if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
+		memnodemap = _memnodemap;
+	else if (allocate_cachealigned_memnodemap())
+		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
 	       (u64)start_pfn << PAGE_SHIFT,
 	       (u64)end_pfn << PAGE_SHIFT);
 	/* setup dummy node covering all memory */ 
-	memnode_shift = 63; 
-	memnodemap[0] = 0;
+	memnode_shift = BITS_PER_LONG - 1;
+	memnodemap = _memnodemap;
 	nodes_clear(node_online_map);
 	node_set_online(0);
 	for (i = 0; i < NR_CPUS; i++)
--- 2010-01-06.orig/xen/include/asm-x86/numa.h	2010-01-06 15:16:18.000000000
+0100
+++ 2010-01-06/xen/include/asm-x86/numa.h	2010-01-06 15:15:41.000000000 +0100
@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);
 
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 #define VIRTUAL_BUG_ON(x) 
-#define NODEMAPSIZE 0xfff
 
 extern void numa_add_cpu(int cpu);
 extern void numa_init_array(void);
@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in
 
 /* Simple perfect hash to map pdx to node numbers */
 extern int memnode_shift; 
-extern u8  memnodemap[NODEMAPSIZE]; 
+extern unsigned long memnodemapsize;
+extern u8 *memnodemap;
 
 struct node_data {
     unsigned long node_start_pfn;
@@ -64,7 +64,7 @@ extern struct node_data node_data[];
 static inline __attribute__((pure)) int phys_to_nid(paddr_t addr) 
 { 
 	unsigned nid;
-	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >=
memnodemapsize);
 	nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift]; 
 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
 	return nid; 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2010-Jan-07 03:03 UTC

head link

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)

I didn''t expect node with so smcall hunk, as explained in the comments
to 20599.

But for this specific SRAT, seems we can also improve constructing of
node_memblk_range[], so that we can merge 0~a0000 with 100000-80000000, and 
80000000-d0000000 with 100000000-130000000. Depends on whether we need keep
BIOS''s memory affinity information, we can either create a new
structure to hold the result, or we can simply compact node_memblk_range[]. The
benefit is, we can reduce the sizeof memnodemap, but I''m not sure if it
is needed still after this patch.

Thanks
--jyh

>-----Original Message-----
>From: Jan Beulich [mailto:JBeulich@novell.com]
>Sent: Wednesday, January 06, 2010 11:36 PM
>To: xen-devel@lists.xensource.com
>Cc: Jiang, Yunhong
>Subject: [PATCH] x86: fix NUMA handling (c/s 20599)
>
>c/s 20599 caused the hash shift to become significantly smaller on
>systems with an SRAT like this
>
>(XEN) SRAT: Node 0 PXM 0 0-a0000
>(XEN) SRAT: Node 0 PXM 0 100000-80000000
>(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
>(XEN) SRAT: Node 1 PXM 1 100000000-130000000
>
>Comined with the static size of the memnodemap[] array, NUMA got
>therefore disabled on such systems. The backport from Linux was really
>incomplete, as Linux much earlier had already introduced a dynamcially
>allocated memnodemap[].
>
>Further, doing to/from pdx translations on addresses just past a valid
>range is not correct, as it may strip/fail to insert non-zero bits in
>this case.
>
>Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
>since pdx values are unsigned long.
>
>Signed-off-by: Jan Beulich <jbeulich@novell.com>
>
>--- 2010-01-06.orig/xen/arch/x86/numa.c	2010-01-06 15:16:18.000000000 +0100
>+++ 2010-01-06/xen/arch/x86/numa.c	2010-01-06 16:10:59.000000000 +0100
>@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]
>
> /* Mapping from pdx to node id */
> int memnode_shift;
>-u8  memnodemap[NODEMAPSIZE];
>+static typeof(*memnodemap) _memnodemap[2];
>+unsigned long memnodemapsize;
>+u8 *memnodemap;
>
> unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
> 	[0 ... NR_CPUS-1] = NUMA_NO_NODE
>@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
> 	unsigned long spdx, epdx;
> 	int i, res = -1;
>
>-	memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
>+	memset(memnodemap, NUMA_NO_NODE, memnodemapsize *
>sizeof(*memnodemap));
> 	for (i = 0; i < numnodes; i++) {
> 		spdx = paddr_to_pdx(nodes[i].start);
>-		epdx = paddr_to_pdx(nodes[i].end);
>+		epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
> 		if (spdx >= epdx)
> 			continue;
>-		if ((epdx >> shift) >= NODEMAPSIZE)
>+		if ((epdx >> shift) >= memnodemapsize)
> 			return 0;
> 		do {
> 			if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
>@@ -86,6 +88,28 @@ static int __init populate_memnodemap(co
> 	return res;
> }
>
>+static int __init allocate_cachealigned_memnodemap(void)
>+{
>+	unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
>+	unsigned long mfn = alloc_boot_pages(size, 1);
>+
>+	if (!mfn) {
>+		printk(KERN_ERR
>+		       "NUMA: Unable to allocate Memory to Node hash map\n");
>+		memnodemapsize = 0;
>+		return -1;
>+	}
>+
>+	memnodemap = mfn_to_virt(mfn);
>+	mfn <<= PAGE_SHIFT;
>+	size <<= PAGE_SHIFT;
>+	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
>+	       mfn, mfn + size);
>+	memnodemapsize = size / sizeof(*memnodemap);
>+
>+	return 0;
>+}
>+
> /*
>  * The LSB of all start and end addresses in the node map is the value of
the
>  * maximum possible shift.
>@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes
>
> 	for (i = 0; i < numnodes; i++) {
> 		spdx = paddr_to_pdx(nodes[i].start);
>-		epdx = paddr_to_pdx(nodes[i].end);
>+		epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
> 		if (spdx >= epdx)
> 			continue;
> 		bitfield |= spdx;
>@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
> 			memtop = epdx;
> 	}
> 	if (nodes_used <= 1)
>-		i = 63;
>+		i = BITS_PER_LONG - 1;
> 	else
> 		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
>+	memnodemapsize = (memtop >> i) + 1;
> 	return i;
> }
>
>@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
> 	int shift;
>
> 	shift = extract_lsb_from_nodes(nodes, numnodes);
>+	if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
>+		memnodemap = _memnodemap;
>+	else if (allocate_cachealigned_memnodemap())
>+		return -1;
> 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
> 		shift);
>
>@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
> 	       (u64)start_pfn << PAGE_SHIFT,
> 	       (u64)end_pfn << PAGE_SHIFT);
> 	/* setup dummy node covering all memory */
>-	memnode_shift = 63;
>-	memnodemap[0] = 0;
>+	memnode_shift = BITS_PER_LONG - 1;
>+	memnodemap = _memnodemap;
> 	nodes_clear(node_online_map);
> 	node_set_online(0);
> 	for (i = 0; i < NR_CPUS; i++)
>--- 2010-01-06.orig/xen/include/asm-x86/numa.h	2010-01-06 15:16:18.000000000
>+0100
>+++ 2010-01-06/xen/include/asm-x86/numa.h	2010-01-06 15:15:41.000000000
>+0100
>@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);
>
> #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
> #define VIRTUAL_BUG_ON(x)
>-#define NODEMAPSIZE 0xfff
>
> extern void numa_add_cpu(int cpu);
> extern void numa_init_array(void);
>@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in
>
> /* Simple perfect hash to map pdx to node numbers */
> extern int memnode_shift;
>-extern u8  memnodemap[NODEMAPSIZE];
>+extern unsigned long memnodemapsize;
>+extern u8 *memnodemap;
>
> struct node_data {
>     unsigned long node_start_pfn;
>@@ -64,7 +64,7 @@ extern struct node_data node_data[];
> static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
> {
> 	unsigned nid;
>-	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift)
>>NODEMAPSIZE);
>+	VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift)
>>memnodemapsize);
> 	nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
> 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
> 	return nid;
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jan Beulich

2010-Jan-07 07:42 UTC

head link

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)

>>> "Jiang, Yunhong" <yunhong.jiang@intel.com> 07.01.10
04:03 >>>
>But for this specific SRAT, seems we can also improve constructing of
>node_memblk_range[], so that we can merge 0~a0000 with
>100000-80000000, and  80000000-d0000000 with
>100000000-130000000. Depends on whether we need keep BIOS''s
>memory affinity information, we can either create a new structure to
>hold the result, or we can simply compact node_memblk_range[]. The
>benefit is, we can reduce the sizeof memnodemap, but I''m not sure
>if it is needed still after this patch.
Yes, I had considered this too. But since this is code ported from Linux,
I''d like to get buy-off on this on the Linux side first. (And yes, I do
think this would be an improvement - not just because of the memory
savings, but also because of the [perhaps significantly] reduced
cache footprint resulting from the array accesses: Only two array
elements are really needed for the shown memory layout.)

Why would you, btw., think that BIOS affinity information would get
lost when merging entries in this case?

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Jiang, Yunhong

2010-Jan-07 08:01 UTC

head link

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)

>-----Original Message-----
>From: Jan Beulich [mailto:JBeulich@novell.com]
>Sent: Thursday, January 07, 2010 3:42 PM
>To: Jiang, Yunhong
>Cc: xen-devel@lists.xensource.com
>Subject: RE: [PATCH] x86: fix NUMA handling (c/s 20599)
>
>>>> "Jiang, Yunhong" <yunhong.jiang@intel.com>
07.01.10 04:03 >>>
>>But for this specific SRAT, seems we can also improve constructing of
>>node_memblk_range[], so that we can merge 0~a0000 with
>>100000-80000000, and  80000000-d0000000 with
>>100000000-130000000. Depends on whether we need keep BIOS''s
>>memory affinity information, we can either create a new structure to
>>hold the result, or we can simply compact node_memblk_range[]. The
>>benefit is, we can reduce the sizeof memnodemap, but I''m not
sure
>>if it is needed still after this patch.
>
>Yes, I had considered this too. But since this is code ported from Linux,
>I''d like to get buy-off on this on the Linux side first. (And yes,
I do
>think this would be an improvement - not just because of the memory
>savings, but also because of the [perhaps significantly] reduced
>cache footprint resulting from the array accesses: Only two array
>elements are really needed for the shown memory layout.)
>Why would you, btw., think that BIOS affinity information would get
>lost when merging entries in this case?
After all, for this SRAT table, address 0xb0000 does not belongs to node 0,
while it belongs to 0 after the merge. Yes, I agree it does not effect software
(OS/VMM), but still, it is different.
>
>Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Xen devel - Jan 2010 - [PATCH] x86: fix NUMA handling (c/s 20599)

[Xen-devel] [PATCH] x86: fix NUMA handling (c/s 20599)

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)

[Xen-devel] RE: [PATCH] x86: fix NUMA handling (c/s 20599)