thr3ads.net - Xen devel - [PATCH 1/2] xen: vnuma support for PV guests running as domU. [Nov 2013]

If this information is useful, please help other people find it:
Share via:

Elena Ufimtseva

2013-Nov-14 03:36 UTC

[PATCH 1/2] xen: vnuma support for PV guests running as domU.

Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
NUMA topology, otherwise sets dummy NUMA node and prevents
numa_init from calling other numa initializators as they may
break other guests.

Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
 arch/x86/include/asm/xen/vnuma.h |   12 ++++
 arch/x86/mm/numa.c               |    5 ++
 arch/x86/xen/Makefile            |    2 +-
 arch/x86/xen/vnuma.c             |  119 ++++++++++++++++++++++++++++++++++++++
 include/xen/interface/memory.h   |   28 +++++++++
 5 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/xen/vnuma.h
 create mode 100644 arch/x86/xen/vnuma.c

diff --git a/arch/x86/include/asm/xen/vnuma.h b/arch/x86/include/asm/xen/vnuma.h
new file mode 100644
index 0000000..1ba1e06
--- /dev/null
+++ b/arch/x86/include/asm/xen/vnuma.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_VNUMA_H
+#define _ASM_X86_VNUMA_H
+
+#ifdef CONFIG_XEN
+int xen_vnuma_supported(void);
+int xen_numa_init(void);
+#else
+int xen_vnuma_supported(void) { return 0; };
+int xen_numa_init(void) { return -1; };
+#endif
+
+#endif /* _ASM_X86_VNUMA_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 8bf93ba..c8a61dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -19,6 +19,7 @@
 #include <asm/amd_nb.h>
 
 #include "numa_internal.h"
+#include "asm/xen/vnuma.h"
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
@@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
 	if (!numa_off) {
+#ifdef CONFIG_XEN
+		if (xen_vnuma_supported() && !numa_init(xen_numa_init))
+			return;
+#endif
 #ifdef CONFIG_X86_NUMAQ
 		if (!numa_init(numaq_numa_init))
 			return;
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 96ab2c0..de9deab 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -13,7 +13,7 @@ CFLAGS_mmu.o			:= $(nostackp)
 obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			time.o xen-asm.o xen-asm_$(BITS).o \
 			grant-table.o suspend.o platform-pci-unplug.o \
-			p2m.o
+			p2m.o vnuma.o
 
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
new file mode 100644
index 0000000..b4fc667
--- /dev/null
+++ b/arch/x86/xen/vnuma.c
@@ -0,0 +1,119 @@
+#include <linux/err.h>
+#include <linux/memblock.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/vnuma.h>
+
+#ifdef CONFIG_NUMA
+
+/* Checks if hypercall is suported */
+int xen_vnuma_supported()
+{
+	return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 0 : 1;
+}
+
+int __init xen_numa_init(void)
+{
+	int rc;
+	unsigned int i, j, nr_nodes, cpu, idx, pcpus;
+	u64 physm, physd, physc;
+	unsigned int *vdistance, *cpu_to_node;
+	unsigned long mem_size, dist_size, cpu_to_node_size;
+	struct vmemrange *vblock;
+
+	struct vnuma_topology_info numa_topo = {
+		.domid = DOMID_SELF,
+		.__pad = 0
+	};
+	rc = -EINVAL;
+
+	/* For now only PV guests are supported */
+	if (!xen_pv_domain())
+		return rc;
+
+	pcpus = num_possible_cpus();
+
+	mem_size =  pcpus * sizeof(struct vmemrange);
+	dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
+	cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
+
+	physm = memblock_alloc(mem_size, PAGE_SIZE);
+	vblock = __va(physm);
+
+	physd = memblock_alloc(dist_size, PAGE_SIZE);
+	vdistance  = __va(physd);
+
+	physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
+	cpu_to_node  = __va(physc);
+
+	if (!physm || !physc || !physd)
+		goto vnumaout;
+
+	set_xen_guest_handle(numa_topo.nr_nodes, &nr_nodes);
+	set_xen_guest_handle(numa_topo.vmemblks, vblock);
+	set_xen_guest_handle(numa_topo.vdistance, vdistance);
+	set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
+
+	rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo);
+
+	if (rc < 0)
+		goto vnumaout;
+	if (*numa_topo.nr_nodes == 0) {
+		/* will pass to dummy_numa_init */
+		goto vnumaout;
+	}
+	if (*numa_topo.nr_nodes > num_possible_cpus()) {
+		pr_debug("vNUMA: Node without cpu is not supported in this
version.\n");
+		goto vnumaout;
+	}
+	/*
+	 * NUMA nodes memory ranges are in pfns, constructed and
+	 * aligned based on e820 ram domain map.
+	 */
+	for (i = 0; i < *numa_topo.nr_nodes; i++) {
+		if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
+			/* pass to numa_dummy_init */
+			goto vnumaout;
+		node_set(i, numa_nodes_parsed);
+	}
+	setup_nr_node_ids();
+	/* Setting the cpu, apicid to node */
+	for_each_cpu(cpu, cpu_possible_mask) {
+		set_apicid_to_node(cpu, cpu_to_node[cpu]);
+		numa_set_node(cpu, cpu_to_node[cpu]);
+		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
+	}
+	for (i = 0; i < *numa_topo.nr_nodes; i++) {
+		for (j = 0; j < *numa_topo.nr_nodes; j++) {
+			idx = (j * *numa_topo.nr_nodes) + i;
+			numa_set_distance(i, j, *(vdistance + idx));
+		}
+	}
+	rc = 0;
+vnumaout:
+	if (physm)
+		memblock_free(__pa(physm), mem_size);
+	if (physd)
+		memblock_free(__pa(physd), dist_size);
+	if (physc)
+		memblock_free(__pa(physc), cpu_to_node_size);
+	/*
+	 * Set the "dummy" node and exit without error so Linux
+	 * will not try any NUMA init functions which might break
+	 * guests in the future. This will discard all previous
+	 * settings.
+	 */
+	if (rc != 0) {
+		for (i = 0; i < MAX_LOCAL_APIC; i++)
+			set_apicid_to_node(i, NUMA_NO_NODE);
+		nodes_clear(numa_nodes_parsed);
+		nodes_clear(node_possible_map);
+		nodes_clear(node_online_map);
+		node_set(0, numa_nodes_parsed);
+		numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
+	}
+	return 0;
+}
+#endif
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 2ecfe4f..3974e9a 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -263,4 +263,32 @@ struct xen_remove_from_physmap {
 };
 DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
 
+/* vNUMA structures */
+struct vmemrange {
+	uint64_t start, end;
+	struct vmemrange *next;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vmemrange);
+
+struct vnuma_topology_info {
+	/* OUT */
+	domid_t domid;
+	uint32_t __pad;
+	/* IN */
+	GUEST_HANDLE(uint) nr_nodes; /* number of virtual numa nodes */
+	/* distance table */
+	GUEST_HANDLE(uint) vdistance;
+	/* cpu mapping to vnodes */
+	GUEST_HANDLE(uint) cpu_to_node;
+	/*
+	* array of numa memory areas constructed by Xen
+	* where start and end are pfn numbers of the area
+	* Xen takes into account domains e820 map
+	*/
+	GUEST_HANDLE(vmemrange) vmemblks;
+};
+DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
+
+#define XENMEM_get_vnuma_info	25
+
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
-- 
1.7.10.4

Dario Faggioli

2013-Nov-14 07:26 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On mer, 2013-11-13 at 22:36 -0500, Elena Ufimtseva
wrote:> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
> +++ b/arch/x86/include/asm/xen/vnuma.h
> @@ -0,0 +1,12 @@
> +#ifndef _ASM_X86_VNUMA_H
> +#define _ASM_X86_VNUMA_H
> +
> +#ifdef CONFIG_XEN
> +int xen_vnuma_supported(void);
> +int xen_numa_init(void);
> +#else
> +int xen_vnuma_supported(void) { return 0; };
> +int xen_numa_init(void) { return -1; };
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>
> @@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>  	if (!numa_off) {
> +#ifdef CONFIG_XEN
> +		if (xen_vnuma_supported() && !numa_init(xen_numa_init))
> +			return;
> +#endif
>This looks a bit weird (looking at both the hunks above, I mean). Isn''t
it that, if !CONFIG_XEN, xen_vnuma_supported() and xen_numa_init() are
never called, thus there is very few point in defining a specific
behavior for them?

Oh, having looked at the other patch, I appreciate that at least
xen_vnuma_supported() is (possibly) called even when !CONFIG_XEN. Well,
the above still holds for xen_numa_init(), I guess. :-)
> diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
> +int __init xen_numa_init(void)
> +{
> +	mem_size =  pcpus * sizeof(struct vmemrange);
> +	dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
> +	cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
> +
> +	physm = memblock_alloc(mem_size, PAGE_SIZE);
> +	vblock = __va(physm);
> +
> +	physd = memblock_alloc(dist_size, PAGE_SIZE);
> +	vdistance  = __va(physd);
> +
> +	physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
> +	cpu_to_node  = __va(physc);
> +
> +	if (!physm || !physc || !physd)
> +		goto vnumaout;
> +It''s probably not a big deal, but I think names like
''out'', ''err'',
''fail'', etc. are just fine for labels... No need for having
that sort of
func name prefix.
> +	set_xen_guest_handle(numa_topo.nr_nodes, &nr_nodes);
> +	set_xen_guest_handle(numa_topo.vmemblks, vblock);
> +	set_xen_guest_handle(numa_topo.vdistance, vdistance);
> +	set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
> +
> +	rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo);
> +
> +	if (rc < 0)
> +		goto vnumaout;
> +	if (*numa_topo.nr_nodes == 0) {
> +		/* will pass to dummy_numa_init */
> +		goto vnumaout;
> +	}
> +	if (*numa_topo.nr_nodes > num_possible_cpus()) {
> +		pr_debug("vNUMA: Node without cpu is not supported in this
version.\n");
> +		goto vnumaout;
> +	}Blank line here?
> +	/*
> +	 * NUMA nodes memory ranges are in pfns, constructed and
> +	 * aligned based on e820 ram domain map.
> +	 */
> +	for (i = 0; i < *numa_topo.nr_nodes; i++) {
> +		if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
> +			/* pass to numa_dummy_init */
> +			goto vnumaout;
> +		node_set(i, numa_nodes_parsed);
> +	}
And here...> +	setup_nr_node_ids();...And perhaps even here (but it''s less important, I
think)...> +	/* Setting the cpu, apicid to node */
> +	for_each_cpu(cpu, cpu_possible_mask) {
> +		set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +		numa_set_node(cpu, cpu_to_node[cpu]);
> +		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +	}
...And here...> +	for (i = 0; i < *numa_topo.nr_nodes; i++) {
> +		for (j = 0; j < *numa_topo.nr_nodes; j++) {
> +			idx = (j * *numa_topo.nr_nodes) + i;
> +			numa_set_distance(i, j, *(vdistance + idx));
> +		}
> +	}
...And probably here too...> +	rc = 0;
> +vnumaout:
> +	if (physm)
> +		memblock_free(__pa(physm), mem_size);
> +	if (physd)
> +		memblock_free(__pa(physd), dist_size);
> +	if (physc)
> +		memblock_free(__pa(physc), cpu_to_node_size);
...And here.> +	/*
> +	 * Set the "dummy" node and exit without error so Linux
> +	 * will not try any NUMA init functions which might break
> +	 * guests in the future. This will discard all previous
> +	 * settings.
> +	 */IIRC, it''s more something that was already happening (the breakage, I
mean), than a "safety net" for the unforeseeable future. Might be
worth
giving some context about it, perhaps referencing the email thread or
the git commit hash in the comment.
> +	if (rc != 0) {
> +		for (i = 0; i < MAX_LOCAL_APIC; i++)
> +			set_apicid_to_node(i, NUMA_NO_NODE);
> +		nodes_clear(numa_nodes_parsed);
> +		nodes_clear(node_possible_map);
> +		nodes_clear(node_online_map);
> +		node_set(0, numa_nodes_parsed);
> +		numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
> +	}
> +	return 0;
>Ok, so, we always return ''success'', as we were saying during
last round.
However, we do not call dummy_numa_init() directly, and instead we do
all these stuff, with the last two statements being exactly what
dummy_numa_init() does. Reason is linking, i.e., the fact that
dummy_numa_init() is not exported and you can''t reach it from here,
right?

Personally, I think this is fine, but I''d probably:
 - say something about it in a comment (the fact that you''re
   cut''ng-&past''ng the dummy_numa_init() code);
 - pick up the printk (or at least something like that) from there too.

Thanks and Regards,
Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

David Vrabel

2013-Nov-14 11:18 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On 14/11/13 03:36, Elena Ufimtseva wrote:> Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
> NUMA topology, otherwise sets dummy NUMA node and prevents
> numa_init from calling other numa initializators as they may
> break other guests.
"break other guests" doesn''t seem correct to me. 
"...which prevents
numa_init() from calling other hardware-specific initializers (which do
not work in PV guests)." reads better I think.
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma.h
> @@ -0,0 +1,12 @@
> +#ifndef _ASM_X86_VNUMA_H
> +#define _ASM_X86_VNUMA_H
> +
> +#ifdef CONFIG_XEN
> +int xen_vnuma_supported(void);
> +int xen_numa_init(void);
> +#else
> +int xen_vnuma_supported(void) { return 0; };
Return bool.
> +int xen_numa_init(void) { return -1; };
I don''t think you need this stub.
> +#endif
> +
> +#endif /* _ASM_X86_VNUMA_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..c8a61dc 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -19,6 +19,7 @@
>  #include <asm/amd_nb.h>
#include <asm/xen/vnuma.h> here..
>  #include "numa_internal.h"
> +#include "asm/xen/vnuma.h"
...not here.
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
> @@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>  	if (!numa_off) {
> +#ifdef CONFIG_XEN
> +		if (xen_vnuma_supported() && !numa_init(xen_numa_init))
> +			return;
> +#endif
I would put the xen_vnuma_supported() call into xen_numa_init().
>  #ifdef CONFIG_X86_NUMAQ
>  		if (!numa_init(numaq_numa_init))
>  			return;
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index 96ab2c0..de9deab 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -13,7 +13,7 @@ CFLAGS_mmu.o			:= $(nostackp)
>  obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
>  			time.o xen-asm.o xen-asm_$(BITS).o \
>  			grant-table.o suspend.o platform-pci-unplug.o \
> -			p2m.o
> +			p2m.o vnuma.o
obj-$(CONFIG_NUMA) += vnuma.o

Then you can remove the #ifdef CONFIG_NUMA from xen/vnuma.c
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -263,4 +263,32 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +/* vNUMA structures */
> +struct vmemrange {
> +	uint64_t start, end;
> +	struct vmemrange *next;
A pointer in a ABI structure looks wrong.
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vmemrange);
> +
> +struct vnuma_topology_info {
> +	/* OUT */
> +	domid_t domid;
> +	uint32_t __pad;
> +	/* IN */
> +	GUEST_HANDLE(uint) nr_nodes; /* number of virtual numa nodes */
> +	/* distance table */
> +	GUEST_HANDLE(uint) vdistance;
> +	/* cpu mapping to vnodes */
> +	GUEST_HANDLE(uint) cpu_to_node;
> +	/*
> +	* array of numa memory areas constructed by Xen
> +	* where start and end are pfn numbers of the area
> +	* Xen takes into account domains e820 map
> +	*/
> +	GUEST_HANDLE(vmemrange) vmemblks;
> +};
The structure has different size on 32-bit and 64-bit x86 guests.  Is
this intentional?  The __pad field suggests not.

David

David Vrabel

2013-Nov-14 11:21 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On 14/11/13 07:26, Dario Faggioli wrote:> On mer, 2013-11-13 at 22:36 -0500, Elena Ufimtseva wrote:
>> +	/*
>> +	 * Set the "dummy" node and exit without error so Linux
>> +	 * will not try any NUMA init functions which might break
>> +	 * guests in the future. This will discard all previous
>> +	 * settings.
>> +	 */
> IIRC, it''s more something that was already happening (the
breakage, I
> mean), than a "safety net" for the unforeseeable future. Might be
worth
> giving some context about it, perhaps referencing the email thread or
> the git commit hash in the comment.
Yes, a comment like:

/*
 * Set a dummy node and return success.  This prevents calling any
 * hardware-specific initializers which do not work in a PV guest.
 */

is better.  No need to refer to any specific threads.  It''s pretty
clear
that any hardware-specific init isn''t appropriate for a PV guest.
>> +	if (rc != 0) {
>> +		for (i = 0; i < MAX_LOCAL_APIC; i++)
>> +			set_apicid_to_node(i, NUMA_NO_NODE);
>> +		nodes_clear(numa_nodes_parsed);
>> +		nodes_clear(node_possible_map);
>> +		nodes_clear(node_online_map);
>> +		node_set(0, numa_nodes_parsed);
>> +		numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>> +	}
>> +	return 0;
>>
> Ok, so, we always return ''success'', as we were saying
during last round.
> However, we do not call dummy_numa_init() directly, and instead we do
> all these stuff, with the last two statements being exactly what
> dummy_numa_init() does. Reason is linking, i.e., the fact that
> dummy_numa_init() is not exported and you can''t reach it from
here,
> right?
I think this bit is fine as-is.

David

Dario Faggioli

2013-Nov-14 11:48 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On gio, 2013-11-14 at 11:21 +0000, David Vrabel wrote:> On 14/11/13 07:26, Dario Faggioli wrote:
> > IIRC, it''s more something that was already happening (the
breakage, I
> > mean), than a "safety net" for the unforeseeable future.
Might be worth
> > giving some context about it, perhaps referencing the email thread or
> > the git commit hash in the comment.
> 
> Yes, a comment like:
> 
> /*
>  * Set a dummy node and return success.  This prevents calling any
>  * hardware-specific initializers which do not work in a PV guest.
>  */
> 
> is better.  No need to refer to any specific threads.  It''s pretty
clear
> that any hardware-specific init isn''t appropriate for a PV guest.
> Ok.
> >> +	if (rc != 0) {
> >> +		for (i = 0; i < MAX_LOCAL_APIC; i++)
> >> +			set_apicid_to_node(i, NUMA_NO_NODE);
> >> +		nodes_clear(numa_nodes_parsed);
> >> +		nodes_clear(node_possible_map);
> >> +		nodes_clear(node_online_map);
> >> +		node_set(0, numa_nodes_parsed);
> >> +		numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
> >> +	}
> >> +	return 0;
> >>
> > Ok, so, we always return ''success'', as we were
saying during last round.
> > However, we do not call dummy_numa_init() directly, and instead we do
> > all these stuff, with the last two statements being exactly what
> > dummy_numa_init() does. Reason is linking, i.e., the fact that
> > dummy_numa_init() is not exported and you can''t reach it from
here,
> > right?
> 
> I think this bit is fine as-is.
> Ok, cool. :-) Shouldn''t we then kill or reformulate the comments where
dummy_numa_init is explicitly referenced then?

E.g., this one: /* will pass to dummy_numa_init */

It might be me, but I find it rather confusing. After seeing that, I''d
expect to see that, at some point, either the function returns failure
(which of course we don''t want), or a direct call dummy_numa_init().

Dario

-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

Stefano Stabellini

2013-Nov-14 14:11 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On Wed, 13 Nov 2013, Elena Ufimtseva wrote:> Issues Xen hypercall subop XENMEM_get_vnumainfo and sets the
> NUMA topology, otherwise sets dummy NUMA node and prevents
> numa_init from calling other numa initializators as they may
> break other guests.
> 
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
>  arch/x86/include/asm/xen/vnuma.h |   12 ++++
>  arch/x86/mm/numa.c               |    5 ++
>  arch/x86/xen/Makefile            |    2 +-
>  arch/x86/xen/vnuma.c             |  119
++++++++++++++++++++++++++++++++++++++
>  include/xen/interface/memory.h   |   28 +++++++++
>  5 files changed, 165 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/include/asm/xen/vnuma.h
>  create mode 100644 arch/x86/xen/vnuma.c
> 
> diff --git a/arch/x86/include/asm/xen/vnuma.h
b/arch/x86/include/asm/xen/vnuma.h
> new file mode 100644
> index 0000000..1ba1e06
> --- /dev/null
> +++ b/arch/x86/include/asm/xen/vnuma.h
> @@ -0,0 +1,12 @@
> +#ifndef _ASM_X86_VNUMA_H
> +#define _ASM_X86_VNUMA_H
> +
> +#ifdef CONFIG_XEN
> +int xen_vnuma_supported(void);
> +int xen_numa_init(void);
> +#else
> +int xen_vnuma_supported(void) { return 0; };
> +int xen_numa_init(void) { return -1; };
static inline?
> +#endif
>
> +#endif /* _ASM_X86_VNUMA_H */
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 8bf93ba..c8a61dc 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -19,6 +19,7 @@
>  #include <asm/amd_nb.h>
>  
>  #include "numa_internal.h"
> +#include "asm/xen/vnuma.h"
>  
>  int __initdata numa_off;
>  nodemask_t numa_nodes_parsed __initdata;
> @@ -621,6 +622,10 @@ static int __init dummy_numa_init(void)
>  void __init x86_numa_init(void)
>  {
>  	if (!numa_off) {
> +#ifdef CONFIG_XEN
> +		if (xen_vnuma_supported() && !numa_init(xen_numa_init))
> +			return;
> +#endif
Given the non-Xen function definitions above, you can remove the ifdef
CONFIG_XEN here.

>  #ifdef CONFIG_X86_NUMAQ
>  		if (!numa_init(numaq_numa_init))
>  			return;
> diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
> index 96ab2c0..de9deab 100644
> --- a/arch/x86/xen/Makefile
> +++ b/arch/x86/xen/Makefile
> @@ -13,7 +13,7 @@ CFLAGS_mmu.o			:= $(nostackp)
>  obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
>  			time.o xen-asm.o xen-asm_$(BITS).o \
>  			grant-table.o suspend.o platform-pci-unplug.o \
> -			p2m.o
> +			p2m.o vnuma.o
>  
>  obj-$(CONFIG_EVENT_TRACING) += trace.o
>  
> diff --git a/arch/x86/xen/vnuma.c b/arch/x86/xen/vnuma.c
> new file mode 100644
> index 0000000..b4fc667
> --- /dev/null
> +++ b/arch/x86/xen/vnuma.c
> @@ -0,0 +1,119 @@
> +#include <linux/err.h>
> +#include <linux/memblock.h>
> +#include <xen/interface/xen.h>
> +#include <xen/interface/memory.h>
> +#include <asm/xen/interface.h>
> +#include <asm/xen/hypercall.h>
> +#include <asm/xen/vnuma.h>
> +
> +#ifdef CONFIG_NUMA
> +
> +/* Checks if hypercall is suported */
                               ^ supported> +int xen_vnuma_supported()
> +{
> +	return HYPERVISOR_memory_op(XENMEM_get_vnuma_info, NULL) == -ENOSYS ? 0 :
1;
> +}
> +
> +int __init xen_numa_init(void)
> +{
> +	int rc;
> +	unsigned int i, j, nr_nodes, cpu, idx, pcpus;
> +	u64 physm, physd, physc;
> +	unsigned int *vdistance, *cpu_to_node;
> +	unsigned long mem_size, dist_size, cpu_to_node_size;
physm, physd and physc need to be initialized to 0, otherwise the
vnumaout error path below is erroneous
> +	struct vmemrange *vblock;
> +
> +	struct vnuma_topology_info numa_topo = {
> +		.domid = DOMID_SELF,
> +		.__pad = 0
> +	};
> +	rc = -EINVAL;
> +
> +	/* For now only PV guests are supported */
> +	if (!xen_pv_domain())
> +		return rc;
> +
> +	pcpus = num_possible_cpus();
> +
> +	mem_size =  pcpus * sizeof(struct vmemrange);
> +	dist_size = pcpus * pcpus * sizeof(*numa_topo.vdistance);
> +	cpu_to_node_size = pcpus * sizeof(*numa_topo.cpu_to_node);
> +
> +	physm = memblock_alloc(mem_size, PAGE_SIZE);
> +	vblock = __va(physm);
> +
> +	physd = memblock_alloc(dist_size, PAGE_SIZE);
> +	vdistance  = __va(physd);
> +
> +	physc = memblock_alloc(cpu_to_node_size, PAGE_SIZE);
> +	cpu_to_node  = __va(physc);
> +
> +	if (!physm || !physc || !physd)
> +		goto vnumaout;
> +
> +	set_xen_guest_handle(numa_topo.nr_nodes, &nr_nodes);
> +	set_xen_guest_handle(numa_topo.vmemblks, vblock);
> +	set_xen_guest_handle(numa_topo.vdistance, vdistance);
> +	set_xen_guest_handle(numa_topo.cpu_to_node, cpu_to_node);
> +
> +	rc = HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo);
> +
> +	if (rc < 0)
> +		goto vnumaout;
> +	if (*numa_topo.nr_nodes == 0) {
> +		/* will pass to dummy_numa_init */
> +		goto vnumaout;
> +	}
> +	if (*numa_topo.nr_nodes > num_possible_cpus()) {
> +		pr_debug("vNUMA: Node without cpu is not supported in this
version.\n");
> +		goto vnumaout;
> +	}
> +	/*
> +	 * NUMA nodes memory ranges are in pfns, constructed and
> +	 * aligned based on e820 ram domain map.
> +	 */
> +	for (i = 0; i < *numa_topo.nr_nodes; i++) {
> +		if (numa_add_memblk(i, vblock[i].start, vblock[i].end))
> +			/* pass to numa_dummy_init */
> +			goto vnumaout;
> +		node_set(i, numa_nodes_parsed);
> +	}
> +	setup_nr_node_ids();
> +	/* Setting the cpu, apicid to node */
> +	for_each_cpu(cpu, cpu_possible_mask) {
> +		set_apicid_to_node(cpu, cpu_to_node[cpu]);
> +		numa_set_node(cpu, cpu_to_node[cpu]);
> +		cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]);
> +	}
> +	for (i = 0; i < *numa_topo.nr_nodes; i++) {
> +		for (j = 0; j < *numa_topo.nr_nodes; j++) {
> +			idx = (j * *numa_topo.nr_nodes) + i;
> +			numa_set_distance(i, j, *(vdistance + idx));
> +		}
> +	}
> +	rc = 0;
> +vnumaout:
> +	if (physm)
> +		memblock_free(__pa(physm), mem_size);
> +	if (physd)
> +		memblock_free(__pa(physd), dist_size);
> +	if (physc)
> +		memblock_free(__pa(physc), cpu_to_node_size);
> +	/*
> +	 * Set the "dummy" node and exit without error so Linux
> +	 * will not try any NUMA init functions which might break
> +	 * guests in the future. This will discard all previous
> +	 * settings.
> +	 */
> +	if (rc != 0) {
> +		for (i = 0; i < MAX_LOCAL_APIC; i++)
> +			set_apicid_to_node(i, NUMA_NO_NODE);
> +		nodes_clear(numa_nodes_parsed);
> +		nodes_clear(node_possible_map);
> +		nodes_clear(node_online_map);
> +		node_set(0, numa_nodes_parsed);
> +		numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
> +	}
> +	return 0;
> +}
> +#endif
> diff --git a/include/xen/interface/memory.h
b/include/xen/interface/memory.h
> index 2ecfe4f..3974e9a 100644
> --- a/include/xen/interface/memory.h
> +++ b/include/xen/interface/memory.h
> @@ -263,4 +263,32 @@ struct xen_remove_from_physmap {
>  };
>  DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap);
>  
> +/* vNUMA structures */
> +struct vmemrange {
> +	uint64_t start, end;
> +	struct vmemrange *next;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vmemrange);
> +
> +struct vnuma_topology_info {
> +	/* OUT */
> +	domid_t domid;
> +	uint32_t __pad;
> +	/* IN */
> +	GUEST_HANDLE(uint) nr_nodes; /* number of virtual numa nodes */
> +	/* distance table */
> +	GUEST_HANDLE(uint) vdistance;
> +	/* cpu mapping to vnodes */
> +	GUEST_HANDLE(uint) cpu_to_node;
> +	/*
> +	* array of numa memory areas constructed by Xen
> +	* where start and end are pfn numbers of the area
> +	* Xen takes into account domains e820 map
> +	*/
> +	GUEST_HANDLE(vmemrange) vmemblks;
> +};
> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info);
> +
> +#define XENMEM_get_vnuma_info	25
> +
>  #endif /* __XEN_PUBLIC_MEMORY_H__ */
> -- 
> 1.7.10.4
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

Elena Ufimtseva

2013-Nov-14 14:11 UTC

head link

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

On Thu, Nov 14, 2013 at 6:48 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:> On gio, 2013-11-14 at 11:21 +0000, David Vrabel wrote:
>> On 14/11/13 07:26, Dario Faggioli wrote:
>> > IIRC, it''s more something that was already happening (the
breakage, I
>> > mean), than a "safety net" for the unforeseeable future.
Might be worth
>> > giving some context about it, perhaps referencing the email thread
or
>> > the git commit hash in the comment.
>>
>> Yes, a comment like:
>>
>> /*
>>  * Set a dummy node and return success.  This prevents calling any
>>  * hardware-specific initializers which do not work in a PV guest.
>>  */
>>
>> is better.  No need to refer to any specific threads.  It''s
pretty clear
>> that any hardware-specific init isn''t appropriate for a PV
guest.
>>
> Ok.
>
>> >> +  if (rc != 0) {
>> >> +          for (i = 0; i < MAX_LOCAL_APIC; i++)
>> >> +                  set_apicid_to_node(i, NUMA_NO_NODE);
>> >> +          nodes_clear(numa_nodes_parsed);
>> >> +          nodes_clear(node_possible_map);
>> >> +          nodes_clear(node_online_map);
>> >> +          node_set(0, numa_nodes_parsed);
>> >> +          numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>> >> +  }
>> >> +  return 0;
>> >>
>> > Ok, so, we always return ''success'', as we were
saying during last round.
>> > However, we do not call dummy_numa_init() directly, and instead we
do
>> > all these stuff, with the last two statements being exactly what
>> > dummy_numa_init() does. Reason is linking, i.e., the fact that
>> > dummy_numa_init() is not exported and you can''t reach it
from here,
>> > right?
Ah, my bad, I left these comments and they dont make sense
:)>>
>> I think this bit is fine as-is.
>>
> Ok, cool. :-) Shouldn''t we then kill or reformulate the comments
where
> dummy_numa_init is explicitly referenced then?
>
> E.g., this one: /* will pass to dummy_numa_init */
>
> It might be me, but I find it rather confusing. After seeing that,
I''d
> expect to see that, at some point, either the function returns failure
> (which of course we don''t want), or a direct call
dummy_numa_init().
>
> Dario
>
> --
> <<This happens because I choose it to happen!>> (Raistlin
Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
>


-- 
Elena

Xen devel - Nov 2013 - [PATCH 1/2] xen: vnuma support for PV guests running as domU.

[PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.

Re: [PATCH 1/2] xen: vnuma support for PV guests running as domU.