Elena Ufimtseva
2013-Aug-27 08:52 UTC
[PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest
Enables PV guest to discover NUMA topology provided by XEN and inits this topology on boot. Xen provides numa memblocks aligned for guest, distance table, cpu setup per node. On failure defaults to dummy linux numa init. TODO: Change XENMEM hypercall subop to SYSCTL. Elena Ufimtseva (2): vNUMA PV Guest Linux support Enables NUMA for PV guest arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ arch/x86/mm/numa.c | 8 +++ arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ arch/x86/xen/setup.c | 2 +- include/xen/interface/memory.h | 1 + 5 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h -- 1.7.10.4
Elena Ufimtseva
2013-Aug-27 08:52 UTC
[PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
Uses subop hypercall to request XEN about vnuma topology. Sets the memory blocks (aligned by XEN), cpus, distance table on boot. NUMA support should be compiled in kernel. NUMA support should be compiled in kernel. Memory blocks are aligned by xen. Xen is aware of guest initial RAM layout. If the memory blocks are incorrect, call for default linux numa dummy init. Requires XEN with vNUMA support. Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 TODO: Change subop from XENMEM to SYSCTL. Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> --- arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ arch/x86/mm/numa.c | 8 +++ arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ include/xen/interface/memory.h | 1 + 4 files changed, 160 insertions(+) create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h new file mode 100644 index 0000000..73c1bde --- /dev/null +++ b/arch/x86/include/asm/xen/vnuma-xen.h @@ -0,0 +1,36 @@ +#ifndef _ASM_X86_VNUMA_XEN_H +#define _ASM_X86_VNUMA_XEN_H + +#ifdef CONFIG_XEN +int __initdata xen_numa_init(void); + +struct vnuma_memblk { + uint64_t start, end; +}; +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk); + +struct vnuma_topology_info { + domid_t domid; + uint16_t nr_nodes; + GUEST_HANDLE(vnuma_memblk) vmemblk; + GUEST_HANDLE(int) vdistance; + GUEST_HANDLE(int) cpu_to_node; + GUEST_HANDLE(int) vnode_to_pnode; +}; +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info); + +struct xen_domctl { + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ + domid_t domain; + union { + struct vnuma_topology_info vnuma; + } u; +}; +typedef struct xen_domctl xen_domctl_t; +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t); + +#else +int __init xen_numa_init(void) {} +#endif + +#endif /* _ASM_X86_VNUMA_XEN_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8bf93ba..3ec7855 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -20,6 +20,10 @@ #include "numa_internal.h" +#ifdef CONFIG_XEN +#include "asm/xen/vnuma-xen.h" +#endif + int __initdata numa_off; nodemask_t numa_nodes_parsed __initdata; @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void) void __init x86_numa_init(void) { if (!numa_off) { +#ifdef CONFIG_XEN + if (xen_pv_domain() && !numa_init(xen_numa_init)) + return; +#endif #ifdef CONFIG_X86_NUMAQ if (!numa_init(numaq_numa_init)) return; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 193097e..4658e9d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,7 @@ #include <linux/memblock.h> #include <linux/edd.h> +#include <asm/numa.h> #include <xen/xen.h> #include <xen/events.h> #include <xen/interface/xen.h> @@ -69,6 +70,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/pat.h> +#include <asm/xen/vnuma-xen.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { .x2apic_available = xen_x2apic_para_available, }; EXPORT_SYMBOL(x86_hyper_xen_hvm); + +/* Xen PV NUMA topology initialization */ +int __initdata xen_numa_init(void) +{ + struct vnuma_memblk *vblk; + struct vnuma_topology_info numa_topo; + uint64_t size, start, end; + int i, j, cpu, rc; + u64 phys, physd, physc; + u64 new_end_pfn; + size_t mem_size; + + int *vdistance, *cpu_to_node; + unsigned long dist_size, cpu_to_node_size; + numa_topo.domid = DOMID_SELF; + rc = -EINVAL; + /* block mem reservation for memblks */ + mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk); + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE); + if (!phys) { + pr_info("vNUMA: can''t allocate memory for membloks size %zu\n", mem_size); + rc = -ENOMEM; + goto errout; + } + if (memblock_reserve(phys, mem_size) < 0) { + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size); + rc = -ENOMEM; + goto errout; + } + vblk = __va(phys); + set_xen_guest_handle(numa_topo.vmemblk, vblk); + + dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance); + physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE); + if (!physd) { + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", dist_size); + rc = -ENOMEM; + goto errout; + } + if (memblock_reserve(physd, dist_size) < 0) { + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size); + rc = -ENOMEM; + goto errout; + } + vdistance = __va(physd); + set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance); + + /* allocate memblock for vcpu to node mask of max size */ + cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node); + physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE); + if (!physc) { + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", cpu_to_node_size); + rc = -ENOMEM; + goto errout; + } + if (memblock_reserve(physc, cpu_to_node_size) < 0) { + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size); + goto errout; + } + cpu_to_node = __va(physc); + set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node); + if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0) + goto errout; + if (numa_topo.nr_nodes == 0) + /* Pass to DUMMY numa */ + goto errout; + if (numa_topo.nr_nodes > num_possible_cpus()) { + pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes); + goto errout; + } + new_end_pfn = 0; + /* We have sizes in pages received from hypervisor, no holes and ordered */ + for (i = 0; i < numa_topo.nr_nodes; i++) { + start = vblk[i].start; + end = vblk[i].end; + size = end - start; + pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n", + i, size, start, end); + if (numa_add_memblk(i, start, end)) { + pr_info("vNUMA: Could not add memblk[%d]\n", i); + rc = -EAGAIN; + goto errout; + } + node_set(i, numa_nodes_parsed); + } + setup_nr_node_ids(); + /* Setting the cpu, apicid to node */ + for_each_cpu(cpu, cpu_possible_mask) { + pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]); + set_apicid_to_node(cpu, cpu_to_node[cpu]); + numa_set_node(cpu, cpu_to_node[cpu]); + __apicid_to_node[cpu] = cpu_to_node[cpu]; + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]); + } + setup_nr_node_ids(); + rc = 0; + for (i = 0; i < numa_topo.nr_nodes; i++) + for (j = 0; j < numa_topo.nr_nodes; j++) { + numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i)); + pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i)); + } +errout: + if (phys) + memblock_free(__pa(phys), mem_size); + if (physd) + memblock_free(__pa(physd), dist_size); + if (physc) + memblock_free(__pa(physc), cpu_to_node_size); + if (rc) + pr_debug("XEN: hypercall failed to get vNUMA topology.\n"); + return rc; +} + #endif diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h index 2ecfe4f..16b8d87 100644 --- a/include/xen/interface/memory.h +++ b/include/xen/interface/memory.h @@ -263,4 +263,5 @@ struct xen_remove_from_physmap { }; DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); +#define XENMEM_get_vnuma_info 25 #endif /* __XEN_PUBLIC_MEMORY_H__ */ -- 1.7.10.4
Elena Ufimtseva
2013-Aug-27 08:53 UTC
[PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
Enables NUMA support for PV guest and uses vNUMA interface to discover topology provided by XEN. Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 056d11f..7e0c93b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -561,6 +561,6 @@ void __init xen_arch_setup(void) WARN_ON(xen_set_default_idle()); fiddle_vdso(); #ifdef CONFIG_NUMA - numa_off = 1; + numa_off = 0; #endif } -- 1.7.10.4
David Vrabel
2013-Aug-27 13:35 UTC
Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
On 27/08/13 09:53, Elena Ufimtseva wrote:> Enables NUMA support for PV guest and uses vNUMA > interface to discover topology provided by XEN. > > Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> > --- > arch/x86/xen/setup.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > index 056d11f..7e0c93b 100644 > --- a/arch/x86/xen/setup.c > +++ b/arch/x86/xen/setup.c > @@ -561,6 +561,6 @@ void __init xen_arch_setup(void) > WARN_ON(xen_set_default_idle()); > fiddle_vdso(); > #ifdef CONFIG_NUMA > - numa_off = 1; > + numa_off = 0; > #endif > }Hmm. This is either safe to turn on now or it will break on hosts without vNUMA support as it will cause a fallback to try all the hardware specific NUMA initialization. Which is it? David
David Vrabel
2013-Aug-27 13:48 UTC
Re: [PATCH RFC 0/2] linux/vnuma: vnuma topology PV guest
On 27/08/13 09:52, Elena Ufimtseva wrote:> Enables PV guest to discover NUMA topology provided by XEN > and inits this topology on boot. > Xen provides numa memblocks aligned for guest, distance table, > cpu setup per node. > On failure defaults to dummy linux numa init. > > TODO: > Change XENMEM hypercall subop to SYSCTL.Please Cc the Xen maintainers in future: Konrad, Boris and myself. You will also need to Cc the x86 maintainers when it''s no longer an RFC. David> > Elena Ufimtseva (2): > vNUMA PV Guest Linux support > Enables NUMA for PV guest > > arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ > arch/x86/mm/numa.c | 8 +++ > arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ > arch/x86/xen/setup.c | 2 +- > include/xen/interface/memory.h | 1 + > 5 files changed, 161 insertions(+), 1 deletion(-) > create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h >
David Vrabel
2013-Aug-27 13:52 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On 27/08/13 09:52, Elena Ufimtseva wrote:> Uses subop hypercall to request XEN about vnuma topology. > Sets the memory blocks (aligned by XEN), cpus, distance table > on boot. NUMA support should be compiled in kernel. > > NUMA support should be compiled in kernel. > > Memory blocks are aligned by xen. Xen is aware of guest initial > RAM layout. > If the memory blocks are incorrect, call for default linux numa > dummy init. > > Requires XEN with vNUMA support. > > Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git > latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 > > TODO: > Change subop from XENMEM to SYSCTL.XENMEM subop seems right here.> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> > --- > arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ > arch/x86/mm/numa.c | 8 +++ > arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ > include/xen/interface/memory.h | 1 + > 4 files changed, 160 insertions(+) > create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h > > diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h > new file mode 100644 > index 0000000..73c1bde > --- /dev/null > +++ b/arch/x86/include/asm/xen/vnuma-xen.h > @@ -0,0 +1,36 @@ > +#ifndef _ASM_X86_VNUMA_XEN_H > +#define _ASM_X86_VNUMA_XEN_H > + > +#ifdef CONFIG_XEN > +int __initdata xen_numa_init(void); > + > +struct vnuma_memblk { > + uint64_t start, end; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk); > + > +struct vnuma_topology_info { > + domid_t domid; > + uint16_t nr_nodes; > + GUEST_HANDLE(vnuma_memblk) vmemblk; > + GUEST_HANDLE(int) vdistance; > + GUEST_HANDLE(int) cpu_to_node; > + GUEST_HANDLE(int) vnode_to_pnode; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info); > + > +struct xen_domctl { > + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ > + domid_t domain; > + union { > + struct vnuma_topology_info vnuma; > + } u; > +}; > +typedef struct xen_domctl xen_domctl_t; > +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t);Most of this is the public interface provided by Xen, this should be in include/xen/interface/memory.h. And only the xen_numa_init() prototype in this header.> + > +#else > +int __init xen_numa_init(void) {} > +#endif > + > +#endif /* _ASM_X86_VNUMA_XEN_H */ > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > index 8bf93ba..3ec7855 100644 > --- a/arch/x86/mm/numa.c > +++ b/arch/x86/mm/numa.c > @@ -20,6 +20,10 @@#include <asm/xen/vnuma.h> here.> #include "numa_internal.h" > > +#ifdef CONFIG_XEN > +#include "asm/xen/vnuma-xen.h" > +#endifInstead of this.> + > int __initdata numa_off; > nodemask_t numa_nodes_parsed __initdata; > > @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void) > void __init x86_numa_init(void) > { > if (!numa_off) { > +#ifdef CONFIG_XEN > + if (xen_pv_domain() && !numa_init(xen_numa_init)) > + return; > +#endifMove the test for xen_pv_domain() into xen_numa_init.> #ifdef CONFIG_X86_NUMAQ > if (!numa_init(numaq_numa_init)) > return; > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c > index 193097e..4658e9d 100644 > --- a/arch/x86/xen/enlighten.c > +++ b/arch/x86/xen/enlighten.c > @@ -33,6 +33,7 @@ > #include <linux/memblock.h> > #include <linux/edd.h> > > +#include <asm/numa.h> > #include <xen/xen.h> > #include <xen/events.h> > #include <xen/interface/xen.h> > @@ -69,6 +70,7 @@ > #include <asm/mwait.h> > #include <asm/pci_x86.h> > #include <asm/pat.h> > +#include <asm/xen/vnuma-xen.h> > > #ifdef CONFIG_ACPI > #include <linux/acpi.h> > @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { > .x2apic_available = xen_x2apic_para_available, > }; > EXPORT_SYMBOL(x86_hyper_xen_hvm); > + > +/* Xen PV NUMA topology initialization */ > +int __initdata xen_numa_init(void)This function needs to more clearly separate the different parts. This could be by splitting it into several functions or by improving the comments and spacing. I''d also move it to a new file: arch/x86/xen/vnuma.c.> +{ > + struct vnuma_memblk *vblk; > + struct vnuma_topology_info numa_topo; > + uint64_t size, start, end; > + int i, j, cpu, rc; > + u64 phys, physd, physc; > + u64 new_end_pfn; > + size_t mem_size; > + > + int *vdistance, *cpu_to_node; > + unsigned long dist_size, cpu_to_node_size; > + numa_topo.domid = DOMID_SELF; > + rc = -EINVAL; > + /* block mem reservation for memblks */ > + mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk); > + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE); > + if (!phys) { > + pr_info("vNUMA: can''t allocate memory for membloks size %zu\n", mem_size); > + rc = -ENOMEM; > + goto errout; > + } > + if (memblock_reserve(phys, mem_size) < 0) { > + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size); > + rc = -ENOMEM; > + goto errout; > + }Does mem_block_alloc() work here instead of the find_in_range/reserve combo? Similarly for the other allocations.> + vblk = __va(phys); > + set_xen_guest_handle(numa_topo.vmemblk, vblk);Please group all the memory allocations followed by grouping the initialization of the hypercall arguments.> + if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0) > + goto errout;ret = HYPERVISOR_memory_op(...); if (ret < 0) goto out;> + if (numa_topo.nr_nodes == 0) > + /* Pass to DUMMY numa */ > + goto errout; > + if (numa_topo.nr_nodes > num_possible_cpus()) { > + pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes); > + goto errout; > + }Is this a constraint of the Linux NUMA support?> + new_end_pfn = 0; > + /* We have sizes in pages received from hypervisor, no holes and ordered */How does this play with the relocate of frames done for dom0 during initial setup?> + for (i = 0; i < numa_topo.nr_nodes; i++) { > + start = vblk[i].start; > + end = vblk[i].end; > + size = end - start; > + pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n", > + i, size, start, end); > + if (numa_add_memblk(i, start, end)) { > + pr_info("vNUMA: Could not add memblk[%d]\n", i); > + rc = -EAGAIN; > + goto errout; > + } > + node_set(i, numa_nodes_parsed); > + } > + setup_nr_node_ids(); > + /* Setting the cpu, apicid to node */ > + for_each_cpu(cpu, cpu_possible_mask) { > + pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]); > + set_apicid_to_node(cpu, cpu_to_node[cpu]); > + numa_set_node(cpu, cpu_to_node[cpu]); > + __apicid_to_node[cpu] = cpu_to_node[cpu]; > + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]); > + } > + setup_nr_node_ids(); > + rc = 0; > + for (i = 0; i < numa_topo.nr_nodes; i++)Style: {} around this multi-line body.> + for (j = 0; j < numa_topo.nr_nodes; j++) { > + numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i)); > + pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i)); > + } > +errout:This isn''t (always) an error path, name this label "out".> + if (phys) > + memblock_free(__pa(phys), mem_size); > + if (physd) > + memblock_free(__pa(physd), dist_size); > + if (physc) > + memblock_free(__pa(physc), cpu_to_node_size); > + if (rc) > + pr_debug("XEN: hypercall failed to get vNUMA topology.\n");Remove this debug print, you already have prints for specific failures.> + return rc; > +} > + > #endifDavid
Konrad Rzeszutek Wilk
2013-Aug-27 14:17 UTC
Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
On Tue, Aug 27, 2013 at 04:53:00AM -0400, Elena Ufimtseva wrote:> Enables NUMA support for PV guest and uses vNUMA > interface to discover topology provided by XEN.I think this should depend on a hypercall to figure out whether Xen supports this mechanism ?> > Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> > --- > arch/x86/xen/setup.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > index 056d11f..7e0c93b 100644 > --- a/arch/x86/xen/setup.c > +++ b/arch/x86/xen/setup.c > @@ -561,6 +561,6 @@ void __init xen_arch_setup(void) > WARN_ON(xen_set_default_idle()); > fiddle_vdso(); > #ifdef CONFIG_NUMA > - numa_off = 1; > + numa_off = 0; > #endif > } > -- > 1.7.10.4 >
Konrad Rzeszutek Wilk
2013-Aug-27 14:33 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:> Uses subop hypercall to request XEN about vnuma topology. > Sets the memory blocks (aligned by XEN), cpus, distance table > on boot. NUMA support should be compiled in kernel.Needs a bit more details (which I am sure you will add in the next postings). Mostly: - How E820 and NUMA information mesh - What the hypercall provides - What changeset in the Xen provides this hypercall -> > NUMA support should be compiled in kernel. > > Memory blocks are aligned by xen. Xen is aware of guest initial > RAM layout. > If the memory blocks are incorrect, call for default linux numa > dummy init.Unfortunatly that won''t happen if you boot this under dom0. It will find on some AMD machines the AMD Northbridge and try to scan that. And blow up. If you look at the git commit that did the ''numa = 0'' you will see the backstory. I think part of enablement of '' numa = 1'' is to wrap it with if (xen_initial_domain() && xen_supports_this_hypercall()) numa = 1; in the #2 patch.> > Requires XEN with vNUMA support. > > Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git > latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 > > TODO: > Change subop from XENMEM to SYSCTL. > > Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> > --- > arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ > arch/x86/mm/numa.c | 8 +++ > arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ > include/xen/interface/memory.h | 1 + > 4 files changed, 160 insertions(+) > create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h > > diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h > new file mode 100644 > index 0000000..73c1bde > --- /dev/null > +++ b/arch/x86/include/asm/xen/vnuma-xen.h > @@ -0,0 +1,36 @@ > +#ifndef _ASM_X86_VNUMA_XEN_H > +#define _ASM_X86_VNUMA_XEN_H > + > +#ifdef CONFIG_XEN > +int __initdata xen_numa_init(void); > + > +struct vnuma_memblk { > + uint64_t start, end; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk); > + > +struct vnuma_topology_info { > + domid_t domid; > + uint16_t nr_nodes; > + GUEST_HANDLE(vnuma_memblk) vmemblk; > + GUEST_HANDLE(int) vdistance; > + GUEST_HANDLE(int) cpu_to_node; > + GUEST_HANDLE(int) vnode_to_pnode;A comment explaining what those values are meant to have would be good. Perhaps with a simple example.> +};The structure is not very 32-bit friendly. Would it be possible to add some padding so that the size of this structure is the same on 32-bit and 64-bit please? Perhaps: domid_t domid; uint16_t nr_nodes; uint32_t _pad; GUEST_HANDLE(vnuma_memblk) vmemblk; GUEST_HANDLE(int) vdistance; GUEST_HANDLE(int) cpu_to_node; GUEST_HANDLE(int) vnode_to_pnode; That should make the offsets be the same on both 32 and 64-bit I think.> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info); > + > +struct xen_domctl { > + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ > + domid_t domain; > + union { > + struct vnuma_topology_info vnuma; > + } u;Move that ''u'' one tab back please.> +}; > +typedef struct xen_domctl xen_domctl_t;Ewww. No typdefs in the Linux code please.> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t); > + > +#else > +int __init xen_numa_init(void) {} > +#endif > + > +#endif /* _ASM_X86_VNUMA_XEN_H */ > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c > index 8bf93ba..3ec7855 100644 > --- a/arch/x86/mm/numa.c > +++ b/arch/x86/mm/numa.c > @@ -20,6 +20,10 @@ > > #include "numa_internal.h" > > +#ifdef CONFIG_XEN > +#include "asm/xen/vnuma-xen.h" > +#endif > + > int __initdata numa_off; > nodemask_t numa_nodes_parsed __initdata; > > @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void) > void __init x86_numa_init(void) > { > if (!numa_off) { > +#ifdef CONFIG_XEN > + if (xen_pv_domain() && !numa_init(xen_numa_init))I would remove the xen_pv_domain() check and add that in enlighten.c code.> + return; > +#endif > #ifdef CONFIG_X86_NUMAQ > if (!numa_init(numaq_numa_init)) > return; > diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c > index 193097e..4658e9d 100644 > --- a/arch/x86/xen/enlighten.c > +++ b/arch/x86/xen/enlighten.c > @@ -33,6 +33,7 @@ > #include <linux/memblock.h> > #include <linux/edd.h> > > +#include <asm/numa.h> > #include <xen/xen.h> > #include <xen/events.h> > #include <xen/interface/xen.h> > @@ -69,6 +70,7 @@ > #include <asm/mwait.h> > #include <asm/pci_x86.h> > #include <asm/pat.h> > +#include <asm/xen/vnuma-xen.h> > > #ifdef CONFIG_ACPI > #include <linux/acpi.h> > @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { > .x2apic_available = xen_x2apic_para_available, > }; > EXPORT_SYMBOL(x86_hyper_xen_hvm); > + > +/* Xen PV NUMA topology initialization */ > +int __initdata xen_numa_init(void) > +{ > + struct vnuma_memblk *vblk; > + struct vnuma_topology_info numa_topo; > + uint64_t size, start, end; > + int i, j, cpu, rc; > + u64 phys, physd, physc; > + u64 new_end_pfn; > + size_t mem_size; > + > + int *vdistance, *cpu_to_node; > + unsigned long dist_size, cpu_to_node_size;You need an extra line here;> + numa_topo.domid = DOMID_SELF; > + rc = -EINVAL;Both of those can be part of the decleration of variables above. Like so: int rc = -EINVAL; struct vnuma_topology_info numa_topo = { .domid = DOMID_SELF; };> + /* block mem reservation for memblks */ > + mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk); > + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE); > + if (!phys) { > + pr_info("vNUMA: can''t allocate memory for membloks size %zu\n", mem_size); > + rc = -ENOMEM; > + goto errout; > + } > + if (memblock_reserve(phys, mem_size) < 0) { > + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size); > + rc = -ENOMEM; > + goto errout; > + } > + vblk = __va(phys); > + set_xen_guest_handle(numa_topo.vmemblk, vblk); > + > + dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance); > + physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE); > + if (!physd) { > + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", dist_size); > + rc = -ENOMEM; > + goto errout; > + } > + if (memblock_reserve(physd, dist_size) < 0) { > + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size); > + rc = -ENOMEM; > + goto errout; > + } > + vdistance = __va(physd); > + set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance); > + > + /* allocate memblock for vcpu to node mask of max size */ > + cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node); > + physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE); > + if (!physc) { > + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", cpu_to_node_size); > + rc = -ENOMEM; > + goto errout; > + } > + if (memblock_reserve(physc, cpu_to_node_size) < 0) { > + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size); > + goto errout; > + } > + cpu_to_node = __va(physc); > + set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node); > + if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0)I think you need to do rc = HYPERVISOR... and then if you get rc = -ENOSYS (so not implemented) change the rc to zero.> + goto errout; > + if (numa_topo.nr_nodes == 0) > + /* Pass to DUMMY numa */Also change rc to zero.> + goto errout; > + if (numa_topo.nr_nodes > num_possible_cpus()) { > + pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes); > + goto errout; > + } > + new_end_pfn = 0; > + /* We have sizes in pages received from hypervisor, no holes and ordered */From toolstack you mean. It is the one setting it up. And I hope it sets it up based on the E820 it has constructed as well? Otherwise it would be a bit awkward if those two were different.> + for (i = 0; i < numa_topo.nr_nodes; i++) { > + start = vblk[i].start; > + end = vblk[i].end; > + size = end - start; > + pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n", > + i, size, start, end);pr_debug perhaps?> + if (numa_add_memblk(i, start, end)) { > + pr_info("vNUMA: Could not add memblk[%d]\n", i); > + rc = -EAGAIN;Can you unroll the NUMA topology if this fails? Or is that not a problem?> + goto errout; > + } > + node_set(i, numa_nodes_parsed); > + } > + setup_nr_node_ids(); > + /* Setting the cpu, apicid to node */ > + for_each_cpu(cpu, cpu_possible_mask) { > + pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]);pr_debug> + set_apicid_to_node(cpu, cpu_to_node[cpu]); > + numa_set_node(cpu, cpu_to_node[cpu]); > + __apicid_to_node[cpu] = cpu_to_node[cpu]; > + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]); > + } > + setup_nr_node_ids();How come this is being called twice? It should have a comment explaining this extra call.> + rc = 0; > + for (i = 0; i < numa_topo.nr_nodes; i++) > + for (j = 0; j < numa_topo.nr_nodes; j++) { > + numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i));You could simply this by just doing: int idx = (j * numa_topo.nr_nodes) + i; ... *(vdistance + idx));> + pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i));pr_debug> + } > +errout: > + if (phys) > + memblock_free(__pa(phys), mem_size); > + if (physd) > + memblock_free(__pa(physd), dist_size); > + if (physc) > + memblock_free(__pa(physc), cpu_to_node_size); > + if (rc) > + pr_debug("XEN: hypercall failed to get vNUMA topology.\n");And include the ''rc'' value in the output please.> + return rc; > +} > + > #endif > diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h > index 2ecfe4f..16b8d87 100644 > --- a/include/xen/interface/memory.h > +++ b/include/xen/interface/memory.h > @@ -263,4 +263,5 @@ struct xen_remove_from_physmap { > }; > DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); > > +#define XENMEM_get_vnuma_info 25 > #endif /* __XEN_PUBLIC_MEMORY_H__ */ > -- > 1.7.10.4 >
Konrad Rzeszutek Wilk
2013-Aug-27 15:36 UTC
Re: [PATCH RFC 2/2] linux/vnuma: Enables NUMA support for PV guest
On Tue, Aug 27, 2013 at 02:35:18PM +0100, David Vrabel wrote:> On 27/08/13 09:53, Elena Ufimtseva wrote: > > Enables NUMA support for PV guest and uses vNUMA > > interface to discover topology provided by XEN. > > > > Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> > > --- > > arch/x86/xen/setup.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c > > index 056d11f..7e0c93b 100644 > > --- a/arch/x86/xen/setup.c > > +++ b/arch/x86/xen/setup.c > > @@ -561,6 +561,6 @@ void __init xen_arch_setup(void) > > WARN_ON(xen_set_default_idle()); > > fiddle_vdso(); > > #ifdef CONFIG_NUMA > > - numa_off = 1; > > + numa_off = 0; > > #endif > > } > > Hmm. This is either safe to turn on now or it will break on hosts > without vNUMA support as it will cause a fallback to try all the > hardware specific NUMA initialization. > > Which is it?It will blow up when booting as dom0 on certain AMD machines.> > David > > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Matt Wilson
2013-Aug-28 01:27 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote:> Uses subop hypercall to request XEN about vnuma topology. > Sets the memory blocks (aligned by XEN), cpus, distance table > on boot. NUMA support should be compiled in kernel.Are we *really sure* that we want to go this route for PV vNUMA? Couldn''t we build just enough(tm) of the ACPI tables to express the NUMA topology when constructing the domain? That''s what we do for the e820 map. --msw> NUMA support should be compiled in kernel. > > Memory blocks are aligned by xen. Xen is aware of guest initial > RAM layout. > If the memory blocks are incorrect, call for default linux numa > dummy init. > > Requires XEN with vNUMA support. > > Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git > latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376
Matt Wilson
2013-Aug-28 01:37 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote:> On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: > > Uses subop hypercall to request XEN about vnuma topology. > > Sets the memory blocks (aligned by XEN), cpus, distance table > > on boot. NUMA support should be compiled in kernel. > > Are we *really sure* that we want to go this route for PV vNUMA? > Couldn''t we build just enough(tm) of the ACPI tables to express the > NUMA topology when constructing the domain? That''s what we do for the > e820 map.Ignore me somewhat, since the e820 information is retrieved via hypercall similar to what you''re proposing. Still, if there''s some way that we can reuse existing Linux code rather than bolting on a completely parallel mechanism to set this up under PV I think it''d be better. --msw
Dario Faggioli
2013-Aug-28 13:08 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On mar, 2013-08-27 at 18:37 -0700, Matt Wilson wrote:> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote: > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: > > > Uses subop hypercall to request XEN about vnuma topology. > > > Sets the memory blocks (aligned by XEN), cpus, distance table > > > on boot. NUMA support should be compiled in kernel. > > > > Are we *really sure* that we want to go this route for PV vNUMA? > > Couldn''t we build just enough(tm) of the ACPI tables to express the > > NUMA topology when constructing the domain? That''s what we do for the > > e820 map. > > Ignore me somewhat, since the e820 information is retrieved via > hypercall similar to what you''re proposing. >:-)> Still, if there''s some way that we can reuse existing Linux code > rather than bolting on a completely parallel mechanism to set this up > under PV I think it''d be better. >Well, it looks to me that Elena is reusing quite a bit of it, isn''t she? All she''s providing is a new initialization function ( xen_numa_init() ), as it is happening already for ACPI NUMA, NUMAQ, and other NUMA implementations. In practice, while ACPI based NUMA code parses the ACPI tables in acpi_numa_init(), PV vNUMA parses the information coming from an hypercall xen_numa_init(). From that point on, Linux that steps-in and do everything else "as usual". Isn''t that enough sharing? Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
George Dunlap
2013-Aug-28 13:39 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Wed, Aug 28, 2013 at 2:08 PM, Dario Faggioli <dario.faggioli@citrix.com> wrote:> On mar, 2013-08-27 at 18:37 -0700, Matt Wilson wrote: >> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote: >> > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: >> > > Uses subop hypercall to request XEN about vnuma topology. >> > > Sets the memory blocks (aligned by XEN), cpus, distance table >> > > on boot. NUMA support should be compiled in kernel. >> > >> > Are we *really sure* that we want to go this route for PV vNUMA? >> > Couldn''t we build just enough(tm) of the ACPI tables to express the >> > NUMA topology when constructing the domain? That''s what we do for the >> > e820 map. >> >> Ignore me somewhat, since the e820 information is retrieved via >> hypercall similar to what you''re proposing. >> > :-) > >> Still, if there''s some way that we can reuse existing Linux code >> rather than bolting on a completely parallel mechanism to set this up >> under PV I think it''d be better. >> > Well, it looks to me that Elena is reusing quite a bit of it, isn''t she? > All she''s providing is a new initialization function ( xen_numa_init() > ), as it is happening already for ACPI NUMA, NUMAQ, and other NUMA > implementations. > > In practice, while ACPI based NUMA code parses the ACPI tables in > acpi_numa_init(), PV vNUMA parses the information coming from an > hypercall xen_numa_init(). From that point on, Linux that steps-in and > do everything else "as usual". > > Isn''t that enough sharing?I think the only way to "share" more would be to have Xen do some crazy ACPI table fake-up scheme, which sounds like kind of a nightmare; and also completely pointless, since there are already nice clean interfaces we can just hook into and pass nice clean data structures straight from Xen. -George
Elena Ufimtseva
2013-Aug-28 13:41 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
Hi Konrad Thank you for comments, I will resend the fix in the new patch. Regarding memblocks contrustion they are done taking into account e820 map for this domain. I will place some comments in code as you mentioned. Elena On Tue, Aug 27, 2013 at 10:33 AM, Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:> On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: >> Uses subop hypercall to request XEN about vnuma topology. >> Sets the memory blocks (aligned by XEN), cpus, distance table >> on boot. NUMA support should be compiled in kernel. > > Needs a bit more details (which I am sure you will add in the > next postings). Mostly: > - How E820 and NUMA information mesh > - What the hypercall provides > - What changeset in the Xen provides this hypercall > - >> >> NUMA support should be compiled in kernel. >> >> Memory blocks are aligned by xen. Xen is aware of guest initial >> RAM layout. >> If the memory blocks are incorrect, call for default linux numa >> dummy init. > > Unfortunatly that won''t happen if you boot this under dom0. It will > find on some AMD machines the AMD Northbridge and try to scan that. > And blow up. > > If you look at the git commit that did the ''numa = 0'' you will see > the backstory. > > I think part of enablement of '' numa = 1'' is to wrap it with > > if (xen_initial_domain() && xen_supports_this_hypercall()) > numa = 1; > > in the #2 patch. > >> >> Requires XEN with vNUMA support. >> >> Applies to 3.10 git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git >> latest commit: 8bb495e3f02401ee6f76d1b1d77f3ac9f079e376 >> >> TODO: >> Change subop from XENMEM to SYSCTL. >> >> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> >> --- >> arch/x86/include/asm/xen/vnuma-xen.h | 36 +++++++++++ >> arch/x86/mm/numa.c | 8 +++ >> arch/x86/xen/enlighten.c | 115 ++++++++++++++++++++++++++++++++++ >> include/xen/interface/memory.h | 1 + >> 4 files changed, 160 insertions(+) >> create mode 100644 arch/x86/include/asm/xen/vnuma-xen.h >> >> diff --git a/arch/x86/include/asm/xen/vnuma-xen.h b/arch/x86/include/asm/xen/vnuma-xen.h >> new file mode 100644 >> index 0000000..73c1bde >> --- /dev/null >> +++ b/arch/x86/include/asm/xen/vnuma-xen.h >> @@ -0,0 +1,36 @@ >> +#ifndef _ASM_X86_VNUMA_XEN_H >> +#define _ASM_X86_VNUMA_XEN_H >> + >> +#ifdef CONFIG_XEN >> +int __initdata xen_numa_init(void); >> + >> +struct vnuma_memblk { >> + uint64_t start, end; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_memblk); >> + >> +struct vnuma_topology_info { >> + domid_t domid; >> + uint16_t nr_nodes; >> + GUEST_HANDLE(vnuma_memblk) vmemblk; >> + GUEST_HANDLE(int) vdistance; >> + GUEST_HANDLE(int) cpu_to_node; >> + GUEST_HANDLE(int) vnode_to_pnode; > > A comment explaining what those values are meant to have would be good. > Perhaps with a simple example. > >> +}; > > The structure is not very 32-bit friendly. Would it be possible > to add some padding so that the size of this structure is the > same on 32-bit and 64-bit please? > > Perhaps: > domid_t domid; > uint16_t nr_nodes; > uint32_t _pad; > GUEST_HANDLE(vnuma_memblk) vmemblk; > GUEST_HANDLE(int) vdistance; > GUEST_HANDLE(int) cpu_to_node; > GUEST_HANDLE(int) vnode_to_pnode; > > That should make the offsets be the same on both 32 and 64-bit > I think. > >> +DEFINE_GUEST_HANDLE_STRUCT(vnuma_topology_info); >> + >> +struct xen_domctl { >> + uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */ >> + domid_t domain; >> + union { >> + struct vnuma_topology_info vnuma; >> + } u; > > Move that ''u'' one tab back please. > >> +}; >> +typedef struct xen_domctl xen_domctl_t; > > Ewww. No typdefs in the Linux code please. > >> +DEFINE_GUEST_HANDLE_STRUCT(xen_domctl_t); >> + >> +#else >> +int __init xen_numa_init(void) {} >> +#endif >> + >> +#endif /* _ASM_X86_VNUMA_XEN_H */ >> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c >> index 8bf93ba..3ec7855 100644 >> --- a/arch/x86/mm/numa.c >> +++ b/arch/x86/mm/numa.c >> @@ -20,6 +20,10 @@ >> >> #include "numa_internal.h" >> >> +#ifdef CONFIG_XEN >> +#include "asm/xen/vnuma-xen.h" >> +#endif >> + >> int __initdata numa_off; >> nodemask_t numa_nodes_parsed __initdata; >> >> @@ -621,6 +625,10 @@ static int __init dummy_numa_init(void) >> void __init x86_numa_init(void) >> { >> if (!numa_off) { >> +#ifdef CONFIG_XEN >> + if (xen_pv_domain() && !numa_init(xen_numa_init)) > > I would remove the xen_pv_domain() check and add that in enlighten.c code. > >> + return; >> +#endif >> #ifdef CONFIG_X86_NUMAQ >> if (!numa_init(numaq_numa_init)) >> return; >> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c >> index 193097e..4658e9d 100644 >> --- a/arch/x86/xen/enlighten.c >> +++ b/arch/x86/xen/enlighten.c >> @@ -33,6 +33,7 @@ >> #include <linux/memblock.h> >> #include <linux/edd.h> >> >> +#include <asm/numa.h> >> #include <xen/xen.h> >> #include <xen/events.h> >> #include <xen/interface/xen.h> >> @@ -69,6 +70,7 @@ >> #include <asm/mwait.h> >> #include <asm/pci_x86.h> >> #include <asm/pat.h> >> +#include <asm/xen/vnuma-xen.h> >> >> #ifdef CONFIG_ACPI >> #include <linux/acpi.h> >> @@ -1750,4 +1752,117 @@ const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { >> .x2apic_available = xen_x2apic_para_available, >> }; >> EXPORT_SYMBOL(x86_hyper_xen_hvm); >> + >> +/* Xen PV NUMA topology initialization */ >> +int __initdata xen_numa_init(void) >> +{ >> + struct vnuma_memblk *vblk; >> + struct vnuma_topology_info numa_topo; >> + uint64_t size, start, end; >> + int i, j, cpu, rc; >> + u64 phys, physd, physc; >> + u64 new_end_pfn; >> + size_t mem_size; >> + >> + int *vdistance, *cpu_to_node; >> + unsigned long dist_size, cpu_to_node_size; > > You need an extra line here; > >> + numa_topo.domid = DOMID_SELF; >> + rc = -EINVAL; > > Both of those can be part of the decleration of variables above. Like so: > int rc = -EINVAL; > struct vnuma_topology_info numa_topo = { > .domid = DOMID_SELF; > }; > >> + /* block mem reservation for memblks */ >> + mem_size = num_possible_cpus() * sizeof(struct vnuma_memblk); >> + phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), mem_size, PAGE_SIZE); >> + if (!phys) { >> + pr_info("vNUMA: can''t allocate memory for membloks size %zu\n", mem_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(phys, mem_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", mem_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + vblk = __va(phys); >> + set_xen_guest_handle(numa_topo.vmemblk, vblk); >> + >> + dist_size = num_possible_cpus() * num_possible_cpus() * sizeof(*numa_topo.vdistance); >> + physd = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), dist_size, PAGE_SIZE); >> + if (!physd) { >> + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", dist_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(physd, dist_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu \n", dist_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + vdistance = __va(physd); >> + set_xen_guest_handle(numa_topo.vdistance, (int *)vdistance); >> + >> + /* allocate memblock for vcpu to node mask of max size */ >> + cpu_to_node_size = num_possible_cpus() * sizeof(*numa_topo.cpu_to_node); >> + physc = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), cpu_to_node_size, PAGE_SIZE); >> + if (!physc) { >> + pr_info("vNUMA: can''t allocate memory for distance size %zu\n", cpu_to_node_size); >> + rc = -ENOMEM; >> + goto errout; >> + } >> + if (memblock_reserve(physc, cpu_to_node_size) < 0) { >> + pr_info("vNUMA: Cannot reserver mem for blocks of size %zu\n", cpu_to_node_size); >> + goto errout; >> + } >> + cpu_to_node = __va(physc); >> + set_xen_guest_handle(numa_topo.cpu_to_node, (int *)cpu_to_node); >> + if (HYPERVISOR_memory_op(XENMEM_get_vnuma_info, &numa_topo) < 0) > > I think you need to do rc = HYPERVISOR... > > and then if you get rc = -ENOSYS (so not implemented) change the rc to zero. > >> + goto errout; >> + if (numa_topo.nr_nodes == 0) >> + /* Pass to DUMMY numa */ > > Also change rc to zero. > >> + goto errout; >> + if (numa_topo.nr_nodes > num_possible_cpus()) { >> + pr_info("vNUMA: Node without cpu is not supported, nodes = %d\n", numa_topo.nr_nodes); >> + goto errout; >> + } >> + new_end_pfn = 0; >> + /* We have sizes in pages received from hypervisor, no holes and ordered */ > > From toolstack you mean. It is the one setting it up. And I hope it sets it up > based on the E820 it has constructed as well? Otherwise it would be a bit > awkward if those two were different. > >> + for (i = 0; i < numa_topo.nr_nodes; i++) { >> + start = vblk[i].start; >> + end = vblk[i].end; >> + size = end - start; >> + pr_info("VNMA blk[%d] size is %#010Lx start = %#010Lx end = %#010Lx\n", >> + i, size, start, end); > > pr_debug perhaps? > >> + if (numa_add_memblk(i, start, end)) { >> + pr_info("vNUMA: Could not add memblk[%d]\n", i); >> + rc = -EAGAIN; > > Can you unroll the NUMA topology if this fails? Or is that not a problem? > >> + goto errout; >> + } >> + node_set(i, numa_nodes_parsed); >> + } >> + setup_nr_node_ids(); >> + /* Setting the cpu, apicid to node */ >> + for_each_cpu(cpu, cpu_possible_mask) { >> + pr_info("Setting up vcpu[%d] to node[%d]\n", cpu, cpu_to_node[cpu]); > > pr_debug >> + set_apicid_to_node(cpu, cpu_to_node[cpu]); >> + numa_set_node(cpu, cpu_to_node[cpu]); >> + __apicid_to_node[cpu] = cpu_to_node[cpu]; >> + cpumask_set_cpu(cpu, node_to_cpumask_map[cpu_to_node[cpu]]); >> + } >> + setup_nr_node_ids(); > > How come this is being called twice? It should have a comment > explaining this extra call. > >> + rc = 0; >> + for (i = 0; i < numa_topo.nr_nodes; i++) >> + for (j = 0; j < numa_topo.nr_nodes; j++) { >> + numa_set_distance(i, j, *(vdistance + j*numa_topo.nr_nodes + i)); > > You could simply this by just doing: > > int idx = (j * numa_topo.nr_nodes) + i; > > ... *(vdistance + idx)); > >> + pr_info("distance %d <-> %d = %d\n", i, j, *(vdistance + j*numa_topo.nr_nodes + i)); > > pr_debug > >> + } >> +errout: >> + if (phys) >> + memblock_free(__pa(phys), mem_size); >> + if (physd) >> + memblock_free(__pa(physd), dist_size); >> + if (physc) >> + memblock_free(__pa(physc), cpu_to_node_size); >> + if (rc) >> + pr_debug("XEN: hypercall failed to get vNUMA topology.\n"); > > And include the ''rc'' value in the output please. >> + return rc; >> +} >> + >> #endif >> diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h >> index 2ecfe4f..16b8d87 100644 >> --- a/include/xen/interface/memory.h >> +++ b/include/xen/interface/memory.h >> @@ -263,4 +263,5 @@ struct xen_remove_from_physmap { >> }; >> DEFINE_GUEST_HANDLE_STRUCT(xen_remove_from_physmap); >> >> +#define XENMEM_get_vnuma_info 25 >> #endif /* __XEN_PUBLIC_MEMORY_H__ */ >> -- >> 1.7.10.4 >>-- Elena
Konrad Rzeszutek Wilk
2013-Aug-28 16:01 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote:> On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote: > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: > > > Uses subop hypercall to request XEN about vnuma topology. > > > Sets the memory blocks (aligned by XEN), cpus, distance table > > > on boot. NUMA support should be compiled in kernel. > > > > Are we *really sure* that we want to go this route for PV vNUMA? > > Couldn''t we build just enough(tm) of the ACPI tables to express the > > NUMA topology when constructing the domain? That''s what we do for the > > e820 map. > > Ignore me somewhat, since the e820 information is retrieved via > hypercall similar to what you''re proposing. > > Still, if there''s some way that we can reuse existing Linux code > rather than bolting on a completely parallel mechanism to set this up > under PV I think it''d be better.That would also parallel the work you do with ACPI right? We could enable ACPI parsing in a PV guest and provide one table - the SLIT (or SRAT). But I don''t know enough about SRAT to know whether this is something that represents truly everything we need?> > --msw > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xen.org > http://lists.xen.org/xen-devel
Matt Wilson
2013-Aug-28 16:38 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote:> On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote: > > On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote: > > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: > > > > Uses subop hypercall to request XEN about vnuma topology. > > > > Sets the memory blocks (aligned by XEN), cpus, distance table > > > > on boot. NUMA support should be compiled in kernel. > > > > > > Are we *really sure* that we want to go this route for PV vNUMA? > > > Couldn''t we build just enough(tm) of the ACPI tables to express the > > > NUMA topology when constructing the domain? That''s what we do for the > > > e820 map. > > > > Ignore me somewhat, since the e820 information is retrieved via > > hypercall similar to what you''re proposing. > > > > Still, if there''s some way that we can reuse existing Linux code > > rather than bolting on a completely parallel mechanism to set this up > > under PV I think it''d be better. > > That would also parallel the work you do with ACPI right?Yes.> We could enable ACPI parsing in a PV guest and provide one table - the > SLIT (or SRAT).Right, it''d be the SRAT table for the resource affinity and a SLIT table for the locality/distance information.> But I don''t know enough about SRAT to know whether this is something > that represents truly everything we need?The SRAT table has processor objects and memory objects. A processor object maps a logical processor number to its initial APIC ID and provides the node information. A memory object specifies the start and length for a memory region and provides the node information. For SLIT, the entries are a matrix of distances. Here are the structs: struct acpi_20_srat_processor { uint8_t type; uint8_t length; uint8_t domain; uint8_t apic_id; uint32_t flags; uint8_t sapic_id; uint8_t domain_hi[3]; uint32_t reserved; }; struct acpi_20_srat_memory { uint8_t type; uint8_t length; uint8_t domain; uint8_t domain_hi[3]; /* this is ACPI 3.0, reserved in 2.0 */ uint16_t reserved; uint32_t base_address_lo; uint32_t base_address_hi; uint32_t length_lo; uint32_t length_hi; uint32_t reserved2; uint32_t flags; uint32_t reserved3[2]; }; struct acpi_20_slit { struct acpi_header header; uint64_t nr_sys_loc; uint8_t entry[]; }; --msw
Elena Ufimtseva
2013-Aug-28 20:10 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
Hi Matt Do you construct these tables via hypercalls? Will be good to see the details on how you do it and if I can use it as well. Thank you. Elena On Wed, Aug 28, 2013 at 12:38 PM, Matt Wilson <msw@amazon.com> wrote:> On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote: >> On Tue, Aug 27, 2013 at 06:37:35PM -0700, Matt Wilson wrote: >> > On Tue, Aug 27, 2013 at 06:27:15PM -0700, Matt Wilson wrote: >> > > On Tue, Aug 27, 2013 at 04:52:59AM -0400, Elena Ufimtseva wrote: >> > > > Uses subop hypercall to request XEN about vnuma topology. >> > > > Sets the memory blocks (aligned by XEN), cpus, distance table >> > > > on boot. NUMA support should be compiled in kernel. >> > > >> > > Are we *really sure* that we want to go this route for PV vNUMA? >> > > Couldn''t we build just enough(tm) of the ACPI tables to express the >> > > NUMA topology when constructing the domain? That''s what we do for the >> > > e820 map. >> > >> > Ignore me somewhat, since the e820 information is retrieved via >> > hypercall similar to what you''re proposing. >> > >> > Still, if there''s some way that we can reuse existing Linux code >> > rather than bolting on a completely parallel mechanism to set this up >> > under PV I think it''d be better. >> >> That would also parallel the work you do with ACPI right? > > Yes. > >> We could enable ACPI parsing in a PV guest and provide one table - the >> SLIT (or SRAT). > > Right, it''d be the SRAT table for the resource affinity and a SLIT > table for the locality/distance information. > >> But I don''t know enough about SRAT to know whether this is something >> that represents truly everything we need? > > The SRAT table has processor objects and memory objects. A processor > object maps a logical processor number to its initial APIC ID and > provides the node information. A memory object specifies the start and > length for a memory region and provides the node information. > > For SLIT, the entries are a matrix of distances. > > Here are the structs: > > struct acpi_20_srat_processor { > uint8_t type; > uint8_t length; > uint8_t domain; > uint8_t apic_id; > uint32_t flags; > uint8_t sapic_id; > uint8_t domain_hi[3]; > uint32_t reserved; > }; > > struct acpi_20_srat_memory { > uint8_t type; > uint8_t length; > uint8_t domain; > uint8_t domain_hi[3]; /* this is ACPI 3.0, reserved in 2.0 */ > uint16_t reserved; > uint32_t base_address_lo; > uint32_t base_address_hi; > uint32_t length_lo; > uint32_t length_hi; > uint32_t reserved2; > uint32_t flags; > uint32_t reserved3[2]; > }; > > struct acpi_20_slit { > struct acpi_header header; > uint64_t nr_sys_loc; > uint8_t entry[]; > }; > > --msw-- Elena
Dario Faggioli
2013-Aug-28 22:21 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On mer, 2013-08-28 at 09:38 -0700, Matt Wilson wrote:> On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote: > > That would also parallel the work you do with ACPI right? > > Yes. >I see. It''s hard to comment, since I have only seen some previous (off list) versiona of Elena''s code (and won''t have time to properly review this one untill Monday), and I haven''t seen Matt''s code at all... Anyway...> > We could enable ACPI parsing in a PV guest and provide one table - the > > SLIT (or SRAT). > > Right, it''d be the SRAT table for the resource affinity and a SLIT > table for the locality/distance information. >... I see the point in sharing code for HVM and PV. However, I''m still not convinced this would be something valuable to do with this specific hunk, mostly because it looks really easy and clean to hook up at the numa_init() stage (i.e., what Elena is doing), that anything else I can think of looks like more work. :-P> > But I don''t know enough about SRAT to know whether this is something > > that represents truly everything we need? > > The SRAT table has processor objects and memory objects. A processor > object maps a logical processor number to its initial APIC ID and > provides the node information. A memory object specifies the start and > length for a memory region and provides the node information. > > For SLIT, the entries are a matrix of distances. > > Here are the structs: > > [snip] >Ok, thanks for the very useful info. What would be interesting to know is where and how Linux reads the information from ACPI and fill these structures. The current Elena''s approach is 1 hypercall, during early NUMA initialization, and that is pretty self-contained (which is the thing I like most about it). How easy is it to look up the places where each one of the tables gets filled, intercept the code/calls doing that, and replace it properly for our use case? How easy is it to "xen-ify" those call sites (stuff like ''#ifdef CONFIG_XEN'' or/and is_xen_domain() )? How many hypercalls would it require? Is it possible to have one doing all the work, or would we need something like one for each table? As I said, I can''t check the details about it right now, but it sounds like more work than Elena''s xen_numa_init(). Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Dario Faggioli
2013-Aug-28 23:12 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On mar, 2013-08-27 at 10:33 -0400, Konrad Rzeszutek Wilk wrote:> Unfortunatly that won''t happen if you boot this under dom0. It will > find on some AMD machines the AMD Northbridge and try to scan that. > And blow up. >Well, perhaps this was not evident enough in Elena''s mails and patches, but her current target is actually DomU. :-P However, that''s only because we''re proceeding in steps, so yes, once DomU will be working, Dom0 will become the target, and we''ll have to start caring about not making things explode! :-O> If you look at the git commit that did the ''numa = 0'' you will see > the backstory. >That''s something interesting to look at... I remember having seen that discussion but not the details. I''ll check when back from vacation.> I think part of enablement of '' numa = 1'' is to wrap it with > > if (xen_initial_domain() && xen_supports_this_hypercall()) > numa = 1; > > in the #2 patch. >Mmm... Did you mean ''!xen_initial_domain()'' ? Or is it me that did not understand your comment about Dom0 above? In case you did mean ''!'', again, this would be fine for now, since the target is DomUs. At the point when we will start chasing Dom0, it would be interesting to find a solution to this issue... :-/ Anyway, thanks for pointing this out. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Matt Wilson
2013-Aug-28 23:25 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Wed, Aug 28, 2013 at 04:10:11PM -0400, Elena Ufimtseva wrote:> Hi Matt > > Do you construct these tables via hypercalls? > Will be good to see the details on how you do it and if I can use it as well. > Thank you.All the ACPI tables get built in guest memory by hvmloader. The configuration is passed from libxc into hvmloader via the hvm_info table. I can''t take credit for all of this work. Most of it was written and proposed by Andre Przywara in 2010. http://lists.xen.org/archives/html/xen-devel/2010-02/msg00280.html --msw
Matt Wilson
2013-Aug-29 00:11 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Thu, Aug 29, 2013 at 12:21:40AM +0200, Dario Faggioli wrote:> On mer, 2013-08-28 at 09:38 -0700, Matt Wilson wrote: > > On Wed, Aug 28, 2013 at 12:01:48PM -0400, Konrad Rzeszutek Wilk wrote: > > > That would also parallel the work you do with ACPI right? > > > > Yes. > > > I see. It''s hard to comment, since I have only seen some previous (off > list) versiona of Elena''s code (and won''t have time to properly review > this one untill Monday), and I haven''t seen Matt''s code at all...See the link I gave in my other reply for older proposed patch I based this work on. Here''s the head of the patchset: http://lists.xen.org/archives/html/xen-devel/2010-02/msg00279.html> > > We could enable ACPI parsing in a PV guest and provide one table - the > > > SLIT (or SRAT). > > > > Right, it''d be the SRAT table for the resource affinity and a SLIT > > table for the locality/distance information. > > > ... I see the point in sharing code for HVM and PV. However, I''m still > not convinced this would be something valuable to do with this specific > hunk, mostly because it looks really easy and clean to hook up at the > numa_init() stage (i.e., what Elena is doing), that anything else I can > think of looks like more work. :-PI agree.> > > But I don''t know enough about SRAT to know whether this is something > > > that represents truly everything we need? > > > > The SRAT table has processor objects and memory objects. A processor > > object maps a logical processor number to its initial APIC ID and > > provides the node information. A memory object specifies the start and > > length for a memory region and provides the node information. > > > > For SLIT, the entries are a matrix of distances. > > > > Here are the structs: > > > > [snip] > > > Ok, thanks for the very useful info. What would be interesting to know > is where and how Linux reads the information from ACPI and fill these > structures. > > The current Elena''s approach is 1 hypercall, during early NUMA > initialization, and that is pretty self-contained (which is the thing I > like most about it). > > How easy is it to look up the places where each one of the tables gets > filled, intercept the code/calls doing that, and replace it properly for > our use case? How easy is it to "xen-ify" those call sites (stuff like > ''#ifdef CONFIG_XEN'' or/and is_xen_domain() )? How many hypercalls would > it require? Is it possible to have one doing all the work, or would we > need something like one for each table?I think it wouldn''t be too hard to construct the static ACPI tables and provide them in acpi_os_get_root_pointer().> As I said, I can''t check the details about it right now, but it sounds > like more work than Elena''s xen_numa_init().Yes, it''s probably a bit more work. --msw
George Dunlap
2013-Aug-29 09:16 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Thu, Aug 29, 2013 at 12:25 AM, Matt Wilson <msw@amazon.com> wrote:> On Wed, Aug 28, 2013 at 04:10:11PM -0400, Elena Ufimtseva wrote: >> Hi Matt >> >> Do you construct these tables via hypercalls? >> Will be good to see the details on how you do it and if I can use it as well. >> Thank you. > > All the ACPI tables get built in guest memory by hvmloader.Our end goal is to be able to have dom0 boot with NUMA, so whatever we do will have to be constructed in Xen as well. If making the ACPI tables is fairly straightforward, that might be an option; but if it''s really complicated, we probably want to stick with what we have. -George
David Vrabel
2013-Aug-29 13:41 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On 29/08/13 01:11, Matt Wilson wrote:> > I think it wouldn''t be too hard to construct the static ACPI tables > and provide them in acpi_os_get_root_pointer().Elena''s approach looks fine to me from a Linux point of view. It''s still not clear how presenting the information via ACPI would be an improvement. I also think it is preferrable for the hypercall ABI for PV guests to be independent of architecture specifics where possible. If you posted Xen and Linux series producing/using the ACPI SRAT and SLIT tables for dom0 and domU PV guests we could better evaluate that option but until then I''m inclined to consider Elena''s approach as the better one. David
Konrad Rzeszutek Wilk
2013-Aug-29 14:07 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Thu, Aug 29, 2013 at 01:12:15AM +0200, Dario Faggioli wrote:> On mar, 2013-08-27 at 10:33 -0400, Konrad Rzeszutek Wilk wrote: > > Unfortunatly that won''t happen if you boot this under dom0. It will > > find on some AMD machines the AMD Northbridge and try to scan that. > > And blow up. > > > Well, perhaps this was not evident enough in Elena''s mails and patches, > but her current target is actually DomU. :-P > > However, that''s only because we''re proceeding in steps, so yes, once > DomU will be working, Dom0 will become the target, and we''ll have to > start caring about not making things explode! :-O > > > If you look at the git commit that did the ''numa = 0'' you will see > > the backstory. > > > That''s something interesting to look at... I remember having seen that > discussion but not the details. I''ll check when back from vacation. > > > I think part of enablement of '' numa = 1'' is to wrap it with > > > > if (xen_initial_domain() && xen_supports_this_hypercall()) > > numa = 1; > > > > in the #2 patch. > > > Mmm... Did you mean ''!xen_initial_domain()'' ? Or is it me that did not > understand your comment about Dom0 above?For right now ''!'' would suffice.> > In case you did mean ''!'', again, this would be fine for now, since the > target is DomUs. At the point when we will start chasing Dom0, it would > be interesting to find a solution to this issue... :-/> > Anyway, thanks for pointing this out. > > Regards, > Dario > > -- > <<This happens because I choose it to happen!>> (Raistlin Majere) > ----------------------------------------------------------------- > Dario Faggioli, Ph.D, http://about.me/dario.faggioli > Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) >
Konrad Rzeszutek Wilk
2013-Aug-29 14:23 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote:> On 29/08/13 01:11, Matt Wilson wrote: > > > > I think it wouldn''t be too hard to construct the static ACPI tables > > and provide them in acpi_os_get_root_pointer(). > > Elena''s approach looks fine to me from a Linux point of view.From an maintaince perspective the question is: - Why not re-use existing code? As in, if both PV and HVM can use SRAT, why not do it that way? This way you are exercising the same code path in both guests and it means less bugs to chase. Incidententaly this also means that the mechanism to fetch the NUMA information from the hypervisor and construct the SRAT tables from can be done the same in both hvmloader and Linux kernel. Which is nice - b/c if you find bugs in one, there is a chance they are the same in the other. Also this means one hypercall for both PV and HVM which is neat as well. - Lastly, the question is - a new ''interface'' in the generic code is added to deal exclusively with Xen. Why special case it when one can use the SRAT for example? Is there a compeling technical argument? (Is SRAT inferior to vNUMA?) Of course the counter argument is: - It will require more work as now Linux has to use the hypercall to construct an ACPI SRAT table. And we need to create a "fake" SRAT table and as well an RS.. something structure so that ACPI code can find it. Or perhaps this is something libxl can do? Which would mean the code to construct this and for HVM is both in Xen code base. And the only thing Linux needs to do is turn NUMA on for PV guests if it finds an ACPI table (perhaps by scanning the E820?). - Will it work? As in will an PV domU be able to run with just one ACPI table - just the SRAT and nothing else? Or will it fall flat on its face?> > It''s still not clear how presenting the information via ACPI would be an > improvement. I also think it is preferrable for the hypercall ABI for > PV guests to be independent of architecture specifics where possible.I concur on the hypercall ABI - but I think the hypercall should be available on both PV and HVM so that it can be used in both.> > If you posted Xen and Linux series producing/using the ACPI SRAT and > SLIT tables for dom0 and domU PV guests we could better evaluate that > option but until then I''m inclined to consider Elena''s approach as the > better one.I think short term Elena''s option is better one as it gets it going and we can experiement with it. Long term I think stashing the data in ACPI SRAT/SLIT is right. But Elena might not get to it in the next couple of months - which means somebody else will have to sign up for that. P.S. I did for fun enable ACPI support in the PV domU guests and it boots fine. It does complain about missing tables, but that was it.
George Dunlap
2013-Aug-29 14:32 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote:> On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote: >> On 29/08/13 01:11, Matt Wilson wrote: >>> I think it wouldn''t be too hard to construct the static ACPI tables >>> and provide them in acpi_os_get_root_pointer(). >> Elena''s approach looks fine to me from a Linux point of view. > From an maintaince perspective the question is: > > - Why not re-use existing code? As in, if both PV and HVM can use > SRAT, why not do it that way? This way you are exercising the same > code path in both guests and it means less bugs to chase. > > Incidententaly this also means that the mechanism to fetch the > NUMA information from the hypervisor and construct the SRAT tables > from can be done the same in both hvmloader and Linux kernel.If the SRAT tables are built by hvmloader for HVM guests, couldn''t hvmloader make this hypercall and construct the tables, instead of having the domain builder do it? That would mean that most of the codepath is the same; HVM just has the extra step of encoding and decoding the information in SRAT form.>> If you posted Xen and Linux series producing/using the ACPI SRAT and >> SLIT tables for dom0 and domU PV guests we could better evaluate that >> option but until then I''m inclined to consider Elena''s approach as the >> better one. > I think short term Elena''s option is better one as it gets it going and > we can experiement with it. Long term I think stashing the data in ACPI > SRAT/SLIT is right. But Elena might not get to it in the next couple of > months - which means somebody else will have to sign up for that.I definitely think that Elena needs to continue on the same path for now, so she can actually have closure on the project in a reasonable amount of time. -George
Konrad Rzeszutek Wilk
2013-Aug-29 14:51 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote:> On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote: > >On Thu, Aug 29, 2013 at 02:41:23PM +0100, David Vrabel wrote: > >>On 29/08/13 01:11, Matt Wilson wrote: > >>>I think it wouldn''t be too hard to construct the static ACPI tables > >>>and provide them in acpi_os_get_root_pointer(). > >>Elena''s approach looks fine to me from a Linux point of view. > > From an maintaince perspective the question is: > > > > - Why not re-use existing code? As in, if both PV and HVM can use > > SRAT, why not do it that way? This way you are exercising the same > > code path in both guests and it means less bugs to chase. > > > > Incidententaly this also means that the mechanism to fetch the > > NUMA information from the hypervisor and construct the SRAT tables > > from can be done the same in both hvmloader and Linux kernel. > > If the SRAT tables are built by hvmloader for HVM guests, couldn''t > hvmloader make this hypercall and construct the tables, instead of > having the domain builder do it? That would mean that most of the > codepath is the same; HVM just has the extra step of encoding and > decoding the information in SRAT form.Correct. Or the domain builder can build it for both PV and HVM. We do have the full control of E820 for PV - so can make an "E820_ACPI" region and stash the ACPI tables. Either way would work. Whichever one means more code sharing is probably the best bang for the buck to say.> > >>If you posted Xen and Linux series producing/using the ACPI SRAT and > >>SLIT tables for dom0 and domU PV guests we could better evaluate that > >>option but until then I''m inclined to consider Elena''s approach as the > >>better one. > >I think short term Elena''s option is better one as it gets it going and > >we can experiement with it. Long term I think stashing the data in ACPI > >SRAT/SLIT is right. But Elena might not get to it in the next couple of > >months - which means somebody else will have to sign up for that. > > I definitely think that Elena needs to continue on the same path for > now, so she can actually have closure on the project in a reasonable > amount of time.I concur. The caveat is that it could mean that the x86 maintainers might object to the generic path having the special case for Xen and that particular patch part won''t be upstreamed until it has been taken care of. If Elena is OK with that possiblity - then that is fine with me.
Dario Faggioli
2013-Aug-29 22:29 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On gio, 2013-08-29 at 10:51 -0400, Konrad Rzeszutek Wilk wrote:> On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote: > > On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote: > > > - Why not re-use existing code? As in, if both PV and HVM can use > > > SRAT, why not do it that way? This way you are exercising the same > > > code path in both guests and it means less bugs to chase. > > > > > > Incidententaly this also means that the mechanism to fetch the > > > NUMA information from the hypervisor and construct the SRAT tables > > > from can be done the same in both hvmloader and Linux kernel. > > > > If the SRAT tables are built by hvmloader for HVM guests, couldn''t > > hvmloader make this hypercall and construct the tables, instead of > > having the domain builder do it? That would mean that most of the > > codepath is the same; HVM just has the extra step of encoding and > > decoding the information in SRAT form. > > Correct. >Indeed. Also, Matt mentioned the HVM implementation does the most of the work in libxc and hvmloader, is that the case? (Matt, I knew about the "old" HVM-NUMA series from Andre, but I don''t have the details fresh enough right now... Will take another look ASAP.) If yes, please, consider that, when talking about PV-vNUMA, although right now the series addresses DomU only, we plan to make it possible for Dom0 to have a virtual NUMA topology too. In that case, I don''t think any code from libxc and hvmloader could be shared, could it? So, to me, it sort of looks like we risk to introduce more code duplication than what we''re trying to avoid! :-P> > >I think short term Elena''s option is better one as it gets it going and > > >we can experiement with it. Long term I think stashing the data in ACPI > > >SRAT/SLIT is right. But Elena might not get to it in the next couple of > > >months - which means somebody else will have to sign up for that. > > > > I definitely think that Elena needs to continue on the same path for > > now, so she can actually have closure on the project in a reasonable > > amount of time. > > I concur. The caveat is that it could mean that the x86 maintainers might > object to the generic path having the special case for Xen and that > particular patch part won''t be upstreamed until it has been taken care of. > > If Elena is OK with that possiblity - then that is fine with me. >Sorry, I''m not sure I understand what you mean here. When talking about PV-NUMA, with Elena''s approach, we have a Xen special case in numa_init(). With the fake table approach, we''d have a Xen special case in the ACPI parsing code (Matt was talking about something like acpi_os_get_root_pointer() ), wouldn''t we? So, either way, we''ll have a Xen special case, the only difference I see is that, in the latter, we''d probably have to deal with the ACPI maintainers instead than with the x86 ones. :-) FWIW, x86_numa_init() already looks like this: 621 void __init x86_numa_init(void) 622 { 623 if (!numa_off) { 624 #ifdef CONFIG_X86_NUMAQ 625 if (!numa_init(numaq_numa_init)) 626 return; 627 #endif 628 #ifdef CONFIG_ACPI_NUMA 629 if (!numa_init(x86_acpi_numa_init)) 630 return; 631 #endif 632 #ifdef CONFIG_AMD_NUMA 633 if (!numa_init(amd_numa_init)) 634 return; 635 #endif 636 } 637 638 numa_init(dummy_numa_init); 639 } I.e., quite a bit of architecture/implementation special casing already! So, perhaps, having some more `#ifdef CONFIG_XEN'' and/or `if (xen_domain())'' in there won''t be that much upsetting after all. :-) Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Konrad Rzeszutek Wilk
2013-Aug-30 12:57 UTC
Re: [PATCH RFC 1/2] linux/vnuma: vnuma support for pv guest
On Fri, Aug 30, 2013 at 12:29:09AM +0200, Dario Faggioli wrote:> On gio, 2013-08-29 at 10:51 -0400, Konrad Rzeszutek Wilk wrote: > > On Thu, Aug 29, 2013 at 03:32:13PM +0100, George Dunlap wrote: > > > On 29/08/13 15:23, Konrad Rzeszutek Wilk wrote: > > > > - Why not re-use existing code? As in, if both PV and HVM can use > > > > SRAT, why not do it that way? This way you are exercising the same > > > > code path in both guests and it means less bugs to chase. > > > > > > > > Incidententaly this also means that the mechanism to fetch the > > > > NUMA information from the hypervisor and construct the SRAT tables > > > > from can be done the same in both hvmloader and Linux kernel. > > > > > > If the SRAT tables are built by hvmloader for HVM guests, couldn''t > > > hvmloader make this hypercall and construct the tables, instead of > > > having the domain builder do it? That would mean that most of the > > > codepath is the same; HVM just has the extra step of encoding and > > > decoding the information in SRAT form. > > > > Correct. > > > Indeed. Also, Matt mentioned the HVM implementation does the most of the > work in libxc and hvmloader, is that the case? (Matt, I knew about the > "old" HVM-NUMA series from Andre, but I don''t have the details fresh > enough right now... Will take another look ASAP.) > > If yes, please, consider that, when talking about PV-vNUMA, although > right now the series addresses DomU only, we plan to make it possible > for Dom0 to have a virtual NUMA topology too. In that case, I don''t > think any code from libxc and hvmloader could be shared, could it?But it could be stuck in the hypervisor at that point. So all of that would reside within the Xen source base - in three places. Yuck.> > So, to me, it sort of looks like we risk to introduce more code > duplication than what we''re trying to avoid! :-PWould be nice if this could be somehow stuck in a libary that all three (hvmloader, libxc and hypervisor) could be built with.> > > > >I think short term Elena''s option is better one as it gets it going and > > > >we can experiement with it. Long term I think stashing the data in ACPI > > > >SRAT/SLIT is right. But Elena might not get to it in the next couple of > > > >months - which means somebody else will have to sign up for that. > > > > > > I definitely think that Elena needs to continue on the same path for > > > now, so she can actually have closure on the project in a reasonable > > > amount of time. > > > > I concur. The caveat is that it could mean that the x86 maintainers might > > object to the generic path having the special case for Xen and that > > particular patch part won''t be upstreamed until it has been taken care of. > > > > If Elena is OK with that possiblity - then that is fine with me. > > > Sorry, I''m not sure I understand what you mean here. When talking about > PV-NUMA, with Elena''s approach, we have a Xen special case in > numa_init(). With the fake table approach, we''d have a Xen special case > in the ACPI parsing code (Matt was talking about something like > acpi_os_get_root_pointer() ), wouldn''t we? > > So, either way, we''ll have a Xen special case, the only difference I see > is that, in the latter, we''d probably have to deal with the ACPI > maintainers instead than with the x86 ones. :-)Correct.> > FWIW, x86_numa_init() already looks like this: > > 621 void __init x86_numa_init(void) > 622 { > 623 if (!numa_off) { > 624 #ifdef CONFIG_X86_NUMAQ > 625 if (!numa_init(numaq_numa_init)) > 626 return; > 627 #endif > 628 #ifdef CONFIG_ACPI_NUMA > 629 if (!numa_init(x86_acpi_numa_init)) > 630 return; > 631 #endif > 632 #ifdef CONFIG_AMD_NUMA > 633 if (!numa_init(amd_numa_init)) > 634 return; > 635 #endif > 636 } > 637 > 638 numa_init(dummy_numa_init); > 639 } > > I.e., quite a bit of architecture/implementation special casing already! > So, perhaps, having some more `#ifdef CONFIG_XEN'' and/or `if > (xen_domain())'' in there won''t be that much upsetting after all. :-)Or perhaps fix that special casing and have something similar to the iommu_detect API. And make that ''iommu_detect'' be more generic now.> > Thanks and Regards, > Dario > > -- > <<This happens because I choose it to happen!>> (Raistlin Majere) > ----------------------------------------------------------------- > Dario Faggioli, Ph.D, http://about.me/dario.faggioli > Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) >