Elena Ufimtseva
2013-Sep-13 08:50 UTC
[PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
vNUMA libxl supporting functionality. libxl supporting functionality for vNUMA includes: * having vNUMA memory areas sizes, transforms it to start and end pfns based on domain e820 map; * contructs vnode_to_pnode map for vNUMA nodes memory allocation and pass it to Xen; the mechanism considers automatic NUMA placement in case of presence of hardware NUMA; In best case scenario all vnodes will be allocated within one pnode. If the domain spans different pnodes, the vnodes will be one-by-one placed to pnodes. If such allocation is impossible due to the memory constraints, the allocation will use default mechanism; this is worst case scenario. * passes to Xen vNUMA topology of domain; Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> --- Changes since v1: * chnged libxl__init_vnodemap to take into account automatic NUMA placement; vnode_to_pnode map is formed based on nodemap mask instead of all set of NUMA nodes; * defined LIBXL_BUILD_HAVE_VNUMA for external applications; * fixed int to unsigned int for non-negative structure members; TODO: * implement proposed by Dario mechanism where vnode_to_pnode mapping will be turned off in worst case when some of the vnodes cannot be bound to pnode instead of falling back to default (sparse) allocation for that node (if other ideas will not emerge); * define xc_dominfo vnuma related fields and add vnuma initiliaztion to xcinfo2xlinfo; --- tools/libxl/libxl.c | 19 ++++++++ tools/libxl/libxl.h | 20 +++++++- tools/libxl/libxl_arch.h | 5 ++ tools/libxl/libxl_dom.c | 105 +++++++++++++++++++++++++++++++++++++++++- tools/libxl/libxl_internal.h | 3 ++ tools/libxl/libxl_types.idl | 5 +- tools/libxl/libxl_x86.c | 86 ++++++++++++++++++++++++++++++++++ 7 files changed, 240 insertions(+), 3 deletions(-) diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c index 81785df..de809db 100644 --- a/tools/libxl/libxl.c +++ b/tools/libxl/libxl.c @@ -4293,6 +4293,25 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, uint32_t domid, } return 0; } +int libxl_domain_setvnodes(libxl_ctx *ctx, + uint32_t domid, + uint16_t nr_vnodes, + uint16_t nr_vcpus, + vnuma_memblk_t *vnuma_memblks, + unsigned int *vdistance, + unsigned int *vcpu_to_vnode, + unsigned int *vnode_to_pnode) +{ + GC_INIT(ctx); + int ret; + ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes, + nr_vcpus, vnuma_memblks, + vdistance, + vcpu_to_vnode, + vnode_to_pnode); + GC_FREE; + return ret; +} int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap) { diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h index be19bf5..722b655 100644 --- a/tools/libxl/libxl.h +++ b/tools/libxl/libxl.h @@ -90,6 +90,14 @@ #define LIBXL_HAVE_BUILDINFO_HVM_VENDOR_DEVICE 1 /* + * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be + * build for the guest upon request and with VM configuration. + * It will try to define best allocation for vNUMA + * nodes on real NUMA nodes. + */ +#define LIBXL_HAVE_BUILDINFO_VNUMA 1 + +/* * libxl ABI compatibility * * The only guarantee which libxl makes regarding ABI compatibility @@ -275,7 +283,7 @@ #include <errno.h> #include <netinet/in.h> #include <sys/wait.h> /* for pid_t */ - +#include <xen/memory.h> #include <xentoollog.h> #include <libxl_uuid.h> @@ -706,6 +714,16 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int nr_vcpus); void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms); void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms); +/* vNUMA topology */ +int libxl_domain_setvnodes(libxl_ctx *ctx, + uint32_t domid, + uint16_t nr_vnodes, + uint16_t nr_vcpus, + vnuma_memblk_t *vnuma_memblks, + unsigned int *vdistance, + unsigned int *vcpu_to_vnode, + unsigned int *vnode_to_pnode); + /* * Devices * ======diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h index abe6685..8710270 100644 --- a/tools/libxl/libxl_arch.h +++ b/tools/libxl/libxl_arch.h @@ -18,5 +18,10 @@ /* arch specific internal domain creation function */ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, uint32_t domid); +int libxl__vnuma_align_mem(libxl__gc *gc, + uint32_t domid, + struct libxl_domain_build_info *b_info, + vnuma_memblk_t *memblks); +unsigned long e820_memory_hole_size(unsigned long start, unsigned long end, struct e820entry e820[], int nr); #endif diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index 6e2252a..0a21c44 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -201,6 +201,74 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid, return rc; } +#define set_all_vnodes(n) for(i=0; i< info->nr_vnodes; i++) \ + info->vnode_to_pnode[i] = n + +/* prepares vnode to pnode map for domain vNUMA memory allocation */ +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid, + libxl_domain_build_info *info) +{ + int i, n, start, nr_nodes, rc; + uint64_t *mems; + unsigned long long *claim; + libxl_numainfo *ninfo = NULL; + + rc = -EINVAL; + if (info->vnode_to_pnode == NULL) + info->vnode_to_pnode = calloc(info->nr_vnodes, sizeof(*info->vnode_to_pnode)); + set_all_vnodes(NUMA_NO_NODE); + /* + * check if we have any hardware NUMA nodes selected, + * otherwise NUMA_NO_NODE set and used default allocation + */ + if (libxl_bitmap_is_empty(&info->nodemap)) + return 0; + claim = calloc(info->nr_vnodes, sizeof(*claim)); + if (claim == NULL) + return rc; + mems = info->vnuma_memszs; + ninfo = libxl_get_numainfo(CTX, &nr_nodes); + if (ninfo == NULL) { + LOG(DEBUG, "No HW NUMA found\n"); + goto vnmapout; + } + /* check if all vnodes will fit in one node */ + libxl_for_each_set_bit(n, info->nodemap) { + if(ninfo[n].free/1024 >= info->max_memkb && + libxl_bitmap_test(&info->nodemap, n)) + { + /* + * all domain v-nodes will fit one p-node, + * p-node is a best candidate selected by automatic + * NUMA placement. + */ + set_all_vnodes(n); + return 0; + } + } + /* TODO: change algorithm. The current just fits the nodes + * Will be nice to have them also sorted by size */ + /* If no p-node found, will be set to NUMA_NO_NODE and allocation will fail */ + start = 0; + libxl_for_each_set_bit(n, info->nodemap) + { + for ( i = start; i < info->nr_vnodes; i++ ) + { + if ( ((claim[n] + mems[i]) <= ninfo[n].free) && + /*vnode was not set yet */ + (info->vnode_to_pnode[i] == NUMA_NO_NODE ) ) + { + info->vnode_to_pnode[i] = n; + claim[n] += mems[i]; + } + } + } + rc = 0; +vnmapout: + if (claim) free(claim); + return rc; +} + int libxl__build_pre(libxl__gc *gc, uint32_t domid, libxl_domain_config *d_config, libxl__domain_build_state *state) { @@ -232,6 +300,29 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, if (rc) return rc; } + + if (info->nr_vnodes != 0) { + vnuma_memblk_t *memblks = libxl__calloc(gc, info->nr_vnodes, + sizeof(*memblks)); + libxl__vnuma_align_mem(gc, domid, info, memblks); + if (libxl__init_vnodemap(gc, domid, info)) { + LOG(DEBUG, "Failed to call init_vnodemap\n"); + rc = libxl_domain_setvnodes(ctx, domid, info->nr_vnodes, + info->max_vcpus, memblks, + info->vdistance, info->vcpu_to_vnode, + NULL); + } + else + rc = libxl_domain_setvnodes(ctx, domid, info->nr_vnodes, + info->max_vcpus, memblks, + info->vdistance, info->vcpu_to_vnode, + info->vnode_to_pnode); + if (rc < 0 ) LOG(DEBUG, "Failed to call xc_domain_setvnodes\n"); + } + else { + LOG(DEBUG, "Will not construct vNUMA topology with nodes %d\n", info->nr_vnodes); + } + libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap); libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); @@ -368,7 +459,19 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, } } } - + if (info->nr_vnodes != 0) { + dom->nr_vnodes = info->nr_vnodes; + dom->vnode_to_pnode = malloc(info->nr_vnodes * sizeof(*info->vnode_to_pnode)); + dom->vmemsizes = malloc(info->nr_vnodes * sizeof(*info->vnuma_memszs)); + if (dom->vmemsizes == NULL || dom->vnode_to_pnode == NULL) { + LOGE(ERROR, "Failed to allocate memory for vNUMA domain image.\n"); + goto out; + } + memcpy(dom->vmemsizes, info->vnuma_memszs, + sizeof(*info->vnuma_memszs) * info->nr_vnodes); + memcpy(dom->vnode_to_pnode, info->vnode_to_pnode, + sizeof(*info->vnode_to_pnode) * info->nr_vnodes); + } dom->flags = flags; dom->console_evtchn = state->console_port; dom->console_domid = state->console_domid; diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h index f051d91..7f94de7 100644 --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -2709,6 +2709,7 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) { #define CTX_LOCK (libxl__ctx_lock(CTX)) #define CTX_UNLOCK (libxl__ctx_unlock(CTX)) +#define NUMA_NO_NODE ~((uint32_t)0) /* * Automatic NUMA placement * @@ -2832,6 +2833,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc, libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap); } +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid, + libxl_domain_build_info *info); /* * Inserts "elm_new" into the sorted list "head". * diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index 85341a0..5173c09 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -279,7 +279,10 @@ libxl_domain_build_info = Struct("domain_build_info",[ ("disable_migrate", libxl_defbool), ("cpuid", libxl_cpuid_policy_list), ("blkdev_start", string), - + ("vnuma_memszs", Array(uint64, "nr_vnodes")), + ("vcpu_to_vnode", Array(uint32, "nr_vnodemap")), + ("vdistance", Array(uint32, "nr_vdist")), + ("vnode_to_pnode", Array(uint32, "nr_vnode_to_pnode")), ("device_model_version", libxl_device_model_version), ("device_model_stubdomain", libxl_defbool), # if you set device_model you must set device_model_version too diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c index a78c91d..f7b1aeb 100644 --- a/tools/libxl/libxl_x86.c +++ b/tools/libxl/libxl_x86.c @@ -308,3 +308,89 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, return ret; } + +unsigned long e820_memory_hole_size(unsigned long start, unsigned long end, struct e820entry e820[], int nr) +{ +#define clamp(val, min, max) ({ \ + typeof(val) __val = (val); \ + typeof(min) __min = (min); \ + typeof(max) __max = (max); \ + (void) (&__val == &__min); \ + (void) (&__val == &__max); \ + __val = __val < __min ? __min: __val; \ + __val > __max ? __max: __val; }) + int i; + unsigned long absent, start_pfn, end_pfn; + absent = start - end; + for(i = 0; i < nr; i++) { + if(e820[i].type == E820_RAM) { + start_pfn = clamp(e820[i].addr, start, end); + end_pfn = clamp(e820[i].addr + e820[i].size, start, end); + absent -= end_pfn - start_pfn; + } + } + return absent; +} + +/* + * Aligns memory blocks in domain info for linux NUMA build image. + * Takes libxl_domain_build_info memory sizes and returns aligned to + * domain e820 map linux numa blocks in guest pfns. + */ +int libxl__vnuma_align_mem(libxl__gc *gc, + uint32_t domid, + libxl_domain_build_info *b_info, /* IN: mem sizes */ + vnuma_memblk_t *memblks) /* OUT: linux numa blocks in pfn */ +{ +#ifndef roundup +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif + int i, rc; + unsigned long shift = 0, size, node_min_size = 1, limit; + unsigned long end_max; + uint32_t nr; + struct e820entry map[E820MAX]; + + libxl_ctx *ctx = libxl__gc_owner(gc); + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX); + if (rc < 0) { + errno = rc; + return -EINVAL; + } + nr = rc; + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb, + (b_info->max_memkb - b_info->target_memkb) + + b_info->u.pv.slack_memkb); + if (rc) + return ERROR_FAIL; + end_max = map[nr-1].addr + map[nr-1].size; + shift = 0; + memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes); + memblks[0].start = map[0].addr; + for(i = 0; i < b_info->nr_vnodes; i++) { + memblks[i].start += shift; + memblks[i].end += shift + b_info->vnuma_memszs[i]; + limit = size = memblks[i].end - memblks[i].start; + while (memblks[i].end - memblks[i].start - + e820_memory_hole_size(memblks[i].start, memblks[i].end, map, nr) + < size) { + memblks[i].end += node_min_size; + shift += node_min_size; + if (memblks[i].end - memblks[i].start >= limit) { + memblks[i].end = memblks[i].start + limit; + break; + } + if (memblks[i].end == end_max) { + memblks[i].end = end_max; + break; + } + } + shift = memblks[i].end; + memblks[i].start = roundup(memblks[i].start, 4*1024); + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end = %#010lx\n", + memblks[i].start, memblks[i].end); + } + if(memblks[i-1].end > end_max) + memblks[i-1].end = end_max; + return 0; +} -- 1.7.10.4
George Dunlap
2013-Sep-17 16:36 UTC
Re: [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
On Fri, Sep 13, 2013 at 9:50 AM, Elena Ufimtseva <ufimtseva@gmail.com> wrote:> vNUMA libxl supporting functionality. > libxl supporting functionality for vNUMA includes: > * having vNUMA memory areas sizes, transforms it to > start and end pfns based on domain e820 map; > * contructs vnode_to_pnode map for vNUMA nodes memory > allocation and pass it to Xen; the mechanism considers > automatic NUMA placement in case of presence of hardware > NUMA; In best case scenario all vnodes will be allocated > within one pnode. If the domain spans different pnodes, > the vnodes will be one-by-one placed to pnodes. If such > allocation is impossible due to the memory constraints, > the allocation will use default mechanism; this is worst > case scenario.Why would someone want to make a VM with two vnodes and then put them on the same pnode? Apart from testing, of course, but our defaults should be for the common case of real users. [snip]> diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c > index a78c91d..f7b1aeb 100644 > --- a/tools/libxl/libxl_x86.c > +++ b/tools/libxl/libxl_x86.c > @@ -308,3 +308,89 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, > > return ret; > } > + > +unsigned long e820_memory_hole_size(unsigned long start, unsigned long end, struct e820entry e820[], int nr) > +{ > +#define clamp(val, min, max) ({ \ > + typeof(val) __val = (val); \ > + typeof(min) __min = (min); \ > + typeof(max) __max = (max); \ > + (void) (&__val == &__min); \ > + (void) (&__val == &__max); \ > + __val = __val < __min ? __min: __val; \ > + __val > __max ? __max: __val; })What on earth is going on here? Are these comparison of pointers to work around some gdb "unused variable" warnings or something? This would be much better as a static inline function that explicitly returns a value.> + int i; > + unsigned long absent, start_pfn, end_pfn; > + absent = start - end; > + for(i = 0; i < nr; i++) { > + if(e820[i].type == E820_RAM) { > + start_pfn = clamp(e820[i].addr, start, end); > + end_pfn = clamp(e820[i].addr + e820[i].size, start, end); > + absent -= end_pfn - start_pfn; > + } > + } > + return absent; > +} > + > +/* > + * Aligns memory blocks in domain info for linux NUMA build image. > + * Takes libxl_domain_build_info memory sizes and returns aligned to > + * domain e820 map linux numa blocks in guest pfns. > + */ > +int libxl__vnuma_align_mem(libxl__gc *gc, > + uint32_t domid, > + libxl_domain_build_info *b_info, /* IN: mem sizes */ > + vnuma_memblk_t *memblks) /* OUT: linux numa blocks in pfn */ > +{ > +#ifndef roundup > +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) > +#endif > + int i, rc; > + unsigned long shift = 0, size, node_min_size = 1, limit; > + unsigned long end_max; > + uint32_t nr; > + struct e820entry map[E820MAX]; > + > + libxl_ctx *ctx = libxl__gc_owner(gc); > + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX); > + if (rc < 0) { > + errno = rc; > + return -EINVAL; > + } > + nr = rc; > + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb, > + (b_info->max_memkb - b_info->target_memkb) + > + b_info->u.pv.slack_memkb); > + if (rc) > + return ERROR_FAIL; > + end_max = map[nr-1].addr + map[nr-1].size; > + shift = 0; > + memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes); > + memblks[0].start = map[0].addr; > + for(i = 0; i < b_info->nr_vnodes; i++) { > + memblks[i].start += shift;OK, at this point memblks[i] should be 0, since we zeroed it up there... why are you += instead of just =? And the code below is confusing and I think isn''t doing what you think it does. Walk me through this.> + memblks[i].end += shift + b_info->vnuma_memszs[i];.end started at 0, so this is effectively an obfuscated ''=''. So suppose vnuma_memszs is 256MiB; so now .end == .start + 256MiB.> + limit = size = memblks[i].end - memblks[i].start;So now limit and size are both 256MiB.> + while (memblks[i].end - memblks[i].start - > + e820_memory_hole_size(memblks[i].start, memblks[i].end, map, nr) > + < size) {And suppose there was a hole that overlapped here, so we enter this loop.> + memblks[i].end += node_min_size; > + shift += node_min_size; > + if (memblks[i].end - memblks[i].start >= limit) {Now, at this point, unless node_min_size is negative, this is *always* going to be true.> + memblks[i].end = memblks[i].start + limit; > + break;So we set .end back to what it was before (.start + 256MiB) and break out of the loop?> + } > + if (memblks[i].end == end_max) { > + memblks[i].end = end_max; > + break; > + } > + } > + shift = memblks[i].end;And throw away whatever we did to shift before? What was it you were actually trying to do here? -George
Ian Campbell
2013-Sep-17 16:38 UTC
Re: [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
On Tue, 2013-09-17 at 17:36 +0100, George Dunlap wrote:> On Fri, Sep 13, 2013 at 9:50 AM, Elena Ufimtseva <ufimtseva@gmail.com> wrote: > > vNUMA libxl supporting functionality. > > libxl supporting functionality for vNUMA includes: > > * having vNUMA memory areas sizes, transforms it to > > start and end pfns based on domain e820 map; > > * contructs vnode_to_pnode map for vNUMA nodes memory > > allocation and pass it to Xen; the mechanism considers > > automatic NUMA placement in case of presence of hardware > > NUMA; In best case scenario all vnodes will be allocated > > within one pnode. If the domain spans different pnodes, > > the vnodes will be one-by-one placed to pnodes. If such > > allocation is impossible due to the memory constraints, > > the allocation will use default mechanism; this is worst > > case scenario. > > Why would someone want to make a VM with two vnodes and then put them > on the same pnode? Apart from testing, of course, but our defaults > should be for the common case of real users.If your pool of machines included 1- and 2-node systems might you want to do this so that when you migrate the vm to a two node system it can make use of it? Ian.
George Dunlap
2013-Sep-17 16:56 UTC
Re: [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
On 09/17/2013 05:38 PM, Ian Campbell wrote:> On Tue, 2013-09-17 at 17:36 +0100, George Dunlap wrote: >> On Fri, Sep 13, 2013 at 9:50 AM, Elena Ufimtseva <ufimtseva@gmail.com> wrote: >>> vNUMA libxl supporting functionality. >>> libxl supporting functionality for vNUMA includes: >>> * having vNUMA memory areas sizes, transforms it to >>> start and end pfns based on domain e820 map; >>> * contructs vnode_to_pnode map for vNUMA nodes memory >>> allocation and pass it to Xen; the mechanism considers >>> automatic NUMA placement in case of presence of hardware >>> NUMA; In best case scenario all vnodes will be allocated >>> within one pnode. If the domain spans different pnodes, >>> the vnodes will be one-by-one placed to pnodes. If such >>> allocation is impossible due to the memory constraints, >>> the allocation will use default mechanism; this is worst >>> case scenario. >> >> Why would someone want to make a VM with two vnodes and then put them >> on the same pnode? Apart from testing, of course, but our defaults >> should be for the common case of real users. > > If your pool of machines included 1- and 2-node systems might you want > to do this so that when you migrate the vm to a two node system it can > make use of it?Yes, but from the description (and from a brief glance at the code) it looks like if it can happen to fit all the vnodes on a single pnode, it will do so -- even if the node affinity for the domain spans several nodes. I think if I was a user, and made a VM with 2 vnodes, and then set its node affinity to two pnodes, I would expect the tools to place each vnode on one pnode. At this point it may be bike-shedding, though... at some point we''ll have the ability to specify node affinity for individual vcpus, at which time we can specify a pnode affinity for individual vnodes. As long as we have something not barking mad here, we can revisit it before the release (as long as someone writes down to do it). -George
Dario Faggioli
2013-Sep-18 08:42 UTC
Re: [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
On mar, 2013-09-17 at 17:56 +0100, George Dunlap wrote:> On 09/17/2013 05:38 PM, Ian Campbell wrote: > > On Tue, 2013-09-17 at 17:36 +0100, George Dunlap wrote: > > >> Why would someone want to make a VM with two vnodes and then put them > >> on the same pnode? Apart from testing, of course, but our defaults > >> should be for the common case of real users. > > > > If your pool of machines included 1- and 2-node systems might you want > > to do this so that when you migrate the vm to a two node system it can > > make use of it? > > Yes, but from the description (and from a brief glance at the code) it > looks like if it can happen to fit all the vnodes on a single pnode, it > will do so -- even if the node affinity for the domain spans several nodes. >That is something we discussed with Elena (I think you were copied to the thread, but anyway), and it''s not an easy thing to wrap the mind around! Actually, how to come up with a decent interface, sensible default values, etc., is really quite a piece of work, and I think we should have some discussion about it (perhaps in a new thread). So, initially, I rose exactly the same objection: 2 vnodes needs means trying to put and map the guest to 2 pnodes. However, if the purpose of the whole thing (i.e., the combined action of automatic NUMA placement and vNUMA) is to maximize the performance, well, no matter what the vNUMA topology is, having the guest on just one physical node (if it fits there, of course) is always the best solution... So why shouldn''t we do that?> I think if I was a user, and made a VM with 2 vnodes, and then set its > node affinity to two pnodes, I would expect the tools to place each > vnode on one pnode. >That is exactly the question: is the fact that the user asked for 2 vnodes enough for disallowing allocatin the VM on just one pnode? As it has been pointed out by Jan already, we really want another parameter, or in general something that will allow the user to specify the exact pnode-to-vnode mapping, and in case such parameter is provided, all bets are off. But what if it is not... What the default behaviour should be? And that is a genuine question, i.e., I really can''t decide which solution is better, as I see both advantages and disadvantages on both of them.> At this point it may be bike-shedding, though... at some point we''ll > have the ability to specify node affinity for individual vcpus, at which > time we can specify a pnode affinity for individual vnodes. As long as > we have something not barking mad here, we can revisit it before the > release (as long as someone writes down to do it). >I have per-vcpu NUMA node-affinity almost ready, and I really plan to submit either today or tomorrow. However, I''m not sure I understood how you think that is going to help... Being able to specify a node-affinity for each single vcpu of a domain (which surely means being able to specify it consistently for all the vcpus that forms a vnode, which I think is what you call ''specify a pnode affinity for individual vnodes'') does not forbid to map two or more vnodes on the same pnode (and, most likely, specify the same node-affinity for all their vcpus, although that is not mandatory). So, it looks to me that, even at that point, it will still be our call to decide what to do and what the most sensible default behaviour is, won''t it? Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Konrad Rzeszutek Wilk
2013-Sep-25 17:50 UTC
Re: [PATCH RFC v2 4/7] libxl/vNUMA: vNUMA libxl functionality.
On Tue, Sep 17, 2013 at 05:36:18PM +0100, George Dunlap wrote:> On Fri, Sep 13, 2013 at 9:50 AM, Elena Ufimtseva <ufimtseva@gmail.com> wrote: > > vNUMA libxl supporting functionality. > > libxl supporting functionality for vNUMA includes: > > * having vNUMA memory areas sizes, transforms it to > > start and end pfns based on domain e820 map; > > * contructs vnode_to_pnode map for vNUMA nodes memory > > allocation and pass it to Xen; the mechanism considers > > automatic NUMA placement in case of presence of hardware > > NUMA; In best case scenario all vnodes will be allocated > > within one pnode. If the domain spans different pnodes, > > the vnodes will be one-by-one placed to pnodes. If such > > allocation is impossible due to the memory constraints, > > the allocation will use default mechanism; this is worst > > case scenario. > > Why would someone want to make a VM with two vnodes and then put them > on the same pnode? Apart from testing, of course, but our defaults > should be for the common case of real users.Resource allocation wihin the guest. You can bind each application to use the "fake" NUMA memory and in that way parition the applications to be within their memory pools. You can do that right now with the fake NUMA option that Linux kernel provides.