* Provides verification and construction of vnode to pnode mapping. Will be in use for vNUMA nodes allocation if running on NUMA machine; If the mapping can be used, disables automatic NUMA placement; * Verifies the correctness of memory blocks pfns for Linux guest by requesting the e820 map for that domain; * Provides information to Xen with domain vNUMA topology; TODO: add additional check, as to vcpu pinning, before disabling automatic NUMA placement machanism; Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> --- Changes since RFC v2: - added vnode_to_pnode map and its verification; - in case vnode_to_pnode map can be used, turns off automatic NUMA placement; - removed bogus memory blocks pfn alignment; --- tools/libxl/libxl.c | 19 +++++ tools/libxl/libxl.h | 18 ++++ tools/libxl/libxl_arch.h | 8 ++ tools/libxl/libxl_dom.c | 186 +++++++++++++++++++++++++++++++++++++++++- tools/libxl/libxl_internal.h | 3 + tools/libxl/libxl_types.idl | 5 +- tools/libxl/libxl_x86.c | 53 ++++++++++++ 7 files changed, 290 insertions(+), 2 deletions(-) diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c index 29e66f2..5f11641 100644 --- a/tools/libxl/libxl.c +++ b/tools/libxl/libxl.c @@ -4306,6 +4306,25 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc, uint32_t domid, } return 0; } +int libxl_domain_setvnodes(libxl_ctx *ctx, + uint32_t domid, + uint16_t nr_vnodes, + uint16_t nr_vcpus, + vnuma_memblk_t *vnuma_memblks, + unsigned int *vdistance, + unsigned int *vcpu_to_vnode, + unsigned int *vnode_to_pnode) +{ + GC_INIT(ctx); + int ret; + ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes, + nr_vcpus, vnuma_memblks, + vdistance, + vcpu_to_vnode, + vnode_to_pnode); + GC_FREE; + return ret; +} int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap) { diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h index 1c6675d..ceb4e38 100644 --- a/tools/libxl/libxl.h +++ b/tools/libxl/libxl.h @@ -281,6 +281,7 @@ #include <netinet/in.h> #include <sys/wait.h> /* for pid_t */ +#include <xen/memory.h> #include <xentoollog.h> #include <libxl_uuid.h> @@ -376,6 +377,14 @@ #define LIBXL_EXTERNAL_CALLERS_ONLY /* disappears for callers outside libxl */ #endif +/* + * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be + * build for the guest upon request and with VM configuration. + * It will try to define best allocation for vNUMA + * nodes on real NUMA nodes. + */ +#define LIBXL_HAVE_BUILDINFO_VNUMA 1 + typedef uint8_t libxl_mac[6]; #define LIBXL_MAC_FMT "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx" #define LIBXL_MAC_FMTLEN ((2*6)+5) /* 6 hex bytes plus 5 colons */ @@ -753,6 +762,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int nr_vcpus); void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms); void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms); +int libxl_domain_setvnodes(libxl_ctx *ctx, + uint32_t domid, + uint16_t nr_vnodes, + uint16_t nr_vcpus, + vnuma_memblk_t *vnuma_memblks, + unsigned int *vdistance, + unsigned int *vcpu_to_vnode, + unsigned int *vnode_to_pnode); + /* * Devices * ======diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h index abe6685..442aaec 100644 --- a/tools/libxl/libxl_arch.h +++ b/tools/libxl/libxl_arch.h @@ -19,4 +19,12 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, uint32_t domid); +int libxl__vnuma_align_mem(libxl__gc *gc, + uint32_t domid, + struct libxl_domain_build_info *b_info, + vnuma_memblk_t *memblks); + +int libxl__vnodemap_is_usable(libxl__gc *gc, + libxl_domain_build_info *info); + #endif diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c index 356f920..12dc12a 100644 --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -201,6 +201,91 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid, return rc; } +/* prepares vnode to pnode map for domain vNUMA memory allocation */ +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid, + libxl_domain_build_info *info) +{ + int i, n, nr_nodes, rc; + uint64_t *mems; + unsigned long long *claim = NULL; + libxl_numainfo *ninfo = NULL; + + rc = -EINVAL; + if (info->vnode_to_pnode == NULL) { + info->vnode_to_pnode = calloc(info->nr_vnodes, + sizeof(*info->vnode_to_pnode)); + if (info->vnode_to_pnode == NULL) + return rc; + } + else + return 0; + + /* + * If this is no NUMA machine, vnode_to_pnode map will + * be initilizes with VNUMA_NO_NODE + */ + + /* Get NUMA info */ + ninfo = libxl_get_numainfo(CTX, &nr_nodes); + if (ninfo == NULL || nr_vnodes <= 0) { + for (i=0; i< info->nr_vnodes; i++) + info->vnode_to_pnode[i] = VNUMA_NO_NODE; + LOG(DEBUG, "No HW NUMA found\n"); + goto vnmapout; + } + claim = calloc(info->nr_vnodes, sizeof(*claim)); + if (claim == NULL) + return rc; + + for (i=0; i< info->nr_vnodes; i++) + info->vnode_to_pnode[i] = VNUMA_NO_NODE; + + /* + * check if we have any hardware NUMA nodes selected, + * otherwise VNUMA_NO_NODE set and used default allocation + */ + if (libxl_bitmap_is_empty(&info->nodemap)) + return 0; + mems = info->vnuma_memszs; + + /* check if all vnodes will fit in one node */ + libxl_for_each_set_bit(n, info->nodemap) { + if (ninfo[n].free/1024 >= info->max_memkb && + libxl_bitmap_test(&info->nodemap, n)) + { + /* + * all domain v-nodes will fit one p-node, + * p-node is a best candidate selected by automatic + * NUMA placement. + */ + for (i=0; i< info->nr_vnodes; i++) + info->vnode_to_pnode[i] = n; + return 0; + } + } + /* TODO: change algorithm. The current just fits the nodes + * Will be nice to have them also sorted by size + * If no p-node found, will be set to NUMA_NO_NODE + */ + libxl_for_each_set_bit(n, info->nodemap) + { + for ( i = 0; i < info->nr_vnodes; i++ ) + { + if ( ((claim[n] + (mems[i] << 20)) <= ninfo[n].free) && + /*vnode was not set yet */ + (info->vnode_to_pnode[i] == VNUMA_NO_NODE ) ) + { + info->vnode_to_pnode[i] = n; + claim[n] += (mems[i] << 20); + } + } + } + rc = 0; +vnmapout: + if (claim) free(claim); + return rc; +} + int libxl__build_pre(libxl__gc *gc, uint32_t domid, libxl_domain_config *d_config, libxl__domain_build_state *state) { @@ -209,8 +294,29 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, char *xs_domid, *con_domid; int rc; + rc = -EINVAL; xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus); + /* + * If vNUMA vnode_to_pnode map defined, determine if we + * can disable automatic numa placement and place vnodes + * on specified pnodes. + * For now, if vcpu affinity specified, we will use + * specified vnode to pnode map. + */ + if (info->nr_vnodes != 0) { + if ( libxl__vnodemap_is_usable(gc, info) ) { + LOG(DETAIL, "vNUMA automatic placement disabled\n"); + libxl_defbool_set(&info->numa_placement, false); + } + else { + /* release the map as unusable */ + free(info->vnode_to_pnode); + LOG(DETAIL, "vNUMA will use default vnode to pnode map\n"); + info->vnode_to_pnode = NULL; + } + } + /* * Check if the domain has any CPU affinity. If not, try to build * up one. In case numa_place_domain() find at least a suitable @@ -232,6 +338,26 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, if (rc) return rc; } + if (info->nr_vnodes != 0) { + /* The memory blocks will be formed here from sizes */ + vnuma_memblk_t *memblks = libxl__calloc(gc, info->nr_vnodes, + sizeof(*memblks)); + + libxl__vnuma_align_mem(gc, domid, info, memblks); + /* Construct the vnode to pnode mapping if possible */ + if (libxl__init_vnodemap(gc, domid, info) < 0) { + LOG(DEBUG, "Failed to call init_vnodemap\n"); + info->nr_vnodes = 0; + } + /* plumb domain with vNUMA topology */ + libxl_domain_setvnodes(ctx, domid, info->nr_vnodes, + info->max_vcpus, memblks, + info->vdistance, info->vcpu_to_vnode, + info->vnode_to_pnode); + } + else + LOG(DEBUG, "Will not construct vNUMA topology with 0 nodes.\n"); + libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap); libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); @@ -253,6 +379,48 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid, return rc; } +int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info) +{ + int rc, nr_nodes, i; + libxl_numainfo *ninfo = NULL; + unsigned long long *claim; + unsigned int node; + uint64_t *mems; + + rc = 0; + if (info->vnode_to_pnode == NULL) + return rc; + /* + * Cannot use specified mapping if not NUMA machine + */ + ninfo = libxl_get_numainfo(CTX, &nr_nodes); + if (ninfo == NULL) { + return rc; + } + mems = info->vnuma_memszs; + claim = calloc(info->nr_vnodes, sizeof(*claim)); + if (claim == NULL) + return rc; + /* Sum memory request on per pnode basis */ + for ( i = 0; i < info->nr_vnodes; i++ ) + { + node = info->vnode_to_pnode[i]; + /* Correct pnode number? */ + if (node < nr_nodes) + claim[node] += (mems[i] << 20); + else + goto vmapu; + } + for ( i = 0; i < nr_nodes; i++) + if (claim[i] > ninfo[i].free) + /* Cannot complete user request, falling to default */ + goto vmapu; + rc = 1; +vmapu: + if(claim) free(claim); + return rc; + +} int libxl__build_post(libxl__gc *gc, uint32_t domid, libxl_domain_build_info *info, libxl__domain_build_state *state, @@ -375,7 +543,23 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid, } } } - + if (info->nr_vnodes != 0) { + dom->nr_vnodes = info->nr_vnodes; + dom->vnode_to_pnode = malloc(dom->nr_vnodes * sizeof(*dom->vnode_to_pnode)); + dom->vnuma_memszs = malloc(dom->nr_vnodes * sizeof(*dom->vnuma_memszs)); + if (dom->vnuma_memszs == NULL || dom->vnode_to_pnode == NULL) { + LOGE(ERROR, "Failed to allocate memory for vNUMA domain image.\n"); + dom->nr_vnodes = 0; + info->nr_vnodes = 0; + if (dom->vnode_to_pnode) free(dom->vnode_to_pnode); + if (dom->vnuma_memszs) free(dom->vnuma_memszs); + goto out; + } + memcpy(dom->vnuma_memszs, info->vnuma_memszs, + sizeof(*dom->vnuma_memszs) * dom->nr_vnodes); + memcpy(dom->vnode_to_pnode, info->vnode_to_pnode, + sizeof(*dom->vnode_to_pnode) * dom->nr_vnodes); + } dom->flags = flags; dom->console_evtchn = state->console_port; dom->console_domid = state->console_domid; diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h index 165dc00..19ac0fe 100644 --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -2710,6 +2710,7 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) { #define CTX_LOCK (libxl__ctx_lock(CTX)) #define CTX_UNLOCK (libxl__ctx_unlock(CTX)) +#define VNUMA_NO_NODE ~((unsigned int)0) /* * Automatic NUMA placement * @@ -2833,6 +2834,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc, libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap); } +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid, + libxl_domain_build_info *info); /* * Inserts "elm_new" into the sorted list "head". * diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl index d2cea8a..5418966 100644 --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -292,7 +292,10 @@ libxl_domain_build_info = Struct("domain_build_info",[ ("disable_migrate", libxl_defbool), ("cpuid", libxl_cpuid_policy_list), ("blkdev_start", string), - + ("vnuma_memszs", Array(uint64, "nr_vnodes")), + ("vcpu_to_vnode", Array(uint32, "nr_vnodemap")), + ("vdistance", Array(uint32, "nr_vdist")), + ("vnode_to_pnode", Array(uint32, "nr_vnode_to_pnode")), ("device_model_version", libxl_device_model_version), ("device_model_stubdomain", libxl_defbool), # if you set device_model you must set device_model_version too diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c index a78c91d..01edc2b 100644 --- a/tools/libxl/libxl_x86.c +++ b/tools/libxl/libxl_x86.c @@ -308,3 +308,56 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config, return ret; } + +/* + * Checks for the beginnig and end of RAM in e820 map for domain + * and aligns start of first and end of last vNUMA memory block to + * that map. vnode memory size are passed here Megabytes. + */ +int libxl__vnuma_align_mem(libxl__gc *gc, + uint32_t domid, + /* IN: mem sizes in Mbytes*/ + libxl_domain_build_info *b_info, + /* OUT: linux numa blocks in pfn */ + vnuma_memblk_t *memblks) +{ +#ifndef roundup +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif + int i, rc; + unsigned long shift = 0; + unsigned long end_max; + uint32_t nr; + struct e820entry map[E820MAX]; + + libxl_ctx *ctx = libxl__gc_owner(gc); + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX); + if (rc < 0) { + + errno = rc; + return -EINVAL; + } + nr = rc; + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb, + (b_info->max_memkb - b_info->target_memkb) + + b_info->u.pv.slack_memkb); + if (rc) + return -EINVAL; + end_max = map[nr-1].addr + map[nr-1].size; + shift = 0; + memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes); + memblks[0].start = map[0].addr; + + for(i = 0; i < b_info->nr_vnodes; i++) { + memblks[i].start += shift; + memblks[i].end += shift + (b_info->vnuma_memszs[i] << 20); + shift = memblks[i].end; + memblks[i].start = roundup(memblks[i].start, 1024 * 4); + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end = %#010lx, size MB = %#010lx\n", + memblks[i].start, memblks[i].end, b_info->vnuma_memszs[i]); + } + + if(memblks[i-1].end > end_max) + memblks[i-1].end = end_max; + return 0; +} -- 1.7.10.4