* Provides verification and construction of vnode to pnode mapping.
Will be in use for vNUMA nodes allocation if running on NUMA
machine; If the mapping can be used, disables automatic NUMA placement;
* Verifies the correctness of memory blocks pfns for Linux guest
by requesting the e820 map for that domain;
* Provides information to Xen with domain vNUMA topology;
TODO:
add additional check, as to vcpu pinning, before disabling automatic
NUMA placement machanism;
Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
---
Changes since RFC v2:
- added vnode_to_pnode map and its verification;
- in case vnode_to_pnode map can be used, turns off
automatic NUMA placement;
- removed bogus memory blocks pfn alignment;
---
tools/libxl/libxl.c | 19 +++++
tools/libxl/libxl.h | 18 ++++
tools/libxl/libxl_arch.h | 8 ++
tools/libxl/libxl_dom.c | 186 +++++++++++++++++++++++++++++++++++++++++-
tools/libxl/libxl_internal.h | 3 +
tools/libxl/libxl_types.idl | 5 +-
tools/libxl/libxl_x86.c | 53 ++++++++++++
7 files changed, 290 insertions(+), 2 deletions(-)
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 29e66f2..5f11641 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -4306,6 +4306,25 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc,
uint32_t domid,
}
return 0;
}
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+ uint32_t domid,
+ uint16_t nr_vnodes,
+ uint16_t nr_vcpus,
+ vnuma_memblk_t *vnuma_memblks,
+ unsigned int *vdistance,
+ unsigned int *vcpu_to_vnode,
+ unsigned int *vnode_to_pnode)
+{
+ GC_INIT(ctx);
+ int ret;
+ ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes,
+ nr_vcpus, vnuma_memblks,
+ vdistance,
+ vcpu_to_vnode,
+ vnode_to_pnode);
+ GC_FREE;
+ return ret;
+}
int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap)
{
diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
index 1c6675d..ceb4e38 100644
--- a/tools/libxl/libxl.h
+++ b/tools/libxl/libxl.h
@@ -281,6 +281,7 @@
#include <netinet/in.h>
#include <sys/wait.h> /* for pid_t */
+#include <xen/memory.h>
#include <xentoollog.h>
#include <libxl_uuid.h>
@@ -376,6 +377,14 @@
#define LIBXL_EXTERNAL_CALLERS_ONLY /* disappears for callers outside libxl */
#endif
+/*
+ * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be
+ * build for the guest upon request and with VM configuration.
+ * It will try to define best allocation for vNUMA
+ * nodes on real NUMA nodes.
+ */
+#define LIBXL_HAVE_BUILDINFO_VNUMA 1
+
typedef uint8_t libxl_mac[6];
#define LIBXL_MAC_FMT "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx"
#define LIBXL_MAC_FMTLEN ((2*6)+5) /* 6 hex bytes plus 5 colons */
@@ -753,6 +762,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int
nr_vcpus);
void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
+int libxl_domain_setvnodes(libxl_ctx *ctx,
+ uint32_t domid,
+ uint16_t nr_vnodes,
+ uint16_t nr_vcpus,
+ vnuma_memblk_t *vnuma_memblks,
+ unsigned int *vdistance,
+ unsigned int *vcpu_to_vnode,
+ unsigned int *vnode_to_pnode);
+
/*
* Devices
* ======diff --git a/tools/libxl/libxl_arch.h b/tools/libxl/libxl_arch.h
index abe6685..442aaec 100644
--- a/tools/libxl/libxl_arch.h
+++ b/tools/libxl/libxl_arch.h
@@ -19,4 +19,12 @@
int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
uint32_t domid);
+int libxl__vnuma_align_mem(libxl__gc *gc,
+ uint32_t domid,
+ struct libxl_domain_build_info *b_info,
+ vnuma_memblk_t *memblks);
+
+int libxl__vnodemap_is_usable(libxl__gc *gc,
+ libxl_domain_build_info *info);
+
#endif
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 356f920..12dc12a 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -201,6 +201,91 @@ static int numa_place_domain(libxl__gc *gc, uint32_t domid,
return rc;
}
+/* prepares vnode to pnode map for domain vNUMA memory allocation */
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+ libxl_domain_build_info *info)
+{
+ int i, n, nr_nodes, rc;
+ uint64_t *mems;
+ unsigned long long *claim = NULL;
+ libxl_numainfo *ninfo = NULL;
+
+ rc = -EINVAL;
+ if (info->vnode_to_pnode == NULL) {
+ info->vnode_to_pnode = calloc(info->nr_vnodes,
+ sizeof(*info->vnode_to_pnode));
+ if (info->vnode_to_pnode == NULL)
+ return rc;
+ }
+ else
+ return 0;
+
+ /*
+ * If this is no NUMA machine, vnode_to_pnode map will
+ * be initilizes with VNUMA_NO_NODE
+ */
+
+ /* Get NUMA info */
+ ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+ if (ninfo == NULL || nr_vnodes <= 0) {
+ for (i=0; i< info->nr_vnodes; i++)
+ info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+ LOG(DEBUG, "No HW NUMA found\n");
+ goto vnmapout;
+ }
+ claim = calloc(info->nr_vnodes, sizeof(*claim));
+ if (claim == NULL)
+ return rc;
+
+ for (i=0; i< info->nr_vnodes; i++)
+ info->vnode_to_pnode[i] = VNUMA_NO_NODE;
+
+ /*
+ * check if we have any hardware NUMA nodes selected,
+ * otherwise VNUMA_NO_NODE set and used default allocation
+ */
+ if (libxl_bitmap_is_empty(&info->nodemap))
+ return 0;
+ mems = info->vnuma_memszs;
+
+ /* check if all vnodes will fit in one node */
+ libxl_for_each_set_bit(n, info->nodemap) {
+ if (ninfo[n].free/1024 >= info->max_memkb &&
+ libxl_bitmap_test(&info->nodemap, n))
+ {
+ /*
+ * all domain v-nodes will fit one p-node,
+ * p-node is a best candidate selected by automatic
+ * NUMA placement.
+ */
+ for (i=0; i< info->nr_vnodes; i++)
+ info->vnode_to_pnode[i] = n;
+ return 0;
+ }
+ }
+ /* TODO: change algorithm. The current just fits the nodes
+ * Will be nice to have them also sorted by size
+ * If no p-node found, will be set to NUMA_NO_NODE
+ */
+ libxl_for_each_set_bit(n, info->nodemap)
+ {
+ for ( i = 0; i < info->nr_vnodes; i++ )
+ {
+ if ( ((claim[n] + (mems[i] << 20)) <= ninfo[n].free)
&&
+ /*vnode was not set yet */
+ (info->vnode_to_pnode[i] == VNUMA_NO_NODE ) )
+ {
+ info->vnode_to_pnode[i] = n;
+ claim[n] += (mems[i] << 20);
+ }
+ }
+ }
+ rc = 0;
+vnmapout:
+ if (claim) free(claim);
+ return rc;
+}
+
int libxl__build_pre(libxl__gc *gc, uint32_t domid,
libxl_domain_config *d_config, libxl__domain_build_state *state)
{
@@ -209,8 +294,29 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
char *xs_domid, *con_domid;
int rc;
+ rc = -EINVAL;
xc_domain_max_vcpus(ctx->xch, domid, info->max_vcpus);
+ /*
+ * If vNUMA vnode_to_pnode map defined, determine if we
+ * can disable automatic numa placement and place vnodes
+ * on specified pnodes.
+ * For now, if vcpu affinity specified, we will use
+ * specified vnode to pnode map.
+ */
+ if (info->nr_vnodes != 0) {
+ if ( libxl__vnodemap_is_usable(gc, info) ) {
+ LOG(DETAIL, "vNUMA automatic placement disabled\n");
+ libxl_defbool_set(&info->numa_placement, false);
+ }
+ else {
+ /* release the map as unusable */
+ free(info->vnode_to_pnode);
+ LOG(DETAIL, "vNUMA will use default vnode to pnode map\n");
+ info->vnode_to_pnode = NULL;
+ }
+ }
+
/*
* Check if the domain has any CPU affinity. If not, try to build
* up one. In case numa_place_domain() find at least a suitable
@@ -232,6 +338,26 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
if (rc)
return rc;
}
+ if (info->nr_vnodes != 0) {
+ /* The memory blocks will be formed here from sizes */
+ vnuma_memblk_t *memblks = libxl__calloc(gc, info->nr_vnodes,
+ sizeof(*memblks));
+
+ libxl__vnuma_align_mem(gc, domid, info, memblks);
+ /* Construct the vnode to pnode mapping if possible */
+ if (libxl__init_vnodemap(gc, domid, info) < 0) {
+ LOG(DEBUG, "Failed to call init_vnodemap\n");
+ info->nr_vnodes = 0;
+ }
+ /* plumb domain with vNUMA topology */
+ libxl_domain_setvnodes(ctx, domid, info->nr_vnodes,
+ info->max_vcpus, memblks,
+ info->vdistance, info->vcpu_to_vnode,
+ info->vnode_to_pnode);
+ }
+ else
+ LOG(DEBUG, "Will not construct vNUMA topology with 0
nodes.\n");
+
libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus,
&info->cpumap);
@@ -253,6 +379,48 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
return rc;
}
+int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info *info)
+{
+ int rc, nr_nodes, i;
+ libxl_numainfo *ninfo = NULL;
+ unsigned long long *claim;
+ unsigned int node;
+ uint64_t *mems;
+
+ rc = 0;
+ if (info->vnode_to_pnode == NULL)
+ return rc;
+ /*
+ * Cannot use specified mapping if not NUMA machine
+ */
+ ninfo = libxl_get_numainfo(CTX, &nr_nodes);
+ if (ninfo == NULL) {
+ return rc;
+ }
+ mems = info->vnuma_memszs;
+ claim = calloc(info->nr_vnodes, sizeof(*claim));
+ if (claim == NULL)
+ return rc;
+ /* Sum memory request on per pnode basis */
+ for ( i = 0; i < info->nr_vnodes; i++ )
+ {
+ node = info->vnode_to_pnode[i];
+ /* Correct pnode number? */
+ if (node < nr_nodes)
+ claim[node] += (mems[i] << 20);
+ else
+ goto vmapu;
+ }
+ for ( i = 0; i < nr_nodes; i++)
+ if (claim[i] > ninfo[i].free)
+ /* Cannot complete user request, falling to default */
+ goto vmapu;
+ rc = 1;
+vmapu:
+ if(claim) free(claim);
+ return rc;
+
+}
int libxl__build_post(libxl__gc *gc, uint32_t domid,
libxl_domain_build_info *info,
libxl__domain_build_state *state,
@@ -375,7 +543,23 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
}
}
}
-
+ if (info->nr_vnodes != 0) {
+ dom->nr_vnodes = info->nr_vnodes;
+ dom->vnode_to_pnode = malloc(dom->nr_vnodes *
sizeof(*dom->vnode_to_pnode));
+ dom->vnuma_memszs = malloc(dom->nr_vnodes *
sizeof(*dom->vnuma_memszs));
+ if (dom->vnuma_memszs == NULL || dom->vnode_to_pnode == NULL) {
+ LOGE(ERROR, "Failed to allocate memory for vNUMA domain
image.\n");
+ dom->nr_vnodes = 0;
+ info->nr_vnodes = 0;
+ if (dom->vnode_to_pnode) free(dom->vnode_to_pnode);
+ if (dom->vnuma_memszs) free(dom->vnuma_memszs);
+ goto out;
+ }
+ memcpy(dom->vnuma_memszs, info->vnuma_memszs,
+ sizeof(*dom->vnuma_memszs) * dom->nr_vnodes);
+ memcpy(dom->vnode_to_pnode, info->vnode_to_pnode,
+ sizeof(*dom->vnode_to_pnode) * dom->nr_vnodes);
+ }
dom->flags = flags;
dom->console_evtchn = state->console_port;
dom->console_domid = state->console_domid;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 165dc00..19ac0fe 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2710,6 +2710,7 @@ static inline void libxl__ctx_unlock(libxl_ctx *ctx) {
#define CTX_LOCK (libxl__ctx_lock(CTX))
#define CTX_UNLOCK (libxl__ctx_unlock(CTX))
+#define VNUMA_NO_NODE ~((unsigned int)0)
/*
* Automatic NUMA placement
*
@@ -2833,6 +2834,8 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
}
+int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
+ libxl_domain_build_info *info);
/*
* Inserts "elm_new" into the sorted list "head".
*
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index d2cea8a..5418966 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -292,7 +292,10 @@ libxl_domain_build_info =
Struct("domain_build_info",[
("disable_migrate", libxl_defbool),
("cpuid", libxl_cpuid_policy_list),
("blkdev_start", string),
-
+ ("vnuma_memszs", Array(uint64, "nr_vnodes")),
+ ("vcpu_to_vnode", Array(uint32, "nr_vnodemap")),
+ ("vdistance", Array(uint32, "nr_vdist")),
+ ("vnode_to_pnode", Array(uint32,
"nr_vnode_to_pnode")),
("device_model_version", libxl_device_model_version),
("device_model_stubdomain", libxl_defbool),
# if you set device_model you must set device_model_version too
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index a78c91d..01edc2b 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -308,3 +308,56 @@ int libxl__arch_domain_create(libxl__gc *gc,
libxl_domain_config *d_config,
return ret;
}
+
+/*
+ * Checks for the beginnig and end of RAM in e820 map for domain
+ * and aligns start of first and end of last vNUMA memory block to
+ * that map. vnode memory size are passed here Megabytes.
+ */
+int libxl__vnuma_align_mem(libxl__gc *gc,
+ uint32_t domid,
+ /* IN: mem sizes in Mbytes*/
+ libxl_domain_build_info *b_info,
+ /* OUT: linux numa blocks in pfn */
+ vnuma_memblk_t *memblks)
+{
+#ifndef roundup
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#endif
+ int i, rc;
+ unsigned long shift = 0;
+ unsigned long end_max;
+ uint32_t nr;
+ struct e820entry map[E820MAX];
+
+ libxl_ctx *ctx = libxl__gc_owner(gc);
+ rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
+ if (rc < 0) {
+
+ errno = rc;
+ return -EINVAL;
+ }
+ nr = rc;
+ rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
+ (b_info->max_memkb - b_info->target_memkb) +
+ b_info->u.pv.slack_memkb);
+ if (rc)
+ return -EINVAL;
+ end_max = map[nr-1].addr + map[nr-1].size;
+ shift = 0;
+ memset(memblks, 0, sizeof(*memblks)*b_info->nr_vnodes);
+ memblks[0].start = map[0].addr;
+
+ for(i = 0; i < b_info->nr_vnodes; i++) {
+ memblks[i].start += shift;
+ memblks[i].end += shift + (b_info->vnuma_memszs[i] << 20);
+ shift = memblks[i].end;
+ memblks[i].start = roundup(memblks[i].start, 1024 * 4);
+ LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end = %#010lx,
size MB = %#010lx\n",
+ memblks[i].start, memblks[i].end,
b_info->vnuma_memszs[i]);
+ }
+
+ if(memblks[i-1].end > end_max)
+ memblks[i-1].end = end_max;
+ return 0;
+}
--
1.7.10.4