On 11/14/2013 03:26 AM, Elena Ufimtseva wrote:> * Checks and sets if incorrect user defined map for physical
> NUMA nodes allocation. If fails, tries use automatic NUMA placement
> machanism, otherwise falls to default, not bound to any nodes,
> allocation. If user define allocation map can be used, disables
> automatic numa placement.
>
> * Verifies the correctness of memory ranges pfns for PV guest
> by requesting the e820 map for that domain;
>
> * Provides vNUMA topology information to Xen about vNUMA topology
> and allocation map used for vnodes;
>
> Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com>
> ---
> tools/libxl/libxl.c | 20 +++++
> tools/libxl/libxl.h | 20 +++++
> tools/libxl/libxl_arch.h | 8 ++
> tools/libxl/libxl_dom.c | 189
+++++++++++++++++++++++++++++++++++++++++-
> tools/libxl/libxl_internal.h | 3 +
> tools/libxl/libxl_types.idl | 5 +-
> tools/libxl/libxl_vnuma.h | 7 ++
> tools/libxl/libxl_x86.c | 58 +++++++++++++
> 8 files changed, 308 insertions(+), 2 deletions(-)
> create mode 100644 tools/libxl/libxl_vnuma.h
>
> diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
> index 0f0f56c..e420806 100644
> --- a/tools/libxl/libxl.c
> +++ b/tools/libxl/libxl.c
> @@ -4324,6 +4324,26 @@ static int libxl__set_vcpuonline_qmp(libxl__gc *gc,
uint32_t domid,
> return 0;
> }
>
> +int libxl_domain_setvnodes(libxl_ctx *ctx,
> + uint32_t domid,
> + uint16_t nr_vnodes,
> + uint16_t nr_vcpus,
> + vmemrange_t *vmemrange,
> + unsigned int *vdistance,
> + unsigned int *vcpu_to_vnode,
> + unsigned int *vnode_numamap)
> +{
> + GC_INIT(ctx);
> + int ret;
> + ret = xc_domain_setvnodes(ctx->xch, domid, nr_vnodes,
> + nr_vcpus, vmemrange,
> + vdistance,
> + vcpu_to_vnode,
> + vnode_numamap);
> + GC_FREE;
> + return ret;
> +}
> +
> int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap
*cpumap)
> {
> GC_INIT(ctx);
> diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h
> index 9379694..5206f41 100644
> --- a/tools/libxl/libxl.h
> +++ b/tools/libxl/libxl.h
> @@ -281,11 +281,14 @@
> #include <netinet/in.h>
> #include <sys/wait.h> /* for pid_t */
>
> +#include <xen/memory.h>
> #include <xentoollog.h>
>
> #include <libxl_uuid.h>
> #include <_libxl_list.h>
>
> +#include <xen/vnuma.h>
> +
> /* API compatibility. */
> #ifdef LIBXL_API_VERSION
> #if LIBXL_API_VERSION != 0x040200 && LIBXL_API_VERSION !=
0x040300 && \
> @@ -376,6 +379,14 @@
> #define LIBXL_EXTERNAL_CALLERS_ONLY /* disappears for callers outside
libxl */
> #endif
>
> +/*
> + * LIBXL_HAVE_BUILDINFO_VNUMA indicates that vnuma topology will be
> + * build for the guest upon request and with VM configuration.
> + * It will try to define best allocation for vNUMA
> + * nodes on real NUMA nodes.
> + */
> +#define LIBXL_HAVE_BUILDINFO_VNUMA 1
> +
> typedef uint8_t libxl_mac[6];
> #define LIBXL_MAC_FMT
"%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx"
> #define LIBXL_MAC_FMTLEN ((2*6)+5) /* 6 hex bytes plus 5 colons */
> @@ -735,6 +746,15 @@ void libxl_vcpuinfo_list_free(libxl_vcpuinfo *, int
nr_vcpus);
> void libxl_device_vtpm_list_free(libxl_device_vtpm*, int nr_vtpms);
> void libxl_vtpminfo_list_free(libxl_vtpminfo *, int nr_vtpms);
>
> +int libxl_domain_setvnodes(libxl_ctx *ctx,
> + uint32_t domid,
> + uint16_t nr_vnodes,
> + uint16_t nr_vcpus,
> + vmemrange_t *vmemrange,
> + unsigned int *vdistance,
> + unsigned int *vcpu_to_vnode,
> + unsigned int *vnode_numamap);
So it looks like the interface you''re presenting here is similar to the
libxc interface, in that the caller has to pass the topology into the
structure that they''re using do build the domain, and *then* they have
to call a function to copy that topology into the hypervisor.
In the case of libxc that''s probably fine, but in the case of libxl,
wouldn''t it make sense to have libxl take the information from the
domain config struct and automatically pass that into Xen?
> +
> /*
> * Devices
> * ======> diff --git a/tools/libxl/libxl_arch.h
b/tools/libxl/libxl_arch.h
> index abe6685..c289612 100644
> --- a/tools/libxl/libxl_arch.h
> +++ b/tools/libxl/libxl_arch.h
> @@ -19,4 +19,12 @@
> int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config
*d_config,
> uint32_t domid);
>
> +int libxl__vnuma_align_mem(libxl__gc *gc,
> + uint32_t domid,
> + struct libxl_domain_build_info *b_info,
> + vmemrange_t *memblks);
> +
> +int libxl__vnodemap_is_usable(libxl__gc *gc,
> + libxl_domain_build_info *info);
> +
> #endif
> diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
> index 1812bdc..187b498 100644
> --- a/tools/libxl/libxl_dom.c
> +++ b/tools/libxl/libxl_dom.c
> @@ -23,6 +23,7 @@
> #include <xc_dom.h>
> #include <xen/hvm/hvm_info_table.h>
> #include <xen/hvm/hvm_xs_strings.h>
> +#include <libxl_vnuma.h>
>
> libxl_domain_type libxl__domain_type(libxl__gc *gc, uint32_t domid)
> {
> @@ -201,6 +202,93 @@ static int numa_place_domain(libxl__gc *gc, uint32_t
domid,
> return rc;
> }
>
> +/* prepares vnode to pnode map for domain vNUMA memory allocation */
> +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
> + libxl_domain_build_info *info)
> +{
> + int i, n, nr_nodes, rc;
> + uint64_t *mems;
> + unsigned long long *claim = NULL;
> + libxl_numainfo *ninfo = NULL;
> +
> + rc = -EINVAL;
> + if (info->vnode_numamap == NULL) {
> + info->vnode_numamap = calloc(info->nr_vnodes,
> + sizeof(*info->vnode_numamap));
> + if (info->vnode_numamap == NULL)
> + return rc;
> + }
> +
> + /*
> + * If this is not a NUMA machine, vnode_numamap map will
> + * be initilizes with VNUMA_NO_NODE
> + */
> +
> + /* Get NUMA info */
> + ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> + if (ninfo == NULL || nr_nodes == 0) {
> + for (i = 0; i < info->nr_vnodes; i++)
> + info->vnode_numamap[i] = VNUMA_NO_NODE;
> + LOG(DEBUG, "No HW NUMA found.\n");
> + rc = 0;
> + goto vnmapout;
> + }
> +
> + for (i=0; i< info->nr_vnodes; i++)
> + info->vnode_numamap[i] = VNUMA_NO_NODE;
> +
> + /*
> + * check if we have any hardware NUMA nodes selected,
> + * otherwise VNUMA_NO_NODE set and used default allocation
> + */
> + if (libxl_bitmap_is_empty(&info->nodemap))
> + return 0;
> + mems = info->vnuma_memszs;
> +
> + /* check if all vnodes will fit in one node */
> + libxl_for_each_set_bit(n, info->nodemap) {
> + if(ninfo[n].free/1024 >= info->max_memkb &&
> + libxl_bitmap_test(&info->nodemap, n))
> + {
> + /*
> + * all domain v-nodes will fit one p-node n,
> + * p-node n is a best candidate selected by automatic
> + * NUMA placement.
> + */
> + for (i = 0; i < info->nr_vnodes; i++)
> + info->vnode_numamap[i] = n;
> + /* we can exit, as we are happy with placement */
> + return 0;
> + }
> + }
> + /* Determine the best nodes to fit vNUMA nodes */
> + /* TODO: change algorithm. The current just fits the nodes
> + * Will be nice to have them also sorted by size
> + * If no p-node found, will be set to NUMA_NO_NODE
> + */
> + claim = calloc(info->nr_vnodes, sizeof(*claim));
> + if (claim == NULL)
> + return rc;
> +
> + libxl_for_each_set_bit(n, info->nodemap)
> + {
> + for ( i = 0; i < info->nr_vnodes; i++ )
> + {
> + if ( ((claim[n] + (mems[i] << 20)) <= ninfo[n].free)
&&
> + /*vnode was not set yet */
> + (info->vnode_numamap[i] == VNUMA_NO_NODE ) )
> + {
> + info->vnode_numamap[i] = n;
> + claim[n] += (mems[i] << 20);
> + }
> + }
> + }
> + rc = 0;
> +vnmapout:
> + if (claim) free(claim);
> + return rc;
> +}
> +
> int libxl__build_pre(libxl__gc *gc, uint32_t domid,
> libxl_domain_config *d_config, libxl__domain_build_state
*state)
> {
> @@ -214,6 +302,8 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
> return ERROR_FAIL;
> }
>
> +
> +
<jacksparrow>Think of the kittens, lovey.</jacksparrow>
At a high level, the rest looks OK -- I''ll do a more detailed review
tomorrow probably.
-George
> /*
> * Check if the domain has any CPU affinity. If not, try to build
> * up one. In case numa_place_domain() find at least a suitable
> @@ -235,6 +325,41 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
> if (rc)
> return rc;
> }
> + if (info->nr_vnodes != 0) {
> + /* The memory blocks will be formed here from sizes */
> + vmemrange_t *memrange = libxl__calloc(gc, info->nr_vnodes,
> + sizeof(*memrange));
> +
> + libxl__vnuma_align_mem(gc, domid, info, memrange);
> +
> + /*
> + * If vNUMA vnode_numamap map defined, determine if we
> + * can disable automatic numa placement and place vnodes
> + * on specified pnodes.
> + * For now, if vcpu affinity specified, we will use
> + * specified vnode to pnode map.
> + */
> + if ( libxl__vnodemap_is_usable(gc, info) ) {
> + LOG(DETAIL, "vNUMA automatic placement disabled\n");
> + libxl_defbool_set(&info->numa_placement, false);
> + }
> + else {
> + /* Construct the vnode to pnode mapping if possible */
> + if (libxl__init_vnodemap(gc, domid, info) < 0) {
> + LOG(DEBUG, "Failed to call init_vnodemap\n");
> + /* vnuma_nodemap will not be used if nr_vnodes == 0 */
> + info->nr_vnodes = 0;
> + }
> + }
> + /* plumb domain with vNUMA topology */
> + libxl_domain_setvnodes(ctx, domid, info->nr_vnodes,
> + info->max_vcpus, memrange,
> + info->vdistance,
info->vcpu_to_vnode,
> + info->vnode_numamap);
> + }
> + else
> + LOG(DEBUG, "Will not construct vNUMA topology.\n");
> +
> libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap);
> libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus,
&info->cpumap);
>
> @@ -255,7 +380,53 @@ int libxl__build_pre(libxl__gc *gc, uint32_t domid,
>
> return rc;
> }
> +/* Checks if vnuma_nodemap defined in info can be used
> + * for allocation of vnodes on physical NUMA nodes by
> + * verifying that there is enough memory on corresponding
> + * NUMA nodes.
> + */
> +int libxl__vnodemap_is_usable(libxl__gc *gc, libxl_domain_build_info
*info)
> +{
> + int rc, nr_nodes, i;
> + libxl_numainfo *ninfo = NULL;
> + unsigned long long *claim;
> + unsigned int node;
> + uint64_t *mems;
>
> + rc = 0;
> + if (info->vnode_numamap == NULL)
> + return rc;
> + /*
> + * Cannot use specified mapping if not NUMA machine
> + */
> + ninfo = libxl_get_numainfo(CTX, &nr_nodes);
> + if (ninfo == NULL) {
> + return rc;
> + }
> + mems = info->vnuma_memszs;
> + claim = calloc(info->nr_vnodes, sizeof(*claim));
> + if (claim == NULL)
> + return rc;
> + /* Sum memory request on per pnode basis */
> + for ( i = 0; i < info->nr_vnodes; i++ )
> + {
> + node = info->vnode_numamap[i];
> + /* Correct pnode number? */
> + if (node < nr_nodes)
> + claim[node] += (mems[i] << 20);
> + else
> + goto vmapu;
> + }
> + for ( i = 0; i < nr_nodes; i++)
> + if (claim[i] > ninfo[i].free)
> + /* Cannot complete user request, falling to default */
> + goto vmapu;
> + rc = 1;
> +vmapu:
> + if(claim) free(claim);
> + return rc;
> +
> +}
> int libxl__build_post(libxl__gc *gc, uint32_t domid,
> libxl_domain_build_info *info,
> libxl__domain_build_state *state,
> @@ -378,7 +549,23 @@ int libxl__build_pv(libxl__gc *gc, uint32_t domid,
> }
> }
> }
> -
> + if (info->nr_vnodes != 0) {
> + dom->nr_vnodes = info->nr_vnodes;
> + dom->vnode_numamap = malloc(dom->nr_vnodes *
sizeof(*dom->vnode_numamap));
> + dom->vnuma_memszs = malloc(dom->nr_vnodes *
sizeof(*dom->vnuma_memszs));
> + if (dom->vnuma_memszs == NULL || dom->vnode_numamap == NULL)
{
> + LOGE(ERROR, "Failed to allocate memory for vNUMA domain
image.\n");
> + dom->nr_vnodes = 0;
> + info->nr_vnodes = 0;
> + if (dom->vnode_numamap) free(dom->vnode_numamap);
> + if (dom->vnuma_memszs) free(dom->vnuma_memszs);
> + goto out;
> + }
> + memcpy(dom->vnuma_memszs, info->vnuma_memszs,
> + sizeof(*dom->vnuma_memszs) * dom->nr_vnodes);
> + memcpy(dom->vnode_numamap, info->vnode_numamap,
> + sizeof(*dom->vnode_numamap) * dom->nr_vnodes);
> + }
> dom->flags = flags;
> dom->console_evtchn = state->console_port;
> dom->console_domid = state->console_domid;
> diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
> index 4729566..27f7e2d 100644
> --- a/tools/libxl/libxl_internal.h
> +++ b/tools/libxl/libxl_internal.h
> @@ -2838,6 +2838,9 @@ void libxl__numa_candidate_put_nodemap(libxl__gc *gc,
> libxl_bitmap_copy(CTX, &cndt->nodemap, nodemap);
> }
>
> +int libxl__init_vnodemap(libxl__gc *gc, uint32_t domid,
> + libxl_domain_build_info *info);
> +
> /*
> * Inserts "elm_new" into the sorted list "head".
> *
> diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
> index ce003c6..0126b2b 100644
> --- a/tools/libxl/libxl_types.idl
> +++ b/tools/libxl/libxl_types.idl
> @@ -309,7 +309,10 @@ libxl_domain_build_info =
Struct("domain_build_info",[
> ("disable_migrate", libxl_defbool),
> ("cpuid", libxl_cpuid_policy_list),
> ("blkdev_start", string),
> -
> + ("vnuma_memszs", Array(uint64, "nr_vnodes")),
> + ("vcpu_to_vnode", Array(uint32, "nr_vnodemap")),
> + ("vdistance", Array(uint32, "nr_vdist")),
> + ("vnode_numamap", Array(uint32,
"nr_vnode_to_pnode")),
> ("device_model_version", libxl_device_model_version),
> ("device_model_stubdomain", libxl_defbool),
> # if you set device_model you must set device_model_version too
> diff --git a/tools/libxl/libxl_vnuma.h b/tools/libxl/libxl_vnuma.h
> new file mode 100644
> index 0000000..d84b808
> --- /dev/null
> +++ b/tools/libxl/libxl_vnuma.h
> @@ -0,0 +1,7 @@
> +#include "libxl_osdeps.h" /* must come before any other headers
*/
> +
> +#define VNUMA_NO_NODE ~((unsigned int)0)
> +/* Max vNUMA node size in Mb is taken as it is in Linux for fake node */
> +#define MIN_VNODE_SIZE 32U
> +#define MAX_VNUMA_NODES (unsigned int)1 << 10
> +
> diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
> index a78c91d..675a67a 100644
> --- a/tools/libxl/libxl_x86.c
> +++ b/tools/libxl/libxl_x86.c
> @@ -308,3 +308,61 @@ int libxl__arch_domain_create(libxl__gc *gc,
libxl_domain_config *d_config,
>
> return ret;
> }
> +
> +/*
> + * Checks for the beginnig and end of RAM in e820 map for domain
> + * and aligns start of first and end of last vNUMA memory block to
> + * that map. vnode memory size are passed here Megabytes.
> + * For PV guest e820 map has fixed hole sizes.
> + */
> +int libxl__vnuma_align_mem(libxl__gc *gc,
> + uint32_t domid,
> + /* IN: mem sizes in Mbytes*/
> + libxl_domain_build_info *b_info,
> + /* OUT: linux numa blocks in pfn */
> + vmemrange_t *memblks)
> +{
> +#ifndef roundup
> +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
> +#endif
> + int i, rc;
> + unsigned long shift = 0;
> + unsigned long end_max;
> + uint32_t nr;
> + struct e820entry map[E820MAX];
> +
> + /* retreive domain e820 memory map */
> + libxl_ctx *ctx = libxl__gc_owner(gc);
> + rc = xc_get_machine_memory_map(ctx->xch, map, E820MAX);
> + if (rc < 0) {
> +
> + errno = rc;
> + return -EINVAL;
> + }
> + nr = rc;
> + /* sanitize map here as it was done for this domain */
> + rc = e820_sanitize(ctx, map, &nr, b_info->target_memkb,
> + (b_info->max_memkb - b_info->target_memkb) +
> + b_info->u.pv.slack_memkb);
> + if (rc)
> + return -EINVAL;
> + end_max = map[nr-1].addr + map[nr-1].size;
> + shift = 0;
> + memset(memblks, 0, sizeof(*memblks) * b_info->nr_vnodes);
> + memblks[0].start = map[0].addr;
> +
> + for(i = 0; i < b_info->nr_vnodes; i++) {
> + memblks[i].start += shift;
> + memblks[i].end += shift + (b_info->vnuma_memszs[i] <<
20);
> + /* As of now linked list is not supported here */
> + memblks[i].next = NULL;
> + shift = memblks[i].end;
> + memblks[i].start = roundup(memblks[i].start, 1024 * 4);
> + LIBXL__LOG(ctx, LIBXL__LOG_DEBUG,"start = %#010lx, end =
%#010lx, size MB = %#010lx\n",
> + memblks[i].start, memblks[i].end,
b_info->vnuma_memszs[i]);
> + }
> +
> + if(memblks[i-1].end > end_max)
> + memblks[i-1].end = end_max;
> + return 0;
> +}
>