Hi, Just reposting with, this time, _really_ all the Ack-s. :-) Thanks everyone and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 01 of 11 v6] xen, libxc: rename xenctl_cpumap to xenctl_bitmap
More specifically: 1. replaces xenctl_cpumap with xenctl_bitmap 2. provides bitmap_to_xenctl_bitmap and the reverse; 3. re-implement cpumask_to_xenctl_bitmap with bitmap_to_xenctl_bitmap and the reverse; Other than #3, no functional changes. Interface only slightly afected. This is in preparation of introducing NUMA node-affinity maps. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: Keir Fraser <keir@xen.org> --- Changes since v3: * when renaming xenctl_cpumap with xenctl_bitmap, use nr_bits in place of nr_cpus (instead of using nr_elems, like in v3 and before), as requested during review. Changes since v2: * Took care of tools/tests/mce-test/tools/xen-mceinj.c which hass been forgotten there, as requested during review. diff --git a/tools/libxc/xc_cpupool.c b/tools/libxc/xc_cpupool.c --- a/tools/libxc/xc_cpupool.c +++ b/tools/libxc/xc_cpupool.c @@ -90,7 +90,7 @@ xc_cpupoolinfo_t *xc_cpupool_getinfo(xc_ sysctl.u.cpupool_op.op = XEN_SYSCTL_CPUPOOL_OP_INFO; sysctl.u.cpupool_op.cpupool_id = poolid; set_xen_guest_handle(sysctl.u.cpupool_op.cpumap.bitmap, local); - sysctl.u.cpupool_op.cpumap.nr_cpus = local_size * 8; + sysctl.u.cpupool_op.cpumap.nr_bits = local_size * 8; err = do_sysctl_save(xch, &sysctl); @@ -184,7 +184,7 @@ xc_cpumap_t xc_cpupool_freeinfo(xc_inter sysctl.cmd = XEN_SYSCTL_cpupool_op; sysctl.u.cpupool_op.op = XEN_SYSCTL_CPUPOOL_OP_FREEINFO; set_xen_guest_handle(sysctl.u.cpupool_op.cpumap.bitmap, local); - sysctl.u.cpupool_op.cpumap.nr_cpus = mapsize * 8; + sysctl.u.cpupool_op.cpumap.nr_bits = mapsize * 8; err = do_sysctl_save(xch, &sysctl); diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -142,7 +142,7 @@ int xc_vcpu_setaffinity(xc_interface *xc set_xen_guest_handle(domctl.u.vcpuaffinity.cpumap.bitmap, local); - domctl.u.vcpuaffinity.cpumap.nr_cpus = cpusize * 8; + domctl.u.vcpuaffinity.cpumap.nr_bits = cpusize * 8; ret = do_domctl(xch, &domctl); @@ -182,7 +182,7 @@ int xc_vcpu_getaffinity(xc_interface *xc domctl.u.vcpuaffinity.vcpu = vcpu; set_xen_guest_handle(domctl.u.vcpuaffinity.cpumap.bitmap, local); - domctl.u.vcpuaffinity.cpumap.nr_cpus = cpusize * 8; + domctl.u.vcpuaffinity.cpumap.nr_bits = cpusize * 8; ret = do_domctl(xch, &domctl); diff --git a/tools/libxc/xc_tbuf.c b/tools/libxc/xc_tbuf.c --- a/tools/libxc/xc_tbuf.c +++ b/tools/libxc/xc_tbuf.c @@ -134,7 +134,7 @@ int xc_tbuf_set_cpu_mask(xc_interface *x bitmap_64_to_byte(bytemap, &mask64, sizeof (mask64) * 8); set_xen_guest_handle(sysctl.u.tbuf_op.cpu_mask.bitmap, bytemap); - sysctl.u.tbuf_op.cpu_mask.nr_cpus = sizeof(bytemap) * 8; + sysctl.u.tbuf_op.cpu_mask.nr_bits = sizeof(bytemap) * 8; ret = do_sysctl(xch, &sysctl); diff --git a/tools/tests/mce-test/tools/xen-mceinj.c b/tools/tests/mce-test/tools/xen-mceinj.c --- a/tools/tests/mce-test/tools/xen-mceinj.c +++ b/tools/tests/mce-test/tools/xen-mceinj.c @@ -161,7 +161,7 @@ static int inject_cmci(xc_interface *xc_ mc.u.mc_inject_v2.flags |= XEN_MC_INJECT_CPU_BROADCAST; mc.u.mc_inject_v2.flags |= XEN_MC_INJECT_TYPE_CMCI; - mc.u.mc_inject_v2.cpumap.nr_cpus = nr_cpus; + mc.u.mc_inject_v2.cpumap.nr_bits = nr_cpus; return xc_mca_op(xc_handle, &mc); } diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c +++ b/xen/arch/x86/cpu/mcheck/mce.c @@ -1453,8 +1453,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m cpumap = &cpu_online_map; else { - ret = xenctl_cpumap_to_cpumask(&cmv, - &op->u.mc_inject_v2.cpumap); + ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap); if ( ret ) break; cpumap = cmv; diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c --- a/xen/arch/x86/platform_hypercall.c +++ b/xen/arch/x86/platform_hypercall.c @@ -336,7 +336,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA { uint32_t cpu; uint64_t idletime, now = NOW(); - struct xenctl_cpumap ctlmap; + struct xenctl_bitmap ctlmap; cpumask_var_t cpumap; XEN_GUEST_HANDLE(uint8) cpumap_bitmap; XEN_GUEST_HANDLE(uint64) idletimes; @@ -345,11 +345,11 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA if ( cpufreq_controller != FREQCTL_dom0_kernel ) break; - ctlmap.nr_cpus = op->u.getidletime.cpumap_nr_cpus; + ctlmap.nr_bits = op->u.getidletime.cpumap_nr_cpus; guest_from_compat_handle(cpumap_bitmap, op->u.getidletime.cpumap_bitmap); ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */ - if ( (ret = xenctl_cpumap_to_cpumask(&cpumap, &ctlmap)) != 0 ) + if ( (ret = xenctl_bitmap_to_cpumask(&cpumap, &ctlmap)) != 0 ) goto out; guest_from_compat_handle(idletimes, op->u.getidletime.idletime); @@ -368,7 +368,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA op->u.getidletime.now = now; if ( ret == 0 ) - ret = cpumask_to_xenctl_cpumap(&ctlmap, cpumap); + ret = cpumask_to_xenctl_bitmap(&ctlmap, cpumap); free_cpumask_var(cpumap); if ( ret == 0 && __copy_field_to_guest(u_xenpf_op, op, u.getidletime) ) diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c --- a/xen/common/cpupool.c +++ b/xen/common/cpupool.c @@ -493,7 +493,7 @@ int cpupool_do_sysctl(struct xen_sysctl_ op->cpupool_id = c->cpupool_id; op->sched_id = c->sched->sched_id; op->n_dom = c->n_dom; - ret = cpumask_to_xenctl_cpumap(&op->cpumap, c->cpu_valid); + ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); cpupool_put(c); } break; @@ -588,7 +588,7 @@ int cpupool_do_sysctl(struct xen_sysctl_ case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: { - ret = cpumask_to_xenctl_cpumap( + ret = cpumask_to_xenctl_bitmap( &op->cpumap, &cpupool_free_cpus); } break; diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -32,28 +32,29 @@ static DEFINE_SPINLOCK(domctl_lock); DEFINE_SPINLOCK(vcpu_alloc_lock); -int cpumask_to_xenctl_cpumap( - struct xenctl_cpumap *xenctl_cpumap, const cpumask_t *cpumask) +int bitmap_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_bitmap, + const unsigned long *bitmap, + unsigned int nbits) { unsigned int guest_bytes, copy_bytes, i; uint8_t zero = 0; int err = 0; - uint8_t *bytemap = xmalloc_array(uint8_t, (nr_cpu_ids + 7) / 8); + uint8_t *bytemap = xmalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; - guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8; - copy_bytes = min_t(unsigned int, guest_bytes, (nr_cpu_ids + 7) / 8); + guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; + copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); - bitmap_long_to_byte(bytemap, cpumask_bits(cpumask), nr_cpu_ids); + bitmap_long_to_byte(bytemap, bitmap, nbits); if ( copy_bytes != 0 ) - if ( copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes) ) + if ( copy_to_guest(xenctl_bitmap->bitmap, bytemap, copy_bytes) ) err = -EFAULT; for ( i = copy_bytes; !err && i < guest_bytes; i++ ) - if ( copy_to_guest_offset(xenctl_cpumap->bitmap, i, &zero, 1) ) + if ( copy_to_guest_offset(xenctl_bitmap->bitmap, i, &zero, 1) ) err = -EFAULT; xfree(bytemap); @@ -61,36 +62,58 @@ int cpumask_to_xenctl_cpumap( return err; } -int xenctl_cpumap_to_cpumask( - cpumask_var_t *cpumask, const struct xenctl_cpumap *xenctl_cpumap) +int xenctl_bitmap_to_bitmap(unsigned long *bitmap, + const struct xenctl_bitmap *xenctl_bitmap, + unsigned int nbits) { unsigned int guest_bytes, copy_bytes; int err = 0; - uint8_t *bytemap = xzalloc_array(uint8_t, (nr_cpu_ids + 7) / 8); + uint8_t *bytemap = xzalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; - guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8; - copy_bytes = min_t(unsigned int, guest_bytes, (nr_cpu_ids + 7) / 8); + guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; + copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); if ( copy_bytes != 0 ) { - if ( copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes) ) + if ( copy_from_guest(bytemap, xenctl_bitmap->bitmap, copy_bytes) ) err = -EFAULT; - if ( (xenctl_cpumap->nr_cpus & 7) && (guest_bytes == copy_bytes) ) - bytemap[guest_bytes-1] &= ~(0xff << (xenctl_cpumap->nr_cpus & 7)); + if ( (xenctl_bitmap->nr_bits & 7) && (guest_bytes == copy_bytes) ) + bytemap[guest_bytes-1] &= ~(0xff << (xenctl_bitmap->nr_bits & 7)); } - if ( err ) - /* nothing */; - else if ( alloc_cpumask_var(cpumask) ) - bitmap_byte_to_long(cpumask_bits(*cpumask), bytemap, nr_cpu_ids); + if ( !err ) + bitmap_byte_to_long(bitmap, bytemap, nbits); + + xfree(bytemap); + + return err; +} + +int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_cpumap, + const cpumask_t *cpumask) +{ + return bitmap_to_xenctl_bitmap(xenctl_cpumap, cpumask_bits(cpumask), + nr_cpu_ids); +} + +int xenctl_bitmap_to_cpumask(cpumask_var_t *cpumask, + const struct xenctl_bitmap *xenctl_cpumap) +{ + int err = 0; + + if ( alloc_cpumask_var(cpumask) ) { + err = xenctl_bitmap_to_bitmap(cpumask_bits(*cpumask), xenctl_cpumap, + nr_cpu_ids); + /* In case of error, cleanup is up to us, as the caller won''t care! */ + if ( err ) + free_cpumask_var(*cpumask); + } else err = -ENOMEM; - xfree(bytemap); - return err; } @@ -540,7 +563,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe { cpumask_var_t new_affinity; - ret = xenctl_cpumap_to_cpumask( + ret = xenctl_bitmap_to_cpumask( &new_affinity, &op->u.vcpuaffinity.cpumap); if ( !ret ) { @@ -550,7 +573,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe } else { - ret = cpumask_to_xenctl_cpumap( + ret = cpumask_to_xenctl_bitmap( &op->u.vcpuaffinity.cpumap, v->cpu_affinity); } } diff --git a/xen/common/trace.c b/xen/common/trace.c --- a/xen/common/trace.c +++ b/xen/common/trace.c @@ -384,7 +384,7 @@ int tb_control(xen_sysctl_tbuf_op_t *tbc { cpumask_var_t mask; - rc = xenctl_cpumap_to_cpumask(&mask, &tbc->cpu_mask); + rc = xenctl_bitmap_to_cpumask(&mask, &tbc->cpu_mask); if ( !rc ) { cpumask_copy(&tb_cpu_mask, mask); diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h --- a/xen/include/public/arch-x86/xen-mca.h +++ b/xen/include/public/arch-x86/xen-mca.h @@ -414,7 +414,7 @@ struct xen_mc_mceinject { struct xen_mc_inject_v2 { uint32_t flags; - struct xenctl_cpumap cpumap; + struct xenctl_bitmap cpumap; }; #endif diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -285,7 +285,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvc /* XEN_DOMCTL_getvcpuaffinity */ struct xen_domctl_vcpuaffinity { uint32_t vcpu; /* IN */ - struct xenctl_cpumap cpumap; /* IN/OUT */ + struct xenctl_bitmap cpumap; /* IN/OUT */ }; typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t); diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -71,7 +71,7 @@ struct xen_sysctl_tbuf_op { #define XEN_SYSCTL_TBUFOP_disable 5 uint32_t cmd; /* IN/OUT variables */ - struct xenctl_cpumap cpu_mask; + struct xenctl_bitmap cpu_mask; uint32_t evt_mask; /* OUT variables */ uint64_aligned_t buffer_mfn; @@ -532,7 +532,7 @@ struct xen_sysctl_cpupool_op { uint32_t domid; /* IN: M */ uint32_t cpu; /* IN: AR */ uint32_t n_dom; /* OUT: I */ - struct xenctl_cpumap cpumap; /* OUT: IF */ + struct xenctl_bitmap cpumap; /* OUT: IF */ }; typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t); diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -869,9 +869,9 @@ typedef uint8_t xen_domain_handle_t[16]; #endif #ifndef __ASSEMBLY__ -struct xenctl_cpumap { +struct xenctl_bitmap { XEN_GUEST_HANDLE_64(uint8) bitmap; - uint32_t nr_cpus; + uint32_t nr_bits; }; #endif diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h --- a/xen/include/xen/cpumask.h +++ b/xen/include/xen/cpumask.h @@ -424,8 +424,8 @@ extern cpumask_t cpu_present_map; #define for_each_present_cpu(cpu) for_each_cpu(cpu, &cpu_present_map) /* Copy to/from cpumap provided by control tools. */ -struct xenctl_cpumap; -int cpumask_to_xenctl_cpumap(struct xenctl_cpumap *, const cpumask_t *); -int xenctl_cpumap_to_cpumask(cpumask_var_t *, const struct xenctl_cpumap *); +struct xenctl_bitmap; +int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *, const cpumask_t *); +int xenctl_bitmap_to_cpumask(cpumask_var_t *, const struct xenctl_bitmap *); #endif /* __XEN_CPUMASK_H */ diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst --- a/xen/include/xlat.lst +++ b/xen/include/xlat.lst @@ -2,7 +2,7 @@ # ! - needs translation # ? - needs checking ? dom0_vga_console_info xen.h -? xenctl_cpumap xen.h +? xenctl_bitmap xen.h ? mmu_update xen.h ! mmuext_op xen.h ! start_info xen.h
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 02 of 11 v6] xen, libxc: introduce xc_nodemap_t
And its handling functions, following suit from xc_cpumap_t. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: Keir Fraser <keir@xen.org> --- Changes from v5: * wrong usage of cpumask_bits() instead of nodes_addr(). Changes from v2: * Discriminating between statically allocated nodemask_t and dynamically allocated nodemask_var_t is not really necesssary, given the maximum number of nodes won''t ever grow that much, so killed that hunk, as suggested during review. diff --git a/tools/libxc/xc_misc.c b/tools/libxc/xc_misc.c --- a/tools/libxc/xc_misc.c +++ b/tools/libxc/xc_misc.c @@ -54,6 +54,11 @@ int xc_get_cpumap_size(xc_interface *xch return (xc_get_max_cpus(xch) + 7) / 8; } +int xc_get_nodemap_size(xc_interface *xch) +{ + return (xc_get_max_nodes(xch) + 7) / 8; +} + xc_cpumap_t xc_cpumap_alloc(xc_interface *xch) { int sz; @@ -64,6 +69,16 @@ xc_cpumap_t xc_cpumap_alloc(xc_interface return calloc(1, sz); } +xc_nodemap_t xc_nodemap_alloc(xc_interface *xch) +{ + int sz; + + sz = xc_get_nodemap_size(xch); + if (sz == 0) + return NULL; + return calloc(1, sz); +} + int xc_readconsolering(xc_interface *xch, char *buffer, unsigned int *pnr_chars, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -334,12 +334,20 @@ int xc_get_cpumap_size(xc_interface *xch /* allocate a cpumap */ xc_cpumap_t xc_cpumap_alloc(xc_interface *xch); - /* +/* * NODEMAP handling */ +typedef uint8_t *xc_nodemap_t; + /* return maximum number of NUMA nodes the hypervisor supports */ int xc_get_max_nodes(xc_interface *xch); +/* return array size for nodemap */ +int xc_get_nodemap_size(xc_interface *xch); + +/* allocate a nodemap */ +xc_nodemap_t xc_nodemap_alloc(xc_interface *xch); + /* * DOMAIN DEBUGGING FUNCTIONS */ diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -117,6 +117,20 @@ int xenctl_bitmap_to_cpumask(cpumask_var return err; } +int nodemask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_nodemap, + const nodemask_t *nodemask) +{ + return bitmap_to_xenctl_bitmap(xenctl_nodemap, nodes_addr(*nodemask), + MAX_NUMNODES); +} + +int xenctl_bitmap_to_nodemask(nodemask_t *nodemask, + const struct xenctl_bitmap *xenctl_nodemap) +{ + return xenctl_bitmap_to_bitmap(nodes_addr(*nodemask), xenctl_nodemap, + MAX_NUMNODES); +} + static inline int is_free_domid(domid_t dom) { struct domain *d;
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 03 of 11 v6] xen: sched_credit: when picking, make sure we get an idle one, if any
The pcpu picking algorithm treats two threads of a SMT core the same. More specifically, if one is idle and the other one is busy, they both will be assigned a weight of 1. Therefore, when picking begins, if the first target pcpu is the busy thread (and if there are no other idle pcpu than its sibling), that will never change. This change fixes this by ensuring that, before entering the core of the picking algorithm, the target pcpu is an idle one (if there is an idle pcpu at all, of course). Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Keir Fraser <keir@xen.org> --- Changes from v4: * cpumask_test_cpu() comes before cpumask_empty(), being the less expensive of the twos, as suggested during review. diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -532,6 +532,21 @@ static int if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) cpumask_set_cpu(cpu, &idlers); cpumask_and(&cpus, &cpus, &idlers); + + /* + * It is important that cpu points to an idle processor, if a suitable + * one exists (and we can use cpus to check and, possibly, choose a new + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and + * cpu points to a busy thread with an idle sibling, both the threads + * will be considered the same, from the "idleness" calculation point + * of view", preventing vcpu from being moved to the thread that is + * actually idle. + * + * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so + * we check for it first. + */ + if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) ) + cpu = cpumask_cycle(cpu, &cpus); cpumask_clear_cpu(cpu, &cpus); while ( !cpumask_empty(&cpus) )
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 04 of 11 v6] xen: sched_credit: let the scheduler know about node-affinity
As vcpu-affinity tells where VCPUs must run, node-affinity tells where they prefer to. While respecting vcpu-affinity remains mandatory, node-affinity is not that strict, it only expresses a preference, although honouring it will bring significant performance benefits (especially as compared to not having any affinity at all). This change modifies the VCPUs load balancing algorithm (for the credit scheduler only), introducing a two steps logic. During the first step, we use both the vcpu-affinity and the node-affinity masks (by looking at their intersection). The aim is giving precedence to the PCPUs where the domain prefers to run, as expressed by its node-affinity (with the intersection with the vcpu-afinity being necessary in order to avoid running a VCPU where it never should). If that fails in finding a valid PCPU, the node-affinity is just ignored and, in the second step, we fall back to using cpu-affinity only. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Keir Fraser <keir@xen.org> --- Changes from v3: * rephrased some bits of the changelog, as suggested during review; * renamed __vcpu_has_valuable_node_affinity() to __vcpu_has_node_affinity() and killed csched_balance_step_skippable(), in favour of directly putting the full if statement in place, as requested and suggested (respectively) during review; * added a comment about future possible optimization in csched_runq_steal(). Changes from v2: * for_each_csched_balance_step() now is defined as a regular, and easier to understand, 0...n for() loop, instead of a going backwards one, as that wasn''t really needed; * checking whether or not a CSCHED_BALANCE_NODE_AFFINITY balancing set is useful or not now happens outside of csched_balance_cpumask(), i.e., closer to the actual loop, making the logic a lot more evident and easy to understand, as requested during review; * while reworking __runq_tickle(), handling of idle pcpu has been brought outside from the balancing loop, as requested during review; * __csched_vcpu_is_migrateable() was just wrong, so it has been removed; * the suboptimal handling of SMT in _csched_cpu_pick() has been moved to a separate patch (i.e., the previous patch in the series); * moved the CPU mask needed for balancing within `csched_pcpu'', as suggested during review. This way it is not only more close to other per-PCPU data (potential cache related benefits), but it is also only allocated for the PCPUs credit is in charge of. Changes from v1: * CPU masks variables moved off from the stack, as requested during review. As per the comments in the code, having them in the private (per-scheduler instance) struct could have been enough, but it would be racy (again, see comments). For that reason, use a global bunch of them of (via per_cpu()); * George suggested a different load balancing logic during v1''s review. I think he was right and then I changed the old implementation in a way that resembles exactly that. I rewrote most of this patch to introduce a more sensible and effective noda-affinity handling logic. diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -112,6 +112,12 @@ /* + * Node Balancing + */ +#define CSCHED_BALANCE_NODE_AFFINITY 0 +#define CSCHED_BALANCE_CPU_AFFINITY 1 + +/* * Boot parameters */ static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; @@ -126,9 +132,20 @@ struct csched_pcpu { struct timer ticker; unsigned int tick; unsigned int idle_bias; + /* Store this here to avoid having too many cpumask_var_t-s on stack */ + cpumask_var_t balance_mask; }; /* + * Convenience macro for accessing the per-PCPU cpumask we need for + * implementing the two steps (vcpu and node affinity) balancing logic. + * It is stored in csched_pcpu so that serialization is not an issue, + * as there is a csched_pcpu for each PCPU and we always hold the + * runqueue spin-lock when using this. + */ +#define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask) + +/* * Virtual CPU */ struct csched_vcpu { @@ -161,6 +178,9 @@ struct csched_dom { struct list_head active_vcpu; struct list_head active_sdom_elem; struct domain *dom; + /* cpumask translated from the domain''s node-affinity. + * Basically, the CPUs we prefer to be scheduled on. */ + cpumask_var_t node_affinity_cpumask; uint16_t active_vcpu_count; uint16_t weight; uint16_t cap; @@ -241,6 +261,35 @@ static inline void list_del_init(&svc->runq_elem); } +#define for_each_csched_balance_step(step) \ + for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) + + +/* + * vcpu-affinity balancing is always necessary and must never be skipped. + * OTOH, if a domain''s node-affinity spans all the nodes, we can safely + * avoid dealing with node-affinity entirely. + */ +#define __vcpu_has_node_affinity(vc) \ + ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) ) + +/* + * Each csched-balance step uses its own cpumask. This function determines + * which one (given the step) and copies it in mask. For the node-affinity + * balancing step, the pcpus that are not part of vc''s vcpu-affinity are + * filtered out from the result, to avoid running a vcpu where it would + * like, but is not allowed to! + */ +static void +csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) +{ + if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, + vc->cpu_affinity); + else /* step == CSCHED_BALANCE_CPU_AFFINITY */ + cpumask_copy(mask, vc->cpu_affinity); +} + static void burn_credits(struct csched_vcpu *svc, s_time_t now) { s_time_t delta; @@ -272,12 +321,12 @@ static inline void struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu)); struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); cpumask_t mask, idle_mask; - int idlers_empty; + int balance_step, idlers_empty; ASSERT(cur); cpumask_clear(&mask); + idlers_empty = cpumask_empty(prv->idlers); - idlers_empty = cpumask_empty(prv->idlers); /* * If the pcpu is idle, or there are no idlers and the new @@ -297,41 +346,70 @@ static inline void } else if ( !idlers_empty ) { - /* Check whether or not there are idlers that can run new */ - cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity); + /* + * Node and vcpu-affinity balancing loop. For vcpus without + * a useful node-affinity, consider vcpu-affinity only. + */ + for_each_csched_balance_step( balance_step ) + { + int new_idlers_empty; - /* - * If there are no suitable idlers for new, and it''s higher - * priority than cur, ask the scheduler to migrate cur away. - * We have to act like this (instead of just waking some of - * the idlers suitable for cur) because cur is running. - * - * If there are suitable idlers for new, no matter priorities, - * leave cur alone (as it is running and is, likely, cache-hot) - * and wake some of them (which is waking up and so is, likely, - * cache cold anyway). - */ - if ( cpumask_empty(&idle_mask) && new->pri > cur->pri ) - { - SCHED_STAT_CRANK(tickle_idlers_none); - SCHED_VCPU_STAT_CRANK(cur, kicked_away); - SCHED_VCPU_STAT_CRANK(cur, migrate_r); - SCHED_STAT_CRANK(migrate_kicked_away); - set_bit(_VPF_migrating, &cur->vcpu->pause_flags); - cpumask_set_cpu(cpu, &mask); - } - else if ( !cpumask_empty(&idle_mask) ) - { - /* Which of the idlers suitable for new shall we wake up? */ - SCHED_STAT_CRANK(tickle_idlers_some); - if ( opt_tickle_one_idle ) + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(new->vcpu) ) + continue; + + /* Are there idlers suitable for new (for this balance step)? */ + csched_balance_cpumask(new->vcpu, balance_step, + csched_balance_mask); + cpumask_and(&idle_mask, prv->idlers, csched_balance_mask); + new_idlers_empty = cpumask_empty(&idle_mask); + + /* + * Let''s not be too harsh! If there aren''t idlers suitable + * for new in its node-affinity mask, make sure we check its + * vcpu-affinity as well, before taking final decisions. + */ + if ( new_idlers_empty + && balance_step == CSCHED_BALANCE_NODE_AFFINITY ) + continue; + + /* + * If there are no suitable idlers for new, and it''s higher + * priority than cur, ask the scheduler to migrate cur away. + * We have to act like this (instead of just waking some of + * the idlers suitable for cur) because cur is running. + * + * If there are suitable idlers for new, no matter priorities, + * leave cur alone (as it is running and is, likely, cache-hot) + * and wake some of them (which is waking up and so is, likely, + * cache cold anyway). + */ + if ( new_idlers_empty && new->pri > cur->pri ) { - this_cpu(last_tickle_cpu) - cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); - cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); + SCHED_STAT_CRANK(tickle_idlers_none); + SCHED_VCPU_STAT_CRANK(cur, kicked_away); + SCHED_VCPU_STAT_CRANK(cur, migrate_r); + SCHED_STAT_CRANK(migrate_kicked_away); + set_bit(_VPF_migrating, &cur->vcpu->pause_flags); + cpumask_set_cpu(cpu, &mask); } - else - cpumask_or(&mask, &mask, &idle_mask); + else if ( !new_idlers_empty ) + { + /* Which of the idlers suitable for new shall we wake up? */ + SCHED_STAT_CRANK(tickle_idlers_some); + if ( opt_tickle_one_idle ) + { + this_cpu(last_tickle_cpu) + cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); + cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); + } + else + cpumask_or(&mask, &mask, &idle_mask); + } + + /* Did we find anyone? */ + if ( !cpumask_empty(&mask) ) + break; } } @@ -376,6 +454,7 @@ csched_free_pdata(const struct scheduler spin_unlock_irqrestore(&prv->lock, flags); + free_cpumask_var(spc->balance_mask); xfree(spc); } @@ -391,6 +470,12 @@ csched_alloc_pdata(const struct schedule if ( spc == NULL ) return NULL; + if ( !alloc_cpumask_var(&spc->balance_mask) ) + { + xfree(spc); + return NULL; + } + spin_lock_irqsave(&prv->lock, flags); /* Initialize/update system-wide config */ @@ -481,15 +566,16 @@ static inline int } static inline int -__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu) +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask) { /* * Don''t pick up work that''s in the peer''s scheduling tail or hot on - * peer PCPU. Only pick up work that''s allowed to run on our CPU. + * peer PCPU. Only pick up work that prefers and/or is allowed to run + * on our CPU. */ return !vc->is_running && !__csched_vcpu_is_cache_hot(vc) && - cpumask_test_cpu(dest_cpu, vc->cpu_affinity); + cpumask_test_cpu(dest_cpu, mask); } static int @@ -499,100 +585,114 @@ static int cpumask_t idlers; cpumask_t *online; struct csched_pcpu *spc = NULL; - int cpu; + int cpu = vc->processor; + int balance_step; - /* - * Pick from online CPUs in VCPU''s affinity mask, giving a - * preference to its current processor if it''s in there. - */ online = cpupool_scheduler_cpumask(vc->domain->cpupool); - cpumask_and(&cpus, online, vc->cpu_affinity); - cpu = cpumask_test_cpu(vc->processor, &cpus) - ? vc->processor - : cpumask_cycle(vc->processor, &cpus); - ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + for_each_csched_balance_step( balance_step ) + { + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(vc) ) + continue; - /* - * Try to find an idle processor within the above constraints. - * - * In multi-core and multi-threaded CPUs, not all idle execution - * vehicles are equal! - * - * We give preference to the idle execution vehicle with the most - * idling neighbours in its grouping. This distributes work across - * distinct cores first and guarantees we don''t do something stupid - * like run two VCPUs on co-hyperthreads while there are idle cores - * or sockets. - * - * Notice that, when computing the "idleness" of cpu, we may want to - * discount vc. That is, iff vc is the currently running and the only - * runnable vcpu on cpu, we add cpu to the idlers. - */ - cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); - if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) - cpumask_set_cpu(cpu, &idlers); - cpumask_and(&cpus, &cpus, &idlers); + /* Pick an online CPU from the proper affinity mask */ + csched_balance_cpumask(vc, balance_step, &cpus); + cpumask_and(&cpus, &cpus, online); - /* - * It is important that cpu points to an idle processor, if a suitable - * one exists (and we can use cpus to check and, possibly, choose a new - * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and - * cpu points to a busy thread with an idle sibling, both the threads - * will be considered the same, from the "idleness" calculation point - * of view", preventing vcpu from being moved to the thread that is - * actually idle. - * - * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so - * we check for it first. - */ - if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) ) - cpu = cpumask_cycle(cpu, &cpus); - cpumask_clear_cpu(cpu, &cpus); + /* If present, prefer vc''s current processor */ + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); - while ( !cpumask_empty(&cpus) ) - { - cpumask_t cpu_idlers; - cpumask_t nxt_idlers; - int nxt, weight_cpu, weight_nxt; - int migrate_factor; + /* + * Try to find an idle processor within the above constraints. + * + * In multi-core and multi-threaded CPUs, not all idle execution + * vehicles are equal! + * + * We give preference to the idle execution vehicle with the most + * idling neighbours in its grouping. This distributes work across + * distinct cores first and guarantees we don''t do something stupid + * like run two VCPUs on co-hyperthreads while there are idle cores + * or sockets. + * + * Notice that, when computing the "idleness" of cpu, we may want to + * discount vc. That is, iff vc is the currently running and the only + * runnable vcpu on cpu, we add cpu to the idlers. + */ + cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); + if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) + cpumask_set_cpu(cpu, &idlers); + cpumask_and(&cpus, &cpus, &idlers); - nxt = cpumask_cycle(cpu, &cpus); + /* + * It is important that cpu points to an idle processor, if a suitable + * one exists (and we can use cpus to check and, possibly, choose a new + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and + * cpu points to a busy thread with an idle sibling, both the threads + * will be considered the same, from the "idleness" calculation point + * of view", preventing vcpu from being moved to the thread that is + * actually idle. + * + * Notice that cpumask_test_cpu() is quicker than cpumask_empty(), so + * we check for it first. + */ + if ( !cpumask_test_cpu(cpu, &cpus) && !cpumask_empty(&cpus) ) + cpu = cpumask_cycle(cpu, &cpus); + cpumask_clear_cpu(cpu, &cpus); - if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) + while ( !cpumask_empty(&cpus) ) { - /* We''re on the same socket, so check the busy-ness of threads. - * Migrate if # of idlers is less at all */ - ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 1; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, nxt)); - } - else - { - /* We''re on different sockets, so check the busy-ness of cores. - * Migrate only if the other core is twice as idle */ - ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 2; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); + cpumask_t cpu_idlers; + cpumask_t nxt_idlers; + int nxt, weight_cpu, weight_nxt; + int migrate_factor; + + nxt = cpumask_cycle(cpu, &cpus); + + if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) + { + /* We''re on the same socket, so check the busy-ness of threads. + * Migrate if # of idlers is less at all */ + ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 1; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, + cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, + nxt)); + } + else + { + /* We''re on different sockets, so check the busy-ness of cores. + * Migrate only if the other core is twice as idle */ + ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 2; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); + } + + weight_cpu = cpumask_weight(&cpu_idlers); + weight_nxt = cpumask_weight(&nxt_idlers); + /* smt_power_savings: consolidate work rather than spreading it */ + if ( sched_smt_power_savings ? + weight_cpu > weight_nxt : + weight_cpu * migrate_factor < weight_nxt ) + { + cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); + spc = CSCHED_PCPU(nxt); + cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); + cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); + } + else + { + cpumask_andnot(&cpus, &cpus, &nxt_idlers); + } } - weight_cpu = cpumask_weight(&cpu_idlers); - weight_nxt = cpumask_weight(&nxt_idlers); - /* smt_power_savings: consolidate work rather than spreading it */ - if ( sched_smt_power_savings ? - weight_cpu > weight_nxt : - weight_cpu * migrate_factor < weight_nxt ) - { - cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); - spc = CSCHED_PCPU(nxt); - cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); - cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); - } - else - { - cpumask_andnot(&cpus, &cpus, &nxt_idlers); - } + /* Stop if cpu is idle */ + if ( cpumask_test_cpu(cpu, &idlers) ) + break; } if ( commit && spc ) @@ -934,6 +1034,13 @@ csched_alloc_domdata(const struct schedu if ( sdom == NULL ) return NULL; + if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) ) + { + xfree(sdom); + return NULL; + } + cpumask_setall(sdom->node_affinity_cpumask); + /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_vcpu); sdom->active_vcpu_count = 0; @@ -965,6 +1072,9 @@ csched_dom_init(const struct scheduler * static void csched_free_domdata(const struct scheduler *ops, void *data) { + struct csched_dom *sdom = data; + + free_cpumask_var(sdom->node_affinity_cpumask); xfree(data); } @@ -1259,7 +1369,7 @@ csched_tick(void *_cpu) } static struct csched_vcpu * -csched_runq_steal(int peer_cpu, int cpu, int pri) +csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) { const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); const struct vcpu * const peer_vcpu = curr_on_cpu(peer_cpu); @@ -1284,11 +1394,28 @@ csched_runq_steal(int peer_cpu, int cpu, if ( speer->pri <= pri ) break; - /* Is this VCPU is runnable on our PCPU? */ + /* Is this VCPU runnable on our PCPU? */ vc = speer->vcpu; BUG_ON( is_idle_vcpu(vc) ); - if (__csched_vcpu_is_migrateable(vc, cpu)) + /* + * If the vcpu has no useful node-affinity, skip this vcpu. + * In fact, what we want is to check if we have any node-affine + * work to steal, before starting to look at vcpu-affine work. + * + * Notice that, if not even one vCPU on this runq has a useful + * node-affinity, we could have avoid considering this runq for + * a node balancing step in the first place. This, for instance, + * can be implemented by taking note of on what runq there are + * vCPUs with useful node-affinities in some sort of bitmap + * or counter. + */ + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(vc) ) + continue; + + csched_balance_cpumask(vc, balance_step, csched_balance_mask); + if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) ) { /* We got a candidate. Grab it! */ TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu, @@ -1314,7 +1441,8 @@ csched_load_balance(struct csched_privat struct csched_vcpu *speer; cpumask_t workers; cpumask_t *online; - int peer_cpu; + int peer_cpu, peer_node, bstep; + int node = cpu_to_node(cpu); BUG_ON( cpu != snext->vcpu->processor ); online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)); @@ -1331,42 +1459,68 @@ csched_load_balance(struct csched_privat SCHED_STAT_CRANK(load_balance_other); /* - * Peek at non-idling CPUs in the system, starting with our - * immediate neighbour. + * Let''s look around for work to steal, taking both vcpu-affinity + * and node-affinity into account. More specifically, we check all + * the non-idle CPUs'' runq, looking for: + * 1. any node-affine work to steal first, + * 2. if not finding anything, any vcpu-affine work to steal. */ - cpumask_andnot(&workers, online, prv->idlers); - cpumask_clear_cpu(cpu, &workers); - peer_cpu = cpu; + for_each_csched_balance_step( bstep ) + { + /* + * We peek at the non-idling CPUs in a node-wise fashion. In fact, + * it is more likely that we find some node-affine work on our same + * node, not to mention that migrating vcpus within the same node + * could well expected to be cheaper than across-nodes (memory + * stays local, there might be some node-wide cache[s], etc.). + */ + peer_node = node; + do + { + /* Find out what the !idle are in this node */ + cpumask_andnot(&workers, online, prv->idlers); + cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); + cpumask_clear_cpu(cpu, &workers); - while ( !cpumask_empty(&workers) ) - { - peer_cpu = cpumask_cycle(peer_cpu, &workers); - cpumask_clear_cpu(peer_cpu, &workers); + if ( cpumask_empty(&workers) ) + goto next_node; - /* - * Get ahold of the scheduler lock for this peer CPU. - * - * Note: We don''t spin on this lock but simply try it. Spinning could - * cause a deadlock if the peer CPU is also load balancing and trying - * to lock this CPU. - */ - if ( !pcpu_schedule_trylock(peer_cpu) ) - { - SCHED_STAT_CRANK(steal_trylock_failed); - continue; - } + peer_cpu = cpumask_first(&workers); + do + { + /* + * Get ahold of the scheduler lock for this peer CPU. + * + * Note: We don''t spin on this lock but simply try it. Spinning + * could cause a deadlock if the peer CPU is also load + * balancing and trying to lock this CPU. + */ + if ( !pcpu_schedule_trylock(peer_cpu) ) + { + SCHED_STAT_CRANK(steal_trylock_failed); + peer_cpu = cpumask_cycle(peer_cpu, &workers); + continue; + } - /* - * Any work over there to steal? - */ - speer = cpumask_test_cpu(peer_cpu, online) ? - csched_runq_steal(peer_cpu, cpu, snext->pri) : NULL; - pcpu_schedule_unlock(peer_cpu); - if ( speer != NULL ) - { - *stolen = 1; - return speer; - } + /* Any work over there to steal? */ + speer = cpumask_test_cpu(peer_cpu, online) ? + csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; + pcpu_schedule_unlock(peer_cpu); + + /* As soon as one vcpu is found, balancing ends */ + if ( speer != NULL ) + { + *stolen = 1; + return speer; + } + + peer_cpu = cpumask_cycle(peer_cpu, &workers); + + } while( peer_cpu != cpumask_first(&workers) ); + + next_node: + peer_node = cycle_node(peer_node, node_online_map); + } while( peer_node != node ); } out: diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h --- a/xen/include/xen/nodemask.h +++ b/xen/include/xen/nodemask.h @@ -41,6 +41,8 @@ * int last_node(mask) Number highest set bit, or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES. + * int cycle_node(node, mask) Next node cycling from ''node'', or + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit ''node'' set * NODE_MASK_ALL Initializer - all bits set @@ -254,6 +256,16 @@ static inline int __first_unset_node(con find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } +#define cycle_node(n, src) __cycle_node((n), &(src), MAX_NUMNODES) +static inline int __cycle_node(int n, const nodemask_t *maskp, int nbits) +{ + int nxt = __next_node(n, maskp, nbits); + + if (nxt == nbits) + nxt = __first_node(maskp, nbits); + return nxt; +} + #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 05 of 11 v6] xen: allow for explicitly specifying node-affinity
Make it possible to pass the node-affinity of a domain to the hypervisor from the upper layers, instead of always being computed automatically. Note that this also required generalizing the Flask hooks for setting and getting the affinity, so that they now deal with both vcpu and node affinity. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: Keir Fraser <keir@xen.org> --- Changes from v4: * auto_node_affinity (in struct domain) is now bool_t, and in a spot where it should not add much extra padding, as requested during review; * flask: correctly updated the switch in hooks.c and the comments in access_vectors to reflect the new domctls, as requested during review. Changes from v3: * consider it possible for node-affinity and vcpu-affinity to have an empty intersection (as not doing so, when the full series is applied and vcpu-affinity is changed on-line, could result in failing an ASSERT); * handling of XEN_DOMCTL_{set,get}nodeaffinity now happens in two separate `case'' clauses, as suggested during review. Changes from v2: * reworked as needed after the merge of Daniel''s IS_PRIV work; * xen.te taken care of, as requested during review. Changes from v1: * added the missing dummy hook for nodeaffinity; * let the permission renaming affect flask policies too. diff --git a/tools/flask/policy/policy/mls b/tools/flask/policy/policy/mls --- a/tools/flask/policy/policy/mls +++ b/tools/flask/policy/policy/mls @@ -70,11 +70,11 @@ mlsconstrain domain transition (( h1 dom h2 ) and (( l1 eq l2 ) or (t1 == mls_priv))); # all the domain "read" ops -mlsconstrain domain { getvcpuaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } +mlsconstrain domain { getaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } ((l1 dom l2) or (t1 == mls_priv)); # all the domain "write" ops -mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setvcpuaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } +mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } ((l1 eq l2) or (t1 == mls_priv)); # This is incomplete - similar constraints must be written for all classes diff --git a/tools/flask/policy/policy/modules/xen/xen.if b/tools/flask/policy/policy/modules/xen/xen.if --- a/tools/flask/policy/policy/modules/xen/xen.if +++ b/tools/flask/policy/policy/modules/xen/xen.if @@ -48,7 +48,7 @@ define(`create_domain_common'', ` allow $1 $2:domain { create max_vcpus setdomainmaxmem setaddrsize getdomaininfo hypercall setvcpucontext setextvcpucontext getscheduler getvcpuinfo getvcpuextstate getaddrsize - getvcpuaffinity setvcpuaffinity }; + getaffinity setaffinity }; allow $1 $2:domain2 { set_cpuid settsc setscheduler }; allow $1 $2:security check_context; allow $1 $2:shadow enable; @@ -77,9 +77,9 @@ define(`create_domain_build_label'', ` # manage_domain(priv, target) # Allow managing a running domain define(`manage_domain'', ` - allow $1 $2:domain { getdomaininfo getvcpuinfo getvcpuaffinity + allow $1 $2:domain { getdomaininfo getvcpuinfo getaffinity getaddrsize pause unpause trigger shutdown destroy - setvcpuaffinity setdomainmaxmem getscheduler }; + setaffinity setdomainmaxmem getscheduler }; '') # migrate_domain_out(priv, target) diff --git a/tools/flask/policy/policy/modules/xen/xen.te b/tools/flask/policy/policy/modules/xen/xen.te --- a/tools/flask/policy/policy/modules/xen/xen.te +++ b/tools/flask/policy/policy/modules/xen/xen.te @@ -69,7 +69,7 @@ allow dom0_t xen_t:mmu memorymap; # Allow dom0 to use these domctls on itself. For domctls acting on other # domains, see the definitions of create_domain and manage_domain. allow dom0_t dom0_t:domain { - setvcpucontext max_vcpus setvcpuaffinity getvcpuaffinity getscheduler + setvcpucontext max_vcpus setaffinity getaffinity getscheduler getdomaininfo getvcpuinfo getvcpucontext setdomainmaxmem setdomainhandle setdebugging hypercall settime setaddrsize getaddrsize trigger getextvcpucontext setextvcpucontext getvcpuextstate setvcpuextstate diff --git a/xen/common/domain.c b/xen/common/domain.c --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -224,6 +224,7 @@ struct domain *domain_create( spin_lock_init(&d->node_affinity_lock); d->node_affinity = NODE_MASK_ALL; + d->auto_node_affinity = 1; spin_lock_init(&d->shutdown_lock); d->shutdown_code = -1; @@ -364,11 +365,32 @@ void domain_update_node_affinity(struct cpumask_or(cpumask, cpumask, online_affinity); } - for_each_online_node ( node ) - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) - node_set(node, nodemask); + if ( d->auto_node_affinity ) + { + /* Node-affinity is automaically computed from all vcpu-affinities */ + for_each_online_node ( node ) + if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_set(node, nodemask); - d->node_affinity = nodemask; + d->node_affinity = nodemask; + } + else + { + /* Node-affinity is provided by someone else, just filter out cpus + * that are either offline or not in the affinity of any vcpus. */ + nodemask = d->node_affinity; + for_each_node_mask ( node, d->node_affinity ) + if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_clear(node, nodemask);//d->node_affinity); + + /* Avoid loosing track of node-affinity because of a bad + * vcpu-affinity has been specified. */ + if ( !nodes_empty(nodemask) ) + d->node_affinity = nodemask; + } + + sched_set_node_affinity(d, &d->node_affinity); + spin_unlock(&d->node_affinity_lock); free_cpumask_var(online_affinity); @@ -376,6 +398,36 @@ void domain_update_node_affinity(struct } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity) +{ + /* Being affine with no nodes is just wrong */ + if ( nodes_empty(*affinity) ) + return -EINVAL; + + spin_lock(&d->node_affinity_lock); + + /* + * Being/becoming explicitly affine to all nodes is not particularly + * useful. Let''s take it as the `reset node affinity` command. + */ + if ( nodes_full(*affinity) ) + { + d->auto_node_affinity = 1; + goto out; + } + + d->auto_node_affinity = 0; + d->node_affinity = *affinity; + +out: + spin_unlock(&d->node_affinity_lock); + + domain_update_node_affinity(d); + + return 0; +} + + struct domain *get_domain_by_id(domid_t dom) { struct domain *d; diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -560,6 +560,23 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe } break; + case XEN_DOMCTL_setnodeaffinity: + { + nodemask_t new_affinity; + + ret = xenctl_bitmap_to_nodemask(&new_affinity, + &op->u.nodeaffinity.nodemap); + if ( !ret ) + ret = domain_set_node_affinity(d, &new_affinity); + } + break; + case XEN_DOMCTL_getnodeaffinity: + { + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, + &d->node_affinity); + } + break; + case XEN_DOMCTL_setvcpuaffinity: case XEN_DOMCTL_getvcpuaffinity: { diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -218,6 +218,14 @@ static void cpuset_print(char *set, int *set++ = ''\0''; } +static void nodeset_print(char *set, int size, const nodemask_t *mask) +{ + *set++ = ''[''; + set += nodelist_scnprintf(set, size-2, mask); + *set++ = '']''; + *set++ = ''\0''; +} + static void periodic_timer_print(char *str, int size, uint64_t period) { if ( period == 0 ) @@ -273,6 +281,9 @@ static void dump_domains(unsigned char k dump_pageframe_info(d); + nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity); + printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr); + printk("VCPU information and callbacks for domain %u:\n", d->domain_id); for_each_vcpu ( d, v ) diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -261,17 +261,50 @@ static inline void list_del_init(&svc->runq_elem); } +/* + * Translates node-affinity mask into a cpumask, so that we can use it during + * actual scheduling. That of course will contain all the cpus from all the + * set nodes in the original node-affinity mask. + * + * Note that any serialization needed to access mask safely is complete + * responsibility of the caller of this function/hook. + */ +static void csched_set_node_affinity( + const struct scheduler *ops, + struct domain *d, + nodemask_t *mask) +{ + struct csched_dom *sdom; + int node; + + /* Skip idle domain since it doesn''t even have a node_affinity_cpumask */ + if ( unlikely(is_idle_domain(d)) ) + return; + + sdom = CSCHED_DOM(d); + cpumask_clear(sdom->node_affinity_cpumask); + for_each_node_mask( node, *mask ) + cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask, + &node_to_cpumask(node)); +} + #define for_each_csched_balance_step(step) \ for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) /* * vcpu-affinity balancing is always necessary and must never be skipped. - * OTOH, if a domain''s node-affinity spans all the nodes, we can safely - * avoid dealing with node-affinity entirely. + * OTOH, if a domain''s node-affinity is said to be automatically computed + * (or if it just spans all the nodes), we can safely avoid dealing with + * node-affinity entirely. Ah, node-affinity is also deemed meaningless + * in case it has empty intersection with the vcpu''s vcpu-affinity, as it + * would mean trying to schedule it on _no_ pcpu! */ -#define __vcpu_has_node_affinity(vc) \ - ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) ) +#define __vcpu_has_node_affinity(vc) \ + ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || !cpumask_intersects(vc->cpu_affinity, \ + CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || vc->domain->auto_node_affinity == 1) ) /* * Each csched-balance step uses its own cpumask. This function determines @@ -284,8 +317,13 @@ static void csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) { if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + { cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, vc->cpu_affinity); + + if ( unlikely(cpumask_empty(mask)) ) + cpumask_copy(mask, vc->cpu_affinity); + } else /* step == CSCHED_BALANCE_CPU_AFFINITY */ cpumask_copy(mask, vc->cpu_affinity); } @@ -1898,6 +1936,8 @@ const struct scheduler sched_credit_def .adjust = csched_dom_cntl, .adjust_global = csched_sys_cntl, + .set_node_affinity = csched_set_node_affinity, + .pick_cpu = csched_cpu_pick, .do_schedule = csched_schedule, diff --git a/xen/common/schedule.c b/xen/common/schedule.c --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -638,6 +638,11 @@ int cpu_disable_scheduler(unsigned int c return ret; } +void sched_set_node_affinity(struct domain *d, nodemask_t *mask) +{ + SCHED_OP(DOM2OP(d), set_node_affinity, d, mask); +} + int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) { cpumask_t online_affinity; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -280,6 +280,16 @@ typedef struct xen_domctl_getvcpuinfo xe DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); +/* Get/set the NUMA node(s) with which the guest has affinity with. */ +/* XEN_DOMCTL_setnodeaffinity */ +/* XEN_DOMCTL_getnodeaffinity */ +struct xen_domctl_nodeaffinity { + struct xenctl_bitmap nodemap;/* IN */ +}; +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); + + /* Get/set which physical cpus a vcpu can execute on. */ /* XEN_DOMCTL_setvcpuaffinity */ /* XEN_DOMCTL_getvcpuaffinity */ @@ -908,6 +918,8 @@ struct xen_domctl { #define XEN_DOMCTL_audit_p2m 65 #define XEN_DOMCTL_set_virq_handler 66 #define XEN_DOMCTL_set_broken_page_p2m 67 +#define XEN_DOMCTL_setnodeaffinity 68 +#define XEN_DOMCTL_getnodeaffinity 69 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -921,6 +933,7 @@ struct xen_domctl { struct xen_domctl_getpageframeinfo getpageframeinfo; struct xen_domctl_getpageframeinfo2 getpageframeinfo2; struct xen_domctl_getpageframeinfo3 getpageframeinfo3; + struct xen_domctl_nodeaffinity nodeaffinity; struct xen_domctl_vcpuaffinity vcpuaffinity; struct xen_domctl_shadow_op shadow_op; struct xen_domctl_max_mem max_mem; diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h --- a/xen/include/xen/nodemask.h +++ b/xen/include/xen/nodemask.h @@ -8,8 +8,9 @@ * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * - * For details of nodemask_scnprintf() and nodemask_parse(), - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * For details of nodemask_scnprintf(), nodelist_scnpintf() and + * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse() + * in lib/bitmap.c. * * The available nodemask operations are: * @@ -50,6 +51,7 @@ * unsigned long *nodes_addr(mask) Array of unsigned long''s in mask * * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing + * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for printing * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask * * for_each_node_mask(node, mask) for-loop node over mask @@ -292,6 +294,14 @@ static inline int __cycle_node(int n, co #define nodes_addr(src) ((src).bits) +#define nodelist_scnprintf(buf, len, src) \ + __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES) +static inline int __nodelist_scnprintf(char *buf, int len, + const nodemask_t *srcp, int nbits) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); +} + #if 0 #define nodemask_scnprintf(buf, len, src) \ __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -184,6 +184,8 @@ struct scheduler { struct xen_domctl_scheduler_op *); int (*adjust_global) (const struct scheduler *, struct xen_sysctl_scheduler_op *); + void (*set_node_affinity) (const struct scheduler *, + struct domain *, nodemask_t *); void (*dump_settings) (const struct scheduler *); void (*dump_cpu_state) (const struct scheduler *, int); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -288,6 +288,8 @@ struct domain /* Does this guest need iommu mappings? */ bool_t need_iommu; #endif + /* is node-affinity automatically computed? */ + bool_t auto_node_affinity; /* Is this guest fully privileged (aka dom0)? */ bool_t is_privileged; /* Which guest this guest has privileges on */ @@ -365,7 +367,10 @@ struct domain /* Various mem_events */ struct mem_event_per_domain *mem_event; - /* Currently computed from union of all vcpu cpu-affinity masks. */ + /* + * Can be specified by the user. If that is not the case, it is + * computed from the union of all the vcpu cpu-affinity masks. + */ nodemask_t node_affinity; unsigned int last_alloc_node; spinlock_t node_affinity_lock; @@ -435,6 +440,7 @@ static inline void get_knownalive_domain ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); void domain_update_node_affinity(struct domain *d); struct domain *domain_create( @@ -555,6 +561,7 @@ void sched_destroy_domain(struct domain int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); long sched_adjust_global(struct xen_sysctl_scheduler_op *); +void sched_set_node_affinity(struct domain *, nodemask_t *); int sched_id(void); void sched_tick_suspend(void); void sched_tick_resume(void); diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c --- a/xen/xsm/flask/hooks.c +++ b/xen/xsm/flask/hooks.c @@ -611,10 +611,12 @@ static int flask_domctl(struct domain *d return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); case XEN_DOMCTL_setvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY); + case XEN_DOMCTL_setnodeaffinity: + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); case XEN_DOMCTL_getvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY); + case XEN_DOMCTL_getnodeaffinity: + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY); case XEN_DOMCTL_resumedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors --- a/xen/xsm/flask/policy/access_vectors +++ b/xen/xsm/flask/policy/access_vectors @@ -104,9 +104,11 @@ class domain # XEN_DOMCTL_destroydomain destroy # XEN_DOMCTL_setvcpuaffinity - setvcpuaffinity +# XEN_DOMCTL_setnodeaffinity + setaffinity # XEN_DOMCTL_getvcpuaffinity - getvcpuaffinity +# XEN_DOMCTL_getnodeaffinity + getaffinity # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo getscheduler # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 06 of 11 v6] libxc: allow for explicitly specifying node-affinity
By providing the proper get/set interface and wiring them to the new domctl-s from the previous commit. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -110,6 +110,83 @@ int xc_domain_shutdown(xc_interface *xch } +int xc_domain_node_setaffinity(xc_interface *xch, + uint32_t domid, + xc_nodemap_t nodemap) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(uint8_t, local); + int ret = -1; + int nodesize; + + nodesize = xc_get_nodemap_size(xch); + if (!nodesize) + { + PERROR("Could not get number of nodes"); + goto out; + } + + local = xc_hypercall_buffer_alloc(xch, local, nodesize); + if ( local == NULL ) + { + PERROR("Could not allocate memory for setnodeaffinity domctl hypercall"); + goto out; + } + + domctl.cmd = XEN_DOMCTL_setnodeaffinity; + domctl.domain = (domid_t)domid; + + memcpy(local, nodemap, nodesize); + set_xen_guest_handle(domctl.u.nodeaffinity.nodemap.bitmap, local); + domctl.u.nodeaffinity.nodemap.nr_bits = nodesize * 8; + + ret = do_domctl(xch, &domctl); + + xc_hypercall_buffer_free(xch, local); + + out: + return ret; +} + +int xc_domain_node_getaffinity(xc_interface *xch, + uint32_t domid, + xc_nodemap_t nodemap) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(uint8_t, local); + int ret = -1; + int nodesize; + + nodesize = xc_get_nodemap_size(xch); + if (!nodesize) + { + PERROR("Could not get number of nodes"); + goto out; + } + + local = xc_hypercall_buffer_alloc(xch, local, nodesize); + if ( local == NULL ) + { + PERROR("Could not allocate memory for getnodeaffinity domctl hypercall"); + goto out; + } + + domctl.cmd = XEN_DOMCTL_getnodeaffinity; + domctl.domain = (domid_t)domid; + + set_xen_guest_handle(domctl.u.nodeaffinity.nodemap.bitmap, local); + domctl.u.nodeaffinity.nodemap.nr_bits = nodesize * 8; + + ret = do_domctl(xch, &domctl); + + memcpy(nodemap, local, nodesize); + + xc_hypercall_buffer_free(xch, local); + + out: + return ret; +} + int xc_vcpu_setaffinity(xc_interface *xch, uint32_t domid, int vcpu, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -526,6 +526,32 @@ int xc_watchdog(xc_interface *xch, uint32_t id, uint32_t timeout); +/** + * This function explicitly sets the host NUMA nodes the domain will + * have affinity with. + * + * @parm xch a handle to an open hypervisor interface. + * @parm domid the domain id one wants to set the affinity of. + * @parm nodemap the map of the affine nodes. + * @return 0 on success, -1 on failure. + */ +int xc_domain_node_setaffinity(xc_interface *xch, + uint32_t domind, + xc_nodemap_t nodemap); + +/** + * This function retrieves the host NUMA nodes the domain has + * affinity with. + * + * @parm xch a handle to an open hypervisor interface. + * @parm domid the domain id one wants to get the node affinity of. + * @parm nodemap the map of the affine nodes. + * @return 0 on success, -1 on failure. + */ +int xc_domain_node_getaffinity(xc_interface *xch, + uint32_t domind, + xc_nodemap_t nodemap); + int xc_vcpu_setaffinity(xc_interface *xch, uint32_t domid, int vcpu,
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 07 of 11 v6] libxl: allow for explicitly specifying node-affinity
By introducing a nodemap in libxl_domain_build_info and providing the get/set methods to deal with it. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> --- Changes from v4: * LIBXL_HAVE_DOMAIN_NODEAFFINITY moved, as suggested during review. diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c --- a/tools/libxl/libxl.c +++ b/tools/libxl/libxl.c @@ -4183,6 +4183,26 @@ int libxl_set_vcpuaffinity_all(libxl_ctx return rc; } +int libxl_domain_set_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap) +{ + if (xc_domain_node_setaffinity(ctx->xch, domid, nodemap->map)) { + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "setting node affinity"); + return ERROR_FAIL; + } + return 0; +} + +int libxl_domain_get_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap) +{ + if (xc_domain_node_getaffinity(ctx->xch, domid, nodemap->map)) { + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "getting node affinity"); + return ERROR_FAIL; + } + return 0; +} + int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap) { GC_INIT(ctx); diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h --- a/tools/libxl/libxl.h +++ b/tools/libxl/libxl.h @@ -75,6 +75,13 @@ #define LIBXL_HAVE_FIRMWARE_PASSTHROUGH 1 /* + * LIBXL_HAVE_DOMAIN_NODEAFFINITY indicates that a ''nodemap'' field + * (of libxl_bitmap type) is present in libxl_domain_build_info, + * containing the node-affinity for the domain. + */ +#define LIBXL_HAVE_DOMAIN_NODEAFFINITY 1 + +/* * libxl ABI compatibility * * The only guarantee which libxl makes regarding ABI compatibility @@ -884,6 +891,10 @@ int libxl_set_vcpuaffinity(libxl_ctx *ct libxl_bitmap *cpumap); int libxl_set_vcpuaffinity_all(libxl_ctx *ctx, uint32_t domid, unsigned int max_vcpus, libxl_bitmap *cpumap); +int libxl_domain_set_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap); +int libxl_domain_get_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap); int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap); libxl_scheduler libxl_get_scheduler(libxl_ctx *ctx); diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -191,6 +191,12 @@ int libxl__domain_build_info_setdefault( libxl_defbool_setdefault(&b_info->numa_placement, true); + if (!b_info->nodemap.size) { + if (libxl_node_bitmap_alloc(CTX, &b_info->nodemap, 0)) + return ERROR_FAIL; + libxl_bitmap_set_any(&b_info->nodemap); + } + if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT) b_info->max_memkb = 32 * 1024; if (b_info->target_memkb == LIBXL_MEMKB_DEFAULT) diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -230,6 +230,7 @@ int libxl__build_pre(libxl__gc *gc, uint if (rc) return rc; } + libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap); libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT); diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -263,6 +263,7 @@ libxl_domain_build_info = Struct("domain ("max_vcpus", integer), ("avail_vcpus", libxl_bitmap), ("cpumap", libxl_bitmap), + ("nodemap", libxl_bitmap), ("numa_placement", libxl_defbool), ("tsc_mode", libxl_tsc_mode), ("max_memkb", MemKB),
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 08 of 11 v6] libxl: optimize the calculation of how many VCPUs can run on a candidate
For choosing the best NUMA placement candidate, we need to figure out how many VCPUs are runnable on each of them. That requires going through all the VCPUs of all the domains and check their affinities. With this change, instead of doing the above for each candidate, we do it once for all, populating an array while counting. This way, when we later are evaluating candidates, all we need is summing up the right elements of the array itself. This reduces the complexity of the overall algorithm, as it moves a potentially expensive operation (for_each_vcpu_of_each_domain {}) outside from the core placement loop, so that it is performed only once instead of (potentially) tens or hundreds of times. More specifically, we go from a worst case computation time complaxity of: O(2^n_nodes) * O(n_domains*n_domain_vcpus) To, with this change: O(n_domains*n_domains_vcpus) + O(2^n_nodes) = O(2^n_nodes) (with n_nodes<=16, otherwise the algorithm suggests partitioning with cpupools and does not even start.) Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> --- Changes from v2: * in nr_vcpus_on_nodes(), vcpu_nodemap renamed to something more sensible, as suggested during review. diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -165,22 +165,34 @@ static uint32_t nodemap_to_free_memkb(li return free_memkb; } -/* Retrieve the number of vcpus able to run on the cpus of the nodes - * that are part of the nodemap. */ -static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo, +/* Retrieve the number of vcpus able to run on the nodes in nodemap */ +static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[], const libxl_bitmap *nodemap) { + int i, nr_vcpus = 0; + + libxl_for_each_set_bit(i, *nodemap) + nr_vcpus += vcpus_on_node[i]; + + return nr_vcpus; +} + +/* Number of vcpus able to run on the cpus of the various nodes + * (reported by filling the array vcpus_on_node[]). */ +static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, + const libxl_bitmap *suitable_cpumap, + int vcpus_on_node[]) +{ libxl_dominfo *dinfo = NULL; - libxl_bitmap vcpu_nodemap; + libxl_bitmap nodes_counted; int nr_doms, nr_cpus; - int nr_vcpus = 0; int i, j, k; dinfo = libxl_list_domain(CTX, &nr_doms); if (dinfo == NULL) return ERROR_FAIL; - if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) { + if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) { libxl_dominfo_list_free(dinfo, nr_doms); return ERROR_FAIL; } @@ -193,19 +205,17 @@ static int nodemap_to_nr_vcpus(libxl__gc if (vinfo == NULL) continue; - /* For each vcpu of each domain ... */ for (j = 0; j < nr_dom_vcpus; j++) { + /* For each vcpu of each domain, increment the elements of + * the array corresponding to the nodes where the vcpu runs */ + libxl_bitmap_set_none(&nodes_counted); + libxl_for_each_set_bit(k, vinfo[j].cpumap) { + int node = tinfo[k].node; - /* Build up a map telling on which nodes the vcpu is runnable on */ - libxl_bitmap_set_none(&vcpu_nodemap); - libxl_for_each_set_bit(k, vinfo[j].cpumap) - libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node); - - /* And check if that map has any intersection with our nodemap */ - libxl_for_each_set_bit(k, vcpu_nodemap) { - if (libxl_bitmap_test(nodemap, k)) { - nr_vcpus++; - break; + if (libxl_bitmap_test(suitable_cpumap, k) && + !libxl_bitmap_test(&nodes_counted, node)) { + libxl_bitmap_set(&nodes_counted, node); + vcpus_on_node[node]++; } } } @@ -213,9 +223,9 @@ static int nodemap_to_nr_vcpus(libxl__gc libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } - libxl_bitmap_dispose(&vcpu_nodemap); + libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); - return nr_vcpus; + return 0; } /* @@ -270,7 +280,7 @@ int libxl__get_numa_candidate(libxl__gc libxl_numainfo *ninfo = NULL; int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0; libxl_bitmap suitable_nodemap, nodemap; - int rc = 0; + int *vcpus_on_node, rc = 0; libxl_bitmap_init(&nodemap); libxl_bitmap_init(&suitable_nodemap); @@ -281,6 +291,8 @@ int libxl__get_numa_candidate(libxl__gc if (ninfo == NULL) return ERROR_FAIL; + GCNEW_ARRAY(vcpus_on_node, nr_nodes); + /* * The good thing about this solution is that it is based on heuristics * (implemented in numa_cmpf() ), but we at least can evaluate it on @@ -330,6 +342,19 @@ int libxl__get_numa_candidate(libxl__gc goto out; /* + * Later on, we will try to figure out how many vcpus are runnable on + * each candidate (as a part of choosing the best one of them). That + * requires going through all the vcpus of all the domains and check + * their affinities. So, instead of doing that for each candidate, + * let''s count here the number of vcpus runnable on each node, so that + * all we have to do later is summing up the right elements of the + * vcpus_on_node array. + */ + rc = nr_vcpus_on_nodes(gc, tinfo, suitable_cpumap, vcpus_on_node); + if (rc) + goto out; + + /* * If the minimum number of NUMA nodes is not explicitly specified * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes * from where to start generating candidates, if possible (or just start @@ -414,7 +439,8 @@ int libxl__get_numa_candidate(libxl__gc * current best one (if any). */ libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); - new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap); + new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node, + &nodemap); new_cndt.free_memkb = nodes_free_memkb; new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap); new_cndt.nr_cpus = nodes_cpus;
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 09 of 11 v6] libxl: automatic placement deals with node-affinity
Which basically means the following two things: 1) during domain creation, it is the node-affinity of the domain --rather than the vcpu-affinities of its VCPUs-- that is affected by automatic placement; 2) during automatic placement, when counting how many VCPUs are already "bound" to a placement candidate (as part of the process of choosing the best candidate), both vcpu-affinity and node-affinity are considered. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -134,13 +134,13 @@ static int numa_place_domain(libxl__gc * { int found; libxl__numa_candidate candidate; - libxl_bitmap candidate_nodemap; + libxl_bitmap cpupool_nodemap; libxl_cpupoolinfo cpupool_info; int i, cpupool, rc = 0; uint32_t memkb; libxl__numa_candidate_init(&candidate); - libxl_bitmap_init(&candidate_nodemap); + libxl_bitmap_init(&cpupool_nodemap); /* * Extract the cpumap from the cpupool the domain belong to. In fact, @@ -157,7 +157,7 @@ static int numa_place_domain(libxl__gc * rc = libxl_domain_need_memory(CTX, info, &memkb); if (rc) goto out; - if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) { + if (libxl_node_bitmap_alloc(CTX, &cpupool_nodemap, 0)) { rc = ERROR_FAIL; goto out; } @@ -175,17 +175,19 @@ static int numa_place_domain(libxl__gc * if (found == 0) goto out; - /* Map the candidate''s node map to the domain''s info->cpumap */ - libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap); - rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap); + /* Map the candidate''s node map to the domain''s info->nodemap */ + libxl__numa_candidate_get_nodemap(gc, &candidate, &info->nodemap); + + /* Avoid trying to set the affinity to nodes that might be in the + * candidate''s nodemap but out of our cpupool. */ + rc = libxl_cpumap_to_nodemap(CTX, &cpupool_info.cpumap, + &cpupool_nodemap); if (rc) goto out; - /* Avoid trying to set the affinity to cpus that might be in the - * nodemap but not in our cpupool. */ - libxl_for_each_set_bit(i, info->cpumap) { - if (!libxl_bitmap_test(&cpupool_info.cpumap, i)) - libxl_bitmap_reset(&info->cpumap, i); + libxl_for_each_set_bit(i, info->nodemap) { + if (!libxl_bitmap_test(&cpupool_nodemap, i)) + libxl_bitmap_reset(&info->nodemap, i); } LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and " @@ -194,7 +196,7 @@ static int numa_place_domain(libxl__gc * out: libxl__numa_candidate_dispose(&candidate); - libxl_bitmap_dispose(&candidate_nodemap); + libxl_bitmap_dispose(&cpupool_nodemap); libxl_cpupoolinfo_dispose(&cpupool_info); return rc; } @@ -212,10 +214,10 @@ int libxl__build_pre(libxl__gc *gc, uint /* * Check if the domain has any CPU affinity. If not, try to build * up one. In case numa_place_domain() find at least a suitable - * candidate, it will affect info->cpumap accordingly; if it + * candidate, it will affect info->nodemap accordingly; if it * does not, it just leaves it as it is. This means (unless * some weird error manifests) the subsequent call to - * libxl_set_vcpuaffinity_all() will do the actual placement, + * libxl_domain_set_nodeaffinity() will do the actual placement, * whatever that turns out to be. */ if (libxl_defbool_val(info->numa_placement)) { diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -184,7 +184,7 @@ static int nr_vcpus_on_nodes(libxl__gc * int vcpus_on_node[]) { libxl_dominfo *dinfo = NULL; - libxl_bitmap nodes_counted; + libxl_bitmap dom_nodemap, nodes_counted; int nr_doms, nr_cpus; int i, j, k; @@ -197,6 +197,12 @@ static int nr_vcpus_on_nodes(libxl__gc * return ERROR_FAIL; } + if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) { + libxl_bitmap_dispose(&nodes_counted); + libxl_dominfo_list_free(dinfo, nr_doms); + return ERROR_FAIL; + } + for (i = 0; i < nr_doms; i++) { libxl_vcpuinfo *vinfo; int nr_dom_vcpus; @@ -205,14 +211,21 @@ static int nr_vcpus_on_nodes(libxl__gc * if (vinfo == NULL) continue; + /* Retrieve the domain''s node-affinity map */ + libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap); + for (j = 0; j < nr_dom_vcpus; j++) { - /* For each vcpu of each domain, increment the elements of - * the array corresponding to the nodes where the vcpu runs */ + /* + * For each vcpu of each domain, it must have both vcpu-affinity + * and node-affinity to (a pcpu belonging to) a certain node to + * cause an increment in the corresponding element of the array. + */ libxl_bitmap_set_none(&nodes_counted); libxl_for_each_set_bit(k, vinfo[j].cpumap) { int node = tinfo[k].node; if (libxl_bitmap_test(suitable_cpumap, k) && + libxl_bitmap_test(&dom_nodemap, node) && !libxl_bitmap_test(&nodes_counted, node)) { libxl_bitmap_set(&nodes_counted, node); vcpus_on_node[node]++; @@ -223,6 +236,7 @@ static int nr_vcpus_on_nodes(libxl__gc * libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } + libxl_bitmap_dispose(&dom_nodemap); libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); return 0;
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 10 of 11 v6] xl: add node-affinity to the output of `xl list`
Node-affinity is now something that is under (some) control of the user, so show it upon request as part of the output of `xl list'' by the `-n'' option. Re the patch, the print_bitmap() related hunk is _mostly_ code motion, although there is a very minor change in the code, basically to allow using the function for printing both cpu and node bitmaps (as, in case all bits are sets, it used to print "any cpu", which doesn''t fit the nodemap case). Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: Ian Campbell <ian.campbell@citrix.com> --- Changes from v4: * explicit spaces converted to " %16s" in list_domain(), as suggested during review. Changes from v3: * removed the pastebin.com link from the changelog. Will reply to this same mail with another one with the output of various invocations of `xl list'' with different options to show how it looks like. Changes from v1: * print_{cpu,node}map() functions added instead of ''state variable''-izing print_bitmap(). diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -3061,16 +3061,97 @@ out: } } -static void list_domains(int verbose, int context, int claim, +/* If map is not full, prints it and returns 0. Returns 1 otherwise. */ +static int print_bitmap(uint8_t *map, int maplen, FILE *stream) +{ + int i; + uint8_t pmap = 0, bitmask = 0; + int firstset = 0, state = 0; + + for (i = 0; i < maplen; i++) { + if (i % 8 == 0) { + pmap = *map++; + bitmask = 1; + } else bitmask <<= 1; + + switch (state) { + case 0: + case 2: + if ((pmap & bitmask) != 0) { + firstset = i; + state++; + } + continue; + case 1: + case 3: + if ((pmap & bitmask) == 0) { + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); + if (i - 1 > firstset) + fprintf(stream, "-%d", i - 1); + state = 2; + } + continue; + } + } + switch (state) { + case 0: + fprintf(stream, "none"); + break; + case 2: + break; + case 1: + if (firstset == 0) + return 1; + case 3: + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); + if (i - 1 > firstset) + fprintf(stream, "-%d", i - 1); + break; + } + + return 0; +} + +static void print_cpumap(uint8_t *map, int maplen, FILE *stream) +{ + if (print_bitmap(map, maplen, stream)) + fprintf(stream, "any cpu"); +} + +static void print_nodemap(uint8_t *map, int maplen, FILE *stream) +{ + if (print_bitmap(map, maplen, stream)) + fprintf(stream, "any node"); +} + +static void list_domains(int verbose, int context, int claim, int numa, const libxl_dominfo *info, int nb_domain) { int i; static const char shutdown_reason_letters[]= "-rscw"; + libxl_bitmap nodemap; + libxl_physinfo physinfo; + + libxl_bitmap_init(&nodemap); + libxl_physinfo_init(&physinfo); printf("Name ID Mem VCPUs\tState\tTime(s)"); if (verbose) printf(" UUID Reason-Code\tSecurity Label"); if (context && !verbose) printf(" Security Label"); if (claim) printf(" Claimed"); + if (numa) { + if (libxl_node_bitmap_alloc(ctx, &nodemap, 0)) { + fprintf(stderr, "libxl_node_bitmap_alloc_failed.\n"); + exit(1); + } + if (libxl_get_physinfo(ctx, &physinfo) != 0) { + fprintf(stderr, "libxl_physinfo failed.\n"); + libxl_bitmap_dispose(&nodemap); + exit(1); + } + + printf(" NODE Affinity"); + } printf("\n"); for (i = 0; i < nb_domain; i++) { char *domname; @@ -3103,18 +3184,23 @@ static void list_domains(int verbose, in if (verbose || context) { int rc; size_t size; - char *buf; + char *buf = NULL; rc = libxl_flask_sid_to_context(ctx, info[i].ssidref, &buf, &size); - if (rc < 0) - printf(" -"); - else { - printf(" %s", buf); - free(buf); - } + printf(" %16s", rc < 0 ? "-" : buf); + free(buf); + } + if (numa) { + libxl_domain_get_nodeaffinity(ctx, info[i].domid, &nodemap); + + putchar('' ''); + print_nodemap(nodemap.map, physinfo.nr_nodes, stdout); } putchar(''\n''); } + + libxl_bitmap_dispose(&nodemap); + libxl_physinfo_dispose(&physinfo); } static void list_vm(void) @@ -3981,10 +4067,12 @@ int main_list(int argc, char **argv) int opt, verbose = 0; int context = 0; int details = 0; + int numa = 0; static struct option opts[] = { {"long", 0, 0, ''l''}, {"verbose", 0, 0, ''v''}, {"context", 0, 0, ''Z''}, + {"numa", 0, 0, ''n''}, COMMON_LONG_OPTS, {0, 0, 0, 0} }; @@ -3993,7 +4081,7 @@ int main_list(int argc, char **argv) libxl_dominfo *info, *info_free=0; int nb_domain, rc; - SWITCH_FOREACH_OPT(opt, "lvhZ", opts, "list", 0) { + SWITCH_FOREACH_OPT(opt, "lvhZn", opts, "list", 0) { case ''l'': details = 1; break; @@ -4003,6 +4091,9 @@ int main_list(int argc, char **argv) case ''Z'': context = 1; break; + case ''n'': + numa = 1; + break; } if (optind >= argc) { @@ -4034,7 +4125,7 @@ int main_list(int argc, char **argv) if (details) list_domains_details(info, nb_domain); else - list_domains(verbose, context, 0 /* claim */, info, nb_domain); + list_domains(verbose, context, 0 /* claim */, numa, info, nb_domain); if (info_free) libxl_dominfo_list_free(info, nb_domain); @@ -4283,56 +4374,6 @@ int main_button_press(int argc, char **a return 0; } -static void print_bitmap(uint8_t *map, int maplen, FILE *stream) -{ - int i; - uint8_t pmap = 0, bitmask = 0; - int firstset = 0, state = 0; - - for (i = 0; i < maplen; i++) { - if (i % 8 == 0) { - pmap = *map++; - bitmask = 1; - } else bitmask <<= 1; - - switch (state) { - case 0: - case 2: - if ((pmap & bitmask) != 0) { - firstset = i; - state++; - } - continue; - case 1: - case 3: - if ((pmap & bitmask) == 0) { - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); - if (i - 1 > firstset) - fprintf(stream, "-%d", i - 1); - state = 2; - } - continue; - } - } - switch (state) { - case 0: - fprintf(stream, "none"); - break; - case 2: - break; - case 1: - if (firstset == 0) { - fprintf(stream, "any cpu"); - break; - } - case 3: - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); - if (i - 1 > firstset) - fprintf(stream, "-%d", i - 1); - break; - } -} - static void print_vcpuinfo(uint32_t tdomid, const libxl_vcpuinfo *vcpuinfo, uint32_t nr_cpus) @@ -4356,7 +4397,7 @@ static void print_vcpuinfo(uint32_t tdom /* TIM */ printf("%9.1f ", ((float)vcpuinfo->vcpu_time / 1e9)); /* CPU AFFINITY */ - print_bitmap(vcpuinfo->cpumap.map, nr_cpus, stdout); + print_cpumap(vcpuinfo->cpumap.map, nr_cpus, stdout); printf("\n"); } @@ -5942,7 +5983,7 @@ int main_claims(int argc, char **argv) } list_domains(0 /* verbose */, 0 /* context */, 1 /* claim */, - info, nb_domain); + 0 /* numa */, info, nb_domain); libxl_dominfo_list_free(info, nb_domain); return 0; diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c --- a/tools/libxl/xl_cmdtable.c +++ b/tools/libxl/xl_cmdtable.c @@ -50,7 +50,8 @@ struct cmd_spec cmd_table[] = { "[options] [Domain]\n", "-l, --long Output all VM details\n" "-v, --verbose Prints out UUIDs and security context\n" - "-Z, --context Prints out security context" + "-Z, --context Prints out security context\n" + "-n, --numa Prints out NUMA node affinity" }, { "destroy", &main_destroy, 0, 1,
Dario Faggioli
2013-Apr-17 10:57 UTC
[PATCH 11 of 11 v6] docs: rearrange and update NUMA placement documentation
To include the new concept of NUMA aware scheduling and describe its impact. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/docs/misc/xl-numa-placement.markdown b/docs/misc/xl-numa-placement.markdown --- a/docs/misc/xl-numa-placement.markdown +++ b/docs/misc/xl-numa-placement.markdown @@ -14,22 +14,67 @@ the memory directly attached to the set The Xen hypervisor deals with NUMA machines by assigning to each domain a "node affinity", i.e., a set of NUMA nodes of the host from which they -get their memory allocated. +get their memory allocated. Also, even if the node affinity of a domain +is allowed to change on-line, it is very important to "place" the domain +correctly when it is fist created, as the most of its memory is allocated +at that time and can not (for now) be moved easily. NUMA awareness becomes very important as soon as many domains start running memory-intensive workloads on a shared host. In fact, the cost of accessing non node-local memory locations is very high, and the performance degradation is likely to be noticeable. -## Guest Placement in xl ## +For more information, have a look at the [Xen NUMA Introduction][numa_intro] +page on the Wiki. + +### Placing via pinning and cpupools ### + +The simplest way of placing a domain on a NUMA node is statically pinning +the domain''s vCPUs to the pCPUs of the node. This goes under the name of +CPU affinity and can be set through the "cpus=" option in the config file +(more about this below). Another option is to pool together the pCPUs +spanning the node and put the domain in such a cpupool with the "pool=" +config option (as documented in our [Wiki][cpupools_howto]). + +In both the above cases, the domain will not be able to execute outside +the specified set of pCPUs for any reasons, even if all those pCPUs are +busy doing something else while there are others, idle, pCPUs. + +So, when doing this, local memory accesses are 100% guaranteed, but that +may come at he cost of some load imbalances. + +### NUMA aware scheduling ### + +If the credit scheduler is in use, the concept of node affinity defined +above does not only apply to memory. In fact, starting from Xen 4.3, the +scheduler always tries to run the domain''s vCPUs on one of the nodes in +its node affinity. Only if that turns out to be impossible, it will just +pick any free pCPU. + +This is, therefore, something more flexible than CPU affinity, as a domain +can still run everywhere, it just prefers some nodes rather than others. +Locality of access is less guaranteed than in the pinning case, but that +comes along with better chances to exploit all the host resources (e.g., +the pCPUs). + +In fact, if all the pCPUs in a domain''s node affinity are busy, it is +possible for the domain to run outside of there, but it is very likely that +slower execution (due to remote memory accesses) is still better than no +execution at all, as it would happen with pinning. For this reason, NUMA +aware scheduling has the potential of bringing substantial performances +benefits, although this will depend on the workload. + +## Guest placement in xl ## If using xl for creating and managing guests, it is very easy to ask for both manual or automatic placement of them across the host''s NUMA nodes. -Note that xm/xend does the very same thing, the only differences residing -in the details of the heuristics adopted for the placement (see below). +Note that xm/xend does a very similar thing, the only differences being +the details of the heuristics adopted for automatic placement (see below), +and the lack of support (in both xm/xend and the Xen versions where that\ +was the default toolstack) for NUMA aware scheduling. -### Manual Guest Placement with xl ### +### Placing the guest manually ### Thanks to the "cpus=" option, it is possible to specify where a domain should be created and scheduled on, directly in its config file. This @@ -41,14 +86,19 @@ This is very simple and effective, but r administrator to explicitly specify affinities for each and every domain, or Xen won''t be able to guarantee the locality for their memory accesses. -It is also possible to deal with NUMA by partitioning the system using -cpupools. Again, this could be "The Right Answer" for many needs and -occasions, but has to be carefully considered and setup by hand. +Notice that this also pins the domain''s vCPUs to the specified set of +pCPUs, so it not only sets the domain''s node affinity (its memory will +come from the nodes to which the pCPUs belong), but at the same time +forces the vCPUs of the domain to be scheduled on those same pCPUs. -### Automatic Guest Placement with xl ### +### Placing the guest automatically ### If no "cpus=" option is specified in the config file, libxl tries to figure out on its own on which node(s) the domain could fit best. +If it finds one (some), the domain''s node affinity get set to there, +and both memory allocations and NUMA aware scheduling (for the credit +scheduler and starting from Xen 4.3) will comply with it. + It is worthwhile noting that optimally fitting a set of VMs on the NUMA nodes of an host is an incarnation of the Bin Packing Problem. In fact, the various VMs with different memory sizes are the items to be packed, @@ -81,7 +131,7 @@ largest amounts of free memory helps kee small, and maximizes the probability of being able to put more domains there. -## Guest Placement within libxl ## +## Guest placement in libxl ## xl achieves automatic NUMA placement because that is what libxl does by default. No API is provided (yet) for modifying the behaviour of @@ -93,15 +143,34 @@ any placement from happening: libxl_defbool_set(&domain_build_info->numa_placement, false); Also, if `numa_placement` is set to `true`, the domain must not -have any cpu affinity (i.e., `domain_build_info->cpumap` must +have any CPU affinity (i.e., `domain_build_info->cpumap` must have all its bits set, as it is by default), or domain creation will fail returning `ERROR_INVAL`. +Starting from Xen 4.3, in case automatic placement happens (and is +successful), it will affect the domain''s node affinity and _not_ its +CPU affinity. Namely, the domain''s vCPUs will not be pinned to any +pCPU on the host, but the memory from the domain will come from the +selected node(s) and the NUMA aware scheduling (if the credit scheduler +is in use) will try to keep the domain there as much as possible. + Besides than that, looking and/or tweaking the placement algorithm search "Automatic NUMA placement" in libxl\_internal.h. Note this may change in future versions of Xen/libxl. +## Xen < 4.3 ## + +As NUMA aware scheduling is a new feature of Xen 4.3, things are a little +bit different for earlier version of Xen. If no "cpus=" option is specified +and Xen 4.2 is in use, the automatic placement algorithm still runs, but +the results is used to _pin_ the vCPUs of the domain to the output node(s). +This is consistent with what was happening with xm/xend, which were also +affecting the domain''s CPU affinity. + +On a version of Xen earlier than 4.2, there is not automatic placement at +all in xl or libxl, and hence no node or CPU affinity being affected. + ## Limitations ## Analyzing various possible placement solutions is what makes the @@ -109,3 +178,6 @@ algorithm flexible and quite effective. it won''t scale well to systems with arbitrary number of nodes. For this reason, automatic placement is disabled (with a warning) if it is requested on a host with more than 16 NUMA nodes. + +[numa_intro]: http://wiki.xen.org/wiki/Xen_NUMA_Introduction +[cpupools_howto]: http://wiki.xen.org/wiki/Cpupools_Howto
On 17/04/13 11:57, Dario Faggioli wrote:> Hi, > > Just reposting with, this time, _really_ all the Ack-s. :-)From a release perspective: * There is a slight risk that this may introduce bugs which may not be detected until after the release; but I think since this is going to be tested *all the time*, that is fairly low. * There is a slightly higher risk that there may be bugs in this which delay the release. However, it''s a very important feature, so I think it''s worth accepting, even if it does end up slipping the release a bit. So, from a release perspective: Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
On Wed, 2013-04-17 at 12:13 +0100, George Dunlap wrote:> On 17/04/13 11:57, Dario Faggioli wrote: > > Hi, > > > > Just reposting with, this time, _really_ all the Ack-s. :-) > > From a release perspective: > * There is a slight risk that this may introduce bugs which may not be > detected until after the release; but I think since this is going to be > tested *all the time*, that is fairly low. > * There is a slightly higher risk that there may be bugs in this which > delay the release. However, it''s a very important feature, so I think > it''s worth accepting, even if it does end up slipping the release a bit. > > So, from a release perspective: > > Acked-by: George Dunlap <george.dunlap@eu.citrix.com>I have applied this series. Ian.