Hello, Round 4 of the NUMA aware scheduling series. Very few going on, actually, as the comments to v3 where not a big deal... Nevertheless, I, as usual, really appreciated them a lot, and I think I''ve addressed them all. The only notable change is on patch 05/11, where I discovered a small but nasty bug (see the specific changelog). I couldn''t rerun the full set of my usual benchmarks, but I really don''t think that to be necessary, as practically no functional change is being introduced, wrt to v3. However, I did run a couple of them, randomly, just to be sure, and the numbers actually do look almost identical to these ones: http://lists.xen.org/archives/html/xen-devel/2013-02/msg00009.html What I''m now missing is a couple of Ack-s from the relevant people. In fact, almost all the patches carry more than one Ack, but there are a few of them which are missing Ack-s from the actual maintainers of the code they touch. In some more details, and with ''*''=''properly Ack-ed'', ''+''= ''Ack-ed but _not_ by the maintainer'', here it comes the series: * 01/11 xen, libxc: rename xenctl_cpumap to xenctl_bitmap * 02/11 xen, libxc: introduce xc_nodemap_t * 03/11 xen: sched_credit: when picking, make sure we get an idle one, if any + 04/11 xen: sched_credit: let the scheduler know about node-affinity * 05/11 xen: allow for explicitly specifying node-affinity * 06/11 libxc: allow for explicitly specifying node-affinity + 07/11 libxl: allow for explicitly specifying node-affinity + 08/11 libxl: optimize the calculation of how many VCPUs can run on a candidate + 09/11 libxl: automatic placement deals with node-affinity + 10/11 xl: add node-affinity to the output of `xl list * 11/11 docs: rearrange and update NUMA placement documentation So, George, if, as you said, you''re fine with it, could you explicitly ack 04/11 ? Also, IanC and IanJ, could you have a look and ack (if you feel like so, of course) 07/11, 08/11, 09/11 and 10/11 ? Thanks a lot and Regads, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 01 of 11 v4] xen, libxc: rename xenctl_cpumap to xenctl_bitmap
More specifically: 1. replaces xenctl_cpumap with xenctl_bitmap 2. provides bitmap_to_xenctl_bitmap and the reverse; 3. re-implement cpumask_to_xenctl_bitmap with bitmap_to_xenctl_bitmap and the reverse; Other than #3, no functional changes. Interface only slightly afected. This is in preparation of introducing NUMA node-affinity maps. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> --- Changes since v3: * when renaming xenctl_cpumap with xenctl_bitmap, use nr_bits in place of nr_cpus (instead of using nr_elems, like in v3 and before), as requested during review. Changes since v2: * Took care of tools/tests/mce-test/tools/xen-mceinj.c which hass been forgotten there, as requested during review. diff --git a/tools/libxc/xc_cpupool.c b/tools/libxc/xc_cpupool.c --- a/tools/libxc/xc_cpupool.c +++ b/tools/libxc/xc_cpupool.c @@ -90,7 +90,7 @@ xc_cpupoolinfo_t *xc_cpupool_getinfo(xc_ sysctl.u.cpupool_op.op = XEN_SYSCTL_CPUPOOL_OP_INFO; sysctl.u.cpupool_op.cpupool_id = poolid; set_xen_guest_handle(sysctl.u.cpupool_op.cpumap.bitmap, local); - sysctl.u.cpupool_op.cpumap.nr_cpus = local_size * 8; + sysctl.u.cpupool_op.cpumap.nr_bits = local_size * 8; err = do_sysctl_save(xch, &sysctl); @@ -184,7 +184,7 @@ xc_cpumap_t xc_cpupool_freeinfo(xc_inter sysctl.cmd = XEN_SYSCTL_cpupool_op; sysctl.u.cpupool_op.op = XEN_SYSCTL_CPUPOOL_OP_FREEINFO; set_xen_guest_handle(sysctl.u.cpupool_op.cpumap.bitmap, local); - sysctl.u.cpupool_op.cpumap.nr_cpus = mapsize * 8; + sysctl.u.cpupool_op.cpumap.nr_bits = mapsize * 8; err = do_sysctl_save(xch, &sysctl); diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -142,7 +142,7 @@ int xc_vcpu_setaffinity(xc_interface *xc set_xen_guest_handle(domctl.u.vcpuaffinity.cpumap.bitmap, local); - domctl.u.vcpuaffinity.cpumap.nr_cpus = cpusize * 8; + domctl.u.vcpuaffinity.cpumap.nr_bits = cpusize * 8; ret = do_domctl(xch, &domctl); @@ -182,7 +182,7 @@ int xc_vcpu_getaffinity(xc_interface *xc domctl.u.vcpuaffinity.vcpu = vcpu; set_xen_guest_handle(domctl.u.vcpuaffinity.cpumap.bitmap, local); - domctl.u.vcpuaffinity.cpumap.nr_cpus = cpusize * 8; + domctl.u.vcpuaffinity.cpumap.nr_bits = cpusize * 8; ret = do_domctl(xch, &domctl); diff --git a/tools/libxc/xc_tbuf.c b/tools/libxc/xc_tbuf.c --- a/tools/libxc/xc_tbuf.c +++ b/tools/libxc/xc_tbuf.c @@ -134,7 +134,7 @@ int xc_tbuf_set_cpu_mask(xc_interface *x bitmap_64_to_byte(bytemap, &mask64, sizeof (mask64) * 8); set_xen_guest_handle(sysctl.u.tbuf_op.cpu_mask.bitmap, bytemap); - sysctl.u.tbuf_op.cpu_mask.nr_cpus = sizeof(bytemap) * 8; + sysctl.u.tbuf_op.cpu_mask.nr_bits = sizeof(bytemap) * 8; ret = do_sysctl(xch, &sysctl); diff --git a/tools/tests/mce-test/tools/xen-mceinj.c b/tools/tests/mce-test/tools/xen-mceinj.c --- a/tools/tests/mce-test/tools/xen-mceinj.c +++ b/tools/tests/mce-test/tools/xen-mceinj.c @@ -161,7 +161,7 @@ static int inject_cmci(xc_interface *xc_ mc.u.mc_inject_v2.flags |= XEN_MC_INJECT_CPU_BROADCAST; mc.u.mc_inject_v2.flags |= XEN_MC_INJECT_TYPE_CMCI; - mc.u.mc_inject_v2.cpumap.nr_cpus = nr_cpus; + mc.u.mc_inject_v2.cpumap.nr_bits = nr_cpus; return xc_mca_op(xc_handle, &mc); } diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c +++ b/xen/arch/x86/cpu/mcheck/mce.c @@ -1449,8 +1449,7 @@ long do_mca(XEN_GUEST_HANDLE_PARAM(xen_m cpumap = &cpu_online_map; else { - ret = xenctl_cpumap_to_cpumask(&cmv, - &op->u.mc_inject_v2.cpumap); + ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap); if ( ret ) break; cpumap = cmv; diff --git a/xen/arch/x86/platform_hypercall.c b/xen/arch/x86/platform_hypercall.c --- a/xen/arch/x86/platform_hypercall.c +++ b/xen/arch/x86/platform_hypercall.c @@ -336,7 +336,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA { uint32_t cpu; uint64_t idletime, now = NOW(); - struct xenctl_cpumap ctlmap; + struct xenctl_bitmap ctlmap; cpumask_var_t cpumap; XEN_GUEST_HANDLE(uint8) cpumap_bitmap; XEN_GUEST_HANDLE(uint64) idletimes; @@ -345,11 +345,11 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA if ( cpufreq_controller != FREQCTL_dom0_kernel ) break; - ctlmap.nr_cpus = op->u.getidletime.cpumap_nr_cpus; + ctlmap.nr_bits = op->u.getidletime.cpumap_nr_cpus; guest_from_compat_handle(cpumap_bitmap, op->u.getidletime.cpumap_bitmap); ctlmap.bitmap.p = cpumap_bitmap.p; /* handle -> handle_64 conversion */ - if ( (ret = xenctl_cpumap_to_cpumask(&cpumap, &ctlmap)) != 0 ) + if ( (ret = xenctl_bitmap_to_cpumask(&cpumap, &ctlmap)) != 0 ) goto out; guest_from_compat_handle(idletimes, op->u.getidletime.idletime); @@ -368,7 +368,7 @@ ret_t do_platform_op(XEN_GUEST_HANDLE_PA op->u.getidletime.now = now; if ( ret == 0 ) - ret = cpumask_to_xenctl_cpumap(&ctlmap, cpumap); + ret = cpumask_to_xenctl_bitmap(&ctlmap, cpumap); free_cpumask_var(cpumap); if ( ret == 0 && __copy_field_to_guest(u_xenpf_op, op, u.getidletime) ) diff --git a/xen/common/cpupool.c b/xen/common/cpupool.c --- a/xen/common/cpupool.c +++ b/xen/common/cpupool.c @@ -493,7 +493,7 @@ int cpupool_do_sysctl(struct xen_sysctl_ op->cpupool_id = c->cpupool_id; op->sched_id = c->sched->sched_id; op->n_dom = c->n_dom; - ret = cpumask_to_xenctl_cpumap(&op->cpumap, c->cpu_valid); + ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid); cpupool_put(c); } break; @@ -588,7 +588,7 @@ int cpupool_do_sysctl(struct xen_sysctl_ case XEN_SYSCTL_CPUPOOL_OP_FREEINFO: { - ret = cpumask_to_xenctl_cpumap( + ret = cpumask_to_xenctl_bitmap( &op->cpumap, &cpupool_free_cpus); } break; diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -32,28 +32,29 @@ static DEFINE_SPINLOCK(domctl_lock); DEFINE_SPINLOCK(vcpu_alloc_lock); -int cpumask_to_xenctl_cpumap( - struct xenctl_cpumap *xenctl_cpumap, const cpumask_t *cpumask) +int bitmap_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_bitmap, + const unsigned long *bitmap, + unsigned int nbits) { unsigned int guest_bytes, copy_bytes, i; uint8_t zero = 0; int err = 0; - uint8_t *bytemap = xmalloc_array(uint8_t, (nr_cpu_ids + 7) / 8); + uint8_t *bytemap = xmalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; - guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8; - copy_bytes = min_t(unsigned int, guest_bytes, (nr_cpu_ids + 7) / 8); + guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; + copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); - bitmap_long_to_byte(bytemap, cpumask_bits(cpumask), nr_cpu_ids); + bitmap_long_to_byte(bytemap, bitmap, nbits); if ( copy_bytes != 0 ) - if ( copy_to_guest(xenctl_cpumap->bitmap, bytemap, copy_bytes) ) + if ( copy_to_guest(xenctl_bitmap->bitmap, bytemap, copy_bytes) ) err = -EFAULT; for ( i = copy_bytes; !err && i < guest_bytes; i++ ) - if ( copy_to_guest_offset(xenctl_cpumap->bitmap, i, &zero, 1) ) + if ( copy_to_guest_offset(xenctl_bitmap->bitmap, i, &zero, 1) ) err = -EFAULT; xfree(bytemap); @@ -61,36 +62,58 @@ int cpumask_to_xenctl_cpumap( return err; } -int xenctl_cpumap_to_cpumask( - cpumask_var_t *cpumask, const struct xenctl_cpumap *xenctl_cpumap) +int xenctl_bitmap_to_bitmap(unsigned long *bitmap, + const struct xenctl_bitmap *xenctl_bitmap, + unsigned int nbits) { unsigned int guest_bytes, copy_bytes; int err = 0; - uint8_t *bytemap = xzalloc_array(uint8_t, (nr_cpu_ids + 7) / 8); + uint8_t *bytemap = xzalloc_array(uint8_t, (nbits + 7) / 8); if ( !bytemap ) return -ENOMEM; - guest_bytes = (xenctl_cpumap->nr_cpus + 7) / 8; - copy_bytes = min_t(unsigned int, guest_bytes, (nr_cpu_ids + 7) / 8); + guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8; + copy_bytes = min_t(unsigned int, guest_bytes, (nbits + 7) / 8); if ( copy_bytes != 0 ) { - if ( copy_from_guest(bytemap, xenctl_cpumap->bitmap, copy_bytes) ) + if ( copy_from_guest(bytemap, xenctl_bitmap->bitmap, copy_bytes) ) err = -EFAULT; - if ( (xenctl_cpumap->nr_cpus & 7) && (guest_bytes == copy_bytes) ) - bytemap[guest_bytes-1] &= ~(0xff << (xenctl_cpumap->nr_cpus & 7)); + if ( (xenctl_bitmap->nr_bits & 7) && (guest_bytes == copy_bytes) ) + bytemap[guest_bytes-1] &= ~(0xff << (xenctl_bitmap->nr_bits & 7)); } - if ( err ) - /* nothing */; - else if ( alloc_cpumask_var(cpumask) ) - bitmap_byte_to_long(cpumask_bits(*cpumask), bytemap, nr_cpu_ids); + if ( !err ) + bitmap_byte_to_long(bitmap, bytemap, nbits); + + xfree(bytemap); + + return err; +} + +int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_cpumap, + const cpumask_t *cpumask) +{ + return bitmap_to_xenctl_bitmap(xenctl_cpumap, cpumask_bits(cpumask), + nr_cpu_ids); +} + +int xenctl_bitmap_to_cpumask(cpumask_var_t *cpumask, + const struct xenctl_bitmap *xenctl_cpumap) +{ + int err = 0; + + if ( alloc_cpumask_var(cpumask) ) { + err = xenctl_bitmap_to_bitmap(cpumask_bits(*cpumask), xenctl_cpumap, + nr_cpu_ids); + /* In case of error, cleanup is up to us, as the caller won''t care! */ + if ( err ) + free_cpumask_var(*cpumask); + } else err = -ENOMEM; - xfree(bytemap); - return err; } @@ -540,7 +563,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe { cpumask_var_t new_affinity; - ret = xenctl_cpumap_to_cpumask( + ret = xenctl_bitmap_to_cpumask( &new_affinity, &op->u.vcpuaffinity.cpumap); if ( !ret ) { @@ -550,7 +573,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe } else { - ret = cpumask_to_xenctl_cpumap( + ret = cpumask_to_xenctl_bitmap( &op->u.vcpuaffinity.cpumap, v->cpu_affinity); } } diff --git a/xen/common/trace.c b/xen/common/trace.c --- a/xen/common/trace.c +++ b/xen/common/trace.c @@ -384,7 +384,7 @@ int tb_control(xen_sysctl_tbuf_op_t *tbc { cpumask_var_t mask; - rc = xenctl_cpumap_to_cpumask(&mask, &tbc->cpu_mask); + rc = xenctl_bitmap_to_cpumask(&mask, &tbc->cpu_mask); if ( !rc ) { cpumask_copy(&tb_cpu_mask, mask); diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h --- a/xen/include/public/arch-x86/xen-mca.h +++ b/xen/include/public/arch-x86/xen-mca.h @@ -414,7 +414,7 @@ struct xen_mc_mceinject { struct xen_mc_inject_v2 { uint32_t flags; - struct xenctl_cpumap cpumap; + struct xenctl_bitmap cpumap; }; #endif diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -285,7 +285,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvc /* XEN_DOMCTL_getvcpuaffinity */ struct xen_domctl_vcpuaffinity { uint32_t vcpu; /* IN */ - struct xenctl_cpumap cpumap; /* IN/OUT */ + struct xenctl_bitmap cpumap; /* IN/OUT */ }; typedef struct xen_domctl_vcpuaffinity xen_domctl_vcpuaffinity_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_vcpuaffinity_t); diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h --- a/xen/include/public/sysctl.h +++ b/xen/include/public/sysctl.h @@ -71,7 +71,7 @@ struct xen_sysctl_tbuf_op { #define XEN_SYSCTL_TBUFOP_disable 5 uint32_t cmd; /* IN/OUT variables */ - struct xenctl_cpumap cpu_mask; + struct xenctl_bitmap cpu_mask; uint32_t evt_mask; /* OUT variables */ uint64_aligned_t buffer_mfn; @@ -532,7 +532,7 @@ struct xen_sysctl_cpupool_op { uint32_t domid; /* IN: M */ uint32_t cpu; /* IN: AR */ uint32_t n_dom; /* OUT: I */ - struct xenctl_cpumap cpumap; /* OUT: IF */ + struct xenctl_bitmap cpumap; /* OUT: IF */ }; typedef struct xen_sysctl_cpupool_op xen_sysctl_cpupool_op_t; DEFINE_XEN_GUEST_HANDLE(xen_sysctl_cpupool_op_t); diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -851,9 +851,9 @@ typedef uint8_t xen_domain_handle_t[16]; #endif #ifndef __ASSEMBLY__ -struct xenctl_cpumap { +struct xenctl_bitmap { XEN_GUEST_HANDLE_64(uint8) bitmap; - uint32_t nr_cpus; + uint32_t nr_bits; }; #endif diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h --- a/xen/include/xen/cpumask.h +++ b/xen/include/xen/cpumask.h @@ -424,8 +424,8 @@ extern cpumask_t cpu_present_map; #define for_each_present_cpu(cpu) for_each_cpu(cpu, &cpu_present_map) /* Copy to/from cpumap provided by control tools. */ -struct xenctl_cpumap; -int cpumask_to_xenctl_cpumap(struct xenctl_cpumap *, const cpumask_t *); -int xenctl_cpumap_to_cpumask(cpumask_var_t *, const struct xenctl_cpumap *); +struct xenctl_bitmap; +int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *, const cpumask_t *); +int xenctl_bitmap_to_cpumask(cpumask_var_t *, const struct xenctl_bitmap *); #endif /* __XEN_CPUMASK_H */ diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst --- a/xen/include/xlat.lst +++ b/xen/include/xlat.lst @@ -2,7 +2,7 @@ # ! - needs translation # ? - needs checking ? dom0_vga_console_info xen.h -? xenctl_cpumap xen.h +? xenctl_bitmap xen.h ? mmu_update xen.h ! mmuext_op xen.h ! start_info xen.h
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 02 of 11 v4] xen, libxc: introduce xc_nodemap_t
And its handling functions, following suit from xc_cpumap_t. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> --- Changes from v2: * Discriminating between statically allocated nodemask_t and dynamically allocated nodemask_var_t is not really necesssary, given the maximum number of nodes won''t ever grow that much, so killed that hunk, as suggested during review. diff --git a/tools/libxc/xc_misc.c b/tools/libxc/xc_misc.c --- a/tools/libxc/xc_misc.c +++ b/tools/libxc/xc_misc.c @@ -54,6 +54,11 @@ int xc_get_cpumap_size(xc_interface *xch return (xc_get_max_cpus(xch) + 7) / 8; } +int xc_get_nodemap_size(xc_interface *xch) +{ + return (xc_get_max_nodes(xch) + 7) / 8; +} + xc_cpumap_t xc_cpumap_alloc(xc_interface *xch) { int sz; @@ -64,6 +69,16 @@ xc_cpumap_t xc_cpumap_alloc(xc_interface return calloc(1, sz); } +xc_nodemap_t xc_nodemap_alloc(xc_interface *xch) +{ + int sz; + + sz = xc_get_nodemap_size(xch); + if (sz == 0) + return NULL; + return calloc(1, sz); +} + int xc_readconsolering(xc_interface *xch, char *buffer, unsigned int *pnr_chars, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -330,12 +330,20 @@ int xc_get_cpumap_size(xc_interface *xch /* allocate a cpumap */ xc_cpumap_t xc_cpumap_alloc(xc_interface *xch); - /* +/* * NODEMAP handling */ +typedef uint8_t *xc_nodemap_t; + /* return maximum number of NUMA nodes the hypervisor supports */ int xc_get_max_nodes(xc_interface *xch); +/* return array size for nodemap */ +int xc_get_nodemap_size(xc_interface *xch); + +/* allocate a nodemap */ +xc_nodemap_t xc_nodemap_alloc(xc_interface *xch); + /* * DOMAIN DEBUGGING FUNCTIONS */ diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -117,6 +117,20 @@ int xenctl_bitmap_to_cpumask(cpumask_var return err; } +int nodemask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_nodemap, + const nodemask_t *nodemask) +{ + return bitmap_to_xenctl_bitmap(xenctl_nodemap, cpumask_bits(nodemask), + MAX_NUMNODES); +} + +int xenctl_bitmap_to_nodemask(nodemask_t *nodemask, + const struct xenctl_bitmap *xenctl_nodemap) +{ + return xenctl_bitmap_to_bitmap(nodes_addr(*nodemask), xenctl_nodemap, + MAX_NUMNODES); +} + static inline int is_free_domid(domid_t dom) { struct domain *d;
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
The pcpu picking algorithm treats two threads of a SMT core the same. More specifically, if one is idle and the other one is busy, they both will be assigned a weight of 1. Therefore, when picking begins, if the first target pcpu is the busy thread (and if there are no other idle pcpu than its sibling), that will never change. This change fixes this by ensuring that, before entering the core of the picking algorithm, the target pcpu is an idle one (if there is an idle pcpu at all, of course). Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -532,6 +532,18 @@ static int if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) cpumask_set_cpu(cpu, &idlers); cpumask_and(&cpus, &cpus, &idlers); + + /* + * It is important that cpu points to an idle processor, if a suitable + * one exists (and we can use cpus to check and, possibly, choose a new + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and + * cpu points to a busy thread with an idle sibling, both the threads + * will be considered the same, from the "idleness" calculation point + * of view", preventing vcpu from being moved to the thread that is + * actually idle. + */ + if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) ) + cpu = cpumask_cycle(cpu, &cpus); cpumask_clear_cpu(cpu, &cpus); while ( !cpumask_empty(&cpus) )
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 04 of 11 v4] xen: sched_credit: let the scheduler know about node-affinity
As vcpu-affinity tells where VCPUs must run, node-affinity tells where they prefer to. While respecting vcpu-affinity remains mandatory, node-affinity is not that strict, it only expresses a preference, although honouring it will bring significant performance benefits (especially as compared to not having any affinity at all). This change modifies the VCPUs load balancing algorithm (for the credit scheduler only), introducing a two steps logic. During the first step, we use both the vcpu-affinity and the node-affinity masks (by looking at their intersection). The aim is giving precedence to the PCPUs where the domain prefers to run, as expressed by its node-affinity (with the intersection with the vcpu-afinity being necessary in order to avoid running a VCPU where it never should). If that fails in finding a valid PCPU, the node-affinity is just ignored and, in the second step, we fall back to using cpu-affinity only. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> --- Changes from v3: * rephrased some bits of the changelog, as suggested during review; * renamed __vcpu_has_valuable_node_affinity() to __vcpu_has_node_affinity() and killed csched_balance_step_skippable(), in favour of directly putting the full if statement in place, as requested and suggested (respectively) during review; * added a comment about future possible optimization in csched_runq_steal(). Changes from v2: * for_each_csched_balance_step() now is defined as a regular, and easier to understand, 0...n for() loop, instead of a going backwards one, as that wasn''t really needed; * checking whether or not a CSCHED_BALANCE_NODE_AFFINITY balancing set is useful or not now happens outside of csched_balance_cpumask(), i.e., closer to the actual loop, making the logic a lot more evident and easy to understand, as requested during review; * while reworking __runq_tickle(), handling of idle pcpu has been brought outside from the balancing loop, as requested during review; * __csched_vcpu_is_migrateable() was just wrong, so it has been removed; * the suboptimal handling of SMT in _csched_cpu_pick() has been moved to a separate patch (i.e., the previous patch in the series); * moved the CPU mask needed for balancing within `csched_pcpu'', as suggested during review. This way it is not only more close to other per-PCPU data (potential cache related benefits), but it is also only allocated for the PCPUs credit is in charge of. Changes from v1: * CPU masks variables moved off from the stack, as requested during review. As per the comments in the code, having them in the private (per-scheduler instance) struct could have been enough, but it would be racy (again, see comments). For that reason, use a global bunch of them of (via per_cpu()); * George suggested a different load balancing logic during v1''s review. I think he was right and then I changed the old implementation in a way that resembles exactly that. I rewrote most of this patch to introduce a more sensible and effective noda-affinity handling logic. diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -112,6 +112,12 @@ /* + * Node Balancing + */ +#define CSCHED_BALANCE_NODE_AFFINITY 0 +#define CSCHED_BALANCE_CPU_AFFINITY 1 + +/* * Boot parameters */ static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS; @@ -126,9 +132,20 @@ struct csched_pcpu { struct timer ticker; unsigned int tick; unsigned int idle_bias; + /* Store this here to avoid having too many cpumask_var_t-s on stack */ + cpumask_var_t balance_mask; }; /* + * Convenience macro for accessing the per-PCPU cpumask we need for + * implementing the two steps (vcpu and node affinity) balancing logic. + * It is stored in csched_pcpu so that serialization is not an issue, + * as there is a csched_pcpu for each PCPU and we always hold the + * runqueue spin-lock when using this. + */ +#define csched_balance_mask (CSCHED_PCPU(smp_processor_id())->balance_mask) + +/* * Virtual CPU */ struct csched_vcpu { @@ -161,6 +178,9 @@ struct csched_dom { struct list_head active_vcpu; struct list_head active_sdom_elem; struct domain *dom; + /* cpumask translated from the domain''s node-affinity. + * Basically, the CPUs we prefer to be scheduled on. */ + cpumask_var_t node_affinity_cpumask; uint16_t active_vcpu_count; uint16_t weight; uint16_t cap; @@ -241,6 +261,35 @@ static inline void list_del_init(&svc->runq_elem); } +#define for_each_csched_balance_step(step) \ + for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) + + +/* + * vcpu-affinity balancing is always necessary and must never be skipped. + * OTOH, if a domain''s node-affinity spans all the nodes, we can safely + * avoid dealing with node-affinity entirely. + */ +#define __vcpu_has_node_affinity(vc) \ + ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) ) + +/* + * Each csched-balance step uses its own cpumask. This function determines + * which one (given the step) and copies it in mask. For the node-affinity + * balancing step, the pcpus that are not part of vc''s vcpu-affinity are + * filtered out from the result, to avoid running a vcpu where it would + * like, but is not allowed to! + */ +static void +csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) +{ + if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, + vc->cpu_affinity); + else /* step == CSCHED_BALANCE_CPU_AFFINITY */ + cpumask_copy(mask, vc->cpu_affinity); +} + static void burn_credits(struct csched_vcpu *svc, s_time_t now) { s_time_t delta; @@ -272,12 +321,12 @@ static inline void struct csched_vcpu * const cur = CSCHED_VCPU(curr_on_cpu(cpu)); struct csched_private *prv = CSCHED_PRIV(per_cpu(scheduler, cpu)); cpumask_t mask, idle_mask; - int idlers_empty; + int balance_step, idlers_empty; ASSERT(cur); cpumask_clear(&mask); + idlers_empty = cpumask_empty(prv->idlers); - idlers_empty = cpumask_empty(prv->idlers); /* * If the pcpu is idle, or there are no idlers and the new @@ -297,41 +346,70 @@ static inline void } else if ( !idlers_empty ) { - /* Check whether or not there are idlers that can run new */ - cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity); + /* + * Node and vcpu-affinity balancing loop. For vcpus without + * a useful node-affinity, consider vcpu-affinity only. + */ + for_each_csched_balance_step( balance_step ) + { + int new_idlers_empty; - /* - * If there are no suitable idlers for new, and it''s higher - * priority than cur, ask the scheduler to migrate cur away. - * We have to act like this (instead of just waking some of - * the idlers suitable for cur) because cur is running. - * - * If there are suitable idlers for new, no matter priorities, - * leave cur alone (as it is running and is, likely, cache-hot) - * and wake some of them (which is waking up and so is, likely, - * cache cold anyway). - */ - if ( cpumask_empty(&idle_mask) && new->pri > cur->pri ) - { - SCHED_STAT_CRANK(tickle_idlers_none); - SCHED_VCPU_STAT_CRANK(cur, kicked_away); - SCHED_VCPU_STAT_CRANK(cur, migrate_r); - SCHED_STAT_CRANK(migrate_kicked_away); - set_bit(_VPF_migrating, &cur->vcpu->pause_flags); - cpumask_set_cpu(cpu, &mask); - } - else if ( !cpumask_empty(&idle_mask) ) - { - /* Which of the idlers suitable for new shall we wake up? */ - SCHED_STAT_CRANK(tickle_idlers_some); - if ( opt_tickle_one_idle ) + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(new->vcpu) ) + continue; + + /* Are there idlers suitable for new (for this balance step)? */ + csched_balance_cpumask(new->vcpu, balance_step, + csched_balance_mask); + cpumask_and(&idle_mask, prv->idlers, csched_balance_mask); + new_idlers_empty = cpumask_empty(&idle_mask); + + /* + * Let''s not be too harsh! If there aren''t idlers suitable + * for new in its node-affinity mask, make sure we check its + * vcpu-affinity as well, before taking final decisions. + */ + if ( new_idlers_empty + && balance_step == CSCHED_BALANCE_NODE_AFFINITY ) + continue; + + /* + * If there are no suitable idlers for new, and it''s higher + * priority than cur, ask the scheduler to migrate cur away. + * We have to act like this (instead of just waking some of + * the idlers suitable for cur) because cur is running. + * + * If there are suitable idlers for new, no matter priorities, + * leave cur alone (as it is running and is, likely, cache-hot) + * and wake some of them (which is waking up and so is, likely, + * cache cold anyway). + */ + if ( new_idlers_empty && new->pri > cur->pri ) { - this_cpu(last_tickle_cpu) - cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); - cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); + SCHED_STAT_CRANK(tickle_idlers_none); + SCHED_VCPU_STAT_CRANK(cur, kicked_away); + SCHED_VCPU_STAT_CRANK(cur, migrate_r); + SCHED_STAT_CRANK(migrate_kicked_away); + set_bit(_VPF_migrating, &cur->vcpu->pause_flags); + cpumask_set_cpu(cpu, &mask); } - else - cpumask_or(&mask, &mask, &idle_mask); + else if ( !new_idlers_empty ) + { + /* Which of the idlers suitable for new shall we wake up? */ + SCHED_STAT_CRANK(tickle_idlers_some); + if ( opt_tickle_one_idle ) + { + this_cpu(last_tickle_cpu) + cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask); + cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask); + } + else + cpumask_or(&mask, &mask, &idle_mask); + } + + /* Did we find anyone? */ + if ( !cpumask_empty(&mask) ) + break; } } @@ -376,6 +454,7 @@ csched_free_pdata(const struct scheduler spin_unlock_irqrestore(&prv->lock, flags); + free_cpumask_var(spc->balance_mask); xfree(spc); } @@ -391,6 +470,12 @@ csched_alloc_pdata(const struct schedule if ( spc == NULL ) return NULL; + if ( !alloc_cpumask_var(&spc->balance_mask) ) + { + xfree(spc); + return NULL; + } + spin_lock_irqsave(&prv->lock, flags); /* Initialize/update system-wide config */ @@ -481,15 +566,16 @@ static inline int } static inline int -__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu) +__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask) { /* * Don''t pick up work that''s in the peer''s scheduling tail or hot on - * peer PCPU. Only pick up work that''s allowed to run on our CPU. + * peer PCPU. Only pick up work that prefers and/or is allowed to run + * on our CPU. */ return !vc->is_running && !__csched_vcpu_is_cache_hot(vc) && - cpumask_test_cpu(dest_cpu, vc->cpu_affinity); + cpumask_test_cpu(dest_cpu, mask); } static int @@ -499,97 +585,111 @@ static int cpumask_t idlers; cpumask_t *online; struct csched_pcpu *spc = NULL; - int cpu; + int cpu = vc->processor; + int balance_step; - /* - * Pick from online CPUs in VCPU''s affinity mask, giving a - * preference to its current processor if it''s in there. - */ online = cpupool_scheduler_cpumask(vc->domain->cpupool); - cpumask_and(&cpus, online, vc->cpu_affinity); - cpu = cpumask_test_cpu(vc->processor, &cpus) - ? vc->processor - : cpumask_cycle(vc->processor, &cpus); - ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); + for_each_csched_balance_step( balance_step ) + { + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(vc) ) + continue; - /* - * Try to find an idle processor within the above constraints. - * - * In multi-core and multi-threaded CPUs, not all idle execution - * vehicles are equal! - * - * We give preference to the idle execution vehicle with the most - * idling neighbours in its grouping. This distributes work across - * distinct cores first and guarantees we don''t do something stupid - * like run two VCPUs on co-hyperthreads while there are idle cores - * or sockets. - * - * Notice that, when computing the "idleness" of cpu, we may want to - * discount vc. That is, iff vc is the currently running and the only - * runnable vcpu on cpu, we add cpu to the idlers. - */ - cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); - if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) - cpumask_set_cpu(cpu, &idlers); - cpumask_and(&cpus, &cpus, &idlers); + /* Pick an online CPU from the proper affinity mask */ + csched_balance_cpumask(vc, balance_step, &cpus); + cpumask_and(&cpus, &cpus, online); - /* - * It is important that cpu points to an idle processor, if a suitable - * one exists (and we can use cpus to check and, possibly, choose a new - * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and - * cpu points to a busy thread with an idle sibling, both the threads - * will be considered the same, from the "idleness" calculation point - * of view", preventing vcpu from being moved to the thread that is - * actually idle. - */ - if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) ) - cpu = cpumask_cycle(cpu, &cpus); - cpumask_clear_cpu(cpu, &cpus); + /* If present, prefer vc''s current processor */ + cpu = cpumask_test_cpu(vc->processor, &cpus) + ? vc->processor + : cpumask_cycle(vc->processor, &cpus); + ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); - while ( !cpumask_empty(&cpus) ) - { - cpumask_t cpu_idlers; - cpumask_t nxt_idlers; - int nxt, weight_cpu, weight_nxt; - int migrate_factor; + /* + * Try to find an idle processor within the above constraints. + * + * In multi-core and multi-threaded CPUs, not all idle execution + * vehicles are equal! + * + * We give preference to the idle execution vehicle with the most + * idling neighbours in its grouping. This distributes work across + * distinct cores first and guarantees we don''t do something stupid + * like run two VCPUs on co-hyperthreads while there are idle cores + * or sockets. + * + * Notice that, when computing the "idleness" of cpu, we may want to + * discount vc. That is, iff vc is the currently running and the only + * runnable vcpu on cpu, we add cpu to the idlers. + */ + cpumask_and(&idlers, &cpu_online_map, CSCHED_PRIV(ops)->idlers); + if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) + cpumask_set_cpu(cpu, &idlers); + cpumask_and(&cpus, &cpus, &idlers); - nxt = cpumask_cycle(cpu, &cpus); + /* + * It is important that cpu points to an idle processor, if a suitable + * one exists (and we can use cpus to check and, possibly, choose a new + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and + * cpu points to a busy thread with an idle sibling, both the threads + * will be considered the same, from the "idleness" calculation point + * of view", preventing vcpu from being moved to the thread that is + * actually idle. + */ + if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) ) + cpu = cpumask_cycle(cpu, &cpus); + cpumask_clear_cpu(cpu, &cpus); - if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) + while ( !cpumask_empty(&cpus) ) { - /* We''re on the same socket, so check the busy-ness of threads. - * Migrate if # of idlers is less at all */ - ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 1; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, nxt)); - } - else - { - /* We''re on different sockets, so check the busy-ness of cores. - * Migrate only if the other core is twice as idle */ - ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); - migrate_factor = 2; - cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); - cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); + cpumask_t cpu_idlers; + cpumask_t nxt_idlers; + int nxt, weight_cpu, weight_nxt; + int migrate_factor; + + nxt = cpumask_cycle(cpu, &cpus); + + if ( cpumask_test_cpu(cpu, per_cpu(cpu_core_mask, nxt)) ) + { + /* We''re on the same socket, so check the busy-ness of threads. + * Migrate if # of idlers is less at all */ + ASSERT( cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 1; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_sibling_mask, + cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_sibling_mask, + nxt)); + } + else + { + /* We''re on different sockets, so check the busy-ness of cores. + * Migrate only if the other core is twice as idle */ + ASSERT( !cpumask_test_cpu(nxt, per_cpu(cpu_core_mask, cpu)) ); + migrate_factor = 2; + cpumask_and(&cpu_idlers, &idlers, per_cpu(cpu_core_mask, cpu)); + cpumask_and(&nxt_idlers, &idlers, per_cpu(cpu_core_mask, nxt)); + } + + weight_cpu = cpumask_weight(&cpu_idlers); + weight_nxt = cpumask_weight(&nxt_idlers); + /* smt_power_savings: consolidate work rather than spreading it */ + if ( sched_smt_power_savings ? + weight_cpu > weight_nxt : + weight_cpu * migrate_factor < weight_nxt ) + { + cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); + spc = CSCHED_PCPU(nxt); + cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); + cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); + } + else + { + cpumask_andnot(&cpus, &cpus, &nxt_idlers); + } } - weight_cpu = cpumask_weight(&cpu_idlers); - weight_nxt = cpumask_weight(&nxt_idlers); - /* smt_power_savings: consolidate work rather than spreading it */ - if ( sched_smt_power_savings ? - weight_cpu > weight_nxt : - weight_cpu * migrate_factor < weight_nxt ) - { - cpumask_and(&nxt_idlers, &cpus, &nxt_idlers); - spc = CSCHED_PCPU(nxt); - cpu = cpumask_cycle(spc->idle_bias, &nxt_idlers); - cpumask_andnot(&cpus, &cpus, per_cpu(cpu_sibling_mask, cpu)); - } - else - { - cpumask_andnot(&cpus, &cpus, &nxt_idlers); - } + /* Stop if cpu is idle */ + if ( cpumask_test_cpu(cpu, &idlers) ) + break; } if ( commit && spc ) @@ -931,6 +1031,13 @@ csched_alloc_domdata(const struct schedu if ( sdom == NULL ) return NULL; + if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) ) + { + xfree(sdom); + return NULL; + } + cpumask_setall(sdom->node_affinity_cpumask); + /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->active_vcpu); sdom->active_vcpu_count = 0; @@ -962,6 +1069,9 @@ csched_dom_init(const struct scheduler * static void csched_free_domdata(const struct scheduler *ops, void *data) { + struct csched_dom *sdom = data; + + free_cpumask_var(sdom->node_affinity_cpumask); xfree(data); } @@ -1256,7 +1366,7 @@ csched_tick(void *_cpu) } static struct csched_vcpu * -csched_runq_steal(int peer_cpu, int cpu, int pri) +csched_runq_steal(int peer_cpu, int cpu, int pri, int balance_step) { const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); const struct vcpu * const peer_vcpu = curr_on_cpu(peer_cpu); @@ -1281,11 +1391,28 @@ csched_runq_steal(int peer_cpu, int cpu, if ( speer->pri <= pri ) break; - /* Is this VCPU is runnable on our PCPU? */ + /* Is this VCPU runnable on our PCPU? */ vc = speer->vcpu; BUG_ON( is_idle_vcpu(vc) ); - if (__csched_vcpu_is_migrateable(vc, cpu)) + /* + * If the vcpu has no useful node-affinity, skip this vcpu. + * In fact, what we want is to check if we have any node-affine + * work to steal, before starting to look at vcpu-affine work. + * + * Notice that, if not even one vCPU on this runq has a useful + * node-affinity, we could have avoid considering this runq for + * a node balancing step in the first place. This, for instance, + * can be implemented by taking note of on what runq there are + * vCPUs with useful node-affinities in some sort of bitmap + * or counter. + */ + if ( balance_step == CSCHED_BALANCE_NODE_AFFINITY + && !__vcpu_has_node_affinity(vc) ) + continue; + + csched_balance_cpumask(vc, balance_step, csched_balance_mask); + if ( __csched_vcpu_is_migrateable(vc, cpu, csched_balance_mask) ) { /* We got a candidate. Grab it! */ TRACE_3D(TRC_CSCHED_STOLEN_VCPU, peer_cpu, @@ -1311,7 +1438,8 @@ csched_load_balance(struct csched_privat struct csched_vcpu *speer; cpumask_t workers; cpumask_t *online; - int peer_cpu; + int peer_cpu, peer_node, bstep; + int node = cpu_to_node(cpu); BUG_ON( cpu != snext->vcpu->processor ); online = cpupool_scheduler_cpumask(per_cpu(cpupool, cpu)); @@ -1328,42 +1456,68 @@ csched_load_balance(struct csched_privat SCHED_STAT_CRANK(load_balance_other); /* - * Peek at non-idling CPUs in the system, starting with our - * immediate neighbour. + * Let''s look around for work to steal, taking both vcpu-affinity + * and node-affinity into account. More specifically, we check all + * the non-idle CPUs'' runq, looking for: + * 1. any node-affine work to steal first, + * 2. if not finding anything, any vcpu-affine work to steal. */ - cpumask_andnot(&workers, online, prv->idlers); - cpumask_clear_cpu(cpu, &workers); - peer_cpu = cpu; + for_each_csched_balance_step( bstep ) + { + /* + * We peek at the non-idling CPUs in a node-wise fashion. In fact, + * it is more likely that we find some node-affine work on our same + * node, not to mention that migrating vcpus within the same node + * could well expected to be cheaper than across-nodes (memory + * stays local, there might be some node-wide cache[s], etc.). + */ + peer_node = node; + do + { + /* Find out what the !idle are in this node */ + cpumask_andnot(&workers, online, prv->idlers); + cpumask_and(&workers, &workers, &node_to_cpumask(peer_node)); + cpumask_clear_cpu(cpu, &workers); - while ( !cpumask_empty(&workers) ) - { - peer_cpu = cpumask_cycle(peer_cpu, &workers); - cpumask_clear_cpu(peer_cpu, &workers); + if ( cpumask_empty(&workers) ) + goto next_node; - /* - * Get ahold of the scheduler lock for this peer CPU. - * - * Note: We don''t spin on this lock but simply try it. Spinning could - * cause a deadlock if the peer CPU is also load balancing and trying - * to lock this CPU. - */ - if ( !pcpu_schedule_trylock(peer_cpu) ) - { - SCHED_STAT_CRANK(steal_trylock_failed); - continue; - } + peer_cpu = cpumask_first(&workers); + do + { + /* + * Get ahold of the scheduler lock for this peer CPU. + * + * Note: We don''t spin on this lock but simply try it. Spinning + * could cause a deadlock if the peer CPU is also load + * balancing and trying to lock this CPU. + */ + if ( !pcpu_schedule_trylock(peer_cpu) ) + { + SCHED_STAT_CRANK(steal_trylock_failed); + peer_cpu = cpumask_cycle(peer_cpu, &workers); + continue; + } - /* - * Any work over there to steal? - */ - speer = cpumask_test_cpu(peer_cpu, online) ? - csched_runq_steal(peer_cpu, cpu, snext->pri) : NULL; - pcpu_schedule_unlock(peer_cpu); - if ( speer != NULL ) - { - *stolen = 1; - return speer; - } + /* Any work over there to steal? */ + speer = cpumask_test_cpu(peer_cpu, online) ? + csched_runq_steal(peer_cpu, cpu, snext->pri, bstep) : NULL; + pcpu_schedule_unlock(peer_cpu); + + /* As soon as one vcpu is found, balancing ends */ + if ( speer != NULL ) + { + *stolen = 1; + return speer; + } + + peer_cpu = cpumask_cycle(peer_cpu, &workers); + + } while( peer_cpu != cpumask_first(&workers) ); + + next_node: + peer_node = cycle_node(peer_node, node_online_map); + } while( peer_node != node ); } out: diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h --- a/xen/include/xen/nodemask.h +++ b/xen/include/xen/nodemask.h @@ -41,6 +41,8 @@ * int last_node(mask) Number highest set bit, or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES. + * int cycle_node(node, mask) Next node cycling from ''node'', or + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit ''node'' set * NODE_MASK_ALL Initializer - all bits set @@ -254,6 +256,16 @@ static inline int __first_unset_node(con find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } +#define cycle_node(n, src) __cycle_node((n), &(src), MAX_NUMNODES) +static inline int __cycle_node(int n, const nodemask_t *maskp, int nbits) +{ + int nxt = __next_node(n, maskp, nbits); + + if (nxt == nbits) + nxt = __first_node(maskp, nbits); + return nxt; +} + #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 05 of 11 v4] xen: allow for explicitly specifying node-affinity
Make it possible to pass the node-affinity of a domain to the hypervisor from the upper layers, instead of always being computed automatically. Note that this also required generalizing the Flask hooks for setting and getting the affinity, so that they now deal with both vcpu and node affinity. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> --- Changes from v3: * consider it possible for node-affinity and vcpu-affinity to have an empty intersection (as not doing so, when the full series is applied and vcpu-affinity is changed on-line, could result in failing an ASSERT). Changes from v2: * reworked as needed after the merge of Daniel''s IS_PRIV work; * xen.te taken care of, as requested during review. Changes from v1: * added the missing dummy hook for nodeaffinity; * let the permission renaming affect flask policies too. diff --git a/tools/flask/policy/policy/mls b/tools/flask/policy/policy/mls --- a/tools/flask/policy/policy/mls +++ b/tools/flask/policy/policy/mls @@ -70,11 +70,11 @@ mlsconstrain domain transition (( h1 dom h2 ) and (( l1 eq l2 ) or (t1 == mls_priv))); # all the domain "read" ops -mlsconstrain domain { getvcpuaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } +mlsconstrain domain { getaffinity getdomaininfo getvcpuinfo getvcpucontext getaddrsize getextvcpucontext } ((l1 dom l2) or (t1 == mls_priv)); # all the domain "write" ops -mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setvcpuaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } +mlsconstrain domain { setvcpucontext pause unpause resume create max_vcpus destroy setaffinity scheduler setdomainmaxmem setdomainhandle setdebugging hypercall settime set_target shutdown setaddrsize trigger setextvcpucontext } ((l1 eq l2) or (t1 == mls_priv)); # This is incomplete - similar constraints must be written for all classes diff --git a/tools/flask/policy/policy/modules/xen/xen.if b/tools/flask/policy/policy/modules/xen/xen.if --- a/tools/flask/policy/policy/modules/xen/xen.if +++ b/tools/flask/policy/policy/modules/xen/xen.if @@ -48,7 +48,7 @@ define(`create_domain_common'', ` allow $1 $2:domain { create max_vcpus setdomainmaxmem setaddrsize getdomaininfo hypercall setvcpucontext setextvcpucontext getscheduler getvcpuinfo getvcpuextstate getaddrsize - getvcpuaffinity setvcpuaffinity }; + getaffinity setaffinity }; allow $1 $2:domain2 { set_cpuid settsc setscheduler }; allow $1 $2:security check_context; allow $1 $2:shadow enable; @@ -77,9 +77,9 @@ define(`create_domain_build_label'', ` # manage_domain(priv, target) # Allow managing a running domain define(`manage_domain'', ` - allow $1 $2:domain { getdomaininfo getvcpuinfo getvcpuaffinity + allow $1 $2:domain { getdomaininfo getvcpuinfo getaffinity getaddrsize pause unpause trigger shutdown destroy - setvcpuaffinity setdomainmaxmem getscheduler }; + setaffinity setdomainmaxmem getscheduler }; '') # migrate_domain_out(priv, target) diff --git a/tools/flask/policy/policy/modules/xen/xen.te b/tools/flask/policy/policy/modules/xen/xen.te --- a/tools/flask/policy/policy/modules/xen/xen.te +++ b/tools/flask/policy/policy/modules/xen/xen.te @@ -69,7 +69,7 @@ allow dom0_t xen_t:mmu memorymap; # Allow dom0 to use these domctls on itself. For domctls acting on other # domains, see the definitions of create_domain and manage_domain. allow dom0_t dom0_t:domain { - setvcpucontext max_vcpus setvcpuaffinity getvcpuaffinity getscheduler + setvcpucontext max_vcpus setaffinity getaffinity getscheduler getdomaininfo getvcpuinfo getvcpucontext setdomainmaxmem setdomainhandle setdebugging hypercall settime setaddrsize getaddrsize trigger getextvcpucontext setextvcpucontext getvcpuextstate setvcpuextstate diff --git a/xen/common/domain.c b/xen/common/domain.c --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -222,6 +222,7 @@ struct domain *domain_create( spin_lock_init(&d->node_affinity_lock); d->node_affinity = NODE_MASK_ALL; + d->auto_node_affinity = 1; spin_lock_init(&d->shutdown_lock); d->shutdown_code = -1; @@ -362,11 +363,32 @@ void domain_update_node_affinity(struct cpumask_or(cpumask, cpumask, online_affinity); } - for_each_online_node ( node ) - if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) - node_set(node, nodemask); + if ( d->auto_node_affinity ) + { + /* Node-affinity is automaically computed from all vcpu-affinities */ + for_each_online_node ( node ) + if ( cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_set(node, nodemask); - d->node_affinity = nodemask; + d->node_affinity = nodemask; + } + else + { + /* Node-affinity is provided by someone else, just filter out cpus + * that are either offline or not in the affinity of any vcpus. */ + nodemask = d->node_affinity; + for_each_node_mask ( node, d->node_affinity ) + if ( !cpumask_intersects(&node_to_cpumask(node), cpumask) ) + node_clear(node, nodemask);//d->node_affinity); + + /* Avoid loosing track of node-affinity because of a bad + * vcpu-affinity has been specified. */ + if ( !nodes_empty(nodemask) ) + d->node_affinity = nodemask; + } + + sched_set_node_affinity(d, &d->node_affinity); + spin_unlock(&d->node_affinity_lock); free_cpumask_var(online_affinity); @@ -374,6 +396,36 @@ void domain_update_node_affinity(struct } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity) +{ + /* Being affine with no nodes is just wrong */ + if ( nodes_empty(*affinity) ) + return -EINVAL; + + spin_lock(&d->node_affinity_lock); + + /* + * Being/becoming explicitly affine to all nodes is not particularly + * useful. Let''s take it as the `reset node affinity` command. + */ + if ( nodes_full(*affinity) ) + { + d->auto_node_affinity = 1; + goto out; + } + + d->auto_node_affinity = 0; + d->node_affinity = *affinity; + +out: + spin_unlock(&d->node_affinity_lock); + + domain_update_node_affinity(d); + + return 0; +} + + struct domain *get_domain_by_id(domid_t dom) { struct domain *d; diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -560,6 +560,26 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe } break; + case XEN_DOMCTL_setnodeaffinity: + case XEN_DOMCTL_getnodeaffinity: + { + if ( op->cmd == XEN_DOMCTL_setnodeaffinity ) + { + nodemask_t new_affinity; + + ret = xenctl_bitmap_to_nodemask(&new_affinity, + &op->u.nodeaffinity.nodemap); + if ( !ret ) + ret = domain_set_node_affinity(d, &new_affinity); + } + else + { + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, + &d->node_affinity); + } + } + break; + case XEN_DOMCTL_setvcpuaffinity: case XEN_DOMCTL_getvcpuaffinity: { diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c --- a/xen/common/keyhandler.c +++ b/xen/common/keyhandler.c @@ -218,6 +218,14 @@ static void cpuset_print(char *set, int *set++ = ''\0''; } +static void nodeset_print(char *set, int size, const nodemask_t *mask) +{ + *set++ = ''[''; + set += nodelist_scnprintf(set, size-2, mask); + *set++ = '']''; + *set++ = ''\0''; +} + static void periodic_timer_print(char *str, int size, uint64_t period) { if ( period == 0 ) @@ -273,6 +281,9 @@ static void dump_domains(unsigned char k dump_pageframe_info(d); + nodeset_print(tmpstr, sizeof(tmpstr), &d->node_affinity); + printk("NODE affinity for domain %d: %s\n", d->domain_id, tmpstr); + printk("VCPU information and callbacks for domain %u:\n", d->domain_id); for_each_vcpu ( d, v ) diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c --- a/xen/common/sched_credit.c +++ b/xen/common/sched_credit.c @@ -261,17 +261,50 @@ static inline void list_del_init(&svc->runq_elem); } +/* + * Translates node-affinity mask into a cpumask, so that we can use it during + * actual scheduling. That of course will contain all the cpus from all the + * set nodes in the original node-affinity mask. + * + * Note that any serialization needed to access mask safely is complete + * responsibility of the caller of this function/hook. + */ +static void csched_set_node_affinity( + const struct scheduler *ops, + struct domain *d, + nodemask_t *mask) +{ + struct csched_dom *sdom; + int node; + + /* Skip idle domain since it doesn''t even have a node_affinity_cpumask */ + if ( unlikely(is_idle_domain(d)) ) + return; + + sdom = CSCHED_DOM(d); + cpumask_clear(sdom->node_affinity_cpumask); + for_each_node_mask( node, *mask ) + cpumask_or(sdom->node_affinity_cpumask, sdom->node_affinity_cpumask, + &node_to_cpumask(node)); +} + #define for_each_csched_balance_step(step) \ for ( (step) = 0; (step) <= CSCHED_BALANCE_CPU_AFFINITY; (step)++ ) /* * vcpu-affinity balancing is always necessary and must never be skipped. - * OTOH, if a domain''s node-affinity spans all the nodes, we can safely - * avoid dealing with node-affinity entirely. + * OTOH, if a domain''s node-affinity is said to be automatically computed + * (or if it just spans all the nodes), we can safely avoid dealing with + * node-affinity entirely. Ah, node-affinity is also deemed meaningless + * in case it has empty intersection with the vcpu''s vcpu-affinity, as it + * would mean trying to schedule it on _no_ pcpu! */ -#define __vcpu_has_node_affinity(vc) \ - ( !cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) ) +#define __vcpu_has_node_affinity(vc) \ + ( !(cpumask_full(CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || !cpumask_intersects(vc->cpu_affinity, \ + CSCHED_DOM(vc->domain)->node_affinity_cpumask) \ + || vc->domain->auto_node_affinity == 1) ) /* * Each csched-balance step uses its own cpumask. This function determines @@ -284,8 +317,13 @@ static void csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask) { if ( step == CSCHED_BALANCE_NODE_AFFINITY ) + { cpumask_and(mask, CSCHED_DOM(vc->domain)->node_affinity_cpumask, vc->cpu_affinity); + + if ( unlikely(cpumask_empty(mask)) ) + cpumask_copy(mask, vc->cpu_affinity); + } else /* step == CSCHED_BALANCE_CPU_AFFINITY */ cpumask_copy(mask, vc->cpu_affinity); } @@ -1895,6 +1933,8 @@ const struct scheduler sched_credit_def .adjust = csched_dom_cntl, .adjust_global = csched_sys_cntl, + .set_node_affinity = csched_set_node_affinity, + .pick_cpu = csched_cpu_pick, .do_schedule = csched_schedule, diff --git a/xen/common/schedule.c b/xen/common/schedule.c --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -599,6 +599,11 @@ int cpu_disable_scheduler(unsigned int c return ret; } +void sched_set_node_affinity(struct domain *d, nodemask_t *mask) +{ + SCHED_OP(DOM2OP(d), set_node_affinity, d, mask); +} + int vcpu_set_affinity(struct vcpu *v, const cpumask_t *affinity) { cpumask_t online_affinity; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -280,6 +280,16 @@ typedef struct xen_domctl_getvcpuinfo xe DEFINE_XEN_GUEST_HANDLE(xen_domctl_getvcpuinfo_t); +/* Get/set the NUMA node(s) with which the guest has affinity with. */ +/* XEN_DOMCTL_setnodeaffinity */ +/* XEN_DOMCTL_getnodeaffinity */ +struct xen_domctl_nodeaffinity { + struct xenctl_bitmap nodemap;/* IN */ +}; +typedef struct xen_domctl_nodeaffinity xen_domctl_nodeaffinity_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_nodeaffinity_t); + + /* Get/set which physical cpus a vcpu can execute on. */ /* XEN_DOMCTL_setvcpuaffinity */ /* XEN_DOMCTL_getvcpuaffinity */ @@ -908,6 +918,8 @@ struct xen_domctl { #define XEN_DOMCTL_audit_p2m 65 #define XEN_DOMCTL_set_virq_handler 66 #define XEN_DOMCTL_set_broken_page_p2m 67 +#define XEN_DOMCTL_setnodeaffinity 68 +#define XEN_DOMCTL_getnodeaffinity 69 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -921,6 +933,7 @@ struct xen_domctl { struct xen_domctl_getpageframeinfo getpageframeinfo; struct xen_domctl_getpageframeinfo2 getpageframeinfo2; struct xen_domctl_getpageframeinfo3 getpageframeinfo3; + struct xen_domctl_nodeaffinity nodeaffinity; struct xen_domctl_vcpuaffinity vcpuaffinity; struct xen_domctl_shadow_op shadow_op; struct xen_domctl_max_mem max_mem; diff --git a/xen/include/xen/nodemask.h b/xen/include/xen/nodemask.h --- a/xen/include/xen/nodemask.h +++ b/xen/include/xen/nodemask.h @@ -8,8 +8,9 @@ * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * - * For details of nodemask_scnprintf() and nodemask_parse(), - * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * For details of nodemask_scnprintf(), nodelist_scnpintf() and + * nodemask_parse(), see bitmap_scnprintf() and bitmap_parse() + * in lib/bitmap.c. * * The available nodemask operations are: * @@ -50,6 +51,7 @@ * unsigned long *nodes_addr(mask) Array of unsigned long''s in mask * * int nodemask_scnprintf(buf, len, mask) Format nodemask for printing + * int nodelist_scnprintf(buf, len, mask) Format nodemask as a list for printing * int nodemask_parse(ubuf, ulen, mask) Parse ascii string as nodemask * * for_each_node_mask(node, mask) for-loop node over mask @@ -292,6 +294,14 @@ static inline int __cycle_node(int n, co #define nodes_addr(src) ((src).bits) +#define nodelist_scnprintf(buf, len, src) \ + __nodelist_scnprintf((buf), (len), (src), MAX_NUMNODES) +static inline int __nodelist_scnprintf(char *buf, int len, + const nodemask_t *srcp, int nbits) +{ + return bitmap_scnlistprintf(buf, len, srcp->bits, nbits); +} + #if 0 #define nodemask_scnprintf(buf, len, src) \ __nodemask_scnprintf((buf), (len), &(src), MAX_NUMNODES) diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -184,6 +184,8 @@ struct scheduler { struct xen_domctl_scheduler_op *); int (*adjust_global) (const struct scheduler *, struct xen_sysctl_scheduler_op *); + void (*set_node_affinity) (const struct scheduler *, + struct domain *, nodemask_t *); void (*dump_settings) (const struct scheduler *); void (*dump_cpu_state) (const struct scheduler *, int); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -360,8 +360,12 @@ struct domain /* Various mem_events */ struct mem_event_per_domain *mem_event; - /* Currently computed from union of all vcpu cpu-affinity masks. */ + /* + * Can be specified by the user. If that is not the case, it is + * computed from the union of all the vcpu cpu-affinity masks. + */ nodemask_t node_affinity; + int auto_node_affinity; unsigned int last_alloc_node; spinlock_t node_affinity_lock; }; @@ -430,6 +434,7 @@ static inline void get_knownalive_domain ASSERT(!(atomic_read(&d->refcnt) & DOMAIN_DESTROYED)); } +int domain_set_node_affinity(struct domain *d, const nodemask_t *affinity); void domain_update_node_affinity(struct domain *d); struct domain *domain_create( @@ -550,6 +555,7 @@ void sched_destroy_domain(struct domain int sched_move_domain(struct domain *d, struct cpupool *c); long sched_adjust(struct domain *, struct xen_domctl_scheduler_op *); long sched_adjust_global(struct xen_sysctl_scheduler_op *); +void sched_set_node_affinity(struct domain *, nodemask_t *); int sched_id(void); void sched_tick_suspend(void); void sched_tick_resume(void); diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c --- a/xen/xsm/flask/hooks.c +++ b/xen/xsm/flask/hooks.c @@ -611,10 +611,10 @@ static int flask_domctl(struct domain *d return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); case XEN_DOMCTL_setvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY); + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); case XEN_DOMCTL_getvcpuaffinity: - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY); + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY); case XEN_DOMCTL_resumedomain: return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors --- a/xen/xsm/flask/policy/access_vectors +++ b/xen/xsm/flask/policy/access_vectors @@ -103,10 +103,10 @@ class domain max_vcpus # XEN_DOMCTL_destroydomain destroy -# XEN_DOMCTL_setvcpuaffinity - setvcpuaffinity -# XEN_DOMCTL_getvcpuaffinity - getvcpuaffinity +# XEN_DOMCTL_setaffinity + setaffinity +# XEN_DOMCTL_getaffinity + getaffinity # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo getscheduler # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 06 of 11 v4] libxc: allow for explicitly specifying node-affinity
By providing the proper get/set interface and wiring them to the new domctl-s from the previous commit. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> --- Changes from v3: * handling of XEN_DOMCTL_{set,get}nodeaffinity now happens in two separate `case'' clauses, as suggested during review. diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c +++ b/tools/libxc/xc_domain.c @@ -110,6 +110,83 @@ int xc_domain_shutdown(xc_interface *xch } +int xc_domain_node_setaffinity(xc_interface *xch, + uint32_t domid, + xc_nodemap_t nodemap) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(uint8_t, local); + int ret = -1; + int nodesize; + + nodesize = xc_get_nodemap_size(xch); + if (!nodesize) + { + PERROR("Could not get number of nodes"); + goto out; + } + + local = xc_hypercall_buffer_alloc(xch, local, nodesize); + if ( local == NULL ) + { + PERROR("Could not allocate memory for setnodeaffinity domctl hypercall"); + goto out; + } + + domctl.cmd = XEN_DOMCTL_setnodeaffinity; + domctl.domain = (domid_t)domid; + + memcpy(local, nodemap, nodesize); + set_xen_guest_handle(domctl.u.nodeaffinity.nodemap.bitmap, local); + domctl.u.nodeaffinity.nodemap.nr_bits = nodesize * 8; + + ret = do_domctl(xch, &domctl); + + xc_hypercall_buffer_free(xch, local); + + out: + return ret; +} + +int xc_domain_node_getaffinity(xc_interface *xch, + uint32_t domid, + xc_nodemap_t nodemap) +{ + DECLARE_DOMCTL; + DECLARE_HYPERCALL_BUFFER(uint8_t, local); + int ret = -1; + int nodesize; + + nodesize = xc_get_nodemap_size(xch); + if (!nodesize) + { + PERROR("Could not get number of nodes"); + goto out; + } + + local = xc_hypercall_buffer_alloc(xch, local, nodesize); + if ( local == NULL ) + { + PERROR("Could not allocate memory for getnodeaffinity domctl hypercall"); + goto out; + } + + domctl.cmd = XEN_DOMCTL_getnodeaffinity; + domctl.domain = (domid_t)domid; + + set_xen_guest_handle(domctl.u.nodeaffinity.nodemap.bitmap, local); + domctl.u.nodeaffinity.nodemap.nr_bits = nodesize * 8; + + ret = do_domctl(xch, &domctl); + + memcpy(nodemap, local, nodesize); + + xc_hypercall_buffer_free(xch, local); + + out: + return ret; +} + int xc_vcpu_setaffinity(xc_interface *xch, uint32_t domid, int vcpu, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -521,6 +521,32 @@ int xc_watchdog(xc_interface *xch, uint32_t id, uint32_t timeout); +/** + * This function explicitly sets the host NUMA nodes the domain will + * have affinity with. + * + * @parm xch a handle to an open hypervisor interface. + * @parm domid the domain id one wants to set the affinity of. + * @parm nodemap the map of the affine nodes. + * @return 0 on success, -1 on failure. + */ +int xc_domain_node_setaffinity(xc_interface *xch, + uint32_t domind, + xc_nodemap_t nodemap); + +/** + * This function retrieves the host NUMA nodes the domain has + * affinity with. + * + * @parm xch a handle to an open hypervisor interface. + * @parm domid the domain id one wants to get the node affinity of. + * @parm nodemap the map of the affine nodes. + * @return 0 on success, -1 on failure. + */ +int xc_domain_node_getaffinity(xc_interface *xch, + uint32_t domind, + xc_nodemap_t nodemap); + int xc_vcpu_setaffinity(xc_interface *xch, uint32_t domid, int vcpu, diff --git a/xen/common/domctl.c b/xen/common/domctl.c --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -561,22 +561,19 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xe break; case XEN_DOMCTL_setnodeaffinity: + { + nodemask_t new_affinity; + + ret = xenctl_bitmap_to_nodemask(&new_affinity, + &op->u.nodeaffinity.nodemap); + if ( !ret ) + ret = domain_set_node_affinity(d, &new_affinity); + } + break; case XEN_DOMCTL_getnodeaffinity: { - if ( op->cmd == XEN_DOMCTL_setnodeaffinity ) - { - nodemask_t new_affinity; - - ret = xenctl_bitmap_to_nodemask(&new_affinity, - &op->u.nodeaffinity.nodemap); - if ( !ret ) - ret = domain_set_node_affinity(d, &new_affinity); - } - else - { - ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, - &d->node_affinity); - } + ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap, + &d->node_affinity); } break;
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 07 of 11 v4] libxl: allow for explicitly specifying node-affinity
By introducing a nodemap in libxl_domain_build_info and providing the get/set methods to deal with it. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c --- a/tools/libxl/libxl.c +++ b/tools/libxl/libxl.c @@ -4164,6 +4164,26 @@ int libxl_set_vcpuaffinity_all(libxl_ctx return rc; } +int libxl_domain_set_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap) +{ + if (xc_domain_node_setaffinity(ctx->xch, domid, nodemap->map)) { + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "setting node affinity"); + return ERROR_FAIL; + } + return 0; +} + +int libxl_domain_get_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap) +{ + if (xc_domain_node_getaffinity(ctx->xch, domid, nodemap->map)) { + LIBXL__LOG_ERRNO(ctx, LIBXL__LOG_ERROR, "getting node affinity"); + return ERROR_FAIL; + } + return 0; +} + int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap) { GC_INIT(ctx); diff --git a/tools/libxl/libxl.h b/tools/libxl/libxl.h --- a/tools/libxl/libxl.h +++ b/tools/libxl/libxl.h @@ -273,6 +273,10 @@ #endif #endif +/* For the ''nodemap'' field (of libxl_bitmap type) in + * libxl_domain_build_info, containing the node-affinity of the domain. */ +#define LIBXL_HAVE_DOMAIN_NODEAFFINITY 1 + /* Functions annotated with LIBXL_EXTERNAL_CALLERS_ONLY may not be * called from within libxl itself. Callers outside libxl, who * do not #include libxl_internal.h, are fine. */ @@ -868,6 +872,10 @@ int libxl_set_vcpuaffinity(libxl_ctx *ct libxl_bitmap *cpumap); int libxl_set_vcpuaffinity_all(libxl_ctx *ctx, uint32_t domid, unsigned int max_vcpus, libxl_bitmap *cpumap); +int libxl_domain_set_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap); +int libxl_domain_get_nodeaffinity(libxl_ctx *ctx, uint32_t domid, + libxl_bitmap *nodemap); int libxl_set_vcpuonline(libxl_ctx *ctx, uint32_t domid, libxl_bitmap *cpumap); libxl_scheduler libxl_get_scheduler(libxl_ctx *ctx); diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c --- a/tools/libxl/libxl_create.c +++ b/tools/libxl/libxl_create.c @@ -191,6 +191,12 @@ int libxl__domain_build_info_setdefault( libxl_defbool_setdefault(&b_info->numa_placement, true); + if (!b_info->nodemap.size) { + if (libxl_node_bitmap_alloc(CTX, &b_info->nodemap, 0)) + return ERROR_FAIL; + libxl_bitmap_set_any(&b_info->nodemap); + } + if (b_info->max_memkb == LIBXL_MEMKB_DEFAULT) b_info->max_memkb = 32 * 1024; if (b_info->target_memkb == LIBXL_MEMKB_DEFAULT) diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -228,6 +228,7 @@ int libxl__build_pre(libxl__gc *gc, uint if (rc) return rc; } + libxl_domain_set_nodeaffinity(ctx, domid, &info->nodemap); libxl_set_vcpuaffinity_all(ctx, domid, info->max_vcpus, &info->cpumap); xc_domain_setmaxmem(ctx->xch, domid, info->target_memkb + LIBXL_MAXMEM_CONSTANT); diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl --- a/tools/libxl/libxl_types.idl +++ b/tools/libxl/libxl_types.idl @@ -262,6 +262,7 @@ libxl_domain_build_info = Struct("domain ("max_vcpus", integer), ("avail_vcpus", libxl_bitmap), ("cpumap", libxl_bitmap), + ("nodemap", libxl_bitmap), ("numa_placement", libxl_defbool), ("tsc_mode", libxl_tsc_mode), ("max_memkb", MemKB),
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 08 of 11 v4] libxl: optimize the calculation of how many VCPUs can run on a candidate
For choosing the best NUMA placement candidate, we need to figure out how many VCPUs are runnable on each of them. That requires going through all the VCPUs of all the domains and check their affinities. With this change, instead of doing the above for each candidate, we do it once for all, populating an array while counting. This way, when we later are evaluating candidates, all we need is summing up the right elements of the array itself. This reduces the complexity of the overall algorithm, as it moves a potentially expensive operation (for_each_vcpu_of_each_domain {}) outside from the core placement loop, so that it is performed only once instead of (potentially) tens or hundreds of times. More specifically, we go from a worst case computation time complaxity of: O(2^n_nodes) * O(n_domains*n_domain_vcpus) To, with this change: O(n_domains*n_domains_vcpus) + O(2^n_nodes) = O(2^n_nodes) (with n_nodes<=16, otherwise the algorithm suggests partitioning with cpupools and does not even start.) Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> --- Changes from v2: * in nr_vcpus_on_nodes(), vcpu_nodemap renamed to something more sensible, as suggested during review. diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -165,22 +165,34 @@ static uint32_t nodemap_to_free_memkb(li return free_memkb; } -/* Retrieve the number of vcpus able to run on the cpus of the nodes - * that are part of the nodemap. */ -static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo, +/* Retrieve the number of vcpus able to run on the nodes in nodemap */ +static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[], const libxl_bitmap *nodemap) { + int i, nr_vcpus = 0; + + libxl_for_each_set_bit(i, *nodemap) + nr_vcpus += vcpus_on_node[i]; + + return nr_vcpus; +} + +/* Number of vcpus able to run on the cpus of the various nodes + * (reported by filling the array vcpus_on_node[]). */ +static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, + const libxl_bitmap *suitable_cpumap, + int vcpus_on_node[]) +{ libxl_dominfo *dinfo = NULL; - libxl_bitmap vcpu_nodemap; + libxl_bitmap nodes_counted; int nr_doms, nr_cpus; - int nr_vcpus = 0; int i, j, k; dinfo = libxl_list_domain(CTX, &nr_doms); if (dinfo == NULL) return ERROR_FAIL; - if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) { + if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) { libxl_dominfo_list_free(dinfo, nr_doms); return ERROR_FAIL; } @@ -193,19 +205,17 @@ static int nodemap_to_nr_vcpus(libxl__gc if (vinfo == NULL) continue; - /* For each vcpu of each domain ... */ for (j = 0; j < nr_dom_vcpus; j++) { + /* For each vcpu of each domain, increment the elements of + * the array corresponding to the nodes where the vcpu runs */ + libxl_bitmap_set_none(&nodes_counted); + libxl_for_each_set_bit(k, vinfo[j].cpumap) { + int node = tinfo[k].node; - /* Build up a map telling on which nodes the vcpu is runnable on */ - libxl_bitmap_set_none(&vcpu_nodemap); - libxl_for_each_set_bit(k, vinfo[j].cpumap) - libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node); - - /* And check if that map has any intersection with our nodemap */ - libxl_for_each_set_bit(k, vcpu_nodemap) { - if (libxl_bitmap_test(nodemap, k)) { - nr_vcpus++; - break; + if (libxl_bitmap_test(suitable_cpumap, k) && + !libxl_bitmap_test(&nodes_counted, node)) { + libxl_bitmap_set(&nodes_counted, node); + vcpus_on_node[node]++; } } } @@ -213,9 +223,9 @@ static int nodemap_to_nr_vcpus(libxl__gc libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } - libxl_bitmap_dispose(&vcpu_nodemap); + libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); - return nr_vcpus; + return 0; } /* @@ -270,7 +280,7 @@ int libxl__get_numa_candidate(libxl__gc libxl_numainfo *ninfo = NULL; int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0; libxl_bitmap suitable_nodemap, nodemap; - int rc = 0; + int *vcpus_on_node, rc = 0; libxl_bitmap_init(&nodemap); libxl_bitmap_init(&suitable_nodemap); @@ -281,6 +291,8 @@ int libxl__get_numa_candidate(libxl__gc if (ninfo == NULL) return ERROR_FAIL; + GCNEW_ARRAY(vcpus_on_node, nr_nodes); + /* * The good thing about this solution is that it is based on heuristics * (implemented in numa_cmpf() ), but we at least can evaluate it on @@ -330,6 +342,19 @@ int libxl__get_numa_candidate(libxl__gc goto out; /* + * Later on, we will try to figure out how many vcpus are runnable on + * each candidate (as a part of choosing the best one of them). That + * requires going through all the vcpus of all the domains and check + * their affinities. So, instead of doing that for each candidate, + * let''s count here the number of vcpus runnable on each node, so that + * all we have to do later is summing up the right elements of the + * vcpus_on_node array. + */ + rc = nr_vcpus_on_nodes(gc, tinfo, suitable_cpumap, vcpus_on_node); + if (rc) + goto out; + + /* * If the minimum number of NUMA nodes is not explicitly specified * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes * from where to start generating candidates, if possible (or just start @@ -414,7 +439,8 @@ int libxl__get_numa_candidate(libxl__gc * current best one (if any). */ libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); - new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap); + new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node, + &nodemap); new_cndt.free_memkb = nodes_free_memkb; new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap); new_cndt.nr_cpus = nodes_cpus;
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 09 of 11 v4] libxl: automatic placement deals with node-affinity
Which basically means the following two things: 1) during domain creation, it is the node-affinity of the domain --rather than the vcpu-affinities of its VCPUs-- that is affected by automatic placement; 2) during automatic placement, when counting how many VCPUs are already "bound" to a placement candidate (as part of the process of choosing the best candidate), both vcpu-affinity and node-affinity are considered. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c +++ b/tools/libxl/libxl_dom.c @@ -133,13 +133,13 @@ static int numa_place_domain(libxl__gc * { int found; libxl__numa_candidate candidate; - libxl_bitmap candidate_nodemap; + libxl_bitmap cpupool_nodemap; libxl_cpupoolinfo cpupool_info; int i, cpupool, rc = 0; uint32_t memkb; libxl__numa_candidate_init(&candidate); - libxl_bitmap_init(&candidate_nodemap); + libxl_bitmap_init(&cpupool_nodemap); /* * Extract the cpumap from the cpupool the domain belong to. In fact, @@ -156,7 +156,7 @@ static int numa_place_domain(libxl__gc * rc = libxl_domain_need_memory(CTX, info, &memkb); if (rc) goto out; - if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) { + if (libxl_node_bitmap_alloc(CTX, &cpupool_nodemap, 0)) { rc = ERROR_FAIL; goto out; } @@ -174,17 +174,19 @@ static int numa_place_domain(libxl__gc * if (found == 0) goto out; - /* Map the candidate''s node map to the domain''s info->cpumap */ - libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap); - rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap); + /* Map the candidate''s node map to the domain''s info->nodemap */ + libxl__numa_candidate_get_nodemap(gc, &candidate, &info->nodemap); + + /* Avoid trying to set the affinity to nodes that might be in the + * candidate''s nodemap but out of our cpupool. */ + rc = libxl_cpumap_to_nodemap(CTX, &cpupool_info.cpumap, + &cpupool_nodemap); if (rc) goto out; - /* Avoid trying to set the affinity to cpus that might be in the - * nodemap but not in our cpupool. */ - libxl_for_each_set_bit(i, info->cpumap) { - if (!libxl_bitmap_test(&cpupool_info.cpumap, i)) - libxl_bitmap_reset(&info->cpumap, i); + libxl_for_each_set_bit(i, info->nodemap) { + if (!libxl_bitmap_test(&cpupool_nodemap, i)) + libxl_bitmap_reset(&info->nodemap, i); } LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and " @@ -193,7 +195,7 @@ static int numa_place_domain(libxl__gc * out: libxl__numa_candidate_dispose(&candidate); - libxl_bitmap_dispose(&candidate_nodemap); + libxl_bitmap_dispose(&cpupool_nodemap); libxl_cpupoolinfo_dispose(&cpupool_info); return rc; } @@ -209,10 +211,10 @@ int libxl__build_pre(libxl__gc *gc, uint /* * Check if the domain has any CPU affinity. If not, try to build * up one. In case numa_place_domain() find at least a suitable - * candidate, it will affect info->cpumap accordingly; if it + * candidate, it will affect info->nodemap accordingly; if it * does not, it just leaves it as it is. This means (unless * some weird error manifests) the subsequent call to - * libxl_set_vcpuaffinity_all() will do the actual placement, + * libxl_domain_set_nodeaffinity() will do the actual placement, * whatever that turns out to be. */ if (libxl_defbool_val(info->numa_placement)) { diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c --- a/tools/libxl/libxl_numa.c +++ b/tools/libxl/libxl_numa.c @@ -184,7 +184,7 @@ static int nr_vcpus_on_nodes(libxl__gc * int vcpus_on_node[]) { libxl_dominfo *dinfo = NULL; - libxl_bitmap nodes_counted; + libxl_bitmap dom_nodemap, nodes_counted; int nr_doms, nr_cpus; int i, j, k; @@ -197,6 +197,12 @@ static int nr_vcpus_on_nodes(libxl__gc * return ERROR_FAIL; } + if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) { + libxl_bitmap_dispose(&nodes_counted); + libxl_dominfo_list_free(dinfo, nr_doms); + return ERROR_FAIL; + } + for (i = 0; i < nr_doms; i++) { libxl_vcpuinfo *vinfo; int nr_dom_vcpus; @@ -205,14 +211,21 @@ static int nr_vcpus_on_nodes(libxl__gc * if (vinfo == NULL) continue; + /* Retrieve the domain''s node-affinity map */ + libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap); + for (j = 0; j < nr_dom_vcpus; j++) { - /* For each vcpu of each domain, increment the elements of - * the array corresponding to the nodes where the vcpu runs */ + /* + * For each vcpu of each domain, it must have both vcpu-affinity + * and node-affinity to (a pcpu belonging to) a certain node to + * cause an increment in the corresponding element of the array. + */ libxl_bitmap_set_none(&nodes_counted); libxl_for_each_set_bit(k, vinfo[j].cpumap) { int node = tinfo[k].node; if (libxl_bitmap_test(suitable_cpumap, k) && + libxl_bitmap_test(&dom_nodemap, node) && !libxl_bitmap_test(&nodes_counted, node)) { libxl_bitmap_set(&nodes_counted, node); vcpus_on_node[node]++; @@ -223,6 +236,7 @@ static int nr_vcpus_on_nodes(libxl__gc * libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); } + libxl_bitmap_dispose(&dom_nodemap); libxl_bitmap_dispose(&nodes_counted); libxl_dominfo_list_free(dinfo, nr_doms); return 0;
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
Node-affinity is now something that is under (some) control of the user, so show it upon request as part of the output of `xl list'' by the `-n'' option. Re the patch, the print_bitmap() related hunk is _mostly_ code motion, although there is a very minor change in the code, basically to allow using the function for printing both cpu and node bitmaps (as, in case all bits are sets, it used to print "any cpu", which doesn''t fit the nodemap case). Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> --- Changes from v3: * removed the pastebin.com link from the changelog. Will reply to this same mail with another one with the output of various invocations of `xl list'' with different options to show how it looks like. Changes from v1: * print_{cpu,node}map() functions added instead of ''state variable''-izing print_bitmap(). diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c +++ b/tools/libxl/xl_cmdimpl.c @@ -3030,14 +3030,95 @@ out: } } -static void list_domains(int verbose, int context, const libxl_dominfo *info, int nb_domain) +/* If map is not full, prints it and returns 0. Returns 1 otherwise. */ +static int print_bitmap(uint8_t *map, int maplen, FILE *stream) +{ + int i; + uint8_t pmap = 0, bitmask = 0; + int firstset = 0, state = 0; + + for (i = 0; i < maplen; i++) { + if (i % 8 == 0) { + pmap = *map++; + bitmask = 1; + } else bitmask <<= 1; + + switch (state) { + case 0: + case 2: + if ((pmap & bitmask) != 0) { + firstset = i; + state++; + } + continue; + case 1: + case 3: + if ((pmap & bitmask) == 0) { + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); + if (i - 1 > firstset) + fprintf(stream, "-%d", i - 1); + state = 2; + } + continue; + } + } + switch (state) { + case 0: + fprintf(stream, "none"); + break; + case 2: + break; + case 1: + if (firstset == 0) + return 1; + case 3: + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); + if (i - 1 > firstset) + fprintf(stream, "-%d", i - 1); + break; + } + + return 0; +} + +static void print_cpumap(uint8_t *map, int maplen, FILE *stream) +{ + if (print_bitmap(map, maplen, stream)) + fprintf(stream, "any cpu"); +} + +static void print_nodemap(uint8_t *map, int maplen, FILE *stream) +{ + if (print_bitmap(map, maplen, stream)) + fprintf(stream, "any node"); +} + +static void list_domains(int verbose, int context, int numa, const libxl_dominfo *info, int nb_domain) { int i; static const char shutdown_reason_letters[]= "-rscw"; + libxl_bitmap nodemap; + libxl_physinfo physinfo; + + libxl_bitmap_init(&nodemap); + libxl_physinfo_init(&physinfo); printf("Name ID Mem VCPUs\tState\tTime(s)"); if (verbose) printf(" UUID Reason-Code\tSecurity Label"); if (context && !verbose) printf(" Security Label"); + if (numa) { + if (libxl_node_bitmap_alloc(ctx, &nodemap, 0)) { + fprintf(stderr, "libxl_node_bitmap_alloc_failed.\n"); + exit(1); + } + if (libxl_get_physinfo(ctx, &physinfo) != 0) { + fprintf(stderr, "libxl_physinfo failed.\n"); + libxl_bitmap_dispose(&nodemap); + exit(1); + } + + printf(" NODE Affinity"); + } printf("\n"); for (i = 0; i < nb_domain; i++) { char *domname; @@ -3071,14 +3152,23 @@ static void list_domains(int verbose, in rc = libxl_flask_sid_to_context(ctx, info[i].ssidref, &buf, &size); if (rc < 0) - printf(" -"); + printf(" -"); else { - printf(" %s", buf); + printf(" %16s", buf); free(buf); } } + if (numa) { + libxl_domain_get_nodeaffinity(ctx, info[i].domid, &nodemap); + + putchar('' ''); + print_nodemap(nodemap.map, physinfo.nr_nodes, stdout); + } putchar(''\n''); } + + libxl_bitmap_dispose(&nodemap); + libxl_physinfo_dispose(&physinfo); } static void list_vm(void) @@ -3945,10 +4035,12 @@ int main_list(int argc, char **argv) int opt, verbose = 0; int context = 0; int details = 0; + int numa = 0; static struct option opts[] = { {"long", 0, 0, ''l''}, {"verbose", 0, 0, ''v''}, {"context", 0, 0, ''Z''}, + {"numa", 0, 0, ''n''}, COMMON_LONG_OPTS, {0, 0, 0, 0} }; @@ -3957,7 +4049,7 @@ int main_list(int argc, char **argv) libxl_dominfo *info, *info_free=0; int nb_domain, rc; - SWITCH_FOREACH_OPT(opt, "lvhZ", opts, "list", 0) { + SWITCH_FOREACH_OPT(opt, "lvhZn", opts, "list", 0) { case ''l'': details = 1; break; @@ -3967,6 +4059,9 @@ int main_list(int argc, char **argv) case ''Z'': context = 1; break; + case ''n'': + numa = 1; + break; } if (optind >= argc) { @@ -3998,7 +4093,7 @@ int main_list(int argc, char **argv) if (details) list_domains_details(info, nb_domain); else - list_domains(verbose, context, info, nb_domain); + list_domains(verbose, context, numa, info, nb_domain); if (info_free) libxl_dominfo_list_free(info, nb_domain); @@ -4247,56 +4342,6 @@ int main_button_press(int argc, char **a return 0; } -static void print_bitmap(uint8_t *map, int maplen, FILE *stream) -{ - int i; - uint8_t pmap = 0, bitmask = 0; - int firstset = 0, state = 0; - - for (i = 0; i < maplen; i++) { - if (i % 8 == 0) { - pmap = *map++; - bitmask = 1; - } else bitmask <<= 1; - - switch (state) { - case 0: - case 2: - if ((pmap & bitmask) != 0) { - firstset = i; - state++; - } - continue; - case 1: - case 3: - if ((pmap & bitmask) == 0) { - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); - if (i - 1 > firstset) - fprintf(stream, "-%d", i - 1); - state = 2; - } - continue; - } - } - switch (state) { - case 0: - fprintf(stream, "none"); - break; - case 2: - break; - case 1: - if (firstset == 0) { - fprintf(stream, "any cpu"); - break; - } - case 3: - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); - if (i - 1 > firstset) - fprintf(stream, "-%d", i - 1); - break; - } -} - static void print_vcpuinfo(uint32_t tdomid, const libxl_vcpuinfo *vcpuinfo, uint32_t nr_cpus) @@ -4320,7 +4365,7 @@ static void print_vcpuinfo(uint32_t tdom /* TIM */ printf("%9.1f ", ((float)vcpuinfo->vcpu_time / 1e9)); /* CPU AFFINITY */ - print_bitmap(vcpuinfo->cpumap.map, nr_cpus, stdout); + print_cpumap(vcpuinfo->cpumap.map, nr_cpus, stdout); printf("\n"); } diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c --- a/tools/libxl/xl_cmdtable.c +++ b/tools/libxl/xl_cmdtable.c @@ -50,7 +50,8 @@ struct cmd_spec cmd_table[] = { "[options] [Domain]\n", "-l, --long Output all VM details\n" "-v, --verbose Prints out UUIDs and security context\n" - "-Z, --context Prints out security context" + "-Z, --context Prints out security context\n" + "-n, --numa Prints out NUMA node affinity" }, { "destroy", &main_destroy, 0, 1,
Dario Faggioli
2013-Mar-15 02:30 UTC
[PATCH 11 of 11 v4] docs: rearrange and update NUMA placement documentation
To include the new concept of NUMA aware scheduling and describe its impact. Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> Acked-by: George Dunlap <george.dunlap@eu.citrix.com> diff --git a/docs/misc/xl-numa-placement.markdown b/docs/misc/xl-numa-placement.markdown --- a/docs/misc/xl-numa-placement.markdown +++ b/docs/misc/xl-numa-placement.markdown @@ -14,22 +14,67 @@ the memory directly attached to the set The Xen hypervisor deals with NUMA machines by assigning to each domain a "node affinity", i.e., a set of NUMA nodes of the host from which they -get their memory allocated. +get their memory allocated. Also, even if the node affinity of a domain +is allowed to change on-line, it is very important to "place" the domain +correctly when it is fist created, as the most of its memory is allocated +at that time and can not (for now) be moved easily. NUMA awareness becomes very important as soon as many domains start running memory-intensive workloads on a shared host. In fact, the cost of accessing non node-local memory locations is very high, and the performance degradation is likely to be noticeable. -## Guest Placement in xl ## +For more information, have a look at the [Xen NUMA Introduction][numa_intro] +page on the Wiki. + +### Placing via pinning and cpupools ### + +The simplest way of placing a domain on a NUMA node is statically pinning +the domain''s vCPUs to the pCPUs of the node. This goes under the name of +CPU affinity and can be set through the "cpus=" option in the config file +(more about this below). Another option is to pool together the pCPUs +spanning the node and put the domain in such a cpupool with the "pool=" +config option (as documented in our [Wiki][cpupools_howto]). + +In both the above cases, the domain will not be able to execute outside +the specified set of pCPUs for any reasons, even if all those pCPUs are +busy doing something else while there are others, idle, pCPUs. + +So, when doing this, local memory accesses are 100% guaranteed, but that +may come at he cost of some load imbalances. + +### NUMA aware scheduling ### + +If the credit scheduler is in use, the concept of node affinity defined +above does not only apply to memory. In fact, starting from Xen 4.3, the +scheduler always tries to run the domain''s vCPUs on one of the nodes in +its node affinity. Only if that turns out to be impossible, it will just +pick any free pCPU. + +This is, therefore, something more flexible than CPU affinity, as a domain +can still run everywhere, it just prefers some nodes rather than others. +Locality of access is less guaranteed than in the pinning case, but that +comes along with better chances to exploit all the host resources (e.g., +the pCPUs). + +In fact, if all the pCPUs in a domain''s node affinity are busy, it is +possible for the domain to run outside of there, but it is very likely that +slower execution (due to remote memory accesses) is still better than no +execution at all, as it would happen with pinning. For this reason, NUMA +aware scheduling has the potential of bringing substantial performances +benefits, although this will depend on the workload. + +## Guest placement in xl ## If using xl for creating and managing guests, it is very easy to ask for both manual or automatic placement of them across the host''s NUMA nodes. -Note that xm/xend does the very same thing, the only differences residing -in the details of the heuristics adopted for the placement (see below). +Note that xm/xend does a very similar thing, the only differences being +the details of the heuristics adopted for automatic placement (see below), +and the lack of support (in both xm/xend and the Xen versions where that\ +was the default toolstack) for NUMA aware scheduling. -### Manual Guest Placement with xl ### +### Placing the guest manually ### Thanks to the "cpus=" option, it is possible to specify where a domain should be created and scheduled on, directly in its config file. This @@ -41,14 +86,19 @@ This is very simple and effective, but r administrator to explicitly specify affinities for each and every domain, or Xen won''t be able to guarantee the locality for their memory accesses. -It is also possible to deal with NUMA by partitioning the system using -cpupools. Again, this could be "The Right Answer" for many needs and -occasions, but has to be carefully considered and setup by hand. +Notice that this also pins the domain''s vCPUs to the specified set of +pCPUs, so it not only sets the domain''s node affinity (its memory will +come from the nodes to which the pCPUs belong), but at the same time +forces the vCPUs of the domain to be scheduled on those same pCPUs. -### Automatic Guest Placement with xl ### +### Placing the guest automatically ### If no "cpus=" option is specified in the config file, libxl tries to figure out on its own on which node(s) the domain could fit best. +If it finds one (some), the domain''s node affinity get set to there, +and both memory allocations and NUMA aware scheduling (for the credit +scheduler and starting from Xen 4.3) will comply with it. + It is worthwhile noting that optimally fitting a set of VMs on the NUMA nodes of an host is an incarnation of the Bin Packing Problem. In fact, the various VMs with different memory sizes are the items to be packed, @@ -81,7 +131,7 @@ largest amounts of free memory helps kee small, and maximizes the probability of being able to put more domains there. -## Guest Placement within libxl ## +## Guest placement in libxl ## xl achieves automatic NUMA placement because that is what libxl does by default. No API is provided (yet) for modifying the behaviour of @@ -93,15 +143,34 @@ any placement from happening: libxl_defbool_set(&domain_build_info->numa_placement, false); Also, if `numa_placement` is set to `true`, the domain must not -have any cpu affinity (i.e., `domain_build_info->cpumap` must +have any CPU affinity (i.e., `domain_build_info->cpumap` must have all its bits set, as it is by default), or domain creation will fail returning `ERROR_INVAL`. +Starting from Xen 4.3, in case automatic placement happens (and is +successful), it will affect the domain''s node affinity and _not_ its +CPU affinity. Namely, the domain''s vCPUs will not be pinned to any +pCPU on the host, but the memory from the domain will come from the +selected node(s) and the NUMA aware scheduling (if the credit scheduler +is in use) will try to keep the domain there as much as possible. + Besides than that, looking and/or tweaking the placement algorithm search "Automatic NUMA placement" in libxl\_internal.h. Note this may change in future versions of Xen/libxl. +## Xen < 4.3 ## + +As NUMA aware scheduling is a new feature of Xen 4.3, things are a little +bit different for earlier version of Xen. If no "cpus=" option is specified +and Xen 4.2 is in use, the automatic placement algorithm still runs, but +the results is used to _pin_ the vCPUs of the domain to the output node(s). +This is consistent with what was happening with xm/xend, which were also +affecting the domain''s CPU affinity. + +On a version of Xen earlier than 4.2, there is not automatic placement at +all in xl or libxl, and hence no node or CPU affinity being affected. + ## Limitations ## Analyzing various possible placement solutions is what makes the @@ -109,3 +178,6 @@ algorithm flexible and quite effective. it won''t scale well to systems with arbitrary number of nodes. For this reason, automatic placement is disabled (with a warning) if it is requested on a host with more than 16 NUMA nodes. + +[numa_intro]: http://wiki.xen.org/wiki/Xen_NUMA_Introduction +[cpupools_howto]: http://wiki.xen.org/wiki/Cpupools_Howto
Dario Faggioli
2013-Mar-15 03:03 UTC
Re: [PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
On ven, 2013-03-15 at 03:30 +0100, Dario Faggioli wrote:> Node-affinity is now something that is under (some) control of the > user, so show it upon request as part of the output of `xl list'' > by the `-n'' option. > > Re the patch, the print_bitmap() related hunk is _mostly_ code motion, > although there is a very minor change in the code, basically to allow > using the function for printing both cpu and node bitmaps (as, in case > all bits are sets, it used to print "any cpu", which doesn''t fit the > nodemap case). >For the records, this is how the output of various invocations of `xl list ...'', with different set of parameters looks like after this change: root@Zhaman:~# xl list Name ID Mem VCPUs State Time(s) Domain-0 0 375 16 r----- 225.1 vm1 1 960 2 -b---- 34.0 root@Zhaman:~# xl list -Z Name ID Mem VCPUs State Time(s) Security Label Domain-0 0 375 16 r----- 225.4 - vm1 1 960 2 -b---- 34.0 - root@Zhaman:~# xl list -v Name ID Mem VCPUs State Time(s) UUID Reason-Code Security Label Domain-0 0 375 16 r----- 226.6 00000000-0000-0000-0000-000000000000 - - vm1 1 960 2 -b---- 34.2 e36429cc-d2a2-4da7-b21d-b053f725e7a7 - - root@Zhaman:~# xl list -n Name ID Mem VCPUs State Time(s) NODE Affinity Domain-0 0 375 16 r----- 226.8 any node vm1 1 960 2 -b---- 34.2 0 root@Zhaman:~# xl list -nZ Name ID Mem VCPUs State Time(s) Security Label NODE Affinity Domain-0 0 375 16 r----- 226.9 - any node vm1 1 960 2 -b---- 34.2 - 0 root@Zhaman:~# xl list -nv Name ID Mem VCPUs State Time(s) UUID Reason-Code Security Label NODE Affinity Domain-0 0 375 16 r----- 227.0 00000000-0000-0000-0000-000000000000 - - any node vm1 1 960 2 -b---- 34.2 e36429cc-d2a2-4da7-b21d-b053f725e7a7 - - 0 (the same thing should be available here: http://pastebin.com/68kUuwyE ) Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Mar-15 08:14 UTC
Re: [PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
>>> On 15.03.13 at 03:30, Dario Faggioli <dario.faggioli@citrix.com> wrote: > The pcpu picking algorithm treats two threads of a SMT core the same. > More specifically, if one is idle and the other one is busy, they both > will be assigned a weight of 1. Therefore, when picking begins, if the > first target pcpu is the busy thread (and if there are no other idle > pcpu than its sibling), that will never change. > > This change fixes this by ensuring that, before entering the core of > the picking algorithm, the target pcpu is an idle one (if there is an > idle pcpu at all, of course). > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> > Acked-by: George Dunlap <george.dunlap@eu.citrix.com> > > diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c > --- a/xen/common/sched_credit.c > +++ b/xen/common/sched_credit.c > @@ -532,6 +532,18 @@ static int > if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) > cpumask_set_cpu(cpu, &idlers); > cpumask_and(&cpus, &cpus, &idlers); > + > + /* > + * It is important that cpu points to an idle processor, if a suitable > + * one exists (and we can use cpus to check and, possibly, choose a new > + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and > + * cpu points to a busy thread with an idle sibling, both the threads > + * will be considered the same, from the "idleness" calculation point > + * of view", preventing vcpu from being moved to the thread that is > + * actually idle. > + */ > + if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) )I think I had asked about this before - what''s the point of the left hand side of the &&? If the mask is empty, the right hand side will cover that quite well, at much less a price for high NR_CPUS (or nr_cpu_ids). The only thing to check is whether "cpu" is out of range (because of "cpus" having been empty already before the cpumask_and() right before your addition). The ASSERT() a few lines earlier could be simplified in similar ways, btw. Jan> + cpu = cpumask_cycle(cpu, &cpus); > cpumask_clear_cpu(cpu, &cpus); > > while ( !cpumask_empty(&cpus) )
Jan Beulich
2013-Mar-15 08:17 UTC
Re: [PATCH 05 of 11 v4] xen: allow for explicitly specifying node-affinity
>>> On 15.03.13 at 03:30, Dario Faggioli <dario.faggioli@citrix.com> wrote: > --- a/xen/include/xen/sched.h > +++ b/xen/include/xen/sched.h > @@ -360,8 +360,12 @@ struct domain > /* Various mem_events */ > struct mem_event_per_domain *mem_event; > > - /* Currently computed from union of all vcpu cpu-affinity masks. */ > + /* > + * Can be specified by the user. If that is not the case, it is > + * computed from the union of all the vcpu cpu-affinity masks. > + */ > nodemask_t node_affinity; > + int auto_node_affinity;bool_t, and put in a suitable slot without adding much extra padding (if available). Jan> unsigned int last_alloc_node; > spinlock_t node_affinity_lock; > };
Dario Faggioli
2013-Mar-15 10:37 UTC
Re: [PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
On ven, 2013-03-15 at 08:14 +0000, Jan Beulich wrote:> >>> On 15.03.13 at 03:30, Dario Faggioli <dario.faggioli@citrix.com> wrote: > > diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c > > --- a/xen/common/sched_credit.c > > +++ b/xen/common/sched_credit.c > > @@ -532,6 +532,18 @@ static int > > if ( vc->processor == cpu && IS_RUNQ_IDLE(cpu) ) > > cpumask_set_cpu(cpu, &idlers); > > cpumask_and(&cpus, &cpus, &idlers); > > + > > + /* > > + * It is important that cpu points to an idle processor, if a suitable > > + * one exists (and we can use cpus to check and, possibly, choose a new > > + * CPU, as we just &&-ed it with idlers). In fact, if we are on SMT, and > > + * cpu points to a busy thread with an idle sibling, both the threads > > + * will be considered the same, from the "idleness" calculation point > > + * of view", preventing vcpu from being moved to the thread that is > > + * actually idle. > > + */ > > + if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) ) > > I think I had asked about this before - >Did you? I don''t remember anything like that but, if you did and I did not answer, sorry for that! :-P> what''s the point of the > left hand side of the &&? If the mask is empty, the right hand > side will cover that quite well, at much less a price for high > NR_CPUS (or nr_cpu_ids). >And in fact it was not there, but ISTR having to add it because not having it was leading in some very bad Xen crash... If I remember correctly, this is what happens without it. The code looks like this: if ( !cpumask_empty(&cpus) && !cpumask_test_cpu(cpu, &cpus) ) cpu = cpumask_cycle(cpu, &cpus); cpumask_clear_cpu(cpu, &cpus); while ( !cpumask_empty(&cpus) ) { ... } ... return cpu; So, what happens if cpus is actually empty? As you say cpumask_test_cpu(cpu,&cpus) will be false, which means cpu is updated with the result of cpumask(cycle(cpu,&cpus)). If I''m reading the code correctly, a cpumask_cycle() on an empty cpumask will give me nr_cpu_ids, which is then what is returned (the while loop is not entered, so nothing more happens to cpu), which makes things explode... Does that make sense? Time has passed since I saw that bugtrace, so it is possible that my memories are not accurate... I surely can try to reproduce it, if you want to see the "smoking gun" :-) Perhaps I can turn the condition into something like this: if ( !cpumask_test_cpu(cpu, &cpus) ) cpu = cpumask_empty(&cpus) ? cpu : cpumask_cycle(cpu, &cpus); So that we pay the price less frequently?> The ASSERT() a few lines earlier > could be simplified in similar ways, btw. >You mean this, right? online = cpupool_scheduler_cpumask(vc->domain->cpupool); cpumask_and(&cpus, online, vc->cpu_affinity); cpu = cpumask_test_cpu(vc->processor, &cpus) ? vc->processor : cpumask_cycle(vc->processor, &cpus); ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); Not sure. AFAIU the code, the ASSERT() is indeed willing to make sure that cpus did not ended up being empty as a consequence of the cpumask_and(), and that is done together with the cpumask_test_cpu() just to have only one ASSERT() instead of two, but again, I might well be wrong. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Jan Beulich
2013-Mar-15 10:55 UTC
Re: [PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
>>> On 15.03.13 at 11:37, Dario Faggioli <dario.faggioli@citrix.com> wrote: > On ven, 2013-03-15 at 08:14 +0000, Jan Beulich wrote: > Perhaps I can turn the condition into something like this: > > if ( !cpumask_test_cpu(cpu, &cpus) ) > cpu = cpumask_empty(&cpus) ? cpu : cpumask_cycle(cpu, &cpus); > > So that we pay the price less frequently?Given cpu < nr_cpu_ids before this, yes, that sounds right. Or you could simply switch the operands of the && in your original if(). Yet less expensive would be if you stored the result of cpumask_cycle() in another variable and copied it into "cpu" only if less than nr_cpu_ids. That would eliminate the need for cpumask_empty() altogether.>> The ASSERT() a few lines earlier >> could be simplified in similar ways, btw. >> > You mean this, right? > > online = cpupool_scheduler_cpumask(vc->domain->cpupool); > cpumask_and(&cpus, online, vc->cpu_affinity); > cpu = cpumask_test_cpu(vc->processor, &cpus) > ? vc->processor > : cpumask_cycle(vc->processor, &cpus); > ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) ); > > Not sure. AFAIU the code, the ASSERT() is indeed willing to make sure > that cpus did not ended up being empty as a consequence of the > cpumask_and(), and that is done together with the cpumask_test_cpu() > just to have only one ASSERT() instead of two, but again, I might well > be wrong.If I''m not mistaken, ASSERT(cpu < nr_cpu_ids && cpumask_test_cpu(cpu, &cpus)); would have the exact same effect. Jan
Daniel De Graaf
2013-Mar-15 14:20 UTC
Re: [PATCH 05 of 11 v4] xen: allow for explicitly specifying node-affinity
On 03/14/2013 10:30 PM, Dario Faggioli wrote: [...]> diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c > --- a/xen/xsm/flask/hooks.c > +++ b/xen/xsm/flask/hooks.c > @@ -611,10 +611,10 @@ static int flask_domctl(struct domain *d > return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); > > case XEN_DOMCTL_setvcpuaffinity: > - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY); > + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); > > case XEN_DOMCTL_getvcpuaffinity: > - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY); > + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY);You need to add XEN_DOMCTL_{get,set}nodeaffinity to the switch statement in addition to changing the permission name for the existing domctls.> case XEN_DOMCTL_resumedomain: > return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); > diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors > --- a/xen/xsm/flask/policy/access_vectors > +++ b/xen/xsm/flask/policy/access_vectors > @@ -103,10 +103,10 @@ class domain > max_vcpus > # XEN_DOMCTL_destroydomain > destroy > -# XEN_DOMCTL_setvcpuaffinity > - setvcpuaffinity > -# XEN_DOMCTL_getvcpuaffinity > - getvcpuaffinity > +# XEN_DOMCTL_setaffinity > + setaffinity > +# XEN_DOMCTL_getaffinity > + getaffinity > # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo > getscheduler > # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist >The comments here are now incorrect, and should reflect the domctls controlled by the listed permission. -- Daniel De Graaf National Security Agency
Dario Faggioli
2013-Mar-16 07:11 UTC
Re: [PATCH 05 of 11 v4] xen: allow for explicitly specifying node-affinity
On ven, 2013-03-15 at 14:20 +0000, Daniel De Graaf wrote:> On 03/14/2013 10:30 PM, Dario Faggioli wrote: > [...] > > diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c > > --- a/xen/xsm/flask/hooks.c > > +++ b/xen/xsm/flask/hooks.c > > @@ -611,10 +611,10 @@ static int flask_domctl(struct domain *d > > return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__UNPAUSE); > > > > case XEN_DOMCTL_setvcpuaffinity: > > - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETVCPUAFFINITY); > > + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__SETAFFINITY); > > > > case XEN_DOMCTL_getvcpuaffinity: > > - return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETVCPUAFFINITY); > > + return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__GETAFFINITY); > > You need to add XEN_DOMCTL_{get,set}nodeaffinity to the switch statement > in addition to changing the permission name for the existing domctls. >Ok.> > case XEN_DOMCTL_resumedomain: > > return current_has_perm(d, SECCLASS_DOMAIN, DOMAIN__RESUME); > > diff --git a/xen/xsm/flask/policy/access_vectors b/xen/xsm/flask/policy/access_vectors > > --- a/xen/xsm/flask/policy/access_vectors > > +++ b/xen/xsm/flask/policy/access_vectors > > @@ -103,10 +103,10 @@ class domain > > max_vcpus > > # XEN_DOMCTL_destroydomain > > destroy > > -# XEN_DOMCTL_setvcpuaffinity > > - setvcpuaffinity > > -# XEN_DOMCTL_getvcpuaffinity > > - getvcpuaffinity > > +# XEN_DOMCTL_setaffinity > > + setaffinity > > +# XEN_DOMCTL_getaffinity > > + getaffinity > > # XEN_DOMCTL_scheduler_op with XEN_DOMCTL_SCHEDOP_getinfo > > getscheduler > > # XEN_DOMCTL_getdomaininfo, XEN_SYSCTL_getdomaininfolist > > > > The comments here are now incorrect, and should reflect the domctls > controlled by the listed permission. >I see. I tried to update this patch to cope with the changes introduced by your new IS_PRIV series, but evidently I missed this couple of spots. Thanks for pointing them out, will do what you ask. Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
George Dunlap
2013-Mar-18 13:58 UTC
Re: [PATCH 04 of 11 v4] xen: sched_credit: let the scheduler know about node-affinity
On 15/03/13 02:30, Dario Faggioli wrote:> As vcpu-affinity tells where VCPUs must run, node-affinity tells > where they prefer to. While respecting vcpu-affinity remains mandatory, > node-affinity is not that strict, it only expresses a preference, > although honouring it will bring significant performance benefits > (especially as compared to not having any affinity at all). > > This change modifies the VCPUs load balancing algorithm (for the > credit scheduler only), introducing a two steps logic. During the > first step, we use both the vcpu-affinity and the node-affinity > masks (by looking at their intersection). The aim is giving precedence > to the PCPUs where the domain prefers to run, as expressed by its > node-affinity (with the intersection with the vcpu-afinity being > necessary in order to avoid running a VCPU where it never should). > If that fails in finding a valid PCPU, the node-affinity is just > ignored and, in the second step, we fall back to using cpu-affinity > only. > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
George Dunlap
2013-Mar-18 14:02 UTC
Re: [PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
On 15/03/13 10:55, Jan Beulich wrote:>>>> On 15.03.13 at 11:37, Dario Faggioli <dario.faggioli@citrix.com> wrote: >> On ven, 2013-03-15 at 08:14 +0000, Jan Beulich wrote: >> Perhaps I can turn the condition into something like this: >> >> if ( !cpumask_test_cpu(cpu, &cpus) ) >> cpu = cpumask_empty(&cpus) ? cpu : cpumask_cycle(cpu, &cpus); >> >> So that we pay the price less frequently? > Given cpu < nr_cpu_ids before this, yes, that sounds right. Or > you could simply switch the operands of the && in your original > if(). Yet less expensive would be if you stored the result of > cpumask_cycle() in another variable and copied it into "cpu" > only if less than nr_cpu_ids. That would eliminate the need for > cpumask_empty() altogether.It seems to me like just switching the order would result in more readable code. Wouldn''t hurt to add "Do the quick test first" to the comment so no one switches it back. :-) -George
George Dunlap
2013-Mar-18 14:06 UTC
Re: [PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
On 15/03/13 02:30, Dario Faggioli wrote:> Node-affinity is now something that is under (some) control of the > user, so show it upon request as part of the output of `xl list'' > by the `-n'' option. > > Re the patch, the print_bitmap() related hunk is _mostly_ code motion, > although there is a very minor change in the code, basically to allow > using the function for printing both cpu and node bitmaps (as, in case > all bits are sets, it used to print "any cpu", which doesn''t fit the > nodemap case). > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>Can we get a tools maintainer comment (either ack / recommendation for improvement) sometime in the next few days, so that the next time Dario submits this series it can be taken wholesale? -George> --- > Changes from v3: > * removed the pastebin.com link from the changelog. Will reply to this > same mail with another one with the output of various invocations > of `xl list'' with different options to show how it looks like. > > Changes from v1: > * print_{cpu,node}map() functions added instead of ''state variable''-izing > print_bitmap(). > > diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c > --- a/tools/libxl/xl_cmdimpl.c > +++ b/tools/libxl/xl_cmdimpl.c > @@ -3030,14 +3030,95 @@ out: > } > } > > -static void list_domains(int verbose, int context, const libxl_dominfo *info, int nb_domain) > +/* If map is not full, prints it and returns 0. Returns 1 otherwise. */ > +static int print_bitmap(uint8_t *map, int maplen, FILE *stream) > +{ > + int i; > + uint8_t pmap = 0, bitmask = 0; > + int firstset = 0, state = 0; > + > + for (i = 0; i < maplen; i++) { > + if (i % 8 == 0) { > + pmap = *map++; > + bitmask = 1; > + } else bitmask <<= 1; > + > + switch (state) { > + case 0: > + case 2: > + if ((pmap & bitmask) != 0) { > + firstset = i; > + state++; > + } > + continue; > + case 1: > + case 3: > + if ((pmap & bitmask) == 0) { > + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); > + if (i - 1 > firstset) > + fprintf(stream, "-%d", i - 1); > + state = 2; > + } > + continue; > + } > + } > + switch (state) { > + case 0: > + fprintf(stream, "none"); > + break; > + case 2: > + break; > + case 1: > + if (firstset == 0) > + return 1; > + case 3: > + fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); > + if (i - 1 > firstset) > + fprintf(stream, "-%d", i - 1); > + break; > + } > + > + return 0; > +} > + > +static void print_cpumap(uint8_t *map, int maplen, FILE *stream) > +{ > + if (print_bitmap(map, maplen, stream)) > + fprintf(stream, "any cpu"); > +} > + > +static void print_nodemap(uint8_t *map, int maplen, FILE *stream) > +{ > + if (print_bitmap(map, maplen, stream)) > + fprintf(stream, "any node"); > +} > + > +static void list_domains(int verbose, int context, int numa, const libxl_dominfo *info, int nb_domain) > { > int i; > static const char shutdown_reason_letters[]= "-rscw"; > + libxl_bitmap nodemap; > + libxl_physinfo physinfo; > + > + libxl_bitmap_init(&nodemap); > + libxl_physinfo_init(&physinfo); > > printf("Name ID Mem VCPUs\tState\tTime(s)"); > if (verbose) printf(" UUID Reason-Code\tSecurity Label"); > if (context && !verbose) printf(" Security Label"); > + if (numa) { > + if (libxl_node_bitmap_alloc(ctx, &nodemap, 0)) { > + fprintf(stderr, "libxl_node_bitmap_alloc_failed.\n"); > + exit(1); > + } > + if (libxl_get_physinfo(ctx, &physinfo) != 0) { > + fprintf(stderr, "libxl_physinfo failed.\n"); > + libxl_bitmap_dispose(&nodemap); > + exit(1); > + } > + > + printf(" NODE Affinity"); > + } > printf("\n"); > for (i = 0; i < nb_domain; i++) { > char *domname; > @@ -3071,14 +3152,23 @@ static void list_domains(int verbose, in > rc = libxl_flask_sid_to_context(ctx, info[i].ssidref, &buf, > &size); > if (rc < 0) > - printf(" -"); > + printf(" -"); > else { > - printf(" %s", buf); > + printf(" %16s", buf); > free(buf); > } > } > + if (numa) { > + libxl_domain_get_nodeaffinity(ctx, info[i].domid, &nodemap); > + > + putchar('' ''); > + print_nodemap(nodemap.map, physinfo.nr_nodes, stdout); > + } > putchar(''\n''); > } > + > + libxl_bitmap_dispose(&nodemap); > + libxl_physinfo_dispose(&physinfo); > } > > static void list_vm(void) > @@ -3945,10 +4035,12 @@ int main_list(int argc, char **argv) > int opt, verbose = 0; > int context = 0; > int details = 0; > + int numa = 0; > static struct option opts[] = { > {"long", 0, 0, ''l''}, > {"verbose", 0, 0, ''v''}, > {"context", 0, 0, ''Z''}, > + {"numa", 0, 0, ''n''}, > COMMON_LONG_OPTS, > {0, 0, 0, 0} > }; > @@ -3957,7 +4049,7 @@ int main_list(int argc, char **argv) > libxl_dominfo *info, *info_free=0; > int nb_domain, rc; > > - SWITCH_FOREACH_OPT(opt, "lvhZ", opts, "list", 0) { > + SWITCH_FOREACH_OPT(opt, "lvhZn", opts, "list", 0) { > case ''l'': > details = 1; > break; > @@ -3967,6 +4059,9 @@ int main_list(int argc, char **argv) > case ''Z'': > context = 1; > break; > + case ''n'': > + numa = 1; > + break; > } > > if (optind >= argc) { > @@ -3998,7 +4093,7 @@ int main_list(int argc, char **argv) > if (details) > list_domains_details(info, nb_domain); > else > - list_domains(verbose, context, info, nb_domain); > + list_domains(verbose, context, numa, info, nb_domain); > > if (info_free) > libxl_dominfo_list_free(info, nb_domain); > @@ -4247,56 +4342,6 @@ int main_button_press(int argc, char **a > return 0; > } > > -static void print_bitmap(uint8_t *map, int maplen, FILE *stream) > -{ > - int i; > - uint8_t pmap = 0, bitmask = 0; > - int firstset = 0, state = 0; > - > - for (i = 0; i < maplen; i++) { > - if (i % 8 == 0) { > - pmap = *map++; > - bitmask = 1; > - } else bitmask <<= 1; > - > - switch (state) { > - case 0: > - case 2: > - if ((pmap & bitmask) != 0) { > - firstset = i; > - state++; > - } > - continue; > - case 1: > - case 3: > - if ((pmap & bitmask) == 0) { > - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); > - if (i - 1 > firstset) > - fprintf(stream, "-%d", i - 1); > - state = 2; > - } > - continue; > - } > - } > - switch (state) { > - case 0: > - fprintf(stream, "none"); > - break; > - case 2: > - break; > - case 1: > - if (firstset == 0) { > - fprintf(stream, "any cpu"); > - break; > - } > - case 3: > - fprintf(stream, "%s%d", state > 1 ? "," : "", firstset); > - if (i - 1 > firstset) > - fprintf(stream, "-%d", i - 1); > - break; > - } > -} > - > static void print_vcpuinfo(uint32_t tdomid, > const libxl_vcpuinfo *vcpuinfo, > uint32_t nr_cpus) > @@ -4320,7 +4365,7 @@ static void print_vcpuinfo(uint32_t tdom > /* TIM */ > printf("%9.1f ", ((float)vcpuinfo->vcpu_time / 1e9)); > /* CPU AFFINITY */ > - print_bitmap(vcpuinfo->cpumap.map, nr_cpus, stdout); > + print_cpumap(vcpuinfo->cpumap.map, nr_cpus, stdout); > printf("\n"); > } > > diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c > --- a/tools/libxl/xl_cmdtable.c > +++ b/tools/libxl/xl_cmdtable.c > @@ -50,7 +50,8 @@ struct cmd_spec cmd_table[] = { > "[options] [Domain]\n", > "-l, --long Output all VM details\n" > "-v, --verbose Prints out UUIDs and security context\n" > - "-Z, --context Prints out security context" > + "-Z, --context Prints out security context\n" > + "-n, --numa Prints out NUMA node affinity" > }, > { "destroy", > &main_destroy, 0, 1,
Ian Campbell
2013-Mar-18 14:13 UTC
Re: [PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote:> Node-affinity is now something that is under (some) control of the > user, so show it upon request as part of the output of `xl list'' > by the `-n'' option. > > Re the patch, the print_bitmap() related hunk is _mostly_ code motion, > although there is a very minor change in the code, basically to allow > using the function for printing both cpu and node bitmaps (as, in case > all bits are sets, it used to print "any cpu", which doesn''t fit the > nodemap case). > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>Acked-by: Ian Campbell <ian.campbell@citrix.com> [...]> if (rc < 0) > - printf(" -"); > + printf(" -"); > else { > - printf(" %s", buf); > + printf(" %16s", buf); > free(buf);If you do end up respinning for any reason then this could be made less manual/error prone by using "%16s" even in the "-" case. Might even be possible to combine the whole lot, since free(NULL) is safe: printf(" %16s", rc < 0 : "" ? buf); free(buf); (assuming buf == NULL whenever rc < 0, I didn''t check)
Dario Faggioli
2013-Mar-18 14:21 UTC
Re: [PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
On lun, 2013-03-18 at 14:06 +0000, George Dunlap wrote:> On 15/03/13 02:30, Dario Faggioli wrote: > > Node-affinity is now something that is under (some) control of the > > user, so show it upon request as part of the output of `xl list'' > > by the `-n'' option. > > > > Re the patch, the print_bitmap() related hunk is _mostly_ code motion, > > although there is a very minor change in the code, basically to allow > > using the function for printing both cpu and node bitmaps (as, in case > > all bits are sets, it used to print "any cpu", which doesn''t fit the > > nodemap case). > > > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> > > Can we get a tools maintainer comment (either ack / recommendation for > improvement) sometime in the next few days, so that the next time Dario > submits this series it can be taken wholesale? >Yep... And if the same could be done for patches 7, 8 and 9 as well, that would be great! Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Dario Faggioli
2013-Mar-18 14:22 UTC
Re: [PATCH 10 of 11 v4] xl: add node-affinity to the output of `xl list`
On lun, 2013-03-18 at 14:13 +0000, Ian Campbell wrote:> On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote: > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> > > Acked-by: Ian Campbell <ian.campbell@citrix.com> >Thanks!> [...] > > if (rc < 0) > > - printf(" -"); > > + printf(" -"); > > else { > > - printf(" %s", buf); > > + printf(" %16s", buf); > > free(buf); > > If you do end up respinning for any reason then this could be made less > manual/error prone by using "%16s" even in the "-" case. Might even be > possible to combine the whole lot, since free(NULL) is safe: > printf(" %16s", rc < 0 : "" ? buf); > free(buf); > (assuming buf == NULL whenever rc < 0, I didn''t check) >I am respinning, and I''ll take a look at this. Thanks again, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Dario Faggioli
2013-Mar-18 14:23 UTC
Re: [PATCH 03 of 11 v4] xen: sched_credit: when picking, make sure we get an idle one, if any
On lun, 2013-03-18 at 14:02 +0000, George Dunlap wrote:> On 15/03/13 10:55, Jan Beulich wrote: > >>>> On 15.03.13 at 11:37, Dario Faggioli <dario.faggioli@citrix.com> wrote: > >> On ven, 2013-03-15 at 08:14 +0000, Jan Beulich wrote: > >> Perhaps I can turn the condition into something like this: > >> > >> if ( !cpumask_test_cpu(cpu, &cpus) ) > >> cpu = cpumask_empty(&cpus) ? cpu : cpumask_cycle(cpu, &cpus); > >> > >> So that we pay the price less frequently? > > Given cpu < nr_cpu_ids before this, yes, that sounds right. Or > > you could simply switch the operands of the && in your original > > if(). Yet less expensive would be if you stored the result of > > cpumask_cycle() in another variable and copied it into "cpu" > > only if less than nr_cpu_ids. That would eliminate the need for > > cpumask_empty() altogether. > > It seems to me like just switching the order would result in more > readable code. > > Wouldn''t hurt to add "Do the quick test first" to the comment so no one > switches it back. :-) >Ok, will do. Thanks, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Ian Campbell
2013-Mar-18 14:33 UTC
Re: [PATCH 07 of 11 v4] libxl: allow for explicitly specifying node-affinity
On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote:> > +/* For the ''nodemap'' field (of libxl_bitmap type) in > + * libxl_domain_build_info, containing the node-affinity of the > domain. */ > +#define LIBXL_HAVE_DOMAIN_NODEAFFINITY 1Please can you put this next to the existing LIBXL_HAVE_FIRMWARE_PASSTHROUGH in the interests of keeping them altogether. Wiuth that change: Acked-by: Ian Campbell <ian.campbell@citrix.com>
Ian Campbell
2013-Mar-18 14:34 UTC
Re: [PATCH 08 of 11 v4] libxl: optimize the calculation of how many VCPUs can run on a candidate
On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote:> For choosing the best NUMA placement candidate, we need to figure out > how many VCPUs are runnable on each of them. That requires going through > all the VCPUs of all the domains and check their affinities. > > With this change, instead of doing the above for each candidate, we > do it once for all, populating an array while counting. This way, when > we later are evaluating candidates, all we need is summing up the right > elements of the array itself. > > This reduces the complexity of the overall algorithm, as it moves a > potentially expensive operation (for_each_vcpu_of_each_domain {}) > outside from the core placement loop, so that it is performed only > once instead of (potentially) tens or hundreds of times. More > specifically, we go from a worst case computation time complaxity of: > > O(2^n_nodes) * O(n_domains*n_domain_vcpus) > > To, with this change: > > O(n_domains*n_domains_vcpus) + O(2^n_nodes) = O(2^n_nodes) > > (with n_nodes<=16, otherwise the algorithm suggests partitioning with > cpupools and does not even start.) > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com> > Acked-by: George Dunlap <george.dunlap@eu.citrix.com>Acked-by: Ian Campbell <ian.campbell@citrix.com>> --- > Changes from v2: > * in nr_vcpus_on_nodes(), vcpu_nodemap renamed to something more > sensible, as suggested during review. > > diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c > --- a/tools/libxl/libxl_numa.c > +++ b/tools/libxl/libxl_numa.c > @@ -165,22 +165,34 @@ static uint32_t nodemap_to_free_memkb(li > return free_memkb; > } > > -/* Retrieve the number of vcpus able to run on the cpus of the nodes > - * that are part of the nodemap. */ > -static int nodemap_to_nr_vcpus(libxl__gc *gc, libxl_cputopology *tinfo, > +/* Retrieve the number of vcpus able to run on the nodes in nodemap */ > +static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[], > const libxl_bitmap *nodemap) > { > + int i, nr_vcpus = 0; > + > + libxl_for_each_set_bit(i, *nodemap) > + nr_vcpus += vcpus_on_node[i]; > + > + return nr_vcpus; > +} > + > +/* Number of vcpus able to run on the cpus of the various nodes > + * (reported by filling the array vcpus_on_node[]). */ > +static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo, > + const libxl_bitmap *suitable_cpumap, > + int vcpus_on_node[]) > +{ > libxl_dominfo *dinfo = NULL; > - libxl_bitmap vcpu_nodemap; > + libxl_bitmap nodes_counted; > int nr_doms, nr_cpus; > - int nr_vcpus = 0; > int i, j, k; > > dinfo = libxl_list_domain(CTX, &nr_doms); > if (dinfo == NULL) > return ERROR_FAIL; > > - if (libxl_node_bitmap_alloc(CTX, &vcpu_nodemap, 0) < 0) { > + if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) { > libxl_dominfo_list_free(dinfo, nr_doms); > return ERROR_FAIL; > } > @@ -193,19 +205,17 @@ static int nodemap_to_nr_vcpus(libxl__gc > if (vinfo == NULL) > continue; > > - /* For each vcpu of each domain ... */ > for (j = 0; j < nr_dom_vcpus; j++) { > + /* For each vcpu of each domain, increment the elements of > + * the array corresponding to the nodes where the vcpu runs */ > + libxl_bitmap_set_none(&nodes_counted); > + libxl_for_each_set_bit(k, vinfo[j].cpumap) { > + int node = tinfo[k].node; > > - /* Build up a map telling on which nodes the vcpu is runnable on */ > - libxl_bitmap_set_none(&vcpu_nodemap); > - libxl_for_each_set_bit(k, vinfo[j].cpumap) > - libxl_bitmap_set(&vcpu_nodemap, tinfo[k].node); > - > - /* And check if that map has any intersection with our nodemap */ > - libxl_for_each_set_bit(k, vcpu_nodemap) { > - if (libxl_bitmap_test(nodemap, k)) { > - nr_vcpus++; > - break; > + if (libxl_bitmap_test(suitable_cpumap, k) && > + !libxl_bitmap_test(&nodes_counted, node)) { > + libxl_bitmap_set(&nodes_counted, node); > + vcpus_on_node[node]++; > } > } > } > @@ -213,9 +223,9 @@ static int nodemap_to_nr_vcpus(libxl__gc > libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); > } > > - libxl_bitmap_dispose(&vcpu_nodemap); > + libxl_bitmap_dispose(&nodes_counted); > libxl_dominfo_list_free(dinfo, nr_doms); > - return nr_vcpus; > + return 0; > } > > /* > @@ -270,7 +280,7 @@ int libxl__get_numa_candidate(libxl__gc > libxl_numainfo *ninfo = NULL; > int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0; > libxl_bitmap suitable_nodemap, nodemap; > - int rc = 0; > + int *vcpus_on_node, rc = 0; > > libxl_bitmap_init(&nodemap); > libxl_bitmap_init(&suitable_nodemap); > @@ -281,6 +291,8 @@ int libxl__get_numa_candidate(libxl__gc > if (ninfo == NULL) > return ERROR_FAIL; > > + GCNEW_ARRAY(vcpus_on_node, nr_nodes); > + > /* > * The good thing about this solution is that it is based on heuristics > * (implemented in numa_cmpf() ), but we at least can evaluate it on > @@ -330,6 +342,19 @@ int libxl__get_numa_candidate(libxl__gc > goto out; > > /* > + * Later on, we will try to figure out how many vcpus are runnable on > + * each candidate (as a part of choosing the best one of them). That > + * requires going through all the vcpus of all the domains and check > + * their affinities. So, instead of doing that for each candidate, > + * let''s count here the number of vcpus runnable on each node, so that > + * all we have to do later is summing up the right elements of the > + * vcpus_on_node array. > + */ > + rc = nr_vcpus_on_nodes(gc, tinfo, suitable_cpumap, vcpus_on_node); > + if (rc) > + goto out; > + > + /* > * If the minimum number of NUMA nodes is not explicitly specified > * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes > * from where to start generating candidates, if possible (or just start > @@ -414,7 +439,8 @@ int libxl__get_numa_candidate(libxl__gc > * current best one (if any). > */ > libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap); > - new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, tinfo, &nodemap); > + new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node, > + &nodemap); > new_cndt.free_memkb = nodes_free_memkb; > new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap); > new_cndt.nr_cpus = nodes_cpus;
Dario Faggioli
2013-Mar-18 14:35 UTC
Re: [PATCH 07 of 11 v4] libxl: allow for explicitly specifying node-affinity
On lun, 2013-03-18 at 14:33 +0000, Ian Campbell wrote:> On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote: > > > > +/* For the ''nodemap'' field (of libxl_bitmap type) in > > + * libxl_domain_build_info, containing the node-affinity of the > > domain. */ > > +#define LIBXL_HAVE_DOMAIN_NODEAFFINITY 1 > > Please can you put this next to the existing > LIBXL_HAVE_FIRMWARE_PASSTHROUGH in the interests of keeping them > altogether. >Gha, shure... When I put it there, I looked for other similar thing and found no one, but I guess I forgot to re-do this when re-sending the series the other day, now that another LIBXL_HAVE_* appeared. Thanks for pointing out, will do.> Wiuth that change: > > Acked-by: Ian Campbell <ian.campbell@citrix.com> >And thanks for this too. Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Ian Campbell
2013-Mar-18 14:36 UTC
Re: [PATCH 09 of 11 v4] libxl: automatic placement deals with node-affinity
On Fri, 2013-03-15 at 02:30 +0000, Dario Faggioli wrote:> Which basically means the following two things: > 1) during domain creation, it is the node-affinity of > the domain --rather than the vcpu-affinities of its > VCPUs-- that is affected by automatic placement; > 2) during automatic placement, when counting how many > VCPUs are already "bound" to a placement candidate > (as part of the process of choosing the best > candidate), both vcpu-affinity and node-affinity > are considered. > > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com> > Acked-by: George Dunlap <george.dunlap@eu.citrix.com> > Acked-by: Juergen Gross <juergen.gross@ts.fujitsu.com>I skimmed this and it seems ok but mostly if George and Juergen think this is OK then I''m happy with it. Ian.> > diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c > --- a/tools/libxl/libxl_dom.c > +++ b/tools/libxl/libxl_dom.c > @@ -133,13 +133,13 @@ static int numa_place_domain(libxl__gc * > { > int found; > libxl__numa_candidate candidate; > - libxl_bitmap candidate_nodemap; > + libxl_bitmap cpupool_nodemap; > libxl_cpupoolinfo cpupool_info; > int i, cpupool, rc = 0; > uint32_t memkb; > > libxl__numa_candidate_init(&candidate); > - libxl_bitmap_init(&candidate_nodemap); > + libxl_bitmap_init(&cpupool_nodemap); > > /* > * Extract the cpumap from the cpupool the domain belong to. In fact, > @@ -156,7 +156,7 @@ static int numa_place_domain(libxl__gc * > rc = libxl_domain_need_memory(CTX, info, &memkb); > if (rc) > goto out; > - if (libxl_node_bitmap_alloc(CTX, &candidate_nodemap, 0)) { > + if (libxl_node_bitmap_alloc(CTX, &cpupool_nodemap, 0)) { > rc = ERROR_FAIL; > goto out; > } > @@ -174,17 +174,19 @@ static int numa_place_domain(libxl__gc * > if (found == 0) > goto out; > > - /* Map the candidate''s node map to the domain''s info->cpumap */ > - libxl__numa_candidate_get_nodemap(gc, &candidate, &candidate_nodemap); > - rc = libxl_nodemap_to_cpumap(CTX, &candidate_nodemap, &info->cpumap); > + /* Map the candidate''s node map to the domain''s info->nodemap */ > + libxl__numa_candidate_get_nodemap(gc, &candidate, &info->nodemap); > + > + /* Avoid trying to set the affinity to nodes that might be in the > + * candidate''s nodemap but out of our cpupool. */ > + rc = libxl_cpumap_to_nodemap(CTX, &cpupool_info.cpumap, > + &cpupool_nodemap); > if (rc) > goto out; > > - /* Avoid trying to set the affinity to cpus that might be in the > - * nodemap but not in our cpupool. */ > - libxl_for_each_set_bit(i, info->cpumap) { > - if (!libxl_bitmap_test(&cpupool_info.cpumap, i)) > - libxl_bitmap_reset(&info->cpumap, i); > + libxl_for_each_set_bit(i, info->nodemap) { > + if (!libxl_bitmap_test(&cpupool_nodemap, i)) > + libxl_bitmap_reset(&info->nodemap, i); > } > > LOG(DETAIL, "NUMA placement candidate with %d nodes, %d cpus and " > @@ -193,7 +195,7 @@ static int numa_place_domain(libxl__gc * > > out: > libxl__numa_candidate_dispose(&candidate); > - libxl_bitmap_dispose(&candidate_nodemap); > + libxl_bitmap_dispose(&cpupool_nodemap); > libxl_cpupoolinfo_dispose(&cpupool_info); > return rc; > } > @@ -209,10 +211,10 @@ int libxl__build_pre(libxl__gc *gc, uint > /* > * Check if the domain has any CPU affinity. If not, try to build > * up one. In case numa_place_domain() find at least a suitable > - * candidate, it will affect info->cpumap accordingly; if it > + * candidate, it will affect info->nodemap accordingly; if it > * does not, it just leaves it as it is. This means (unless > * some weird error manifests) the subsequent call to > - * libxl_set_vcpuaffinity_all() will do the actual placement, > + * libxl_domain_set_nodeaffinity() will do the actual placement, > * whatever that turns out to be. > */ > if (libxl_defbool_val(info->numa_placement)) { > diff --git a/tools/libxl/libxl_numa.c b/tools/libxl/libxl_numa.c > --- a/tools/libxl/libxl_numa.c > +++ b/tools/libxl/libxl_numa.c > @@ -184,7 +184,7 @@ static int nr_vcpus_on_nodes(libxl__gc * > int vcpus_on_node[]) > { > libxl_dominfo *dinfo = NULL; > - libxl_bitmap nodes_counted; > + libxl_bitmap dom_nodemap, nodes_counted; > int nr_doms, nr_cpus; > int i, j, k; > > @@ -197,6 +197,12 @@ static int nr_vcpus_on_nodes(libxl__gc * > return ERROR_FAIL; > } > > + if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) { > + libxl_bitmap_dispose(&nodes_counted); > + libxl_dominfo_list_free(dinfo, nr_doms); > + return ERROR_FAIL; > + } > + > for (i = 0; i < nr_doms; i++) { > libxl_vcpuinfo *vinfo; > int nr_dom_vcpus; > @@ -205,14 +211,21 @@ static int nr_vcpus_on_nodes(libxl__gc * > if (vinfo == NULL) > continue; > > + /* Retrieve the domain''s node-affinity map */ > + libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap); > + > for (j = 0; j < nr_dom_vcpus; j++) { > - /* For each vcpu of each domain, increment the elements of > - * the array corresponding to the nodes where the vcpu runs */ > + /* > + * For each vcpu of each domain, it must have both vcpu-affinity > + * and node-affinity to (a pcpu belonging to) a certain node to > + * cause an increment in the corresponding element of the array. > + */ > libxl_bitmap_set_none(&nodes_counted); > libxl_for_each_set_bit(k, vinfo[j].cpumap) { > int node = tinfo[k].node; > > if (libxl_bitmap_test(suitable_cpumap, k) && > + libxl_bitmap_test(&dom_nodemap, node) && > !libxl_bitmap_test(&nodes_counted, node)) { > libxl_bitmap_set(&nodes_counted, node); > vcpus_on_node[node]++; > @@ -223,6 +236,7 @@ static int nr_vcpus_on_nodes(libxl__gc * > libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus); > } > > + libxl_bitmap_dispose(&dom_nodemap); > libxl_bitmap_dispose(&nodes_counted); > libxl_dominfo_list_free(dinfo, nr_doms); > return 0;