Defines XENMEM subop hypercall for PV vNUMA enabled guests and data structures that provide vNUMA topology information from per-domain vnuma topology build info. Signed-off-by: Elena Ufimtseva <ufimtseva@gmail.com> --- Changes since RFC v2: - fixed code style; - the memory copying in hypercall happens in one go for arrays; - fixed error codes logic; --- xen/common/domain.c | 10 ++++++ xen/common/domctl.c | 72 +++++++++++++++++++++++++++++++++++++++++++ xen/common/memory.c | 41 ++++++++++++++++++++++++ xen/include/public/domctl.h | 14 +++++++++ xen/include/public/memory.h | 8 +++++ xen/include/xen/domain.h | 11 +++++++ xen/include/xen/sched.h | 1 + xen/include/xen/vnuma.h | 18 +++++++++++ 8 files changed, 175 insertions(+) create mode 100644 xen/include/xen/vnuma.h diff --git a/xen/common/domain.c b/xen/common/domain.c index 5999779..d084292 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -539,6 +539,7 @@ int domain_kill(struct domain *d) tmem_destroy(d->tmem); domain_set_outstanding_pages(d, 0); d->tmem = NULL; + domain_vnuma_destroy(&d->vnuma); /* fallthrough */ case DOMDYING_dying: rc = domain_relinquish_resources(d); @@ -1287,6 +1288,15 @@ int continue_hypercall_on_cpu( return 0; } +void domain_vnuma_destroy(struct domain_vnuma_info *v) +{ + v->nr_vnodes = 0; + xfree(v->vnuma_memblks); + xfree(v->vcpu_to_vnode); + xfree(v->vdistance); + xfree(v->vnode_to_pnode); +} + /* * Local variables: * mode: C diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 870eef1..042e2d2 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -29,6 +29,7 @@ #include <asm/page.h> #include <public/domctl.h> #include <xsm/xsm.h> +#include <xen/vnuma.h> static DEFINE_SPINLOCK(domctl_lock); DEFINE_SPINLOCK(vcpu_alloc_lock); @@ -871,6 +872,77 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) } break; + case XEN_DOMCTL_setvnumainfo: + { + unsigned int i, j, nr_vnodes, dist_size; + unsigned int dist; + + ret = -EFAULT; + dist = i = j = 0; + + nr_vnodes = op->u.vnuma.nr_vnodes; + if ( nr_vnodes == 0 ) + goto err_dom; + dist_size = nr_vnodes * nr_vnodes; + + d->vnuma.vdistance = xmalloc_bytes( + sizeof(*d->vnuma.vdistance) * dist_size); + d->vnuma.vnuma_memblks = xmalloc_bytes( + sizeof(*d->vnuma.vnuma_memblks) * nr_vnodes); + d->vnuma.vcpu_to_vnode = xmalloc_bytes( + sizeof(*d->vnuma.vcpu_to_vnode) * d->max_vcpus); + d->vnuma.vnode_to_pnode = xmalloc_bytes( + sizeof(*d->vnuma.vnode_to_pnode) * nr_vnodes); + + if ( d->vnuma.vdistance == NULL || d->vnuma.vnuma_memblks == NULL || + d->vnuma.vcpu_to_vnode == NULL || + d->vnuma.vnode_to_pnode == NULL ) + { + ret = -ENOMEM; + goto err_dom; + } + + d->vnuma.nr_vnodes = nr_vnodes; + if ( !guest_handle_is_null(op->u.vnuma.vdistance) ) + if ( unlikely(copy_from_guest(d->vnuma.vdistance, + op->u.vnuma.vdistance, + dist_size)) ) + goto err_dom; + + if ( !guest_handle_is_null(op->u.vnuma.vnuma_memblks) ) + if ( unlikely(copy_from_guest(d->vnuma.vnuma_memblks, + op->u.vnuma.vnuma_memblks, + nr_vnodes))) + goto err_dom; + + if ( !guest_handle_is_null(op->u.vnuma.vcpu_to_vnode) ) + if ( unlikely(copy_from_guest(d->vnuma.vcpu_to_vnode, + op->u.vnuma.vcpu_to_vnode, + d->max_vcpus)) ) + goto err_dom; + + if ( !guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) + { + if ( unlikely(copy_from_guest(d->vnuma.vnode_to_pnode, + op->u.vnuma.vnode_to_pnode, + nr_vnodes)) ) + goto err_dom; + + } + else + for( i = 0; i < nr_vnodes; i++ ) + d->vnuma.vnode_to_pnode[i] = NUMA_NO_NODE; + ret = 0; +err_dom: + if (ret != 0) { + d->vnuma.nr_vnodes = 0; + xfree( d->vnuma.vdistance ); + xfree( d->vnuma.vnuma_memblks ); + xfree( d->vnuma.vnode_to_pnode ); + } + } + break; + default: ret = arch_do_domctl(op, d, u_domctl); break; diff --git a/xen/common/memory.c b/xen/common/memory.c index 50b740f..e8915e4 100644 --- a/xen/common/memory.c +++ b/xen/common/memory.c @@ -28,6 +28,7 @@ #include <public/memory.h> #include <xsm/xsm.h> #include <xen/trace.h> +#include <xen/vnuma.h> struct memop_args { /* INPUT */ @@ -732,7 +733,47 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) rcu_unlock_domain(d); break; + case XENMEM_get_vnuma_info: + { + struct vnuma_topology_info mtopology; + struct domain *d; + unsigned int max_vcpus, nr_vnodes = 0; + rc = -EINVAL; + + if( copy_from_guest(&mtopology, arg, 1) ) + return -EFAULT; + if ( (d = rcu_lock_domain_by_any_id(mtopology.domid)) == NULL ) + return -ESRCH; + + nr_vnodes = d->vnuma.nr_vnodes; + max_vcpus = d->max_vcpus; + + if ((nr_vnodes == 0) || (nr_vnodes > max_vcpus)) + goto vnumaout; + + mtopology.nr_vnodes = nr_vnodes; + + if (copy_to_guest(arg , &mtopology, 1) != 0) + goto vnumaout; + + if (copy_to_guest(mtopology.vnuma_memblks, + d->vnuma.vnuma_memblks, + nr_vnodes) != 0) + goto vnumaout; + if (copy_to_guest(mtopology.vdistance, + d->vnuma.vdistance, + nr_vnodes * nr_vnodes) != 0) + goto vnumaout; + if (copy_to_guest(mtopology.vcpu_to_vnode, + d->vnuma.vcpu_to_vnode, + max_vcpus) != 0) + goto vnumaout; + rc = 0; +vnumaout: + rcu_unlock_domain(d); + break; + } default: rc = arch_memory_op(op, arg); break; diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index d4e479f..1adab25 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -35,6 +35,7 @@ #include "xen.h" #include "grant_table.h" #include "hvm/save.h" +#include "memory.h" #define XEN_DOMCTL_INTERFACE_VERSION 0x00000009 @@ -863,6 +864,17 @@ struct xen_domctl_set_max_evtchn { typedef struct xen_domctl_set_max_evtchn xen_domctl_set_max_evtchn_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_max_evtchn_t); +struct xen_domctl_vnuma { + uint16_t nr_vnodes; + XEN_GUEST_HANDLE_64(uint) vdistance; + XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode; + XEN_GUEST_HANDLE_64(uint) vnode_to_pnode; + XEN_GUEST_HANDLE_64(vnuma_memblk_t) vnuma_memblks; +}; + +typedef struct xen_domctl_vnuma xen_domctl_vnuma_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_vnuma_t); + struct xen_domctl { uint32_t cmd; #define XEN_DOMCTL_createdomain 1 @@ -932,6 +944,7 @@ struct xen_domctl { #define XEN_DOMCTL_setnodeaffinity 68 #define XEN_DOMCTL_getnodeaffinity 69 #define XEN_DOMCTL_set_max_evtchn 70 +#define XEN_DOMCTL_setvnumainfo 71 #define XEN_DOMCTL_gdbsx_guestmemio 1000 #define XEN_DOMCTL_gdbsx_pausevcpu 1001 #define XEN_DOMCTL_gdbsx_unpausevcpu 1002 @@ -992,6 +1005,7 @@ struct xen_domctl { struct xen_domctl_set_broken_page_p2m set_broken_page_p2m; struct xen_domctl_gdbsx_pauseunp_vcpu gdbsx_pauseunp_vcpu; struct xen_domctl_gdbsx_domstatus gdbsx_domstatus; + struct xen_domctl_vnuma vnuma; uint8_t pad[128]; } u; }; diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h index 7a26dee..93a17e9 100644 --- a/xen/include/public/memory.h +++ b/xen/include/public/memory.h @@ -459,6 +459,14 @@ DEFINE_XEN_GUEST_HANDLE(xen_mem_sharing_op_t); * The zero value is appropiate. */ +struct vnuma_memblk { + uint64_t start, end; +}; +typedef struct vnuma_memblk vnuma_memblk_t; +DEFINE_XEN_GUEST_HANDLE(vnuma_memblk_t); + +#define XENMEM_get_vnuma_info 25 + #endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */ #endif /* __XEN_PUBLIC_MEMORY_H__ */ diff --git a/xen/include/xen/domain.h b/xen/include/xen/domain.h index a057069..c9d53e3 100644 --- a/xen/include/xen/domain.h +++ b/xen/include/xen/domain.h @@ -4,6 +4,7 @@ #include <public/xen.h> #include <asm/domain.h> +#include <public/memory.h> typedef union { struct vcpu_guest_context *nat; @@ -89,4 +90,14 @@ extern unsigned int xen_processor_pmbits; extern bool_t opt_dom0_vcpus_pin; +struct domain_vnuma_info { + uint16_t nr_vnodes; + uint *vdistance; + uint *vcpu_to_vnode; + uint *vnode_to_pnode; + vnuma_memblk_t *vnuma_memblks; +}; + +void domain_vnuma_destroy(struct domain_vnuma_info *v); + #endif /* __XEN_DOMAIN_H__ */ diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 25bf637..6693b35 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -408,6 +408,7 @@ struct domain nodemask_t node_affinity; unsigned int last_alloc_node; spinlock_t node_affinity_lock; + struct domain_vnuma_info vnuma; }; struct domain_setup_info diff --git a/xen/include/xen/vnuma.h b/xen/include/xen/vnuma.h new file mode 100644 index 0000000..2637476 --- /dev/null +++ b/xen/include/xen/vnuma.h @@ -0,0 +1,18 @@ +#ifndef _VNUMA_H +#define _VNUMA_H +#include <public/memory.h> + +/* DEFINE_XEN_GUEST_HANDLE(vnuma_memblk_t); */ + +struct vnuma_topology_info { + domid_t domid; + uint16_t nr_vnodes; + uint32_t _pad; + XEN_GUEST_HANDLE_64(uint) vdistance; + XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode; + XEN_GUEST_HANDLE_64(vnuma_memblk_t) vnuma_memblks; +}; +typedef struct vnuma_topology_info vnuma_topology_info_t; +DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t); + +#endif -- 1.7.10.4
>>> On 17.10.13 at 00:37, Elena Ufimtseva <ufimtseva@gmail.com> wrote: > Changes since RFC v2: > - fixed code style;Sadly not enough yet.> @@ -871,6 +872,77 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) > } > break; > > + case XEN_DOMCTL_setvnumainfo: > + { > + unsigned int i, j, nr_vnodes, dist_size; > + unsigned int dist;Why can''t these all be together?> +Line with only blanks.> + ret = -EFAULT; > + dist = i = j = 0;Why can''t these be initializers to their declarators?> + > + nr_vnodes = op->u.vnuma.nr_vnodes; > + if ( nr_vnodes == 0 ) > + goto err_dom; > + dist_size = nr_vnodes * nr_vnodes;Potential for overflow?> + > + d->vnuma.vdistance = xmalloc_bytes( > + sizeof(*d->vnuma.vdistance) * dist_size); > + d->vnuma.vnuma_memblks = xmalloc_bytes( > + sizeof(*d->vnuma.vnuma_memblks) * nr_vnodes); > + d->vnuma.vcpu_to_vnode = xmalloc_bytes( > + sizeof(*d->vnuma.vcpu_to_vnode) * d->max_vcpus); > + d->vnuma.vnode_to_pnode = xmalloc_bytes( > + sizeof(*d->vnuma.vnode_to_pnode) * nr_vnodes);Is there any reason why any of these really has to use xmalloc_bytes() instead of xmalloc_array() (which takes care of overflow conditions as well as type correctness for you)? Further there''s nothing preventing this call from being issued twice for a given domain, yet you blindly overwrite the old values (and thus leak them).> + > + if ( d->vnuma.vdistance == NULL || d->vnuma.vnuma_memblks == NULL || > + d->vnuma.vcpu_to_vnode == NULL || > + d->vnuma.vnode_to_pnode == NULL ) > + { > + ret = -ENOMEM; > + goto err_dom; > + } > + > + d->vnuma.nr_vnodes = nr_vnodes;Trailing blank. Also I''d strongly recommend setting this field last, so that other code won''t risk getting confused if some of the copying below fails - until all copying was successfully done, the domain should look like not having any vNUMA info.> + if ( !guest_handle_is_null(op->u.vnuma.vdistance) ) > + if ( unlikely(copy_from_guest(d->vnuma.vdistance,Two if()-s like here should be combined into a single one.> + op->u.vnuma.vdistance, > + dist_size)) ) > + goto err_dom;And if guest_handle_is_null(op->u.vnuma.vdistance) d->vnuma.vdistance will remain uninitialized - is that not a problem? (Both this and the preceding comment apply further down again.)> + > + if ( !guest_handle_is_null(op->u.vnuma.vnuma_memblks) ) > + if ( unlikely(copy_from_guest(d->vnuma.vnuma_memblks, > + op->u.vnuma.vnuma_memblks, > + nr_vnodes))) > + goto err_dom; > + > + if ( !guest_handle_is_null(op->u.vnuma.vcpu_to_vnode) ) > + if ( unlikely(copy_from_guest(d->vnuma.vcpu_to_vnode, > + op->u.vnuma.vcpu_to_vnode, > + d->max_vcpus)) ) > + goto err_dom; > + > + if ( !guest_handle_is_null(op->u.vnuma.vnode_to_pnode) ) > + { > + if ( unlikely(copy_from_guest(d->vnuma.vnode_to_pnode, > + op->u.vnuma.vnode_to_pnode, > + nr_vnodes)) ) > + goto err_dom; > + > + } > + else > + for( i = 0; i < nr_vnodes; i++ )Missing blank after "for".> + d->vnuma.vnode_to_pnode[i] = NUMA_NO_NODE; > + ret = 0; > +err_dom:This should be indented by one space. I also don''t see why it''s being named the way it is (it''s for one too unspecific and also completely unrelated to anything "dom"ish).> + if (ret != 0) {if ( ret != 0 ) {> + d->vnuma.nr_vnodes = 0; > + xfree( d->vnuma.vdistance ); > + xfree( d->vnuma.vnuma_memblks ); > + xfree( d->vnuma.vnode_to_pnode );Bogus blanks around function call arguments.> @@ -732,7 +733,47 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg) > rcu_unlock_domain(d); > > break; > + case XENMEM_get_vnuma_info: > + { > + struct vnuma_topology_info mtopology; > + struct domain *d; > + unsigned int max_vcpus, nr_vnodes = 0;Pretty pointless variables, and definitely a pointless initializer.>The blank line here should actually be retained above the case label (and similarly a blank line should be there before the next [default] one).> + rc = -EINVAL; > + > + if( copy_from_guest(&mtopology, arg, 1) ) > + return -EFAULT; > + if ( (d = rcu_lock_domain_by_any_id(mtopology.domid)) == NULL ) > + return -ESRCH; > + > + nr_vnodes = d->vnuma.nr_vnodes; > + max_vcpus = d->max_vcpus; > + > + if ((nr_vnodes == 0) || (nr_vnodes > max_vcpus))Starting here and continuing all the way to the end of the function you''re again lacking blanks inside the outermost parentheses. Also, is the right side of the || really possible?> + goto vnumaout;This yields a -EINVAL return for something that isn''t really a bad function argument. Please use a more appropriate error code (like -EOPNOTSUPP).> + > + mtopology.nr_vnodes = nr_vnodes;So this is the only field you changed.> + > + if (copy_to_guest(arg , &mtopology, 1) != 0)Hence you would be better off copying back just that one field. And having used copy_from_guest() on the same guest memory block above, using the __-prefixed variant here is correct and (for consistency with other similar code) recommended.> + goto vnumaout;And from here on the error return value ought to be -EFAULT.> + > + if (copy_to_guest(mtopology.vnuma_memblks, > + d->vnuma.vnuma_memblks, > + nr_vnodes) != 0)And here we see that the lack of initialization above _is_ a (security) problem - you''re possibly leaking hypervisor memory contents to the guest here.> + goto vnumaout; > + if (copy_to_guest(mtopology.vdistance, > + d->vnuma.vdistance, > + nr_vnodes * nr_vnodes) != 0) > + goto vnumaout; > + if (copy_to_guest(mtopology.vcpu_to_vnode, > + d->vnuma.vcpu_to_vnode, > + max_vcpus) != 0) > + goto vnumaout; > + rc = 0; > +vnumaout:Again needs to be indented by one space.> @@ -863,6 +864,17 @@ struct xen_domctl_set_max_evtchn { > typedef struct xen_domctl_set_max_evtchn xen_domctl_set_max_evtchn_t; > DEFINE_XEN_GUEST_HANDLE(xen_domctl_set_max_evtchn_t); > > +struct xen_domctl_vnuma { > + uint16_t nr_vnodes;Please insert explicit padding here. And if you want the structure to be extensible without having to increment the domctl interface version, you''d also require (and check!) the padding space to be zero-initialized.> +struct vnuma_memblk { > + uint64_t start, end;Too deep indentation.> --- a/xen/include/xen/domain.h > +++ b/xen/include/xen/domain.h > @@ -4,6 +4,7 @@ > > #include <public/xen.h> > #include <asm/domain.h> > +#include <public/memory.h>Avoid such unnecessary dependencies if you can. And you can here ...> @@ -89,4 +90,14 @@ extern unsigned int xen_processor_pmbits; > > extern bool_t opt_dom0_vcpus_pin; > > +struct domain_vnuma_info { > + uint16_t nr_vnodes; > + uint *vdistance; > + uint *vcpu_to_vnode; > + uint *vnode_to_pnode; > + vnuma_memblk_t *vnuma_memblks;... by using struct vnuma_memblk here.> --- /dev/null > +++ b/xen/include/xen/vnuma.h > @@ -0,0 +1,18 @@ > +#ifndef _VNUMA_H > +#define _VNUMA_H > +#include <public/memory.h> > + > +/* DEFINE_XEN_GUEST_HANDLE(vnuma_memblk_t); */ > + > +struct vnuma_topology_info { > + domid_t domid; > + uint16_t nr_vnodes; > + uint32_t _pad; > + XEN_GUEST_HANDLE_64(uint) vdistance; > + XEN_GUEST_HANDLE_64(uint) vcpu_to_vnode; > + XEN_GUEST_HANDLE_64(vnuma_memblk_t) vnuma_memblks; > +}; > +typedef struct vnuma_topology_info vnuma_topology_info_t; > +DEFINE_XEN_GUEST_HANDLE(vnuma_topology_info_t);This being the argument of XENMEM_get_vnuma_info, how can this live in a non-public header? (I''m sure I had pointed this out before.) Jan
Hi Elena, Congratulations to your work again! Have you considered the other memory operations in xen/common/memory.c? There are two important function: decrease_reservation(&args) and populate_physmap(&args) decrease_reservation(&args) remove pages from domain. populate_physmap(&args) alloc pages for domain. Both of them have NUMA operation. (there is also a function name increase_reservation(&args), I don''t know why it''s here since guest domain wouldn''t use it as far as I know...) Guest domain pass the mask of nodes to xen by these two hypercalls. For decrease_reservation, xen will also receive a number of pages. We just free them from domain. Here, we should update the memory size of vnodes and pnodes (I think you keep a counter for the page numbers of each vnode and pnode, something as vnuma_memszs, but please forgive me that you have submitted such a huge patch that I could not understand everything in time : - | ) For populate_physmap, xen will allocate blank pages from its heap for domain guest, from specific nodes, according to the nodemask. Here we should update your counters too! And as I see, we don''t have a protocol here on whether the nodemask in (&args ) is pnode or vnode. I think it should be vnode, since guest domain knows nothing about the node affinity. So my idea could be: we communicate with guest domain using vnode IDs. If we need to change the memory size of guest domain, for example, memory increase/decrease on pnode[0], we use your node affinity to change pnode[0] to vnodes_mask, pass it to guest domain. And in the two functions of memory.c mentioned above, we received the vnode_mask, transfer it back to pnode_mask, thus it will work perfectly! And we don''t need an extra hypercall for guest domain any more! Elena and Dario, what is your options? -- Yechen Li Team of System Virtualization and Cloud Computing School of Electronic Engineering and Computer Science, Peking University, China Nothing is impossible because impossible itself says: " I''m possible " lccycc From PKU _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Dario, I just reply to this email again in case you haven''t seen it :) On Tue, Oct 22, 2013 at 10:31 PM, Li Yechen <lccycc123@gmail.com> wrote:> Hi Elena, > Congratulations to your work again! > > Have you considered the other memory operations in xen/common/memory.c? > There are two important function: decrease_reservation(&args) and > populate_physmap(&args) > decrease_reservation(&args) remove pages from domain. > populate_physmap(&args) alloc pages for domain. > > Both of them have NUMA operation. (there is also a function name > increase_reservation(&args), I don''t know why it''s here since guest domain > wouldn''t use it as far as I know...) > Guest domain pass the mask of nodes to xen by these two hypercalls. > > For decrease_reservation, xen will also receive a number of pages. We just > free them from domain. Here, we should update the memory size of vnodes and > pnodes > (I think you keep a counter for the page numbers of each vnode and pnode, > something as vnuma_memszs, but please forgive me that you have submitted > such a huge patch that I could not understand everything in time : - | ) > > For populate_physmap, xen will allocate blank pages from its heap for > domain guest, from specific nodes, according to the nodemask. Here we > should update your counters too! > > And as I see, we don''t have a protocol here on whether the nodemask in > (&args ) is pnode or vnode. > > I think it should be vnode, since guest domain knows nothing about the > node affinity. > > So my idea could be: we communicate with guest domain using vnode IDs. If > we need to change the memory size of guest domain, for example, memory > increase/decrease on pnode[0], > we use your node affinity to change pnode[0] to vnodes_mask, pass it to > guest domain. And in the two functions of memory.c mentioned above, we > received the vnode_mask, transfer it back to pnode_mask, thus it will work > perfectly! And we don''t need an extra hypercall for guest domain any more! > > Elena and Dario, what is your options? > > > > > > -- > > Yechen Li > > Team of System Virtualization and Cloud Computing > School of Electronic Engineering and Computer Science, > Peking University, China > > Nothing is impossible because impossible itself says: " I''m possible " > lccycc From PKU > >-- Yechen Li Team of System Virtualization and Cloud Computing School of Electronic Engineering and Computer Science, Peking University, China Nothing is impossible because impossible itself says: " I''m possible " lccycc From PKU _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
On gio, 2013-11-21 at 18:00 +0800, Li Yechen wrote:> Dario, > I just reply to this email again in case you haven''t seen it :) >No, don''t worry, I haven''t forgot. :-)> On Tue, Oct 22, 2013 at 10:31 PM, Li Yechen <lccycc123@gmail.com> > wrote: > Hi Elena, > > Congratulations to your work again! > > > Have you considered the other memory operations in > xen/common/memory.c? > > There are two important function: decrease_reservation(&args) > and populate_physmap(&args) > > decrease_reservation(&args) remove pages from domain. > populate_physmap(&args) alloc pages for domain. >Yes, that''s definitely something we need to adress, probably in this patch series, even before thinking about NUMA aware ballooning.> Guest domain pass the mask of nodes to xen by these two > hypercalls. > > For decrease_reservation, xen will also receive a number of > pages. We just free them from domain. Here, we should update > the memory size of vnodes and pnodes > (I think you keep a counter for the page numbers of each vnode > and pnode, something as vnuma_memszs, but please forgive me > that you have submitted such a huge patch that I could not > understand everything in time : - | ) > > For populate_physmap, xen will allocate blank pages from its > heap for domain guest, from specific nodes, according to the > nodemask. Here we should update your counters too! >Well, I haven''t gone re-check the code, but that does make sense. In Edinburgh, Elena told me that she did some tests of ballooning with her series applied, and nothing exploded (which is already something. :-D). We definitely should double check what happens, from where the pages came /are taken from, and ensure the accounting is done right.> And as I see, we don''t have a protocol here on whether the > nodemask in (&args ) is pnode or vnode. > > I think it should be vnode, since guest domain knows nothing > about the node affinity. > > So my idea could be: we communicate with guest domain using > vnode IDs. If we need to change the memory size of guest > domain, for example, memory increase/decrease on pnode[0], > we . And in the two functions of memory.c mentioned above, we > received the vnode_mask, transfer it back to pnode_mask, thus > it will work perfectly! And we don''t need an extra hypercall > for guest domain any more! >Mmm.. it may be me, but I''m not sure I follow. I agree that the guest should speak vnode-s, but I''m not sure I get what you mean when you say "use your node affinity to change pnode[0] to vnodes_mask, pass it to guest domain". Anyway, I was thinking, you did a pretty nice job on hacking something together when for NUMA aware ballooning when vNUMA wasn''t even released. Now that Elena''s patchset is out, how about you try to adapt what you had at the time, plus the outcome of all that nice discussion we also had, on top of it, and show us what happens? :-) Elena''s patches are not in the final form, but that should constitute a fairly decent basis for another, and this time easier to understand and to review, proof of concept implementation, isn''t that so? Of course, there''s no hurry, this will definitely be something we''ll consider for the next release of Xen (so 4.5, not 4.4 which will hopefully be released in January), i.e., there should be plenty of time. :-D What do you think? Thanks and Regards, Dario -- <<This happens because I choose it to happen!>> (Raistlin Majere) ----------------------------------------------------------------- Dario Faggioli, Ph.D, http://about.me/dario.faggioli Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel