Yoshiaki Tamura
2009-Mar-12 01:14 UTC
[Xen-devel] [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree
Hi, This patch series are updated version of Kemari that we posted last week. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00371.html Thanks for the comments, and we''re working to improve and clean up the code appropriately. The following patches can be applied to the 3.4 unstable tree. xen, xend: xen-unstable.hg c/s 19303:b249f3e979a5 linux: linux-2.6.18-xen.hg c/s 817:0430b1dbfb3a ioemu: qemu-xen-unstable.git c/s 82787c6f689d869ad349df83ec3f58702afe00fe If you can take a look, it would be very helpful. Any comments are appreciated. Thanks, Yoshi Overview: Kemari in VMM taps event channel, pauses the guest (not suspend), prepares for transfer, and Kemari in userland transfers the guest. On failover, Kemari on the secondary restores the guest, and the backend drivers in dom0 set up the backend rings from the state of the shared rings in the guest. [01] xen: add ECS_TAP state to event channel [02] xen: core Kemari code [03] xend: change parameter type of xc_{set,get}_hvm_param [04] xend: Kemari controller interface in libxc [05] xend: Kemari sender in libxc [06] xend: Kemari receiver libxc [07] xend: add Kemari support to python [08] xend: add dev state "Attached" to python [09] linux: add XenbusStateAttached to xenbus [10] linux: XenbusStateAttached handler for blkback [11] linux: XenbusStateAttached handler for netback [12] ioemu: use signal to save qemu state for Kemari [13] ioemu: use shared region with to flip logdirty_bitmap _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:15 UTC
[Xen-devel] [RFC][PATCH 01/13] Kemari: add ECS_TAP state to event channel
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00369.html Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> --- xen/common/event_channel.c | 150 ++++++++++++++++++++++++++++++++++++++++++++- xen/include/xen/event.h | 14 ++++ xen/include/xen/sched.h | 10 +++ 3 files changed, 173 insertions(+), 1 deletion(-) diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/include/xen/sched.h Wed Mar 11 18:03:47 2009 +0900 @@ -20,6 +20,7 @@ #include <xen/rcupdate.h> #include <xen/irq.h> #include <xen/mm.h> +#include <xen/kemari.h> #ifdef CONFIG_COMPAT #include <compat/vcpu.h> @@ -47,6 +48,7 @@ #define ECS_PIRQ 4 /* Channel is bound to a physical IRQ line. */ #define ECS_VIRQ 5 /* Channel is bound to a virtual IRQ line. */ #define ECS_IPI 6 /* Channel is bound to a virtual IPI line. */ +#define ECS_TAP 7 /* Channel is bound and tapped. */ u8 state; /* ECS_* */ u8 consumer_is_xen; /* Consumed by Xen or by guest? */ u16 notify_vcpu_id; /* VCPU for local delivery notification */ @@ -61,6 +63,11 @@ u16 pirq; /* state == ECS_PIRQ */ u16 virq; /* state == ECS_VIRQ */ } u; + struct { + u8 mode; /* Tap IN, OUT or both. */ + /* Fucntion to call when an event is detected. */ + long (*redirect) (struct evtchn *lchn, struct evtchn *rchn); + } tap; #ifdef FLASK_ENABLE void *ssid; #endif @@ -255,6 +262,9 @@ /* OProfile support. */ struct xenoprof *xenoprof; int32_t time_offset_seconds; + + /* Kemari support. */ + struct kemari *kemari; struct rcu_head rcu; diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/event.h --- a/xen/include/xen/event.h Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/include/xen/event.h Wed Mar 11 18:03:47 2009 +0900 @@ -82,4 +82,18 @@ mb(); /* set blocked status /then/ caller does his work */ \ } while ( 0 ) +struct evtchn_bind_tap { + /* IN parameters. */ + domid_t tap_dom; + uint32_t tap_port; + uint8_t mode; + long (*redirect) (struct evtchn *lchn, struct evtchn *rchn); +}; + +void notify_via_xen_evtchn_tap(struct domain *ld, int lport); + +long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap); + +long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap); + #endif /* __XEN_EVENT_H__ */ diff -r b249f3e979a5 -r cf6a910e3663 xen/common/event_channel.c --- a/xen/common/event_channel.c Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/common/event_channel.c Wed Mar 11 18:03:47 2009 +0900 @@ -191,7 +191,8 @@ if ( !port_is_valid(rd, rport) ) ERROR_EXIT_DOM(-EINVAL, rd); rchn = evtchn_from_port(rd, rport); - if ( (rchn->state != ECS_UNBOUND) || + /* kemari needs to reuse rchn information */ + if ( (rchn->state != ECS_UNBOUND) && (rchn->u.unbound.remote_domid != ld->domain_id) ) ERROR_EXIT_DOM(-EINVAL, rd); @@ -338,6 +339,113 @@ return rc; } +long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap) +{ + struct evtchn *lchn, *rchn; + struct domain *ld, *rd; + int lport = bind_tap->tap_port, rport; + domid_t ldom = bind_tap->tap_dom; + long ret; + + if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL ) + return -ESRCH; + + spin_lock(&ld->event_lock); + + ret = -EINVAL; + if ( !port_is_valid(ld, lport) ) + goto lchn_out; + lchn = evtchn_from_port(ld, lport); + if ( lchn->state != ECS_INTERDOMAIN ) + goto lchn_out; + + ret = -ESRCH; + rd = lchn->u.interdomain.remote_dom; + if ( rd == NULL ) + goto lchn_out; + + spin_lock(&rd->event_lock); + + rport = lchn->u.interdomain.remote_port; + if ( !port_is_valid(rd, rport) ) + goto rchn_out; + rchn = evtchn_from_port(rd, rport); + if ( rchn->state != ECS_INTERDOMAIN ) + goto rchn_out; + + lchn->state = ECS_TAP; + lchn->tap.mode = bind_tap->mode; + lchn->tap.redirect = bind_tap->redirect; + + rchn->state = ECS_TAP; + rchn->tap.redirect = bind_tap->redirect; + + ret = 0; + + rchn_out: + spin_unlock(&rd->event_lock); + + lchn_out: + spin_unlock(&ld->event_lock); + + rcu_unlock_domain(ld); + + return ret; +} + +long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap) +{ + struct evtchn *lchn, *rchn; + struct domain *ld, *rd; + int lport = bind_tap->tap_port, rport; + domid_t ldom = bind_tap->tap_dom; + long ret; + + if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL ) + return -ESRCH; + + spin_lock(&ld->event_lock); + + ret = -EINVAL; + if ( !port_is_valid(ld, lport) ) + goto lchn_out; + lchn = evtchn_from_port(ld, lport); + if ( lchn->state != ECS_TAP ) + goto lchn_out; + + ret = -ESRCH; + rd = lchn->u.interdomain.remote_dom; + if ( rd == NULL ) + goto lchn_out; + + spin_lock(&rd->event_lock); + + rport = lchn->u.interdomain.remote_port; + if ( !port_is_valid(rd, rport) ) + goto rchn_out; + rchn = evtchn_from_port(rd, rport); + if ( rchn->state != ECS_TAP ) + goto rchn_out; + + lchn->state = ECS_INTERDOMAIN; + lchn->tap.mode = bind_tap->mode; + lchn->tap.redirect = NULL; + + rchn->state = ECS_INTERDOMAIN; + rchn->tap.redirect = NULL; + + ret = 0; + + rchn_out: + spin_unlock(&rd->event_lock); + + lchn_out: + spin_unlock(&ld->event_lock); + + rcu_unlock_domain(ld); + + return ret; +} static long __evtchn_close(struct domain *d1, int port1) { @@ -393,6 +501,7 @@ case ECS_IPI: break; + case ECS_TAP: case ECS_INTERDOMAIN: if ( d2 == NULL ) { @@ -430,6 +539,14 @@ BUG_ON(!port_is_valid(d2, port2)); chn2 = evtchn_from_port(d2, port2); + + if ( chn1->state == ECS_TAP ) + { + chn1->tap.redirect = NULL; + chn2->tap.redirect = NULL; + chn2->state = ECS_INTERDOMAIN; + } + BUG_ON(chn2->state != ECS_INTERDOMAIN); BUG_ON(chn2->u.interdomain.remote_dom != d1); @@ -499,6 +616,13 @@ switch ( lchn->state ) { + case ECS_TAP: + rd = lchn->u.interdomain.remote_dom; + rport = lchn->u.interdomain.remote_port; + rchn = evtchn_from_port(rd, rport); + + lchn->tap.redirect(lchn, rchn); + case ECS_INTERDOMAIN: rd = lchn->u.interdomain.remote_dom; rport = lchn->u.interdomain.remote_port; @@ -1009,6 +1133,30 @@ spin_unlock(&ld->event_lock); } +void notify_via_xen_evtchn_tap(struct domain *ld, int lport) +{ + struct evtchn *lchn, *rchn; + struct domain *rd; + int rport; + + if (ld != current->domain) + spin_lock(&ld->event_lock); + + ASSERT(port_is_valid(ld, lport)); + lchn = evtchn_from_port(ld, lport); + ASSERT(lchn->consumer_is_xen); + + if ( likely(lchn->state == ECS_INTERDOMAIN) ) + { + rd = lchn->u.interdomain.remote_dom; + rport = lchn->u.interdomain.remote_port; + rchn = evtchn_from_port(rd, rport); + evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport); + } + + if (ld != current->domain) + spin_unlock(&ld->event_lock); +} int evtchn_init(struct domain *d) { _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:16 UTC
[Xen-devel] [RFC][PATCH 02/13] Kemari: core kemari code
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00373.html Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> --- xen/arch/x86/Makefile | 1 xen/arch/x86/domain.c | 4 xen/arch/x86/domctl.c | 16 xen/arch/x86/kemari/Makefile | 1 xen/arch/x86/kemari/kemari.c | 670 +++++++++++++++++++++++++++++++++++++++++ xen/include/public/domctl.h | 33 ++ xen/include/public/io/xenbus.h | 4 xen/include/public/kemari.h | 97 +++++ xen/include/xen/kemari.h | 75 ++++ 9 files changed, 900 insertions(+), 1 deletion(-) diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/kemari.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/public/kemari.h Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,97 @@ +/****************************************************************************** + * kemari.h + * + * Tools interface to Kemari. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __XEN_PUBLIC_KEMARI_H__ +#define __XEN_PUBLIC_KEMARI_H__ + +#define KEMARI_TAP_OFF 0 +#define KEMARI_TAP_IN 1 +#define KEMARI_TAP_OUT 2 + +struct kemari_ring { + uint32_t cons; + uint32_t prod; + uint32_t num_ents; + unsigned int dirty_bitmap_size; /* num of ditry bits */ + struct { + uint32_t buf_size; + uint32_t rec_size; + uint32_t buf_offset; + } hvm_ctxt; + char data[1]; +}; + +struct kemari_ent { + union { + struct { + uint16_t pages; + uint16_t port; + } header; + struct { + uint16_t start; + uint16_t end; + } index; + unsigned long dirty_bitmap; + } u; +}; + +#define KEMARI_RING_GET_PROD(_ring) \ + (&((struct kemari_ent *)(_ring)->data)[(_ring)->prod % (_ring)->num_ents]) + +#define KEMARI_RING_GET_CONS(_ring) \ + (&((struct kemari_ent *)(_ring)->data)[(_ring)->cons % (_ring)->num_ents]) + +static inline void kemari_ring_read(struct kemari_ring *ring, + struct kemari_ent **buf) +{ + *buf = KEMARI_RING_GET_CONS(ring); +#ifdef __XEN__ + wmb(); +#elif __XEN_TOOLS__ + xen_wmb(); +#endif + ring->cons++; +} + +static inline void kemari_ring_write(struct kemari_ring *ring, + struct kemari_ent *buf) +{ + memcpy(KEMARI_RING_GET_PROD(ring), buf, sizeof(struct kemari_ent)); +#ifdef __XEN__ + wmb(); +#elif __XEN_TOOLS__ + xen_wmb(); +#endif + ring->prod++; +} + +#endif /* __XEN_PUBLIC_KEMARI_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/domctl.h --- a/xen/include/public/domctl.h Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/include/public/domctl.h Wed Mar 11 18:03:47 2009 +0900 @@ -645,6 +645,38 @@ } xen_domctl_hvmcontext_partial_t; DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t); +/* Kemari interface */ +#define XEN_DOMCTL_kemari_op 56 + +#define _XEN_KEMARI_OP_enable 0 +#define XEN_KEMARI_OP_enable (1UL<<_XEN_KEMARI_OP_enable) +#define _XEN_KEMARI_OP_off 1 +#define XEN_KEMARI_OP_off (1UL<<_XEN_KEMARI_OP_off) +#define _XEN_KEMARI_OP_attach 2 +#define XEN_KEMARI_OP_attach (1UL<<_XEN_KEMARI_OP_attach) +#define _XEN_KEMARI_OP_detach 3 +#define XEN_KEMARI_OP_detach (1UL<<_XEN_KEMARI_OP_detach) + +struct xen_domctl_kemari_op { + uint32_t cmd; + + union { + struct { + uint32_t port; + uint32_t num_pages; + uint64_t mfn; + } enable; /* XEN_KEMARI_OP_enable */ + struct { + uint32_t port; + uint16_t evtchn_tap_mode; + } attach; /* XEN_KEMARI_OP_attach */ + struct { + uint32_t port; + } detach; /* XEN_KEMARI_OP_detach */ + } u; +}; +typedef struct xen_domctl_kemari_op xen_domctl_kemari_op_t; +DEFINE_XEN_GUEST_HANDLE(xen_domctl_kemari_op_t); struct xen_domctl { uint32_t cmd; @@ -687,6 +719,7 @@ struct xen_domctl_set_target set_target; struct xen_domctl_subscribe subscribe; struct xen_domctl_debug_op debug_op; + struct xen_domctl_kemari_op kemari_op; #if defined(__i386__) || defined(__x86_64__) struct xen_domctl_cpuid cpuid; #endif diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/io/xenbus.h --- a/xen/include/public/io/xenbus.h Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/include/public/io/xenbus.h Wed Mar 11 18:03:47 2009 +0900 @@ -63,7 +63,9 @@ */ XenbusStateReconfiguring = 7, - XenbusStateReconfigured = 8 + XenbusStateReconfigured = 8, + + XenbusStateAttached = 9 }; typedef enum xenbus_state XenbusState; diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/kemari.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/xen/kemari.h Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,75 @@ +/****************************************************************************** + * kemari.h + * + * Kemari header file. + * + * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef __XEN_KEMARI_H__ +#define __XEN_KEMARI_H__ + +#include <public/domctl.h> + +#define NUM_KEMARI_TAPS 32 + +#define _KEMARI_TAP_ATTACHED 0 +#define KEMARI_TAP_ATTACHED (1UL<<_KEMARI_TAP_ATTACHED) +#define _KEMARI_TAP_DETACHED 1 +#define KEMARI_TAP_DETACHED (1UL<<_KEMARI_TAP_DETACHED) + +struct kemari_tap { + uint64_t status; + uint64_t in_events; + uint64_t out_events; +}; + +/* Main data structure of Kemari */ +struct kemari { + struct domain *domain; + + struct kemari_ring *ring; + + uint32_t port; + + uint32_t num_pages; + + uint64_t mfn; + + uint64_t num_events; + + uint64_t priv_dirty_pages; + + struct kemari_tap taps[NUM_KEMARI_TAPS]; +}; + +long kemari_off(struct domain *d); + +/* Entry point to Kemari */ +long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op); + +#endif + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/Makefile --- a/xen/arch/x86/Makefile Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/arch/x86/Makefile Wed Mar 11 18:03:47 2009 +0900 @@ -4,6 +4,7 @@ subdir-y += hvm subdir-y += mm subdir-y += oprofile +subdir-y += kemari subdir-$(x86_32) += x86_32 subdir-$(x86_64) += x86_64 diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/domain.c --- a/xen/arch/x86/domain.c Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/arch/x86/domain.c Wed Mar 11 18:03:47 2009 +0900 @@ -1912,6 +1912,10 @@ BUG(); } + /* Turn off Kemari. */ + if ( d->kemari ) + kemari_off(d); + if ( is_hvm_domain(d) ) hvm_domain_relinquish_resources(d); diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/domctl.c --- a/xen/arch/x86/domctl.c Mon Mar 09 10:32:24 2009 +0000 +++ b/xen/arch/x86/domctl.c Wed Mar 11 18:03:47 2009 +0900 @@ -20,6 +20,7 @@ #include <xen/trace.h> #include <xen/console.h> #include <xen/iocap.h> +#include <xen/kemari.h> #include <xen/paging.h> #include <asm/irq.h> #include <asm/hvm/hvm.h> @@ -1079,6 +1080,21 @@ } break; + case XEN_DOMCTL_kemari_op: + { + struct domain *d = rcu_lock_domain_by_id(domctl->domain); + + ret = -ESRCH; + if ( unlikely(d == NULL) ) + break; + + ret = do_kemari_op(d, &domctl->u.kemari_op); + + copy_to_guest(u_domctl, domctl, 1); + rcu_unlock_domain(d); + } + break; + default: ret = -ENOSYS; break; diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/kemari/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/kemari/Makefile Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,1 @@ +obj-y += kemari.o diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/kemari/kemari.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/kemari/kemari.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,670 @@ +/****************************************************************************** + * kemari.c + * + * The hypervisor part of VM synchronization mechanism (Kemari). + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copied log_dirty_lock(_d), log_dirty_unlock(_d) and paging_log_dirty_op() + * from arch/x86/paging.c. + * + * x86 specific paging support + * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) + * Copyright (c) 2007 XenSource Inc. + */ + +#include <xen/config.h> +#include <xen/errno.h> +#include <xen/sched.h> +#include <xen/event.h> +#include <xen/kemari.h> +#include <xen/mm.h> +#include <xen/domain.h> + +#include <public/kemari.h> +#include <asm/domain.h> +#include <asm/hvm/support.h> +#include <asm/page.h> +#include <asm/paging.h> +#include <asm/shadow.h> +#include <asm/types.h> + +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_valid +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) + +#define log_dirty_lock(_d) \ + do { \ + if (unlikely((_d)->arch.paging.log_dirty.locker==current->processor))\ + { \ + printk("Error: paging log dirty lock held by %s\n", \ + (_d)->arch.paging.log_dirty.locker_function); \ + BUG(); \ + } \ + spin_lock(&(_d)->arch.paging.log_dirty.lock); \ + ASSERT((_d)->arch.paging.log_dirty.locker == -1); \ + (_d)->arch.paging.log_dirty.locker = current->processor; \ + (_d)->arch.paging.log_dirty.locker_function = __func__; \ + } while (0) + +#define log_dirty_unlock(_d) \ + do { \ + ASSERT((_d)->arch.paging.log_dirty.locker == current->processor); \ + (_d)->arch.paging.log_dirty.locker = -1; \ + (_d)->arch.paging.log_dirty.locker_function = "nobody"; \ + spin_unlock(&(_d)->arch.paging.log_dirty.lock); \ + } while (0) + +#define bucket_from_port(d,p) \ + ((d)->evtchn[(p)/EVTCHNS_PER_BUCKET]) +#define port_is_valid(d,p) \ + (((p) >= 0) && ((p) < MAX_EVTCHNS(d)) && \ + (bucket_from_port(d,p) != NULL)) +#define evtchn_from_port(d,p) \ + (&(bucket_from_port(d,p))[(p)&(EVTCHNS_PER_BUCKET-1)]) + +static void kemari_send_domaininfo_ctxt(struct kemari_ring *ring, + struct domain *d) +{ + struct hvm_domain_context ctxt; + + if ( !d->is_paused_by_controller ) + { + dprintk(XENLOG_ERR, "Domain isn''t paused\n"); + return; + } + + ctxt.cur = 0; + ctxt.size = ring->hvm_ctxt.buf_size; + ctxt.data = (uint8_t *)ring + ring->hvm_ctxt.buf_offset; + hvm_save(d, &ctxt); + ring->hvm_ctxt.rec_size = ctxt.cur; +} + +static long kemari_send_dirty_bitmap_page(struct kemari_ring *ring, + struct domain *d, + unsigned long *dirty_bitmap, + uint16_t index, unsigned int bytes) +{ + uint16_t i, j; + struct kemari_ent *buf; + + for ( i = 0; i < bytes / BYTES_PER_LONG; i++ ) + { + j = i; + + while ( (j < bytes / BYTES_PER_LONG) && (dirty_bitmap[j] != 0) ) + j++; + + if ( i == j ) + continue; + + buf = KEMARI_RING_GET_PROD(ring); + buf->u.index.start = i + index; + buf->u.index.end = j + index; + wmb(); + ring->prod++; + + while( i < j ) + { + buf = (struct kemari_ent *)&dirty_bitmap[i]; + kemari_ring_write(ring, buf); + i++; + } + } + return i; +} + +/* Based on paging_log_dirty_op() in xen/arch/x86/mm/paging.c. */ +static long kemari_send_dirty_bitmap(struct kemari_ring *ring, + struct domain *d) +{ + long ret = 0, clean = 1, peek = 1; + unsigned long pages = 0; + unsigned long p2m_size; + mfn_t *l4, *l3, *l2; + unsigned long *l1; + int i4, i3, i2; + uint16_t index = 0; + + log_dirty_lock(d); + + if ( clean ) + { + d->arch.paging.log_dirty.fault_count = 0; + d->arch.paging.log_dirty.dirty_count = 0; + } + + if ( !mfn_valid(d->arch.paging.log_dirty.top) ) + { + ret = -EINVAL; /* perhaps should be ENOMEM? */ + goto out; + } + + if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) { + printk("%s: %d failed page allocs while logging dirty pages\n", + __FUNCTION__, d->arch.paging.log_dirty.failed_allocs); + ret = -ENOMEM; + goto out; + } + + pages = 0; + l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top)); + + p2m_size = domain_get_maximum_gpfn(d) + 1; + + for ( i4 = 0; + (pages < p2m_size) && (i4 < LOGDIRTY_NODE_ENTRIES); + i4++ ) + { + l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL; + for ( i3 = 0; + (pages < p2m_size) && (i3 < LOGDIRTY_NODE_ENTRIES); + i3++ ) + { + l2 = ((l3 && mfn_valid(l3[i3])) ? + map_domain_page(mfn_x(l3[i3])) : NULL); + for ( i2 = 0; + (pages < p2m_size) && (i2 < LOGDIRTY_NODE_ENTRIES); + i2++ ) + { + unsigned int bytes = PAGE_SIZE; + l1 = ((l2 && mfn_valid(l2[i2])) ? + map_domain_page(mfn_x(l2[i2])) : NULL); + if ( unlikely(((p2m_size - pages + 7) >> 3) < bytes) ) + bytes = (unsigned int)((p2m_size - pages + + BITS_PER_LONG - 1) >> 3); + if ( likely(peek) ) + { + if ( l1 != NULL && + kemari_send_dirty_bitmap_page(ring, d, l1, + index, bytes) < 0 ) + { + ret = -EFAULT; + dprintk(XENLOG_ERR, + "%s: kemari_send_dirty_bitmap_page\n", + __FUNCTION__); + goto out; + } + } + index += PAGE_SIZE / BYTES_PER_LONG; + + if ( clean && l1 != NULL ) + clear_page(l1); + pages += bytes << 3; + if ( l1 != NULL ) + unmap_domain_page(l1); + } + if ( l2 ) + unmap_domain_page(l2); + } + if ( l3 ) + unmap_domain_page(l3); + } + unmap_domain_page(l4); + + log_dirty_unlock(d); + + if ( clean ) + { + /* We need to further call clean_dirty_bitmap() functions of specific + * paging modes (shadow or hap). Safe because the domain is paused. */ + d->arch.paging.log_dirty.clean_dirty_bitmap(d); + } + + return ret; + + out: + log_dirty_unlock(d); + + return ret; +} + +static void kemari_guest_notify(struct kemari *kemari) +{ + if ( likely(kemari != NULL) ) + notify_via_xen_evtchn_tap(kemari->domain, kemari->port); +} + +/* VM synchronization entry point. */ +static long run_kemari(struct evtchn *lchn, struct evtchn *rchn) +{ + long ret; + uint32_t port; + uint64_t *events; + struct domain *d, *rd = lchn->u.interdomain.remote_dom; + struct kemari *kemari; + struct kemari_ring *ring; + struct evtchn *kemari_evtchn; + + if (lchn->tap.mode & KEMARI_TAP_OUT) + { + domain_pause_for_debugger(); + d = current->domain; + kemari = d->kemari; + port = rchn->u.interdomain.remote_port; + events = &kemari->taps[port].out_events; + } + else if (rchn->tap.mode & KEMARI_TAP_IN) + { + domain_pause_by_systemcontroller(rd); + d = rd; + kemari = rd->kemari; + port = lchn->u.interdomain.remote_port; + events = &kemari->taps[port].in_events; + } + else + { + ret = 0; + goto out; + } + + spin_lock(&d->grant_table->lock); + + ++*events; + + kemari_evtchn = evtchn_from_port(d, kemari->port); + if (kemari_evtchn->notify_vcpu_id != current->vcpu_id) + kemari_evtchn->notify_vcpu_id = current->vcpu_id; + + ring = kemari->ring; + + ret = kemari_send_dirty_bitmap(ring, d); + if ( ret < 0 ) + goto unlock_out; + + kemari_guest_notify(kemari); + + prepare_wait_on_xen_event_channel(kemari->port); + + test_and_clear_bit(_VPF_blocked_in_xen, ¤t->pause_flags); + + ret = 0; + + unlock_out: + spin_unlock(&d->grant_table->lock); + + out: + return ret; +} + +static long kemari_bind_tap(struct domain *d, + struct xen_domctl_kemari_op *kemari_op) +{ + long ret; + struct evtchn_bind_tap bind_tap; + + bind_tap.tap_dom = d->domain_id; + bind_tap.tap_port = kemari_op->u.attach.port; + bind_tap.mode = kemari_op->u.attach.evtchn_tap_mode; + bind_tap.redirect = run_kemari; + + ret = evtchn_bind_tap(&bind_tap); + + return ret; +} + +static long kemari_unbind_tap(struct domain *d, + struct xen_domctl_kemari_op *kemari_op) +{ + long ret; + struct evtchn_bind_tap unbind_tap; + + unbind_tap.tap_dom = d->domain_id; + unbind_tap.tap_port = kemari_op->u.detach.port; + unbind_tap.mode = KEMARI_TAP_OFF; + + ret = evtchn_unbind_tap(&unbind_tap); + + return ret; +} + +static long kemari_attach(struct domain *d, + struct xen_domctl_kemari_op *kemari_op) +{ + long ret; + uint32_t port = kemari_op->u.attach.port; + struct kemari *kemari = d->kemari; + struct kemari_tap *tap; + + dprintk(XENLOG_DEBUG, "%s: in\n", __FUNCTION__); + + ret = -EINVAL; + if ( unlikely(kemari == NULL) ) + { + dprintk(XENLOG_ERR, "kemari is off\n"); + goto out; + } + dprintk(XENLOG_DEBUG, "%s: kemari_bind_tap\n", __FUNCTION__); + ret = kemari_bind_tap(d, kemari_op); + if (ret < 0) + { + dprintk(XENLOG_ERR, + "couldn''t bind evtchn tap port=%u\n", port); + goto out; + } + + tap = &kemari->taps[port]; + + tap->status = KEMARI_TAP_ATTACHED; + + out: + dprintk(XENLOG_DEBUG, "%s: out\n", __FUNCTION__); + return ret; +} + +static long kemari_detach(struct domain *d, + struct xen_domctl_kemari_op *kemari_op) +{ + long ret; + uint32_t port = kemari_op->u.detach.port; + struct kemari *kemari = d->kemari; + struct kemari_tap *tap = &kemari->taps[port]; + + ret = -EINVAL; + if ( unlikely(kemari == NULL) ) + { + dprintk(XENLOG_ERR, "kemari is off\n"); + goto out; + } + + ret = -EINVAL; + if ( unlikely(tap->status != KEMARI_TAP_ATTACHED) ) + goto out; + + ret = kemari_unbind_tap(d, kemari_op); + if (ret < 0) + goto out; + + tap->status = KEMARI_TAP_DETACHED; + + out: + return ret; +} + +static void share_kemari_page_with_privileged_guests(struct kemari *kemari) +{ + int i; + struct kemari_ring *ring = kemari->ring; + + for ( i = 0; i < kemari->num_pages; i++ ) + share_xen_page_with_privileged_guests(virt_to_page(ring) + i, + XENSHARE_writable); +} + +static void unshare_kemari_page_with_privileged_guests(struct kemari *kemari) +{ + int i; + + for ( i = 0; i < kemari->num_pages; i++ ) + { + struct page_info *page = mfn_to_page(kemari->mfn + i); + BUG_ON(page_get_owner(page) != dom_xen); + if ( test_and_clear_bit(_PGC_allocated, &page->count_info) ) + put_page(page); + } +} + +static void kemari_free_ring(struct domain *d) +{ + int order; + struct vcpu *v = d->vcpu[0]; + struct kemari *kemari = d->kemari; + + if ( kemari->ring == NULL || + kemari->num_pages == 0 || + kemari->port == 0 ) + return; + + free_xen_event_channel(v, kemari->port); + + unshare_kemari_page_with_privileged_guests(kemari); + + order = get_order_from_pages(kemari->num_pages); + free_xenheap_pages(kemari->ring, order); + + kemari->mfn = 0; + kemari->ring = NULL; + kemari->num_pages = 0; + kemari->port = 0; +} + +static long kemari_alloc_ring(struct domain *d, struct kemari *kemari) +{ + long ret; + unsigned int order; + unsigned long num_pages; + domid_t current_domid = current->domain->domain_id; + struct vcpu *v = d->vcpu[0]; + struct kemari_ring *ring; + unsigned long dirty_bitmap_size; + uint32_t hvm_buf_size; + + ret = alloc_unbound_xen_event_channel(v, current_domid); + if ( ret < 0 ) + { + dprintk(XENLOG_ERR, "couldn''t alloc xen_event_channel\n"); + goto out; + } + kemari->port = ret; + + dirty_bitmap_size = (BITS_TO_LONGS(domain_get_maximum_gpfn(d) + 1) + * sizeof(unsigned long)); + + ret = -EINVAL; + if ( dirty_bitmap_size == 0 || !mfn_valid(d->arch.paging.log_dirty.top) ) + { + dprintk(XENLOG_ERR, "dirty_bitmap is EMPTY\n"); + goto out_evtchn; + } + + hvm_buf_size = hvm_save_size(d); + num_pages = (sizeof(struct kemari_ring) + + hvm_buf_size + + (dirty_bitmap_size >> 3) + + PAGE_SIZE - 1) / PAGE_SIZE; + order = get_order_from_pages(num_pages); + num_pages = (1UL << order); + + dprintk(XENLOG_DEBUG, "ring=%u, bitmap=%lu, ctxt=%u, PAGE=%ld\n", + sizeof(struct kemari_ring), dirty_bitmap_size / 8, + hvm_buf_size, PAGE_SIZE); + + ret = -ENOMEM; + ring = alloc_xenheap_pages(order, 0); + if ( ring == NULL ) + { + dprintk(XENLOG_ERR, "couldn''t alloc xenheap_pages\n"); + goto out_evtchn; + } + memset(ring, 0, PAGE_SIZE * num_pages); + + ring->num_ents + (PAGE_SIZE * num_pages - hvm_buf_size + (long)ring - (long)ring->data) + / sizeof(struct kemari_ent); + ring->hvm_ctxt.buf_size = hvm_buf_size; + ring->hvm_ctxt.buf_offset = PAGE_SIZE * num_pages - hvm_buf_size; + + kemari->num_pages = num_pages; + kemari->mfn = virt_to_mfn(ring); + kemari->ring = ring; + + share_kemari_page_with_privileged_guests(kemari); + + dprintk(XENLOG_DEBUG, "num_ents=%u, num_pages=%u\n", + ring->num_ents, kemari->num_pages); + + return 0; + + out_evtchn: + free_xen_event_channel(v, kemari->port); + out: + return ret; +} + +static long kemari_enable(struct domain *d, + struct xen_domctl_kemari_op *kemari_op) +{ + long ret; + struct kemari *kemari; + + ret = -EBUSY; + if ( unlikely(d->kemari != NULL) ) + { + dprintk(XENLOG_ERR, "kemari already enabled\n"); + goto out; + } + + ret = -ENOMEM; + kemari = xmalloc_bytes(sizeof(struct kemari)); + if ( kemari == NULL ) + { + dprintk(XENLOG_ERR, "couldn''t alloc kemari\n"); + goto out; + } + + memset(kemari, 0, sizeof(struct kemari) ); + + domain_pause_by_systemcontroller(d); + + ret = kemari_alloc_ring(d, kemari); + if ( ret < 0 ) + goto kemari_free; + + kemari_op->u.enable.port = kemari->port; + kemari_op->u.enable.mfn = kemari->mfn; + kemari_op->u.enable.num_pages = kemari->num_pages; + + dprintk(XENLOG_DEBUG, "port=%u, mfn=%llu\n", kemari->port, kemari->mfn); + + kemari->domain = d; + + d->kemari = kemari; + + kemari_send_domaininfo_ctxt(kemari->ring, d); + + domain_unpause_by_systemcontroller(d); + + dprintk(XENLOG_DEBUG, "kemari enabled\n"); + return 0; + + kemari_free: + xfree(kemari); + domain_unpause_by_systemcontroller(d); + out: + return ret; +} + +long kemari_off(struct domain *d) +{ + long ret; + uint32_t port; + struct kemari *kemari = d->kemari; + struct kemari_tap *tap; + struct evtchn_bind_tap kemari_unbind_tap; + + ret = -EINVAL; + if ( unlikely(kemari == NULL) ) + { + dprintk(XENLOG_ERR, "kemari already off\n"); + goto out; + } + + domain_pause_by_systemcontroller(d); + + kemari_unbind_tap.tap_dom = d->domain_id; + + for ( port = 0; port < NUM_KEMARI_TAPS; port++ ) { + tap = &kemari->taps[port]; + + if ( (tap->status != KEMARI_TAP_ATTACHED) || + (!port_is_valid(d, port)) ) + continue; + + kemari_unbind_tap.tap_port = port; + + if ( evtchn_unbind_tap(&kemari_unbind_tap) < 0 ) + dprintk(XENLOG_ERR, + "couldn''t unbind evtchn tap port=%u\n", port); + } + + if ( kemari->ring ) + kemari_free_ring(d); + + xfree(kemari); + + d->kemari = NULL; + + domain_unpause_by_systemcontroller(d); + + return 0; + + out: + return ret; +} + +long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op) +{ + static DEFINE_SPINLOCK(lock); + long ret; + + /* We don''t support calling kemari by itself or dom0. */ + if ( d == current->domain || d == dom0 ) + { + dprintk(XENLOG_ERR, "can''t attach kemari by itself or to dom0"); + return -EINVAL; + } + + spin_lock(&lock); + + switch ( kemari_op->cmd ) + { + case XEN_KEMARI_OP_enable: + ret = kemari_enable(d, kemari_op); + break; + + case XEN_KEMARI_OP_off: + ret = kemari_off(d); + break; + + case XEN_KEMARI_OP_attach: + ret = kemari_attach(d, kemari_op); + break; + + case XEN_KEMARI_OP_detach: + ret = kemari_detach(d, kemari_op); + break; + + default: + ret = -EINVAL; + break; + } + + spin_unlock(&lock); + + return ret; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:16 UTC
[Xen-devel] [RFC][PATCH 03/13] Kemari: change parameter type of xc_{set, get}_hvm_param
This is an updated version of the following patch. Modifies files which use xc_{set,get}_hvm_param. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00370.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/libxc/xc_cpuid_x86.c | 2 +- tools/libxc/xc_domain.c | 4 ++-- tools/libxc/xc_domain_save.c | 10 +++++----- tools/libxc/xc_resume.c | 2 +- tools/python/xen/lowlevel/xc/xc.c | 2 +- tools/xcutils/xc_save.c | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_domain.c --- a/tools/libxc/xc_domain.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xc_domain.c Wed Mar 11 18:03:47 2009 +0900 @@ -792,7 +792,7 @@ return do_domctl(xc_handle, &domctl); } -int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value) +int xc_set_hvm_param(int handle, domid_t dom, int param, uint64_t value) { DECLARE_HYPERCALL; xen_hvm_param_t arg; @@ -811,7 +811,7 @@ return rc; } -int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value) +int xc_get_hvm_param(int handle, domid_t dom, int param, uint64_t *value) { DECLARE_HYPERCALL; xen_hvm_param_t arg; diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_cpuid_x86.c --- a/tools/libxc/xc_cpuid_x86.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xc_cpuid_x86.c Wed Mar 11 18:03:47 2009 +0900 @@ -167,7 +167,7 @@ int xc, domid_t domid, const unsigned int *input, unsigned int *regs) { char brand[13]; - unsigned long pae; + uint64_t pae; int is_pae; xc_get_hvm_param(xc, domid, HVM_PARAM_PAE_ENABLED, &pae); diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_save.c --- a/tools/xcutils/xc_save.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/xcutils/xc_save.c Wed Mar 11 18:03:47 2009 +0900 @@ -164,7 +164,7 @@ static int suspend(void) { - unsigned long sx_state = 0; + uint64_t sx_state = 0; /* Cannot notify guest to shut itself down if it''s in ACPI sleep state. */ if (si.flags & XCFLAGS_HVM) diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_domain_save.c --- a/tools/libxc/xc_domain_save.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xc_domain_save.c Wed Mar 11 18:03:47 2009 +0900 @@ -1395,7 +1395,7 @@ chunk.id = -3; xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, - (unsigned long *)&chunk.data); + &chunk.data); if ( (chunk.data != 0) && write_exact(io_fd, &chunk, sizeof(chunk)) ) @@ -1406,7 +1406,7 @@ chunk.id = -4; xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, - (unsigned long *)&chunk.data); + &chunk.data); if ( (chunk.data != 0) && write_exact(io_fd, &chunk, sizeof(chunk)) ) @@ -1431,11 +1431,11 @@ /* Save magic-page locations. */ memset(magic_pfns, 0, sizeof(magic_pfns)); xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, - (unsigned long *)&magic_pfns[0]); + &magic_pfns[0]); xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, - (unsigned long *)&magic_pfns[1]); + &magic_pfns[1]); xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, - (unsigned long *)&magic_pfns[2]); + &magic_pfns[2]); if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) { PERROR("Error when writing to state file (7)"); diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_resume.c --- a/tools/libxc/xc_resume.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xc_resume.c Wed Mar 11 18:03:47 2009 +0900 @@ -27,7 +27,7 @@ /* HVM guests without PV drivers do not have a return code to modify. */ if ( info.hvm ) { - unsigned long irq = 0; + uint64_t irq = 0; xc_get_hvm_param(xc_handle, domid, HVM_PARAM_CALLBACK_IRQ, &irq); if ( !irq ) return 0; diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar 11 18:03:47 2009 +0900 @@ -490,7 +490,7 @@ { uint32_t dom; int param; - unsigned long value; + uint64_t value; static char *kwd_list[] = { "domid", "param", NULL }; if ( !PyArg_ParseTupleAndKeywords(args, kwds, "ii", kwd_list, _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:17 UTC
[Xen-devel] [RFC][PATCH 04/13] Kemari: Kemari controller interface in libxc
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00372.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/libxc/Makefile | 2 + tools/libxc/xc_dom_kemari.c | 79 ++++++++++++++++++++++++++++++++++++++++++ tools/libxc/xenctrl.h | 13 +++++- tools/libxc/xenguest.h | 45 +++++++++++++++++++++++ tools/libxc/xg_save_restore.h | 1 tools/xcutils/Makefile | 1 6 files changed, 139 insertions(+), 2 deletions(-) diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/Makefile --- a/tools/xcutils/Makefile Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/xcutils/Makefile Wed Mar 11 18:03:47 2009 +0900 @@ -15,6 +15,7 @@ CFLAGS += $(CFLAGS_libxenctrl) $(CFLAGS_libxenguest) $(CFLAGS_libxenstore) PROGRAMS = xc_restore xc_save readnotes lsevtchn +PROGRAMS += xc_kemari_restore xc_kemari_save LDLIBS = $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) $(LDFLAGS_libxenstore) diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/Makefile --- a/tools/libxc/Makefile Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/Makefile Wed Mar 11 18:03:47 2009 +0900 @@ -31,6 +31,8 @@ GUEST_SRCS-y : GUEST_SRCS-y += xg_private.c GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari_restore.c xc_dom_kemari_save.c +GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari.c GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c vpath %.c ../../xen/common/libelf diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xenctrl.h Wed Mar 11 18:03:47 2009 +0900 @@ -1041,8 +1041,8 @@ */ xc_error_handler xc_set_error_handler(xc_error_handler handler); -int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value); -int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value); +int xc_set_hvm_param(int handle, domid_t dom, int param, uint64_t value); +int xc_get_hvm_param(int handle, domid_t dom, int param, uint64_t *value); /* IA64 specific, nvram save */ int xc_ia64_save_to_nvram(int xc_handle, uint32_t dom); @@ -1242,4 +1242,13 @@ int xc_set_cpufreq_gov(int xc_handle, int cpuid, char *govname); int xc_set_cpufreq_para(int xc_handle, int cpuid, int ctrl_type, int ctrl_value); + +/* kemari control interface */ +int xc_kemari_control(int xc_handle, + uint32_t domid, + uint32_t cmd, + evtchn_port_t *port, + uint32_t *num_pages, + uint64_t *mfn, + uint16_t tap_mode); #endif /* XENCTRL_H */ diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xenguest.h --- a/tools/libxc/xenguest.h Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xenguest.h Wed Mar 11 18:03:47 2009 +0900 @@ -43,6 +43,51 @@ * @return 0 on success, -1 on failure */ int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn, + unsigned int hvm, unsigned int pae); + +/** + * This function will save a running domain for Kemari. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm fd the file descriptor to save a domain to + * @parm dom the id of the domain + * @return 0 on success, -1 on failure + */ +int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags /* XCFLAGS_xxx */, + int hvm, + void *(*init_qemu_maps)(int, unsigned)); + +/** + * This function will update a domain for Kemari. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm fd the file descriptor to save a domain to + * @parm dom the id of the domain + * @return 0 on success, -1 on failure + */ +int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags, + void (*qemu_save_image)(int), + void (*qemu_end_flip)(void), + void (*qemu_end_save)(void), + void (*qemu_image_sent)(void)); + +/** + * This function will restore a saved domain for Kemari. + * + * @parm xc_handle a handle to an open hypervisor interface + * @parm fd the file descriptor to restore a domain from + * @parm dom the id of the domain + * @parm store_evtchn the store event channel for this domain to use + * @parm store_mfn returned with the mfn of the store page + * @parm hvm non-zero if this is a HVM restore + * @parm pae non-zero if this HVM domain has PAE support enabled + * @return 0 on success, -1 on failure + */ +int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom, unsigned int store_evtchn, unsigned long *store_mfn, unsigned int console_evtchn, unsigned long *console_mfn, unsigned int hvm, unsigned int pae); diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xg_save_restore.h --- a/tools/libxc/xg_save_restore.h Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/libxc/xg_save_restore.h Wed Mar 11 18:03:47 2009 +0900 @@ -8,6 +8,7 @@ #include <xen/foreign/x86_32.h> #include <xen/foreign/x86_64.h> +#include <xen/kemari.h> /* ** We process save/restore/migrate in batches of pages; the below diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_dom_kemari.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,79 @@ +/* + * xc_dom_kemari.c + * + * The API for manipulating and obtaining information on kemari-domains. + * + * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include "xc_private.h" + +/* + * Kemari controller interface. + */ +int xc_kemari_control(int xc_handle, + uint32_t domid, + uint32_t cmd, + evtchn_port_t *port, + uint32_t *num_pages, + uint64_t *mfn, + uint16_t tap_mode) +{ + int rc; + struct xen_domctl_kemari_op *kemari_op; + DECLARE_DOMCTL; + + domctl.cmd = XEN_DOMCTL_kemari_op; + domctl.domain = (domid_t)domid; + + kemari_op = &domctl.u.kemari_op; + kemari_op->cmd = cmd; + + if ( cmd == XEN_KEMARI_OP_attach ) + { + kemari_op->u.attach.port = *port; + kemari_op->u.attach.evtchn_tap_mode = tap_mode; + } + + if ( cmd /* == */ & XEN_KEMARI_OP_detach ) + kemari_op->u.detach.port = *port; + + DPRINTF("xc_kemari_control: cmd=%d\n", cmd); + + rc = do_domctl(xc_handle, &domctl); + + if ( cmd == XEN_KEMARI_OP_enable ) + { + *port = kemari_op->u.enable.port; + *mfn = kemari_op->u.enable.mfn; + *num_pages = kemari_op->u.enable.num_pages; + } + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:17 UTC
[Xen-devel] [RFC][PATCH 05/13] Kemari: Kemari sender
This is an updated version of the following patch. Followed the changes in live migration code. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00374.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/libxc/xc_dom_kemari_save.c | 1139 +++++++++++++++++++++++++++++++++++++++ tools/xcutils/xc_kemari_save.c | 518 +++++++++++++++++ 2 files changed, 1657 insertions(+) diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_kemari_save.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xcutils/xc_kemari_save.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,518 @@ +/* + * xc_kemari_save.c + * + * Save the state of a running Linux session. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * This source code is based on xc_save.c. + * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c. + * + * Copyright (C) 2005 by Christian Limpach + * + */ + + +#include <err.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <xs.h> +#include <xenctrl.h> +#include <xenguest.h> +#include <xc_private.h> +#include <xen/kemari.h> + +static volatile sig_atomic_t run = 1; +static int xc_handle, xce_handle, io_fd; +static struct kemari_ring *ring = NULL; +static uint32_t kemari_ring_size = 0; +static pid_t qemu_pid; +static int is_finalized = 0; +static int domid; + +/* For HVM guests, there are two sources of dirty pages: the Xen shadow + * log-dirty bitmap, which we get with a hypercall, and qemu''s version. + * The protocol for getting page-dirtying data from qemu uses a + * double-buffered shared memory interface directly between xc_save and + * qemu-dm. + * + * xc_save calculates the size of the bitmaps and notifies qemu-dm + * through the store that it wants to share the bitmaps. qemu-dm then + * starts filling in the ''active'' buffer. + * + * To change the buffers over, xc_save writes the other buffer number to + * the store and waits for qemu to acknowledge that it is now writing to + * the new active buffer. xc_save can then process and clear the old + * active buffer. */ + +static char *qemu_active_path; +static char *qemu_next_active_path; +static int qemu_shmid = -1; +static struct xs_handle *xs; + + +/* Mark the shared-memory segment for destruction */ +static void qemu_destroy_buffer(void) +{ + if (qemu_shmid != -1) + shmctl(qemu_shmid, IPC_RMID, NULL); + qemu_shmid = -1; +} + +static char *kemari_qemu_info = NULL; +static void qemu_save_image(int next_active) +{ + kemari_qemu_info[0] = next_active; + kemari_qemu_info[1] = 0; + xen_wmb(); + kill(qemu_pid, SIGUSR1); +} + +static void qemu_end_flip(void) +{ + while (kemari_qemu_info[1] == 0) + xen_rmb(); +} + +static void qemu_end_save(void) +{ + while (kemari_qemu_info[2] == 0) + xen_rmb(); +} + +static void qemu_image_sent(void) +{ + /* after QEMU image sent */ + kemari_qemu_info[2] = 0; + xen_wmb(); +} + +static void *init_qemu_maps(int domid, unsigned int bitmap_size) +{ + key_t key; + char key_ascii[17] = {0,}; + void *seg; + char *path, *p; + + /* Make a shared-memory segment */ + do { + key = rand(); /* No security, just a sequence of numbers */ + qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE, + IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR); + if (qemu_shmid == -1 && errno != EEXIST) + errx(1, "can''t get shmem to talk to qemu-dm"); + } while (qemu_shmid == -1); + + /* Remember to tidy up after ourselves */ + atexit(qemu_destroy_buffer); + + /* Map it into our address space */ + seg = shmat(qemu_shmid, NULL, 0); + if (seg == (void *) -1) + errx(1, "can''t map shmem to talk to qemu-dm"); + memset(seg, 0, 2 * bitmap_size + PAGE_SIZE); + + /* Write the size of it into the first 32 bits */ + *(uint32_t *)seg = bitmap_size; + + /* Tell qemu about it */ + if ((xs = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + if (!(path = strdup("/local/domain/0/device-model/"))) + errx(1, "can''t get domain path in store"); + if (!(path = realloc(path, strlen(path) + + 10 + + strlen("/logdirty/next-active") + 1))) + errx(1, "no memory for constructing xenstore path"); + snprintf(path + strlen(path), 11, "%i", domid); + strcat(path, "/logdirty/"); + p = path + strlen(path); + + strcpy(p, "key"); + snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key); + if (!xs_write(xs, XBT_NULL, path, key_ascii, 16)) + errx(1, "can''t write key (%s) to store path (%s)\n", key_ascii, path); + + /* Watch for qemu''s indication of the active buffer, and request it + * to start writing to buffer 0 */ + strcpy(p, "active"); + if (!xs_watch(xs, path, "qemu-active-buffer")) + errx(1, "can''t set watch in store (%s)\n", path); + if (!(qemu_active_path = strdup(path))) + errx(1, "no memory for copying xenstore path"); + + strcpy(p, "next-active"); + if (!(qemu_next_active_path = strdup(path))) + errx(1, "no memory for copying xenstore path"); + + kemari_qemu_info = seg + 2 * bitmap_size; + xen_wmb(); + qemu_save_image(0); + + free(path); + return seg; +} + +static void close_handler(int sig_type) +{ + run = 0; +} + +static int handle_event(int domid, unsigned int flags) +{ + int ret = 1, rcv_port; + + if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) { + ERROR("Failed to read from event fd"); + goto out; + } + + if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags, + qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) { + xc_domain_pause(xc_handle, domid); + kill(qemu_pid, SIGSTOP); + ERROR("xc_kemari_update failed"); + goto out; + } + + if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) { + ERROR("Failed to write to event fd"); + goto out; + } + + ret = 0; +out: + return ret; +} + +static void set_signal_handler(void (*handler)(int)) +{ + struct sigaction act; + + act.sa_handler = handler; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + sigaction(SIGQUIT, &act, 0); + sigaction(SIGINT, &act, 0); + sigaction(SIGHUP, &act, 0); + sigaction(SIGTERM, &act, 0); +} + +static int attach_ports(int domid) +{ + struct xs_handle *xs_handle; + char **list, *data; + unsigned int list_size, data_size; + char path[128]; + uint32_t port; + int i, ret = 1; + + if ((xs_handle = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + + /* + * attach block port. + */ + snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid); + list = xs_directory(xs_handle, XBT_NULL, path, &list_size); + if (list == NULL) + errx(1, "xs_directory (%s) failed", path); + + for (i = 0; i < list_size; i++) { + snprintf(path, sizeof(path), + "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]); + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) + continue; + port = strtoul(data, NULL, 10); + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach, + &port, NULL, + NULL, KEMARI_TAP_OUT)) != 0) { + ERROR("Error when attaching blk_port (%d) on kemari", port); + goto out; + } + free(data); + DPRINTF("blk_port %d attached\n", port); + } + free(list); + + /* + * attach net port. + */ + snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid); + list = xs_directory(xs_handle, XBT_NULL, path, &list_size); + if (list == NULL) + errx(1, "xs_directory (%s) failed", path); + + for (i = 0; i < list_size; i++) { + snprintf(path, sizeof(path), + "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]); + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) + continue; + port = strtoul(data, NULL, 10); + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach, + &port, NULL, + NULL, KEMARI_TAP_OUT)) != 0) { + ERROR("Error when attaching net_port (%d) on kemari", port); + goto out; + } + free(data); + DPRINTF("net_port %d attached\n", port); + } + free(list); + + /* attach success */ + ret = 0; + +out: + xs_daemon_close(xs_handle); + + return ret; +} + +static pid_t get_qemu_pid(int domid) +{ + struct xs_handle *xs_handle; + char path[128]; + char *data; + unsigned int data_size; + pid_t pid = 0; + + if ((xs_handle = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + + snprintf(path, sizeof(path), + "/local/domain/%d/image/device-model-pid", domid); + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) { + ERROR("Could not find QEMU pid for domid %d", domid); + goto out; + } + pid = strtoul(data, NULL, 10); + free(data); + +out: + xs_daemon_close(xs_handle); + + return pid; +} + +static void finalize(void) +{ + int ret; + + if (is_finalized) + return; + + set_signal_handler(SIG_IGN); + if (ring != NULL) + munmap(ring, kemari_ring_size * PAGE_SIZE); + + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off, + NULL, NULL, NULL, 0)) != 0) { + ERROR("Error when turning off kemari"); + } else { + DPRINTF("successufully execute KEMARI_OP_off\n"); + } + + if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL) < 0 ) { + ERROR("Warning - couldn''t disable shadow mode"); + } + + if (!run) + xc_domain_destroy(xc_handle, domid); + + xc_interface_close(xc_handle); + + is_finalized = 1; +} + +int +main(int argc, char **argv) +{ + unsigned int maxit, max_f, flags; + int ret; + int evtchn_fd; + uint32_t port, kemari_port; + uint64_t kemari_mfn; + fd_set inset; + + if (argc != 6) + errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]); + + xc_handle = xc_interface_open(); + if (xc_handle < 0) + errx(1, "failed to open control interface"); + + io_fd = atoi(argv[1]); + domid = atoi(argv[2]); + maxit = atoi(argv[3]); + max_f = atoi(argv[4]); + flags = atoi(argv[5]); + + set_signal_handler(close_handler); + if ((qemu_pid = get_qemu_pid(domid)) == 0) + errx(1, "failed to get qemu pid"); + atexit(finalize); + + if (io_fd == -1) /* means test mode */ + { + io_fd = open("/dev/null", O_RDWR); + flags |= XCFLAGS_DEBUG; + } + else + { + int one = 1; + if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, + &one, sizeof(one)) < 0) { + ERROR("failed to set TCP_NODELAY"); + } + } + + if ((xce_handle = xc_evtchn_open()) < 0) { + errx(1, "failed to open control interface"); + } + + evtchn_fd = xc_evtchn_fd(xce_handle); + + if ( xc_shadow_control(xc_handle, domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL) < 0 ) + { + int frc; + /* log-dirty already enabled? There''s no test op, + so attempt to disable then reenable it */ + frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + if ( frc >= 0 ) + { + frc = xc_shadow_control(xc_handle, domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + } + + if ( frc < 0 ) + { + err(errno, "Couldn''t enable shadow mode (rc %d)", frc); + } + } + + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable, + &kemari_port, &kemari_ring_size, + &kemari_mfn, 0) != 0)) { + errx(1, "Error when turning on kemari"); + } + + DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n", + kemari_port, kemari_mfn, kemari_ring_size); + + if (attach_ports(domid) != 0) { + ERROR("attaching port failed "); + goto out; + } + + if ((port = xc_evtchn_bind_interdomain(xce_handle, domid, + kemari_port)) < 0) { + ERROR("xc_evtchn_bind_interdomain failed "); + goto out; + } + + if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN, + kemari_ring_size * PAGE_SIZE, PROT_READ | PROT_WRITE, + kemari_mfn)) == 0) { + ERROR("xc_map_foreign_range failed"); + goto out; + } + + if (xc_domain_pause(xc_handle, domid) < 0) { + ERROR("Domain appears not to have paused"); + goto out; + } + + ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags, + !!(flags & XCFLAGS_HVM), + &init_qemu_maps); + if (ret != 0) { + ERROR("xc_kemari_save failed"); + goto out; + } + + FD_ZERO(&inset); + FD_SET(evtchn_fd, &inset); + + if (xc_domain_unpause(xc_handle, domid) < 0) { + ERROR("Domain appears not to have unpaused"); + goto out; + } + + DPRINTF("running start"); + + while (run) { + + if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) { + if (errno == EINTR) + continue; + ERROR("Error when waiting events by select()"); + break; + } + + if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) { + + if ((ret = handle_event(domid, flags)) != 0) { + ERROR("Error when handling events"); + break; + } + + /* usleep(10000); */ + + if (xc_evtchn_notify(xce_handle, port) < 0) { + ERROR("xc_evtchn_notify failed"); + /* goto out; */ + break; + } + + if(xc_domain_unpause(xc_handle, domid) < 0) { + ERROR("xc_domain_unpause"); + /* goto out; */ + break; + } + + } + } + + out: + close(io_fd); + finalize(); + + return ret; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari_save.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_dom_kemari_save.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,1139 @@ +/****************************************************************************** + * xc_dom_kemari_save.c + * + * Save the state of a running Linux session. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * This source code is based on xc_domain_save.c. + * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT, + * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write, + * initialize_mbit_rate, and ratewrite from xc_domain_save.c + * + * Copyright (c) 2003, K A Fraser. + */ + +#include <inttypes.h> +#include <time.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/time.h> + +#include "xc_private.h" +#include "xc_dom.h" +#include "xg_private.h" +#include "xg_save_restore.h" + +#include <xen/hvm/params.h> +#include "xc_e820.h" + +#ifdef __MINIOS__ +/* + * Caution: atomicity of following alternative libc functions are broken. + */ +static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + char buf[1024]; + int len, wrote_len = 0; + + if (offset != NULL) { + ERROR("Sorry sendfile for stubdomain should not have offset"); + errno = EIO; + return -1; + } + + while (count > 0) { + len = (count < sizeof(buf))?count:sizeof(buf); + len = read(in_fd, buf, len); + if (len < 0) + return -1; + if (write_exact(out_fd, buf, len)) + return -1; + wrote_len += len; + count -= len; + } + return wrote_len; +} + +#define IOV_MAX 1024 +struct iovec { + void *iov_base; /* Base address. */ + size_t iov_len; /* Length. */ +}; +static ssize_t writev(int d, const struct iovec *iov, int iovcnt) +{ + int i; + int len, wrote_len; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + errno = EINVAL; + return -1; + } + + for (i = 0, wrote_len = 0; i < iovcnt; i++) { + len = write(d, iov[i].iov_base, iov[i].iov_len); + if (len < 0) + return -1; + + wrote_len += len; + if (wrote_len < 0) { /* integer overflow */ + errno = EINVAL; + return -1; + } + + if (len != iov[i].iov_len) + return wrote_len; + } + + return wrote_len; +} +#else /* !__MINIOS__ */ +#include <sys/sendfile.h> +#include <sys/uio.h> +#endif /* __MINIOS__ */ + +/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */ +static unsigned long *qemu_bitmaps[2]; +static int qemu_active; +static int qemu_non_active; + +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; + +/* page frame numbers */ +static unsigned long *pfn_type = NULL; + +/* The new domain''s shared-info frame number. */ +static unsigned long shared_info_frame; + +/* + * guest memory + */ +#define GUEST_MEM_ENTRY_SIZE 1024 /* up to 4MB at a time. */ +static unsigned char ** guest_memory = NULL; +static unsigned long ** guest_memory_status = NULL; +static unsigned long guest_memory_size = 0; + +static inline int map_guest_mem(int xc_handle, uint32_t domid, + unsigned long base) +{ + int j; + unsigned char * region_base; + unsigned long * pfn_base; + + pfn_base = guest_memory_status[base]; + + memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE); + for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) { + pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j; + } + region_base = xc_map_foreign_batch( + xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE); + if ( region_base == NULL ) + { + PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)", + base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1, + base); + return -1; + } + + /* Look for and skip completely empty batches. */ + for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ ) + pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK; + for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ ) + if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB ) + break; + if ( j == GUEST_MEM_ENTRY_SIZE ) + { + munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE); + guest_memory[base] = NULL; + return 1; + } + + guest_memory[base] = region_base; + + return 0; +} + +static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid, + unsigned long mfn) +{ + unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE; + unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE; + + if (base >= guest_memory_size) { + ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n", + base, guest_memory_size); + return NULL; + } + + if ( guest_memory_status[base][offset] == XEN_DOMCTL_PFINFO_XTAB ) { + /* reload XTAB place */ + munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE); + guest_memory[base] = NULL; + DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n", + base, base * GUEST_MEM_ENTRY_SIZE, + (base + 1) * GUEST_MEM_ENTRY_SIZE - 1); + } + + if (guest_memory[base] == NULL) + if (map_guest_mem(xc_handle, domid, offset)) + return NULL; + + return guest_memory[base] + offset * PAGE_SIZE; + /* Since I don''t care of XEN_DOMCTL_PFINFO_LTAB_MASK, + this program may cause some accidents. */ +} + +static inline int init_guest_mem(int xc_handle, uint32_t dom) +{ + int i; + + guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1; + DPRINTF("guest_memory_size: %lu\n", guest_memory_size); + + /* mapped memory */ + guest_memory = xg_memalign(PAGE_SIZE, + guest_memory_size * sizeof(guest_memory[0])); + if (guest_memory == NULL) + { + PERROR("failed to allocate guest_memory"); + return -1; + } + if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0]))) + { + ERROR("Unable to lock guest_memory array"); + return -1; + } + + /* memory status */ + guest_memory_status = xg_memalign(PAGE_SIZE, + guest_memory_size * sizeof(guest_memory_status[0])); + if ( guest_memory_status == NULL ) + { + ERROR("failed to alloc memory for guest_memory_status"); + errno = ENOMEM; + return -1; + } + if ( lock_pages(guest_memory_status, + guest_memory_size * sizeof(guest_memory_status[0]))) + { + ERROR("Unable to lock guest_memory_status array"); + return -1; + } + + for (i = 0; i < guest_memory_size; i++) { + guest_memory_status[i] = xg_memalign(PAGE_SIZE, + GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0])); + if (guest_memory_status[i] == NULL) { + ERROR("failed to alloc memory for guest_memory_status[%d]", i); + errno = ENOMEM; + return -1; + } + if ( lock_pages(guest_memory_status, + guest_memory_size * sizeof(guest_memory_status[0][0]))) + { + ERROR("Unable to lock guest_memory_status[%d]", i); + return -1; + } + } + + for (i = 0; i < guest_memory_size; i++) + if (map_guest_mem(xc_handle, dom, i) < 0) + return -1; + + return 0; +} + +static int writev_exact(int fd, const struct iovec *iov, size_t count) +{ + int i; + size_t sum; + for (i = 0, sum = 0; i < count; i++) + sum += iov[i].iov_len; + + if (writev(fd, iov, count) != sum) + return -1; + else + return 0; +} + +/* grep fodder: machine_to_phys */ + + +/* +** During (live) save/migrate, we maintain a number of bitmaps to track +** which pages we have to send, to fixup, and to skip. +*/ + +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) +#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long)) + +#define BITMAP_ENTRY(_nr,_bmap) \ + ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] + +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) + +static inline int test_bit (int nr, volatile void * addr) +{ + return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; +} + +static inline void clear_bit (int nr, volatile void * addr) +{ + BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); +} + +static inline void set_bit ( int nr, volatile void * addr) +{ + BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr)); +} + +static uint64_t tv_delta(struct timeval *new, struct timeval *old) +{ + return (((new->tv_sec - old->tv_sec)*1000000) + + (new->tv_usec - old->tv_usec)); +} + +static int noncached_write(int fd, void *buffer, int len) +{ + static int write_count = 0; + int rc = (write_exact(fd, buffer, len) == 0) ? len : -1; + + write_count += len; + if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) ) + { + /* Time to discard cache - dont care if this fails */ + discard_file_cache(fd, 0 /* no flush */); + write_count = 0; + } + + return rc; +} + +#ifdef ADAPTIVE_SAVE + +/* +** We control the rate at which we transmit (or save) to minimize impact +** on running domains (including the target if we''re doing live migrate). +*/ + +#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */ +#define START_MBIT_RATE 100 /* initial transmit rate for migrate */ + +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */ +#define RATE_TO_BTU 781250 + +/* Amount in bytes we allow ourselves to send in a burst */ +#define BURST_BUDGET (100*1024) + +/* We keep track of the current and previous transmission rate */ +static int mbit_rate, ombit_rate = 0; + +/* Have we reached the maximum transmission rate? */ +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) + +static inline void initialize_mbit_rate() +{ + mbit_rate = START_MBIT_RATE; +} + +static int ratewrite(int io_fd, void *buf, int n) +{ + static int budget = 0; + static int burst_time_us = -1; + static struct timeval last_put = { 0 }; + struct timeval now; + struct timespec delay; + long long delta; + + if ( START_MBIT_RATE == 0 ) + return noncached_write(io_fd, buf, n); + + budget -= n; + if ( budget < 0 ) + { + if ( mbit_rate != ombit_rate ) + { + burst_time_us = RATE_TO_BTU / mbit_rate; + ombit_rate = mbit_rate; + DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", + mbit_rate, BURST_BUDGET, burst_time_us); + } + if ( last_put.tv_sec == 0 ) + { + budget += BURST_BUDGET; + gettimeofday(&last_put, NULL); + } + else + { + while ( budget < 0 ) + { + gettimeofday(&now, NULL); + delta = tv_delta(&now, &last_put); + while ( delta > burst_time_us ) + { + budget += BURST_BUDGET; + last_put.tv_usec += burst_time_us; + if ( last_put.tv_usec > 1000000 ) + { + last_put.tv_usec -= 1000000; + last_put.tv_sec++; + } + delta -= burst_time_us; + } + if ( budget > 0 ) + break; + delay.tv_sec = 0; + delay.tv_nsec = 1000 * (burst_time_us - delta); + while ( delay.tv_nsec > 0 ) + if ( nanosleep(&delay, &delay) == 0 ) + break; + } + } + } + return noncached_write(io_fd, buf, n); +} + +#else /* ! ADAPTIVE SAVE */ + +#define RATE_IS_MAX() (0) +#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n)) +#define initialize_mbit_rate() + +#endif + +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, + xc_shadow_op_stats_t *stats, int print) +{ + static struct timeval wall_last; + static long long d0_cpu_last; + static long long d1_cpu_last; + + struct timeval wall_now; + long long wall_delta; + long long d0_cpu_now, d0_cpu_delta; + long long d1_cpu_now, d1_cpu_delta; + + gettimeofday(&wall_now, NULL); + + d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; + d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; + + if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) + DPRINTF("ARRHHH!!\n"); + + wall_delta = tv_delta(&wall_now,&wall_last)/1000; + if ( wall_delta == 0 ) + wall_delta = 1; + + d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; + d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; + + if ( print ) + DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " + "dirtied %dMb/s %" PRId32 " pages\n", + wall_delta, + (int)((d0_cpu_delta*100)/wall_delta), + (int)((d1_cpu_delta*100)/wall_delta), + (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), + (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), + stats->dirty_count); + +#ifdef ADAPTIVE_SAVE + if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate ) + { + mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) + + 50; + if ( mbit_rate > MAX_MBIT_RATE ) + mbit_rate = MAX_MBIT_RATE; + } +#endif + + d0_cpu_last = d0_cpu_now; + d1_cpu_last = d1_cpu_now; + wall_last = wall_now; + + return 0; +} + +static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom) +{ + char path[128]; + struct stat st; + struct { + int minusfour; + uint32_t image_size; + } chunk = { -1, 0 }; + int qemu_fd; + int rc = -1; + + snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom); + if ((qemu_fd = open(path, O_RDONLY)) == -1) + { + PERROR("Error when opening qemu image %s", path); + goto out; + } + + if (fstat(qemu_fd, &st) == -1) + { + PERROR("Error fstat qemu file %s", path); + goto out; + } + chunk.image_size = st.st_size; + + if ( write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing header for qemu image"); + goto out; + } + + if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !+ chunk.image_size) + { + PERROR("Error when writing qemu image"); + goto out; + } + close(qemu_fd); + + rc = 0; +out: + return rc; +} + +static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom) +{ + struct { + int id; + uint32_t pad; + uint64_t data; + } chunk = { 0, 0 }; + + chunk.id = -3; + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, + &chunk.data); + + if ( (chunk.data != 0) && + write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing the ident_pt for EPT guest"); + return -1; + } + + chunk.id = -4; + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, + &chunk.data); + + if ( (chunk.data != 0) && + write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing the vm86 TSS for guest"); + return -1; + } + + return 0; +} + +static int send_hvm_context(int xc_handle, int io_fd, + struct kemari_ring *ring, uint32_t dom) +{ + uint32_t buf_size = ring->hvm_ctxt.buf_size; + uint32_t rec_size = ring->hvm_ctxt.rec_size; + uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset; + int rc = -1; + + /* Get HVM context from Xen and save it too */ + if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, + buf_size)) == -1 ) + { + ERROR("HVM:Could not get hvm buffer"); + goto out; + } + + if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) ) + { + PERROR("error write hvm buffer size"); + goto out; + } + + if ( write_exact(io_fd, hvm_buf, rec_size) ) + { + PERROR("write HVM info failed!\n"); + goto out; + } + rc = 0; + +out: + return rc; +} + +int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags, + int hvm, void *(*init_qemu_maps)(int, unsigned)) +{ + int rc = 1, i, j, iter = 0; + int debug = (flags & XCFLAGS_DEBUG); + int sent_last_iter, skip_this_iter; + xc_dominfo_t info; + struct kemari_ring *ring = (struct kemari_ring *)kemari_ring; + + /* base of the region in which domain memory is mapped */ + unsigned char *region_base = NULL; + + /* bitmap of pages: + - that should be sent this iteration (unless later marked as skip); + - to skip this iteration because already dirty; + - to fixup by sending at the end if not already resent; */ + unsigned long *to_send = NULL, *to_fix = NULL; + + xc_shadow_op_stats_t stats; + + unsigned long needed_to_fix = 0; + unsigned long total_sent = 0; + + /* HVM: magic frames for ioreqs and xenstore comms. */ + uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ + + /* callback irq */ + uint64_t callback_irq = 0; + + if ( !hvm ) + { + ERROR("HVM domain is required for the kemari migration."); + return 1; + } + + initialize_mbit_rate(); + + if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) + { + ERROR("Could not get domain info"); + return 1; + } + + shared_info_frame = info.shared_info_frame; + DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame); + + /* Get the size of the P2M table */ + p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1; + DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size); + + /* Domain is still running at this point */ + { + /* Get qemu-dm logging dirty pages too */ + void *seg = init_qemu_maps(dom, BITMAP_SIZE); + qemu_bitmaps[0] = seg; + qemu_bitmaps[1] = seg + BITMAP_SIZE; + qemu_active = 0; + qemu_non_active = 1; + } + + /* pretend we sent all the pages last iteration */ + sent_last_iter = p2m_size; + + /* Setup to_send / to_fix bitmaps */ + to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT)); + to_fix = calloc(1, BITMAP_SIZE); + + if ( !to_send || !to_fix ) + { + ERROR("Couldn''t allocate to_send array"); + goto out; + } + + memset(to_send, 0xff, BITMAP_SIZE); + + if ( lock_pages(to_send, BITMAP_SIZE) ) + { + ERROR("Unable to lock to_send"); + return 1; + } + + pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP( + MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); + if ( pfn_type == NULL ) + { + ERROR("failed to alloc memory for pfn_type arrays"); + errno = ENOMEM; + goto out; + } + memset(pfn_type, 0, + ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); + + if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) ) + { + ERROR("Unable to lock pfn_type array"); + goto out; + } + + /* Start writing out the saved-domain record. */ + if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) ) + { + PERROR("write: p2m_size"); + goto out; + } + + /* send shared_info_frame */ + if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) ) + { + PERROR("write: shared_info_frame"); + goto out; + } + + /* Save magic-page locations. */ + memset(magic_pfns, 0, sizeof(magic_pfns)); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, + &magic_pfns[0]); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, + &magic_pfns[1]); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, + &magic_pfns[2]); + DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n", + magic_pfns[0], magic_pfns[1], magic_pfns[2]); + if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) + { + PERROR("Error when writing to state file (7)"); + goto out; + } + + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ, + &callback_irq); + DPRINTF("kemari_restore: callback irq %llx", callback_irq); + if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) ) + { + PERROR("Error when writing to state file (8)"); + goto out; + } + + print_stats(xc_handle, dom, 0, &stats, 0); + + /* Now write out each data page, canonicalising page tables as we go... */ + { + unsigned int prev_pc, sent_this_iter, N, batch, run; + + iter++; + sent_this_iter = 0; + skip_this_iter = 0; + prev_pc = 0; + N = 0; + + DPRINTF("Saving memory pages: iter %d 0%%", iter); + + while ( N < p2m_size ) + { + unsigned int this_pc = (N * 100) / p2m_size; + + if ( (this_pc - prev_pc) >= 5 ) + { + DPRINTF("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + /* load pfn_type[] with the mfn of all the pages we''re doing in + this batch. */ + for ( batch = 0; + (batch < MAX_BATCH_SIZE) && (N < p2m_size); + N++ ) + { + int n = N; + + if ( debug ) + { + DPRINTF("%d pfn= %08lx mfn= %08lx %d", + iter, (unsigned long)n, + (long unsigned int)0, + test_bit(n, to_send)); + DPRINTF("\n"); + } + + if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) ) + continue; + +#if 0 + /* Skip PFNs that aren''t really there */ + if (((n >= 0xa0 && n < 0xc0) /* VGA hole */ + || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) + && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) { + if (n >= shared_info_frame && n <= shared_info_frame + 32) { + /* DPRINTF("shared_info_frame or grant: %d\n", n); */ + } else { + continue; + } + } +#endif + + /* + ** we get here if: + ** 1. page is marked to_send & hasn''t already been re-dirtied + ** 2. add in pages that still need fixup (net bufs) + */ + + /* Hypercall interfaces operate in PFNs for HVM guests + * and MFNs for PV guests */ + pfn_type[batch] = n; + + if ( !is_mapped(pfn_type[batch]) ) + { + /* + ** not currently in psuedo-physical map -- set bit + ** in to_fix since we must send this page in last_iter + ** unless its sent sooner anyhow, or it never enters + ** pseudo-physical map (e.g. for ballooned down doms) + */ + set_bit(n, to_fix); + continue; + } + + if ( test_bit(n, to_fix) && + !test_bit(n, to_send) ) + { + needed_to_fix++; + DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n", + iter, n, pfn_type[batch]); + } + + clear_bit(n, to_fix); + + batch++; + } + + if ( batch == 0 ) + goto skip; /* vanishingly unlikely... */ + + region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_READ, pfn_type, batch); + if ( region_base == NULL ) + { + ERROR("map batch failed"); + goto out; + } + + { + /* Look for and skip completely empty batches. */ + for ( j = 0; j < batch; j++ ) + if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !+ XEN_DOMCTL_PFINFO_XTAB ) + break; + if ( j == batch ) + { + munmap(region_base, batch*PAGE_SIZE); + continue; /* bail on this batch: no valid pages */ + } + } + + if ( write_exact(io_fd, &batch, sizeof(unsigned int)) ) + { + PERROR("Error when writing to state file (2)"); + goto out; + } + + if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) ) + { + PERROR("Error when writing to state file (3)"); + goto out; + } + + /* entering this loop, pfn_type is now in pfns (Not mfns) */ + run = 0; + for ( j = 0; j < batch; j++ ) + { + unsigned long pfn, pagetype; + + pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pagetype != 0 ) + { + /* If the page is not a normal data page, write out any + run of pages we may have previously acumulated */ + if ( run ) + { + if ( ratewrite(io_fd, + (char*)region_base+(PAGE_SIZE*(j-run)), + PAGE_SIZE*run) != PAGE_SIZE*run ) + { + ERROR("Error when writing to state file (4a)" + " (errno %d)", errno); + goto out; + } + run = 0; + } + } + + /* skip pages that aren''t present */ + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) + continue; + + pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && + (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) + { + DPRINTF("canonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn); + } + else + { + /* We have a normal page: accumulate it for writing. */ + run++; + } + } /* end of the write out for this batch */ + + if ( run ) + { + /* write out the last accumulated run of pages */ + if ( ratewrite(io_fd, + (char*)region_base+(PAGE_SIZE*(j-run)), + PAGE_SIZE*run) != PAGE_SIZE*run ) + { + ERROR("Error when writing to state file (4c)" + " (errno %d)", errno); + goto out; + } + } + + sent_this_iter += batch; + + munmap(region_base, batch*PAGE_SIZE); + + } /* end of this while loop for this iteration */ + + skip: + + total_sent += sent_this_iter; + + DPRINTF("\r %d: sent %d, skipped %d, ", + iter, sent_this_iter, skip_this_iter ); + + { + print_stats( xc_handle, dom, sent_this_iter, &stats, 1); + + DPRINTF("Total pages sent= %ld (%.2fx)\n", + total_sent, ((float)total_sent)/p2m_size ); + DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); + } + } /* end of infinite for loop */ + + DPRINTF("All memory is saved\n"); + + if (send_hvm_params(xc_handle, io_fd, dom) < 0) + goto out; + + /* Zero terminate */ + i = 0; + if ( write_exact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing to state file (6'')"); + goto out; + } + + if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0) + goto out; + + if (!debug) + { + int rcv_status; + if ( read_exact(io_fd, &rcv_status, sizeof(int))) { + ERROR("Error when reading receiver status"); + goto out; + } + DPRINTF("status received: %d\n", rcv_status); + } + + if (init_guest_mem(xc_handle, dom) < 0) + goto out; + + /* HVM guests are done now */ + rc = 0; + + out: + + /* Flush last write and discard cache for file. */ + discard_file_cache(io_fd, 1 /* flush */); + + free(to_send); + free(to_fix); + + DPRINTF("Save exit rc=%d\n",rc); + + return !!rc; +} + + +int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags, + void (*qemu_save_image)(int), + void (*qemu_end_flip)(void), + void (*qemu_end_save)(void), + void (*qemu_image_sent)(void)) +{ + int rc = 1, k; + int debug = (flags & XCFLAGS_DEBUG); + uint32_t i, j, index = 0; + unsigned int batch = 0; + struct kemari_ring *ring = (struct kemari_ring *)kemari_ring; + struct kemari_ent *buf; + struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */ + int iovcnt = 2; + +#define ADD_IOV(base, len) do { \ + iov[iovcnt].iov_base = base; \ + iov[iovcnt].iov_len = len; \ + iovcnt++; \ +} while (0) + + + + /* flip active qemu */ + qemu_active = qemu_non_active; + qemu_non_active = qemu_active ? 0 : 1; + qemu_save_image(qemu_active); + + /* + * main iteration starts from here + */ + while (ring->cons < ring->prod) { + + kemari_ring_read(ring, &buf); + + for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) { + + int next, offset = 0; + + index = i * BITS_PER_LONG; + + kemari_ring_read(ring, &buf); + + while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) { + int n; + next = ffs(buf->u.dirty_bitmap); + buf->u.dirty_bitmap >>= next; + offset += next; + n = offset + index - 1; +#if 0 + if (((n >= 0xa0 && n < 0xc0) /* VGA hole */ + || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT) + && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) { + if (n >= shared_info_frame && n <= shared_info_frame + 32) { + ; + } else { + continue; + } + } +#endif + ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE); + pfn_type[batch] = n; + batch++; + } + + if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) && + !(ring->cons == ring->prod)) + continue; + + /* Pull in the dirty bits from qemu-dm too */ + qemu_end_flip(); + for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) { + if (qemu_bitmaps[qemu_non_active][k] != 0) { + unsigned int bmp = qemu_bitmaps[qemu_non_active][k]; + + index = k * BITS_PER_LONG; + while (bmp && offset < BITS_PER_LONG) { + int n, next, offset = 0; + next = ffs(bmp); + bmp >>= next; + offset += next; + n = offset + index - 1; + + ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE); + pfn_type[batch] = n; + batch++; + } + qemu_bitmaps[qemu_non_active][k] = 0; + } + if (batch >= MAX_BATCH_SIZE) { + ERROR("Sorry, reached MAX_BATCH_SIZE. " + "We will fix this lator."); + goto out; + } + } + + PPRINTF("batch %d\n", batch); + + /* send pages */ + iov[0].iov_base = &batch; + iov[0].iov_len = sizeof(batch); + + iov[1].iov_base = pfn_type; + iov[1].iov_len = sizeof(pfn_type[0]) * batch; + + for (k = 0; k < iovcnt / IOV_MAX + 1; k++) { + int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX; + if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) { + ERROR("Error when writing pages state file (2--4)" + " (errno %d)", errno); + goto out; + } + } + + batch = 0; + } + } + + if (send_hvm_params(xc_handle, io_fd, dom) < 0) + goto out; + qemu_end_save(); + if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0) + goto out; + qemu_image_sent(); + + /* Zero terminate */ + i = 0; + if ( write_exact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing to state file (6'')"); + goto out; + } + + if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0) + goto out; + + if (!debug) + { + int rcv_status; + if ( read_exact(io_fd, &rcv_status, sizeof(int))) { + ERROR("Error when reading receiver status"); + goto out; + } + } + + rc = 0; +out: + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:18 UTC
[Xen-devel] [RFC][PATCH 06/13] Kemari: Kemari receiver
This is an updated version of the following patch. Followed the changes in live migration code. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00375.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/libxc/xc_dom_kemari_restore.c | 727 ++++++++++++++++++++++++++++++++++++ tools/xcutils/xc_kemari_restore.c | 88 ++++ 2 files changed, 815 insertions(+) diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_kemari_restore.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xcutils/xc_kemari_restore.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,88 @@ +/* + * xc_kemari_restore.c + * + * Restore the state of a running Linux session. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * This source code is based on xc_restore.c. + * + * Copyright (C) 2005 by Christian Limpach + * + */ +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <err.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include <signal.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <xenguest.h> +#include <xc_private.h> + +static int io_fd; + +static void close_handler(int sig_type) +{ + /* let xc_kemari_restore move build process */ + close(io_fd); +} + +int +main(int argc, char **argv) +{ + unsigned int domid, store_evtchn, console_evtchn; + unsigned int hvm, pae, apic; + int xc_fd, ret, one = 1; + unsigned long store_mfn, console_mfn; + struct sigaction act; + + if ( argc != 8 ) + errx(1, "usage: %s iofd domid store_evtchn " + "console_evtchn hvm pae apic", argv[0]); + + xc_fd = xc_interface_open(); + if ( xc_fd < 0 ) + errx(1, "failed to open control interface"); + + io_fd = atoi(argv[1]); + domid = atoi(argv[2]); + store_evtchn = atoi(argv[3]); + console_evtchn = atoi(argv[4]); + hvm = atoi(argv[5]); + pae = atoi(argv[6]); + apic = atoi(argv[7]); + + act.sa_handler = close_handler; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + sigaction(SIGHUP, &act, 0); + sigaction(SIGINT, &act, 0); + + if ( setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) < 0 ) + DPRINTF("failed to set TCP_NODELAY"); + + ret = xc_kemari_restore(xc_fd, io_fd, domid, store_evtchn, &store_mfn, + console_evtchn, &console_mfn, hvm, pae); + + if ( ret == 0 ) + { + printf("store-mfn %li\n", store_mfn); + if ( !hvm ) + printf("console-mfn %li\n", console_mfn); + fflush(stdout); + } + + xc_interface_close(xc_fd); + + return ret; +} diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari_restore.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_dom_kemari_restore.c Wed Mar 11 18:03:47 2009 +0900 @@ -0,0 +1,727 @@ +/****************************************************************************** + * xc_dom_kemari_restore.c + * + * Restore the state of a guest session for kemari. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * This source code is based on xc_domain_restore.c. + * + * Copyright (c) 2003, K A Fraser. + * Copyright (c) 2006, Intel Corporation + * Copyright (c) 2007, XenSource Inc. + */ + +#include <stdlib.h> +#include <unistd.h> + +#include "xg_private.h" +#include "xg_save_restore.h" +#include "xc_dom.h" + +#include <xen/hvm/ioreq.h> +#include <xen/hvm/params.h> + +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; + +/* number of ''in use'' pfns in the guest (i.e. #P2M entries with a valid mfn) */ +static unsigned long nr_pfns; + +/* A table mapping each PFN to its new MFN. */ +static xen_pfn_t *p2m = NULL; + +/* A table of P2M mappings in the current region */ +static xen_pfn_t *p2m_batch = NULL; + +int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom, + unsigned int store_evtchn, unsigned long *store_mfn, + unsigned int console_evtchn, unsigned long *console_mfn, + unsigned int hvm, unsigned int pae) +{ + int rc = 1, frc, i, n, m; + unsigned long mfn, pfn; + unsigned int prev_pc, this_pc; + + /* The new domain''s shared-info frame number. */ + unsigned long shared_info_frame; + + /* A table containing the type of each PFN (/not/ MFN!). */ + unsigned long *pfn_type = NULL; + + /* A table of MFNs to map in the current region */ + xen_pfn_t *region_mfn = NULL; + + /* Types of the pfns in the current region */ + unsigned long region_pfn_type[MAX_BATCH_SIZE]; + + /* Our mapping of the current region (batch) */ + char *region_base; + + /* Magic frames in HVM guests: ioreqs and xenstore comms. */ + uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ + + /* Temporary buffered memory space until all pages are read. */ + char *tmp_region = NULL; + + /* if true, go into transaction mode */ + int kemari_transaction_mode = 0; + + /* index for grant table */ + int grant_idx = 0; + + /* Callback IRQ */ + uint64_t callback_irq = 0; + + /* active and non-active id of flip buffer */ + int info_active = 0, info_non_active = 1; + + /* Buffer for holding HVM context */ + uint8_t *hvm_buf[2] = {NULL,NULL}; + uint32_t hvm_buf_size = 0; + + /* Buffer for qemu image */ + uint8_t *qemu_image[2] = {NULL,NULL}; + uint32_t qemu_image_size[2] = {0,0}; + uint32_t qemu_buff_size = 0; + + /* Buffer for the EPT identity PT location. */ + uint64_t ident_pt[2] = {0,0}; + /* Buffer for the VM86 TSS. */ + uint64_t vm86_tss[2] = {0,0}; + + if ( !hvm ) { + ERROR("Kemari only works on HVM domain."); + goto out; + } + + /* For info only */ + nr_pfns = 0; + + if ( read_exact(io_fd, &p2m_size, sizeof(unsigned long)) ) + { + ERROR("read: p2m_size"); + goto out; + } + DPRINTF("xc_kemari_restore start: p2m_size = %lx\n", p2m_size); + + /* We want zeroed memory so use calloc rather than malloc. */ + p2m = calloc(p2m_size, sizeof(xen_pfn_t)); + pfn_type = calloc(p2m_size, sizeof(unsigned long)); + + region_mfn = xg_memalign(PAGE_SIZE, ROUNDUP( + MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); + p2m_batch = xg_memalign(PAGE_SIZE, ROUNDUP( + MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); + + /* use aligned page for speed up memmove(3) */ + tmp_region = xg_memalign(PAGE_SIZE, PAGE_SIZE * MAX_BATCH_SIZE); + + if ( (p2m == NULL) || (pfn_type == NULL) || + (region_mfn == NULL) || (p2m_batch == NULL) || + (tmp_region == NULL) ) + { + ERROR("memory alloc failed"); + errno = ENOMEM; + goto out; + } + + memset(region_mfn, 0, + ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); + memset(p2m_batch, 0, + ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT)); + memset(tmp_region, 0, PAGE_SIZE * MAX_BATCH_SIZE); + + if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) ) + { + ERROR("Could not lock region_mfn"); + goto out; + } + + if ( lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) ) + { + ERROR("Could not lock p2m_batch"); + goto out; + } + + if ( lock_pages(tmp_region, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) ) + { + ERROR("Could not lock region_mfn"); + goto out; + } + + /* Get the domain''s shared-info frame. */ + if ( read_exact(io_fd, &shared_info_frame, sizeof(unsigned long))) + { + ERROR("Error when reading shared_info_frame"); + goto out; + } + DPRINTF("xc_kemari_restore: shared_info_frame: %lx\n", shared_info_frame); + + /* read HVM-specific parameters */ + if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) + { + ERROR("error reading magic page addresses"); + goto out; + } + + if (read_exact(io_fd, &callback_irq, sizeof(callback_irq))) + { + ERROR("error reading magic page addresses"); + goto out; + } + + /* Mark all PFNs as invalid; we allocate on demand */ + for ( pfn = 0; pfn < p2m_size; pfn++ ) + p2m[pfn] = INVALID_P2M_ENTRY; + + /* + * Now simply read each saved frame into its new machine frame. + * We uncanonicalise page tables as we go. + */ + prev_pc = 0; + + n = m = 0; + for ( ; ; ) + { + int num_pages; + int nr_mfns; + + num_pages = 0; + for ( ; ; ) { + int j; + + this_pc = (n * 100) / p2m_size; + if ( (this_pc - prev_pc) >= 5 ) + { + PPRINTF("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + if ( read_exact(io_fd, &j, sizeof(int)) ) + { + ERROR("Error when reading batch size"); + goto build; + } + + PPRINTF("batch %d\n",j); + + if (j == -1) + { + uint32_t rec_size; + if ( read_exact(io_fd, &rec_size, sizeof(uint32_t)) ) + { + ERROR("error read the qemu file size"); + goto build; + } + + if (qemu_buff_size < rec_size) + { + qemu_buff_size = rec_size; + qemu_image[0] = realloc(qemu_image[0], qemu_buff_size); + qemu_image[1] = realloc(qemu_image[1], qemu_buff_size); + if ((qemu_image[0] == NULL) || (qemu_image[1] == NULL)) + { + ERROR("error allocate memory"); + goto out; + } + } + + qemu_image_size[info_non_active] = rec_size; + if ( read_exact(io_fd, qemu_image[info_non_active], + qemu_image_size[info_non_active]) ) + { + ERROR("error read the qemu image file"); + goto build; + } + + continue; + } + + if ( j == -3 ) + { + /* Skip padding 4 bytes then read the EPT identity PT location. */ + if ( read_exact(io_fd, &ident_pt[info_non_active], + sizeof(uint32_t)) || + read_exact(io_fd, &ident_pt[info_non_active], + sizeof(uint64_t)) ) + { + ERROR("error read the address of the EPT identity map"); + goto build; + } + + continue; + } + + if ( j == -4 ) + { + /* Skip padding 4 bytes then read the vm86 TSS location. */ + if ( read_exact(io_fd, &vm86_tss[info_non_active], + sizeof(uint32_t)) || + read_exact(io_fd, &vm86_tss[info_non_active], + sizeof(uint64_t)) ) + { + ERROR("error read the address of the vm86 TSS"); + goto out; + } + + continue; + } + + if ( j == 0 ) + break; /* our work here is done */ + + /* j > 0: Read pages here */ + if ( (j > MAX_BATCH_SIZE) || (j < 0) ) + { + ERROR("Max batch size exceeded. Giving up. %d", j); + goto out; + } + + if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) ) + { + ERROR("Error when reading region pfn types"); + goto build; + } + + if (kemari_transaction_mode) { + if (num_pages != 0) + { + ERROR("Sorry! You cannot execute page-send-phase " + "twice. We will fix this bug in the future."); + DPRINTF("Sorry\n"); + goto out; + } + num_pages = j; + + /* Since there are not invalid pages, we don''t need to skip */ + if ( read_exact(io_fd, tmp_region, PAGE_SIZE * num_pages) ) + { + ERROR("Error when reading page at kemari transaction mode"); + goto build; + } + + continue; + } + + /* Normal mode */ + /* First pass for this batch: work out how much memory to alloc */ + nr_mfns = 0; + for ( i = 0; i < j; i++ ) + { + unsigned long pfn, pagetype; + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && + (p2m[pfn] == INVALID_P2M_ENTRY) ) + { + /* Have a live PFN which hasn''t had an MFN allocated */ + p2m_batch[nr_mfns++] = pfn; + p2m[pfn]--; + } + } + + /* Now allocate a bunch of mfns for this batch */ + if ( nr_mfns && + (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0, + 0, p2m_batch) != 0) ) + { + ERROR("Failed to allocate memory for batch.! %d\n", nr_mfns); + for (i = 0; i < nr_mfns; i++) + DPRINTF("p2m_batch[%d] = %lx\n", i, p2m_batch[i]); + errno = ENOMEM; + goto out; + } + + /* set special pages */ + { + struct xen_add_to_physmap xatp; + for (i = 0; i < nr_mfns; i++) + if (p2m_batch[i] == shared_info_frame) { + xatp.domid = dom; + xatp.space = XENMAPSPACE_shared_info; + xatp.idx = 0; + xatp.gpfn = shared_info_frame; + DPRINTF("setting up shared_info_frame: %lu\n", + shared_info_frame); + if (xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp) + != 0) + { + ERROR("Error setting shared_info_frame"); + goto out; + } + } else if ((p2m_batch[i] > shared_info_frame) + && (p2m_batch[i] <= shared_info_frame + 32)) { + xatp.domid = dom; + xatp.space = XENMAPSPACE_grant_table; + xatp.idx = grant_idx; + xatp.gpfn = p2m_batch[i]; + DPRINTF("grant[%d]: %lu\n", grant_idx, xatp.gpfn); + if (xc_memory_op(xc_handle, XENMEM_add_to_physmap, + &xatp) != 0) + { + PERROR("Cannot map grant table pfn: %lu", xatp.gpfn); + goto out; + } + grant_idx++; + } + } + + /* Second pass for this batch: update p2m[] and region_mfn[] */ + nr_mfns = 0; + for ( i = 0; i < j; i++ ) + { + unsigned long pfn, pagetype; + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) + region_mfn[i] = ~0UL; /* map will fail but we don''t care */ + else + { + if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) ) + { + /* We just allocated a new mfn above; update p2m */ + p2m[pfn] = p2m_batch[nr_mfns++]; + nr_pfns++; + } + + /* setup region_mfn[] for batch map. + * For HVM guests, this interface takes PFNs, not MFNs */ + region_mfn[i] = pfn; + } + } + + /* Map relevant mfns */ + region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_WRITE, region_mfn, j); + + if ( region_base == NULL ) + { + ERROR("map batch failed"); + goto out; + } + + for ( i = 0; i < j; i++ ) + { + void *page; + unsigned long pagetype; + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) + /* a bogus/unmapped page: skip it */ + continue; + + if ( pfn > p2m_size ) + { + ERROR("pfn out of range"); + goto out; + } + + pfn_type[pfn] = pagetype; + + mfn = p2m[pfn]; + + page = region_base + i*PAGE_SIZE; + + if ( read_exact(io_fd, page, PAGE_SIZE) ) + { + ERROR("Error when reading page (type was %lx)", pagetype); + goto out; + } + + pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && + (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) + { + DPRINTF("uncanonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn); + } + else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB ) + { + ERROR("Bogus page type %lx page table is out of range: " + "i=%d p2m_size=%lu", pagetype, i, p2m_size); + goto out; + + } + } /* end of ''batch'' for loop */ + + munmap(region_base, j*PAGE_SIZE); + n+= j; /* crude stats */ + } + + /* HVM specific */ + { + uint32_t rec_len; + + /* Read HVM context */ + if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) ) + { + ERROR("error read hvm context size!\n"); + goto build; + } + + if (rec_len != hvm_buf_size) + { + if (hvm_buf[info_non_active] == NULL) + { /* hvm_buf will be reused. */ + hvm_buf_size = rec_len; + hvm_buf[0] = malloc(hvm_buf_size); + hvm_buf[1] = malloc(hvm_buf_size); + if ( hvm_buf[0] == NULL || hvm_buf[1] == NULL) + { + ERROR("memory alloc for hvm context buffer failed"); + errno = ENOMEM; + goto out; + } + } else { + ERROR("Sorry, we did not thought about HVM image size " + "change."); + goto out; + } + } + + if ( read_exact(io_fd, hvm_buf[info_non_active], hvm_buf_size) ) + { + ERROR("error loading the HVM context"); + goto build; + } + } + + /* + * Commit! + */ + { + int zero = 0; + + if ( write_exact(io_fd, &zero, sizeof(int))) { + ERROR("Error when replying to sender (errno %d)", errno); + goto out; + } + } + + /* commit pages */ + if (kemari_transaction_mode && num_pages > 0) + { + int nr_mfns; + /* First pass for this batch: work out how much memory to alloc */ + nr_mfns = 0; + for ( i = 0; i < num_pages; i++ ) + { + unsigned long pfn, pagetype; + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) && + (p2m[pfn] == INVALID_P2M_ENTRY) ) + { + /* Have a live PFN which hasn''t had an MFN allocated */ + p2m_batch[nr_mfns++] = pfn; + p2m[pfn]--; + DPRINTF("Cannot be occur!!! no map for pfn: %lu\n", pfn); + } + } + + /* Now allocate a bunch of mfns for this batch */ + if ( nr_mfns && + (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0, + 0, p2m_batch) != 0) ) + { + ERROR("Failed to allocate memory for batch.!\n"); + errno = ENOMEM; + goto out; + } + + /* Second pass for this batch: update p2m[] and region_mfn[] */ + nr_mfns = 0; + for ( i = 0; i < num_pages; i++ ) + { + unsigned long pfn, pagetype; + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) { + DPRINTF("pfn %lu = XEN_DOMCTL_PFINFO_XTAB\n", pfn); + region_mfn[i] = ~0UL; /* map will fail but we don''t care */ + } + else + { + if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) ) + { + /* We just allocated a new mfn above; update p2m */ + p2m[pfn] = p2m_batch[nr_mfns++]; + nr_pfns++; + } + + /* setup region_mfn[] for batch map. + * For HVM guests, this interface takes PFNs, not MFNs */ + region_mfn[i] = pfn; + } + } + + /* Map relevant mfns */ + region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_WRITE, region_mfn, num_pages); + + if ( region_base == NULL ) + { + ERROR("map batch failed"); + goto out; + } + + for ( i = 0; i < num_pages; i++ ) + { + void *page, *spage; + unsigned long pagetype; + + pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pfn > p2m_size ) + { + ERROR("pfn out of range"); + goto out; + } + + pfn_type[pfn] = pagetype; + + mfn = p2m[pfn]; + + page = region_base + i*PAGE_SIZE; + spage = tmp_region + i*PAGE_SIZE; + + if ( !memmove(page, spage, PAGE_SIZE) ) + { + ERROR("Error when reading page (type was %lx)", pagetype); + goto out; + } + + } /* end of ''batch'' for loop */ + + munmap(region_base, num_pages*PAGE_SIZE); + num_pages = 0; /* clear num_pages for refill */ + } + + /* commit HVM specific status */ + info_active = info_non_active; + info_non_active = info_active ? 0 : 1; + + /* HVM success! */ + rc = 0; + kemari_transaction_mode = 1; + } + + build: /* building HVM context */ + DPRINTF("building status %d\n", rc); + if (rc == 0) + { + FILE *qemu_fp; + char path[128]; + + /* set the EPT identity PT location */ + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, + ident_pt[info_active]); + xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, + vm86_tss[info_active]); + + if ( (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_IOREQ_PFN, magic_pfns[0])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_STORE_PFN, magic_pfns[2])) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_PAE_ENABLED, pae)) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_STORE_EVTCHN, + store_evtchn)) + || (frc = xc_set_hvm_param(xc_handle, dom, + HVM_PARAM_CALLBACK_IRQ, + callback_irq)) ) + { + ERROR("error setting HVM params: %i", frc); + rc = 3; + goto out; + } + *store_mfn = magic_pfns[2]; + DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n", + magic_pfns[0], magic_pfns[1], magic_pfns[2]); + + frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf[info_active], + hvm_buf_size); + if ( frc ) + { + ERROR("error setting the HVM context"); + rc = 4; + goto out; + } + + if (qemu_image_size[info_active] == 0) + { + ERROR("Did not received QEMU image"); + rc = 5; + goto out; + } + snprintf(path, sizeof(path), "/var/lib/xen/qemu-save.%d", dom); + if ((qemu_fp = fopen(path, "w")) == NULL) + { + ERROR("error opening QEMU image"); + rc = 5; + goto out; + } + if (fwrite(qemu_image[info_active], qemu_image_size[info_active], + 1, qemu_fp) != 1) + { + ERROR("error writing QEMU image"); + rc = 5; + goto out; + } + fclose(qemu_fp); + } + + out: + if ( (rc != 0) && (dom != 0) ) + xc_domain_destroy(xc_handle, dom); + free(p2m); + free(pfn_type); + free(region_mfn); + free(p2m_batch); + free(tmp_region); + free(hvm_buf[0]); + free(hvm_buf[1]); + free(qemu_image[0]); + free(qemu_image[1]); + + /* discard cache for save file */ + discard_file_cache(io_fd, 1 /*flush*/); + + DPRINTF("Restore exit with rc=%d\n", rc); + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:19 UTC
[Xen-devel] [RFC][PATCH 07/13] Kemari: add Kemari support to python
This is an updated version of the following patch. Followed the changes in live migration code. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00376.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/python/xen/xend/XendAPI.py | 3 - tools/python/xen/xend/XendCheckpoint.py | 86 +++++++++++++++++++++++++++----- tools/python/xen/xend/XendDomain.py | 6 +- tools/python/xen/xm/migrate.py | 10 ++- 4 files changed, 88 insertions(+), 17 deletions(-) diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xm/migrate.py --- a/tools/python/xen/xm/migrate.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xm/migrate.py Wed Mar 11 18:03:47 2009 +0900 @@ -51,6 +51,10 @@ fn=set_true, default=None, use="Use ssl connection for migration.") +gopts.opt(''kemari'', short=''k'', + fn=set_true, default=None, + use="Use the Kemari fault tolerant migration.") + def help(): return str(gopts) @@ -70,7 +74,8 @@ other_config = { "port": opts.vals.port, "node": opts.vals.node, - "ssl": opts.vals.ssl + "ssl": opts.vals.ssl, + "kemari": opts.vals.kemari } server.xenapi.VM.migrate(vm_ref, dst, bool(opts.vals.live), other_config) @@ -78,4 +83,5 @@ server.xend.domain.migrate(dom, dst, opts.vals.live, opts.vals.port, opts.vals.node, - opts.vals.ssl) + opts.vals.ssl, + opts.vals.kemari) diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendAPI.py --- a/tools/python/xen/xend/XendAPI.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/XendAPI.py Wed Mar 11 18:03:47 2009 +0900 @@ -1797,9 +1797,10 @@ port = other_config.get("port", 0) node = other_config.get("node", -1) ssl = other_config.get("ssl", None) + kemari = other_config.get("kemari", None) xendom.domain_migrate(xeninfo.getDomid(), destination_url, - bool(live), port, node, ssl) + bool(live), port, node, ssl, kemari) return xen_api_success_void() def VM_save(self, _, vm_ref, dest, checkpoint): diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/XendCheckpoint.py Wed Mar 11 18:03:47 2009 +0900 @@ -29,6 +29,8 @@ dm_batch = 512 XC_SAVE = "xc_save" XC_RESTORE = "xc_restore" +XC_KEMARI_SAVE = "xc_kemari_save" +XC_KEMARI_RESTORE = "xc_kemari_restore" sizeof_int = calcsize("i") @@ -64,8 +66,15 @@ list.insert (i+1, value) return +def get_dev_info(info, n): + i = 0 + while i < len(info): + if (info[i][0] == n): + return [n, info[i][1]] + i = i + 1 + return [n, ''''] -def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1): +def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1, kemari=False): try: if not os.path.isdir("/var/lib/xen"): os.makedirs("/var/lib/xen") @@ -76,6 +85,30 @@ write_exact(fd, SIGNATURE, "could not write guest state file: signature") sxprep = dominfo.sxpr() + + # Add kemari option if enabled. + if kemari: + sxprep.append([''kemari'', kemari]) + pv_devlist = [] + pv_devs = dominfo.getDeviceSxprs(''vbd'') + for x in pv_devs: + devinfo = [] + for n in [''event-channel'', ''ring-ref'']: + devinfo.append(get_dev_info(x[1], n)) + pv_devlist.append([x[0], devinfo]) + pv_devs = dominfo.getDeviceSxprs(''vif'') + for x in pv_devs: + devinfo = [] + for n in [''event-channel'', ''tx-ring-ref'', ''rx-ring-ref'', + ''request-rx-copy'', ''feature-rx-notify'', ''feature-sg'', + ''feature-gso-tcpv4'']: + devinfo.append(get_dev_info(x[1], n)) + pv_devlist.append([x[0], devinfo]) + sxprep.append([''kemari-device-info'', pv_devlist]) + + # Add kemari option if enabled. + if kemari: + sxprep.append([''kemari'', kemari]) if node > -1: insert_after(sxprep,''vcpus'',[''node'', str(node)]) @@ -104,9 +137,17 @@ # enabled. Passing "0" simply uses the defaults compiled into # libxenguest; see the comments and/or code in xc_linux_save() for # more information. - cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd), - str(dominfo.getDomid()), "0", "0", - str(int(live) | (int(hvm) << 2)) ] + if kemari: + if not hvm: + raise XendError("You can only use kemari on HVM domain.") + + cmd = [xen.util.auxbin.pathTo(XC_KEMARI_SAVE), str(fd), + str(dominfo.getDomid()), "0", "0", + str(int(live) | (int(hvm) << 2)) ] + else: + cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd), + str(dominfo.getDomid()), "0", "0", + str(int(live) | (int(hvm) << 2)) ] log.debug("[xc_save]: %s", string.join(cmd)) def saveInputHandler(line, tochild): @@ -132,7 +173,7 @@ forkHelper(cmd, fd, saveInputHandler, False) # put qemu device model state - if os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()): + if not kemari and os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()): write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature") qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(), os.O_RDONLY) @@ -198,6 +239,16 @@ raise XendError("not a valid guest state file: config parse") vmconfig = p.get_val() + + # Checks if kemari is enabled or not. + # Since Xen do not know kemari option, this option will not be migrated. + is_kemari = False + kemari_device_info = [] + for v in vmconfig: + if v[0] == ''kemari'' and v[1]: + is_kemari = True + if v[0] == ''kemari-device-info'' and v[1]: + kemari_device_info = v[1] if not relocating: domconfig = XendConfig(sxp_obj = vmconfig) @@ -272,14 +323,21 @@ shadow_cur = xc.shadow_mem_control(dominfo.getDomid(), shadow / 1024) dominfo.info[''shadow_memory''] = shadow_cur - cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE), - fd, dominfo.getDomid(), - store_port, console_port, int(is_hvm), pae, apic]) + if is_kemari: + cmd = map(str, [xen.util.auxbin.pathTo(XC_KEMARI_RESTORE), + fd, dominfo.getDomid(), + store_port, console_port, int(is_hvm), pae, apic]) + else: + cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE), + fd, dominfo.getDomid(), + store_port, console_port, int(is_hvm), pae, apic]) log.debug("[xc_restore]: %s", string.join(cmd)) handler = RestoreInputHandler() forkHelper(cmd, fd, handler.handler, True) + if is_kemari: + os.close(fd) # We don''t want to pass this fd to any other children -- we # might need to recover the disk space that backs it. @@ -299,7 +357,7 @@ # get qemu state and create a tmp file for dm restore # Even PV guests may have QEMU stat, but its not currently # used so only bother with HVM currently. - if is_hvm: + if is_hvm and not is_kemari: qemu_signature = read_exact(fd, len(QEMU_SIGNATURE), "invalid device model signature read") if qemu_signature != QEMU_SIGNATURE: @@ -318,7 +376,10 @@ restore_image.setCpuid() - os.read(fd, 1) # Wait for source to close connection + if is_kemari: + restore_image.setCpuid() + else: + os.read(fd, 1) # Wait for source to close connection dominfo.completeRestore(handler.store_mfn, handler.console_mfn) @@ -336,7 +397,10 @@ lock = False; try: - dominfo.waitForDevices() # Wait for backends to set up + if is_kemari: + dominfo.waitForAttachedDevices(kemari_device_info) + else: + dominfo.waitForDevices() # Wait for backends to set up except Exception, exn: log.exception(exn) diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/XendDomain.py Wed Mar 11 18:03:47 2009 +0900 @@ -1273,7 +1273,7 @@ return val - def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None): + def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None, kemari=None): """Start domain migration. @param domid: Domain ID or Name @@ -1338,7 +1338,7 @@ try: XendCheckpoint.save(p2cwrite, dominfo, True, live, dst, - node=node) + node=node, kemari=kemari) finally: sock.shutdown() sock.close() @@ -1364,7 +1364,7 @@ try: XendCheckpoint.save(sock.fileno(), dominfo, True, live, - dst, node=node) + dst, node=node, kemari=kemari) finally: sock.close() _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:19 UTC
[Xen-devel] [RFC][PATCH 08/13] Kemari: add dev state "Attached" to python
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00377.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/python/xen/xend/XendDomainInfo.py | 8 +++ tools/python/xen/xend/server/DevConstants.py | 1 tools/python/xen/xend/server/DevController.py | 60 ++++++++++++++++++++++++++ tools/python/xen/xend/server/vfbif.py | 4 + 4 files changed, 73 insertions(+) diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/DevConstants.py --- a/tools/python/xen/xend/server/DevConstants.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/server/DevConstants.py Wed Mar 11 18:03:47 2009 +0900 @@ -40,6 +40,7 @@ ''Closed'' : 6, ''Reconfiguring'' : 7, ''Reconfigured'' : 8, + ''Attached'' : 9, } xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys()))) diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/DevController.py --- a/tools/python/xen/xend/server/DevController.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/server/DevController.py Wed Mar 11 18:03:47 2009 +0900 @@ -176,6 +176,59 @@ (devid, self.deviceClass, err)) + def waitForAttachedDevices(self, devinfo): + log.debug("Waiting for attached devices %s.", self.deviceClass) + seq = self.deviceIDs() + return [self.waitForAttachedDevice(item, devinfo) for item in seq] + + + def waitForAttachedDevice(self, devid, devinfo): + log.debug("Waiting for attached %s.", devid) + + if not self.hotplug: + return + + (status, err) = self.waitForBackend(devid) + + if status == Timeout: + self.destroyDevice(devid, False) + raise VmError("Device %s (%s) could not be connected. " + "Hotplug scripts not working." % + (devid, self.deviceClass)) + + elif status == Error: + self.destroyDevice(devid, False) + raise VmError("Device %s (%s) could not be connected. " + "Backend device not found." % + (devid, self.deviceClass)) + + elif status == Missing: + # Don''t try to destroy the device; it''s already gone away. + raise VmError("Device %s (%s) could not be connected. " + "Device not found." % (devid, self.deviceClass)) + + elif status == Busy: + err = None + frontpath = self.frontendPath(devid) + backpath = xstransact.Read(frontpath, "backend") + if backpath: + err = xstransact.Read(backpath, HOTPLUG_ERROR_NODE) + if not err: + err = "Busy." + + self.destroyDevice(devid, False) + raise VmError("Device %s (%s) could not be connected.\n%s" % + (devid, self.deviceClass, err)) + + for x in devinfo: + if x[0] == str(devid): # x[0] was changed to string for transfer. + for y in x[1]: + if y[0] and y[1]: + self.writeFrontend(devid, y[0], str(y[1])) + log.debug("%s %s set for %s.", y[0], y[1], devid) + self.writeFrontend(devid, ''state'', str(xenbusState[''Attached''])) + + def waitForDevice_destroy(self, devid, backpath): log.debug("Waiting for %s - destroyDevice.", devid) @@ -473,6 +526,13 @@ else: raise VmError("Device %s not connected" % devid) + def writeFrontend(self, devid, *args): + frontpath = self.frontendPath(devid) + + if frontpath: + xstransact.Write(frontpath, *args) + else: + raise VmError("Device %s not connected" % devid) ## private: diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/vfbif.py --- a/tools/python/xen/xend/server/vfbif.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/server/vfbif.py Wed Mar 11 18:03:47 2009 +0900 @@ -39,6 +39,10 @@ if devinfo[i] is not None]) def waitForDevice(self, devid): + # is a qemu-dm managed device, don''t wait for hotplug for these. + return + + def waitForAttachedDevice(self, devid, devinfo): # is a qemu-dm managed device, don''t wait for hotplug for these. return diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Mon Mar 09 10:32:24 2009 +0000 +++ b/tools/python/xen/xend/XendDomainInfo.py Wed Mar 11 18:03:47 2009 +0900 @@ -1018,6 +1018,14 @@ """ for devclass in XendDevices.valid_devices(): self.getDeviceController(devclass).waitForDevices() + + def waitForAttachedDevices(self, devinfo): + """Wait for this domain''s configured devices to connect. + + @raise VmError: if any device fails to initialise. + """ + for devclass in XendDevices.valid_devices(): + self.getDeviceController(devclass).waitForAttachedDevices(devinfo) def hvm_destroyPCIDevice(self, vslot): log.debug("hvm_destroyPCIDevice called %s", vslot) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:20 UTC
[Xen-devel] [RFC][PATCH 09/13] Kemari: add XenbusStateAttached to xenbus
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00378.html Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> --- drivers/xen/xenbus/xenbus_client.c | 3 ++- include/xen/interface/io/xenbus.h | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff -r 0430b1dbfb3a -r e183d2114ea1 include/xen/interface/io/xenbus.h --- a/include/xen/interface/io/xenbus.h Fri Mar 06 12:51:33 2009 +0000 +++ b/include/xen/interface/io/xenbus.h Tue Mar 10 15:40:44 2009 +0900 @@ -63,6 +63,8 @@ enum xenbus_state { */ XenbusStateReconfiguring = 7, - XenbusStateReconfigured = 8 + XenbusStateReconfigured = 8, + + XenbusStateAttached = 9 }; typedef enum xenbus_state XenbusState; diff -r 0430b1dbfb3a -r e183d2114ea1 drivers/xen/xenbus/xenbus_client.c --- a/drivers/xen/xenbus/xenbus_client.c Fri Mar 06 12:51:33 2009 +0000 +++ b/drivers/xen/xenbus/xenbus_client.c Tue Mar 10 15:40:44 2009 +0900 @@ -52,8 +52,9 @@ const char *xenbus_strstate(enum xenbus_ [ XenbusStateInitialised ] = "Initialised", [ XenbusStateConnected ] = "Connected", [ XenbusStateClosing ] = "Closing", - [ XenbusStateClosed ] = "Closed", + [ XenbusStateClosed ] = "Closed", + [ XenbusStateAttached ] = "Attached", }; return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; } _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-12 01:23 UTC
[Xen-devel] [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap
This is an updated version of the following patch. No major changes. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00382.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- xenstore.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/xenstore.c b/xenstore.c index 928e950..4333c79 100644 --- a/xenstore.c +++ b/xenstore.c @@ -639,6 +639,8 @@ void xenstore_process_logdirty_event(void) static char *active_path = NULL; static char *next_active_path = NULL; static char *seg = NULL; + static char *kemari_qemu_info = NULL; + static char *qemu_file = NULL; unsigned int len; int i; @@ -705,6 +707,8 @@ void xenstore_process_logdirty_event(void) seg = NULL; return; } + kemari_qemu_info = seg + logdirty_bitmap_size * 2; + asprintf(&qemu_file, "/dev/shm/qemu-save.%d", domid); /* use tmpfs */ #endif /* Remember the paths for the next-active and active entries */ @@ -722,8 +726,32 @@ void xenstore_process_logdirty_event(void) } } +#ifndef CONFIG_STUBDOM + if (kemari_enabled) { + while (kemari_qemu_info[1]) + xen_rmb(); + + /* Switch buffers */ + i = kemari_qemu_info[0]; + if (i != 0 && i != 1) { + fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act); + exit(1); + } + logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size); + kemari_qemu_info[1] = 1; + xen_wmb(); + + /* Save QEMU status */ + while (kemari_qemu_info[2]) + xen_rmb(); + do_savevm(qemu_file); + kemari_qemu_info[2] = 1; + xen_wmb(); + return; + } +#endif /* !CONFIG_STUBDOM */ fprintf(logfile, "Triggered log-dirty buffer switch\n"); - + /* Read the required active buffer from the store */ act = xs_read(xsh, XBT_NULL, next_active_path, &len); if (!act) { _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-24 06:59 UTC
Re: [Xen-devel] [RFC][PATCH 05/13] Kemari: Kemari sender
This is an updated version of the following patch. It uses an event channel instead of a signal to notify buffer flip and order save of the QEMU status. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00749.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- tools/libxc/xc_dom_kemari_save.c | 1114 +++++++++++++++++++++++++++++++++++++++ tools/xcutils/xc_kemari_save.c | 525 ++++++++++++++++++ 2 files changed, 1639 insertions(+) diff -r b249f3e979a5 -r 06b950859c92 tools/libxc/xc_dom_kemari_save.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/libxc/xc_dom_kemari_save.c Tue Mar 24 15:11:38 2009 +0900 @@ -0,0 +1,1114 @@ +/****************************************************************************** + * xc_dom_kemari_save.c + * + * Save the state of a running Linux session. + * + * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * This source code is based on xc_domain_save.c. + * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT, + * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write, + * initialize_mbit_rate, and ratewrite from xc_domain_save.c + * + * Copyright (c) 2003, K A Fraser. + */ + +#include <inttypes.h> +#include <time.h> +#include <stdlib.h> +#include <unistd.h> +#include <limits.h> +#include <sys/types.h> +#include <sys/time.h> + +#include "xc_private.h" +#include "xc_dom.h" +#include "xg_private.h" +#include "xg_save_restore.h" + +#include <xen/hvm/params.h> +#include "xc_e820.h" + +#ifdef __MINIOS__ +/* + * Caution: atomicity of following alternative libc functions are broken. + */ +static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + char buf[1024]; + int len, wrote_len = 0; + + if (offset != NULL) { + ERROR("Sorry sendfile for stubdomain should not have offset"); + errno = EIO; + return -1; + } + + while (count > 0) { + len = (count < sizeof(buf))?count:sizeof(buf); + len = read(in_fd, buf, len); + if (len < 0) + return -1; + if (write_exact(out_fd, buf, len)) + return -1; + wrote_len += len; + count -= len; + } + return wrote_len; +} + +#define IOV_MAX 1024 +struct iovec { + void *iov_base; /* Base address. */ + size_t iov_len; /* Length. */ +}; +static ssize_t writev(int d, const struct iovec *iov, int iovcnt) +{ + int i; + int len, wrote_len; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + errno = EINVAL; + return -1; + } + + for (i = 0, wrote_len = 0; i < iovcnt; i++) { + len = write(d, iov[i].iov_base, iov[i].iov_len); + if (len < 0) + return -1; + + wrote_len += len; + if (wrote_len < 0) { /* integer overflow */ + errno = EINVAL; + return -1; + } + + if (len != iov[i].iov_len) + return wrote_len; + } + + return wrote_len; +} +#else /* !__MINIOS__ */ +#include <sys/sendfile.h> +#include <sys/uio.h> +#endif /* __MINIOS__ */ + +/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */ +static unsigned long *qemu_bitmaps[2]; +static int qemu_active; +static int qemu_non_active; + +/* number of pfns this guest has (i.e. number of entries in the P2M) */ +static unsigned long p2m_size; + +/* page frame numbers */ +static unsigned long *pfn_type = NULL; + +/* The new domain''s shared-info frame number. */ +static unsigned long shared_info_frame; + +/* + * guest memory + */ +#define GUEST_MEM_ENTRY_SIZE 1024 /* up to 4MB at a time. */ +static unsigned char ** guest_memory = NULL; +static unsigned long ** guest_memory_status = NULL; +static unsigned long guest_memory_size = 0; + +static inline int map_guest_mem(int xc_handle, uint32_t domid, + unsigned long base) +{ + int j; + unsigned char * region_base; + unsigned long * pfn_base; + + pfn_base = guest_memory_status[base]; + + memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE); + for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) { + pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j; + } + region_base = xc_map_foreign_batch( + xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE); + if ( region_base == NULL ) + { + PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)", + base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1, + base); + return -1; + } + + /* Look for and skip completely empty batches. */ + for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ ) + pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK; + for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ ) + if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB ) + break; + if ( j == GUEST_MEM_ENTRY_SIZE ) + { + munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE); + guest_memory[base] = NULL; + return 1; + } + + guest_memory[base] = region_base; + + return 0; +} + +static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid, + unsigned long mfn) +{ + unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE; + unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE; + + if (base >= guest_memory_size) { + ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n", + base, guest_memory_size); + return NULL; + } + + if ( guest_memory_status[base][offset] == XEN_DOMCTL_PFINFO_XTAB ) { + /* reload XTAB place */ + munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE); + guest_memory[base] = NULL; + DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n", + base, base * GUEST_MEM_ENTRY_SIZE, + (base + 1) * GUEST_MEM_ENTRY_SIZE - 1); + } + + if (guest_memory[base] == NULL) + if (map_guest_mem(xc_handle, domid, offset)) + return NULL; + + return guest_memory[base] + offset * PAGE_SIZE; + /* Since I don''t care of XEN_DOMCTL_PFINFO_LTAB_MASK, + this program may cause some accidents. */ +} + +static inline int init_guest_mem(int xc_handle, uint32_t dom) +{ + int i; + + guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1; + DPRINTF("guest_memory_size: %lu\n", guest_memory_size); + + /* mapped memory */ + guest_memory = xg_memalign(PAGE_SIZE, + guest_memory_size * sizeof(guest_memory[0])); + if (guest_memory == NULL) + { + PERROR("failed to allocate guest_memory"); + return -1; + } + if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0]))) + { + ERROR("Unable to lock guest_memory array"); + return -1; + } + + /* memory status */ + guest_memory_status = xg_memalign(PAGE_SIZE, + guest_memory_size * sizeof(guest_memory_status[0])); + if ( guest_memory_status == NULL ) + { + ERROR("failed to alloc memory for guest_memory_status"); + errno = ENOMEM; + return -1; + } + if ( lock_pages(guest_memory_status, + guest_memory_size * sizeof(guest_memory_status[0]))) + { + ERROR("Unable to lock guest_memory_status array"); + return -1; + } + + for (i = 0; i < guest_memory_size; i++) { + guest_memory_status[i] = xg_memalign(PAGE_SIZE, + GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0])); + if (guest_memory_status[i] == NULL) { + ERROR("failed to alloc memory for guest_memory_status[%d]", i); + errno = ENOMEM; + return -1; + } + if ( lock_pages(guest_memory_status, + guest_memory_size * sizeof(guest_memory_status[0][0]))) + { + ERROR("Unable to lock guest_memory_status[%d]", i); + return -1; + } + } + + for (i = 0; i < guest_memory_size; i++) + if (map_guest_mem(xc_handle, dom, i) < 0) + return -1; + + return 0; +} + +static int writev_exact(int fd, const struct iovec *iov, size_t count) +{ + int i; + size_t sum; + for (i = 0, sum = 0; i < count; i++) + sum += iov[i].iov_len; + + if (writev(fd, iov, count) != sum) + return -1; + else + return 0; +} + +/* grep fodder: machine_to_phys */ + + +/* +** During (live) save/migrate, we maintain a number of bitmaps to track +** which pages we have to send, to fixup, and to skip. +*/ + +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) +#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long)) + +#define BITMAP_ENTRY(_nr,_bmap) \ + ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG] + +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) + +static inline int test_bit (int nr, volatile void * addr) +{ + return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1; +} + +static inline void clear_bit (int nr, volatile void * addr) +{ + BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr)); +} + +static inline void set_bit ( int nr, volatile void * addr) +{ + BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr)); +} + +static uint64_t tv_delta(struct timeval *new, struct timeval *old) +{ + return (((new->tv_sec - old->tv_sec)*1000000) + + (new->tv_usec - old->tv_usec)); +} + +static int noncached_write(int fd, void *buffer, int len) +{ + static int write_count = 0; + int rc = (write_exact(fd, buffer, len) == 0) ? len : -1; + + write_count += len; + if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) ) + { + /* Time to discard cache - dont care if this fails */ + discard_file_cache(fd, 0 /* no flush */); + write_count = 0; + } + + return rc; +} + +#ifdef ADAPTIVE_SAVE + +/* +** We control the rate at which we transmit (or save) to minimize impact +** on running domains (including the target if we''re doing live migrate). +*/ + +#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */ +#define START_MBIT_RATE 100 /* initial transmit rate for migrate */ + +/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */ +#define RATE_TO_BTU 781250 + +/* Amount in bytes we allow ourselves to send in a burst */ +#define BURST_BUDGET (100*1024) + +/* We keep track of the current and previous transmission rate */ +static int mbit_rate, ombit_rate = 0; + +/* Have we reached the maximum transmission rate? */ +#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE) + +static inline void initialize_mbit_rate() +{ + mbit_rate = START_MBIT_RATE; +} + +static int ratewrite(int io_fd, void *buf, int n) +{ + static int budget = 0; + static int burst_time_us = -1; + static struct timeval last_put = { 0 }; + struct timeval now; + struct timespec delay; + long long delta; + + if ( START_MBIT_RATE == 0 ) + return noncached_write(io_fd, buf, n); + + budget -= n; + if ( budget < 0 ) + { + if ( mbit_rate != ombit_rate ) + { + burst_time_us = RATE_TO_BTU / mbit_rate; + ombit_rate = mbit_rate; + DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n", + mbit_rate, BURST_BUDGET, burst_time_us); + } + if ( last_put.tv_sec == 0 ) + { + budget += BURST_BUDGET; + gettimeofday(&last_put, NULL); + } + else + { + while ( budget < 0 ) + { + gettimeofday(&now, NULL); + delta = tv_delta(&now, &last_put); + while ( delta > burst_time_us ) + { + budget += BURST_BUDGET; + last_put.tv_usec += burst_time_us; + if ( last_put.tv_usec > 1000000 ) + { + last_put.tv_usec -= 1000000; + last_put.tv_sec++; + } + delta -= burst_time_us; + } + if ( budget > 0 ) + break; + delay.tv_sec = 0; + delay.tv_nsec = 1000 * (burst_time_us - delta); + while ( delay.tv_nsec > 0 ) + if ( nanosleep(&delay, &delay) == 0 ) + break; + } + } + } + return noncached_write(io_fd, buf, n); +} + +#else /* ! ADAPTIVE SAVE */ + +#define RATE_IS_MAX() (0) +#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n)) +#define initialize_mbit_rate() + +#endif + +static int print_stats(int xc_handle, uint32_t domid, int pages_sent, + xc_shadow_op_stats_t *stats, int print) +{ + static struct timeval wall_last; + static long long d0_cpu_last; + static long long d1_cpu_last; + + struct timeval wall_now; + long long wall_delta; + long long d0_cpu_now, d0_cpu_delta; + long long d1_cpu_now, d1_cpu_delta; + + gettimeofday(&wall_now, NULL); + + d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000; + d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000; + + if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) ) + DPRINTF("ARRHHH!!\n"); + + wall_delta = tv_delta(&wall_now,&wall_last)/1000; + if ( wall_delta == 0 ) + wall_delta = 1; + + d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000; + d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000; + + if ( print ) + DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, " + "dirtied %dMb/s %" PRId32 " pages\n", + wall_delta, + (int)((d0_cpu_delta*100)/wall_delta), + (int)((d1_cpu_delta*100)/wall_delta), + (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))), + (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))), + stats->dirty_count); + +#ifdef ADAPTIVE_SAVE + if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate ) + { + mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) + + 50; + if ( mbit_rate > MAX_MBIT_RATE ) + mbit_rate = MAX_MBIT_RATE; + } +#endif + + d0_cpu_last = d0_cpu_now; + d1_cpu_last = d1_cpu_now; + wall_last = wall_now; + + return 0; +} + +static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom) +{ + char path[128]; + struct stat st; + struct { + int minusfour; + uint32_t image_size; + } chunk = { -1, 0 }; + int qemu_fd; + int rc = -1; + + snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom); + if ((qemu_fd = open(path, O_RDONLY)) == -1) + { + PERROR("Error when opening qemu image %s", path); + goto out; + } + + if (fstat(qemu_fd, &st) == -1) + { + PERROR("Error fstat qemu file %s", path); + goto out; + } + chunk.image_size = st.st_size; + + if ( write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing header for qemu image"); + goto out; + } + + if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !+ chunk.image_size) + { + PERROR("Error when writing qemu image"); + goto out; + } + close(qemu_fd); + + rc = 0; +out: + return rc; +} + +static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom) +{ + struct { + int id; + uint32_t pad; + uint64_t data; + } chunk = { 0, 0 }; + + chunk.id = -3; + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, + &chunk.data); + + if ( (chunk.data != 0) && + write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing the ident_pt for EPT guest"); + return -1; + } + + chunk.id = -4; + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, + &chunk.data); + + if ( (chunk.data != 0) && + write_exact(io_fd, &chunk, sizeof(chunk)) ) + { + PERROR("Error when writing the vm86 TSS for guest"); + return -1; + } + + return 0; +} + +static int send_hvm_context(int xc_handle, int io_fd, + struct kemari_ring *ring, uint32_t dom) +{ + uint32_t buf_size = ring->hvm_ctxt.buf_size; + uint32_t rec_size = ring->hvm_ctxt.rec_size; + uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset; + int rc = -1; + + /* Get HVM context from Xen and save it too */ + if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf, + buf_size)) == -1 ) + { + ERROR("HVM:Could not get hvm buffer"); + goto out; + } + + if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) ) + { + PERROR("error write hvm buffer size"); + goto out; + } + + if ( write_exact(io_fd, hvm_buf, rec_size) ) + { + PERROR("write HVM info failed!\n"); + goto out; + } + rc = 0; + +out: + return rc; +} + +int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags, + int hvm, void *(*init_qemu_maps)(int, unsigned)) +{ + int rc = 1, i, j, iter = 0; + int debug = (flags & XCFLAGS_DEBUG); + int sent_last_iter, skip_this_iter; + xc_dominfo_t info; + struct kemari_ring *ring = (struct kemari_ring *)kemari_ring; + + /* base of the region in which domain memory is mapped */ + unsigned char *region_base = NULL; + + /* bitmap of pages: + - that should be sent this iteration (unless later marked as skip); + - to skip this iteration because already dirty; + - to fixup by sending at the end if not already resent; */ + unsigned long *to_send = NULL, *to_fix = NULL; + + xc_shadow_op_stats_t stats; + + unsigned long needed_to_fix = 0; + unsigned long total_sent = 0; + + /* HVM: magic frames for ioreqs and xenstore comms. */ + uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */ + + /* callback irq */ + uint64_t callback_irq = 0; + + if ( !hvm ) + { + ERROR("HVM domain is required for the kemari migration."); + return 1; + } + + initialize_mbit_rate(); + + if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 ) + { + ERROR("Could not get domain info"); + return 1; + } + + shared_info_frame = info.shared_info_frame; + DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame); + + /* Get the size of the P2M table */ + p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1; + DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size); + + /* Domain is still running at this point */ + { + /* Get qemu-dm logging dirty pages too */ + void *seg = init_qemu_maps(dom, BITMAP_SIZE); + qemu_bitmaps[0] = seg; + qemu_bitmaps[1] = seg + BITMAP_SIZE; + qemu_active = 0; + qemu_non_active = 1; + } + + /* pretend we sent all the pages last iteration */ + sent_last_iter = p2m_size; + + /* Setup to_send / to_fix bitmaps */ + to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT)); + to_fix = calloc(1, BITMAP_SIZE); + + if ( !to_send || !to_fix ) + { + ERROR("Couldn''t allocate to_send array"); + goto out; + } + + memset(to_send, 0xff, BITMAP_SIZE); + + if ( lock_pages(to_send, BITMAP_SIZE) ) + { + ERROR("Unable to lock to_send"); + return 1; + } + + pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP( + MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); + if ( pfn_type == NULL ) + { + ERROR("failed to alloc memory for pfn_type arrays"); + errno = ENOMEM; + goto out; + } + memset(pfn_type, 0, + ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT)); + + if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) ) + { + ERROR("Unable to lock pfn_type array"); + goto out; + } + + /* Start writing out the saved-domain record. */ + if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) ) + { + PERROR("write: p2m_size"); + goto out; + } + + /* send shared_info_frame */ + if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) ) + { + PERROR("write: shared_info_frame"); + goto out; + } + + /* Save magic-page locations. */ + memset(magic_pfns, 0, sizeof(magic_pfns)); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, + &magic_pfns[0]); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, + &magic_pfns[1]); + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, + &magic_pfns[2]); + DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n", + magic_pfns[0], magic_pfns[1], magic_pfns[2]); + if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) ) + { + PERROR("Error when writing to state file (7)"); + goto out; + } + + xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ, + &callback_irq); + DPRINTF("kemari_restore: callback irq %llx", callback_irq); + if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) ) + { + PERROR("Error when writing to state file (8)"); + goto out; + } + + print_stats(xc_handle, dom, 0, &stats, 0); + + /* Now write out each data page, canonicalising page tables as we go... */ + { + unsigned int prev_pc, sent_this_iter, N, batch, run; + + iter++; + sent_this_iter = 0; + skip_this_iter = 0; + prev_pc = 0; + N = 0; + + DPRINTF("Saving memory pages: iter %d 0%%", iter); + + while ( N < p2m_size ) + { + unsigned int this_pc = (N * 100) / p2m_size; + + if ( (this_pc - prev_pc) >= 5 ) + { + DPRINTF("\b\b\b\b%3d%%", this_pc); + prev_pc = this_pc; + } + + /* load pfn_type[] with the mfn of all the pages we''re doing in + this batch. */ + for ( batch = 0; + (batch < MAX_BATCH_SIZE) && (N < p2m_size); + N++ ) + { + int n = N; + + if ( debug ) + { + DPRINTF("%d pfn= %08lx mfn= %08lx %d", + iter, (unsigned long)n, + (long unsigned int)0, + test_bit(n, to_send)); + DPRINTF("\n"); + } + + if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) ) + continue; + + /* + ** we get here if: + ** 1. page is marked to_send & hasn''t already been re-dirtied + ** 2. add in pages that still need fixup (net bufs) + */ + + /* Hypercall interfaces operate in PFNs for HVM guests + * and MFNs for PV guests */ + pfn_type[batch] = n; + + if ( !is_mapped(pfn_type[batch]) ) + { + /* + ** not currently in psuedo-physical map -- set bit + ** in to_fix since we must send this page in last_iter + ** unless its sent sooner anyhow, or it never enters + ** pseudo-physical map (e.g. for ballooned down doms) + */ + set_bit(n, to_fix); + continue; + } + + if ( test_bit(n, to_fix) && + !test_bit(n, to_send) ) + { + needed_to_fix++; + DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n", + iter, n, pfn_type[batch]); + } + + clear_bit(n, to_fix); + + batch++; + } + + if ( batch == 0 ) + goto skip; /* vanishingly unlikely... */ + + region_base = xc_map_foreign_batch( + xc_handle, dom, PROT_READ, pfn_type, batch); + if ( region_base == NULL ) + { + ERROR("map batch failed"); + goto out; + } + + { + /* Look for and skip completely empty batches. */ + for ( j = 0; j < batch; j++ ) + if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !+ XEN_DOMCTL_PFINFO_XTAB ) + break; + if ( j == batch ) + { + munmap(region_base, batch*PAGE_SIZE); + continue; /* bail on this batch: no valid pages */ + } + } + + if ( write_exact(io_fd, &batch, sizeof(unsigned int)) ) + { + PERROR("Error when writing to state file (2)"); + goto out; + } + + if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) ) + { + PERROR("Error when writing to state file (3)"); + goto out; + } + + /* entering this loop, pfn_type is now in pfns (Not mfns) */ + run = 0; + for ( j = 0; j < batch; j++ ) + { + unsigned long pfn, pagetype; + + pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK; + pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK; + + if ( pagetype != 0 ) + { + /* If the page is not a normal data page, write out any + run of pages we may have previously acumulated */ + if ( run ) + { + if ( ratewrite(io_fd, + (char*)region_base+(PAGE_SIZE*(j-run)), + PAGE_SIZE*run) != PAGE_SIZE*run ) + { + ERROR("Error when writing to state file (4a)" + " (errno %d)", errno); + goto out; + } + run = 0; + } + } + + /* skip pages that aren''t present */ + if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) + continue; + + pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK; + + if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) && + (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) ) + { + DPRINTF("canonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn); + } + else + { + /* We have a normal page: accumulate it for writing. */ + run++; + } + } /* end of the write out for this batch */ + + if ( run ) + { + /* write out the last accumulated run of pages */ + if ( ratewrite(io_fd, + (char*)region_base+(PAGE_SIZE*(j-run)), + PAGE_SIZE*run) != PAGE_SIZE*run ) + { + ERROR("Error when writing to state file (4c)" + " (errno %d)", errno); + goto out; + } + } + + sent_this_iter += batch; + + munmap(region_base, batch*PAGE_SIZE); + + } /* end of this while loop for this iteration */ + + skip: + + total_sent += sent_this_iter; + + DPRINTF("\r %d: sent %d, skipped %d, ", + iter, sent_this_iter, skip_this_iter ); + + { + print_stats( xc_handle, dom, sent_this_iter, &stats, 1); + + DPRINTF("Total pages sent= %ld (%.2fx)\n", + total_sent, ((float)total_sent)/p2m_size ); + DPRINTF("(of which %ld were fixups)\n", needed_to_fix ); + } + } /* end of infinite for loop */ + + DPRINTF("All memory is saved\n"); + + if (send_hvm_params(xc_handle, io_fd, dom) < 0) + goto out; + + /* Zero terminate */ + i = 0; + if ( write_exact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing to state file (6'')"); + goto out; + } + + if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0) + goto out; + + if (!debug) + { + int rcv_status; + if ( read_exact(io_fd, &rcv_status, sizeof(int))) { + ERROR("Error when reading receiver status"); + goto out; + } + DPRINTF("status received: %d\n", rcv_status); + } + + if (init_guest_mem(xc_handle, dom) < 0) + goto out; + + /* HVM guests are done now */ + rc = 0; + + out: + + /* Flush last write and discard cache for file. */ + discard_file_cache(io_fd, 1 /* flush */); + + free(to_send); + free(to_fix); + + DPRINTF("Save exit rc=%d\n",rc); + + return !!rc; +} + + +int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom, + void *kemari_ring, uint32_t flags, + void (*qemu_save_image)(int), + void (*qemu_end_flip)(void), + void (*qemu_end_save)(void), + void (*qemu_image_sent)(void)) +{ + int rc = 1, k; + int debug = (flags & XCFLAGS_DEBUG); + uint32_t i, j, index = 0; + unsigned int batch = 0; + struct kemari_ring *ring = (struct kemari_ring *)kemari_ring; + struct kemari_ent *buf; + struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */ + int iovcnt = 2; + +#define ADD_IOV(base, len) do { \ + iov[iovcnt].iov_base = base; \ + iov[iovcnt].iov_len = len; \ + iovcnt++; \ +} while (0) + + + + /* flip active qemu */ + qemu_active = qemu_non_active; + qemu_non_active = qemu_active ? 0 : 1; + qemu_save_image(qemu_active); + + /* + * main iteration starts from here + */ + while (ring->cons < ring->prod) { + + kemari_ring_read(ring, &buf); + + for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) { + + int next, offset = 0; + + index = i * BITS_PER_LONG; + + kemari_ring_read(ring, &buf); + + while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) { + int n; + next = ffs(buf->u.dirty_bitmap); + buf->u.dirty_bitmap >>= next; + offset += next; + n = offset + index - 1; + ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE); + pfn_type[batch] = n; + batch++; + } + + if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) && + !(ring->cons == ring->prod)) + continue; + + /* Pull in the dirty bits from qemu-dm too */ + qemu_end_flip(); + for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) { + if (qemu_bitmaps[qemu_non_active][k] != 0) { + unsigned int bmp = qemu_bitmaps[qemu_non_active][k]; + + index = k * BITS_PER_LONG; + while (bmp && offset < BITS_PER_LONG) { + int n, next, offset = 0; + next = ffs(bmp); + bmp >>= next; + offset += next; + n = offset + index - 1; + + ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE); + pfn_type[batch] = n; + batch++; + } + qemu_bitmaps[qemu_non_active][k] = 0; + } + if (batch >= MAX_BATCH_SIZE) { + ERROR("Sorry, reached MAX_BATCH_SIZE. " + "We will fix this lator."); + goto out; + } + } + + PPRINTF("batch %d\n", batch); + + /* send pages */ + iov[0].iov_base = &batch; + iov[0].iov_len = sizeof(batch); + + iov[1].iov_base = pfn_type; + iov[1].iov_len = sizeof(pfn_type[0]) * batch; + + for (k = 0; k < iovcnt / IOV_MAX + 1; k++) { + int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX; + if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) { + ERROR("Error when writing pages state file (2--4)" + " (errno %d)", errno); + goto out; + } + } + + batch = 0; + } + } + + if (send_hvm_params(xc_handle, io_fd, dom) < 0) + goto out; + qemu_end_save(); + if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0) + goto out; + qemu_image_sent(); + + /* Zero terminate */ + i = 0; + if ( write_exact(io_fd, &i, sizeof(int)) ) + { + PERROR("Error when writing to state file (6'')"); + goto out; + } + + if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0) + goto out; + + if (!debug) + { + int rcv_status; + if ( read_exact(io_fd, &rcv_status, sizeof(int))) { + ERROR("Error when reading receiver status"); + goto out; + } + } + + rc = 0; +out: + + return rc; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r b249f3e979a5 -r 06b950859c92 tools/xcutils/xc_kemari_save.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/xcutils/xc_kemari_save.c Tue Mar 24 15:11:38 2009 +0900 @@ -0,0 +1,525 @@ +/* + * xc_kemari_save.c + * + * Save the state of a running Linux session. + * + * Copyright (c) 2008-2009 Nippon Telegraph and Telephone Corporation. + * + * This file is subject to the terms and conditions of the GNU General + * Public License. See the file "COPYING" in the main directory of + * this archive for more details. + * + * This source code is based on xc_save.c. + * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c. + * + * Copyright (C) 2005 by Christian Limpach + * + */ + + +#include <err.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stdio.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + +#include <xs.h> +#include <xenctrl.h> +#include <xenguest.h> +#include <xc_private.h> +#include <xen/kemari.h> + +static volatile sig_atomic_t run = 1; +static int xc_handle, xce_handle, io_fd; +static struct kemari_ring *ring = NULL; +static uint32_t kemari_ring_size = 0; +static int qemu_port; +static int is_finalized = 0; +static int domid; + +/* For HVM guests, there are two sources of dirty pages: the Xen shadow + * log-dirty bitmap, which we get with a hypercall, and qemu''s version. + * The protocol for getting page-dirtying data from qemu uses a + * double-buffered shared memory interface directly between xc_save and + * qemu-dm. + * + * xc_save calculates the size of the bitmaps and notifies qemu-dm + * through the store that it wants to share the bitmaps. qemu-dm then + * starts filling in the ''active'' buffer. + * + * To change the buffers over, xc_save writes the other buffer number to + * the store and waits for qemu to acknowledge that it is now writing to + * the new active buffer. xc_save can then process and clear the old + * active buffer. */ + +static char *qemu_active_path; +static char *qemu_next_active_path; +static int qemu_shmid = -1; +static struct xs_handle *xs; + + +/* Mark the shared-memory segment for destruction */ +static void qemu_destroy_buffer(void) +{ + if (qemu_shmid != -1) + shmctl(qemu_shmid, IPC_RMID, NULL); + qemu_shmid = -1; +} + +static char *kemari_qemu_info = NULL; +static void qemu_save_image(int next_active) +{ + kemari_qemu_info[0] = next_active; + kemari_qemu_info[1] = 0; + xen_wmb(); + xc_evtchn_notify(xce_handle, qemu_port); +} + +static void qemu_end_flip(void) +{ + while (kemari_qemu_info[1] == 0) + xen_rmb(); +} + +static void qemu_end_save(void) +{ + while (kemari_qemu_info[2] == 0) + xen_rmb(); +} + +static void qemu_image_sent(void) +{ + /* after QEMU image sent */ + kemari_qemu_info[2] = 0; + xen_wmb(); +} + +static void *init_qemu_maps(int domid, unsigned int bitmap_size) +{ + key_t key; + char key_ascii[17] = {0,}; + void *seg; + char *path, *p; + + /* Make a shared-memory segment */ + do { + key = rand(); /* No security, just a sequence of numbers */ + qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE, + IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR); + if (qemu_shmid == -1 && errno != EEXIST) + errx(1, "can''t get shmem to talk to qemu-dm"); + } while (qemu_shmid == -1); + + /* Remember to tidy up after ourselves */ + atexit(qemu_destroy_buffer); + + /* Map it into our address space */ + seg = shmat(qemu_shmid, NULL, 0); + if (seg == (void *) -1) + errx(1, "can''t map shmem to talk to qemu-dm"); + memset(seg, 0, 2 * bitmap_size + PAGE_SIZE); + + /* Write the size of it into the first 32 bits */ + *(uint32_t *)seg = bitmap_size; + + /* Tell qemu about it */ + if ((xs = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + if (!(path = strdup("/local/domain/0/device-model/"))) + errx(1, "can''t get domain path in store"); + if (!(path = realloc(path, strlen(path) + + 10 + + strlen("/logdirty/next-active") + 1))) + errx(1, "no memory for constructing xenstore path"); + snprintf(path + strlen(path), 11, "%i", domid); + strcat(path, "/logdirty/"); + p = path + strlen(path); + + strcpy(p, "key"); + snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key); + if (!xs_write(xs, XBT_NULL, path, key_ascii, 16)) + errx(1, "can''t write key (%s) to store path (%s)\n", key_ascii, path); + + /* Watch for qemu''s indication of the active buffer, and request it + * to start writing to buffer 0 */ + strcpy(p, "active"); + if (!xs_watch(xs, path, "qemu-active-buffer")) + errx(1, "can''t set watch in store (%s)\n", path); + if (!(qemu_active_path = strdup(path))) + errx(1, "no memory for copying xenstore path"); + + strcpy(p, "next-active"); + if (!(qemu_next_active_path = strdup(path))) + errx(1, "no memory for copying xenstore path"); + + kemari_qemu_info = seg + 2 * bitmap_size; + xen_wmb(); + qemu_save_image(0); + + free(path); + return seg; +} + +static void close_handler(int sig_type) +{ + run = 0; +} + +static int handle_event(int domid, unsigned int flags) +{ + int ret = 1, rcv_port; + + if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) { + ERROR("Failed to read from event fd"); + goto out; + } + + if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags, + qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) { + xc_domain_pause(xc_handle, domid); + ERROR("xc_kemari_update failed"); + goto out; + } + + if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) { + ERROR("Failed to write to event fd"); + goto out; + } + + ret = 0; +out: + return ret; +} + +static void set_signal_handler(void (*handler)(int)) +{ + struct sigaction act; + + act.sa_handler = handler; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + sigaction(SIGQUIT, &act, 0); + sigaction(SIGINT, &act, 0); + sigaction(SIGHUP, &act, 0); + sigaction(SIGTERM, &act, 0); +} + +static int attach_ports(int domid) +{ + struct xs_handle *xs_handle; + char **list, *data; + unsigned int list_size, data_size; + char path[128]; + uint32_t port; + int i, ret = 1; + + if ((xs_handle = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + + /* + * attach block port. + */ + snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid); + list = xs_directory(xs_handle, XBT_NULL, path, &list_size); + if (list == NULL) + errx(1, "xs_directory (%s) failed", path); + + for (i = 0; i < list_size; i++) { + snprintf(path, sizeof(path), + "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]); + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) + continue; + port = strtoul(data, NULL, 10); + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach, + &port, NULL, + NULL, KEMARI_TAP_OUT)) != 0) { + ERROR("Error when attaching blk_port (%d) on kemari", port); + goto out; + } + free(data); + DPRINTF("blk_port %d attached\n", port); + } + free(list); + + /* + * attach net port. + */ + snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid); + list = xs_directory(xs_handle, XBT_NULL, path, &list_size); + if (list == NULL) + errx(1, "xs_directory (%s) failed", path); + + for (i = 0; i < list_size; i++) { + snprintf(path, sizeof(path), + "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]); + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) + continue; + port = strtoul(data, NULL, 10); + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach, + &port, NULL, + NULL, KEMARI_TAP_OUT)) != 0) { + ERROR("Error when attaching net_port (%d) on kemari", port); + goto out; + } + free(data); + DPRINTF("net_port %d attached\n", port); + } + free(list); + + /* attach success */ + ret = 0; + +out: + xs_daemon_close(xs_handle); + + return ret; +} + +static int get_qemu_port(unsigned int domid) +{ + struct xs_handle *xs_handle; + char path[128]; + char *data; + unsigned int data_size; + int port, inter_port = -1; + + if ((xs_handle = xs_daemon_open()) == NULL) + errx(1, "Couldn''t contact xenstore"); + + snprintf(path, sizeof(path), + "/local/domain/%u/kemari/event-channel", domid); + + data = xs_read(xs_handle, XBT_NULL, path, &data_size); + if (data == NULL) { + ERROR("Could not find QEMU port for domid %d", domid); + goto out; + } + port = strtoul(data, NULL, 10); + free(data); + + inter_port = xc_evtchn_bind_interdomain(xce_handle, DOMID_SELF, port); + if (inter_port < 0) + errx(1, "Port assigned by Xen is strange: %d", inter_port); + + DPRINTF("qemu_port: %d %d\n", port, inter_port); + +out: + xs_daemon_close(xs_handle); + + return inter_port; +} + +static void finalize(void) +{ + int ret; + + if (is_finalized) + return; + + set_signal_handler(SIG_IGN); + if (ring != NULL) + munmap(ring, kemari_ring_size * PAGE_SIZE); + + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off, + NULL, NULL, NULL, 0)) != 0) { + ERROR("Error when turning off kemari"); + } else { + DPRINTF("successufully execute KEMARI_OP_off\n"); + } + + if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL) < 0 ) { + ERROR("Warning - couldn''t disable shadow mode"); + } + + if (!run) + xc_domain_destroy(xc_handle, domid); + + xc_interface_close(xc_handle); + + is_finalized = 1; +} + +int +main(int argc, char **argv) +{ + unsigned int maxit, max_f, flags; + int ret; + int evtchn_fd; + uint32_t port, kemari_port; + uint64_t kemari_mfn; + fd_set inset; + + if (argc != 6) + errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]); + + xc_handle = xc_interface_open(); + if (xc_handle < 0) + errx(1, "failed to open control interface"); + + io_fd = atoi(argv[1]); + domid = atoi(argv[2]); + maxit = atoi(argv[3]); + max_f = atoi(argv[4]); + flags = atoi(argv[5]); + + set_signal_handler(close_handler); + atexit(finalize); + + if (io_fd == -1) /* means test mode */ + { + io_fd = open("/dev/null", O_RDWR); + flags |= XCFLAGS_DEBUG; + } + else + { + int one = 1; + if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, + &one, sizeof(one)) < 0) { + ERROR("failed to set TCP_NODELAY"); + } + } + + if ((xce_handle = xc_evtchn_open()) < 0) { + errx(1, "failed to open control interface"); + } + + evtchn_fd = xc_evtchn_fd(xce_handle); + + if ((qemu_port = get_qemu_port(domid)) < 0) + errx(1, "failed to get qemu port"); + + if ( xc_shadow_control(xc_handle, domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL) < 0 ) + { + int frc; + /* log-dirty already enabled? There''s no test op, + so attempt to disable then reenable it */ + frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF, + NULL, 0, NULL, 0, NULL); + if ( frc >= 0 ) + { + frc = xc_shadow_control(xc_handle, domid, + XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, + NULL, 0, NULL, 0, NULL); + } + + if ( frc < 0 ) + { + err(errno, "Couldn''t enable shadow mode (rc %d)", frc); + } + } + + if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable, + &kemari_port, &kemari_ring_size, + &kemari_mfn, 0) != 0)) { + errx(1, "Error when turning on kemari"); + } + + DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n", + kemari_port, kemari_mfn, kemari_ring_size); + + if (attach_ports(domid) != 0) { + ERROR("attaching port failed "); + goto out; + } + + if ((port = xc_evtchn_bind_interdomain(xce_handle, domid, + kemari_port)) < 0) { + ERROR("xc_evtchn_bind_interdomain failed "); + goto out; + } + + if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN, + kemari_ring_size * PAGE_SIZE, PROT_READ | PROT_WRITE, + kemari_mfn)) == 0) { + ERROR("xc_map_foreign_range failed"); + goto out; + } + + if (xc_domain_pause(xc_handle, domid) < 0) { + ERROR("Domain appears not to have paused"); + goto out; + } + + ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags, + !!(flags & XCFLAGS_HVM), + &init_qemu_maps); + if (ret != 0) { + ERROR("xc_kemari_save failed"); + goto out; + } + + FD_ZERO(&inset); + FD_SET(evtchn_fd, &inset); + + if (xc_domain_unpause(xc_handle, domid) < 0) { + ERROR("Domain appears not to have unpaused"); + goto out; + } + + DPRINTF("running start"); + + while (run) { + + if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) { + if (errno == EINTR) + continue; + ERROR("Error when waiting events by select()"); + break; + } + + if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) { + + if ((ret = handle_event(domid, flags)) != 0) { + ERROR("Error when handling events"); + break; + } + + /* usleep(10000); */ + + if (xc_evtchn_notify(xce_handle, port) < 0) { + ERROR("xc_evtchn_notify failed"); + /* goto out; */ + break; + } + + if(xc_domain_unpause(xc_handle, domid) < 0) { + ERROR("xc_domain_unpause"); + /* goto out; */ + break; + } + + } + } + + out: + close(io_fd); + finalize(); + + return ret; +} + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ + _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Yoshiaki Tamura
2009-Mar-24 06:59 UTC
Re: [Xen-devel] [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap
This is an updated version of the following patch. It will receive a notification of flip change through an event channel. http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00757.html Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp> Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp> --- xenstore.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/xenstore.c b/xenstore.c index 928e950..caef5ef 100644 --- a/xenstore.c +++ b/xenstore.c @@ -33,6 +33,7 @@ struct xs_handle *xsh = NULL; static char *media_filename[MAX_DRIVES+1]; static QEMUTimer *insert_timer = NULL; +static int xce = 0; #define UWAIT_MAX (30*1000000) /* thirty seconds */ #define UWAIT (100000) /* 1/10th second */ @@ -301,6 +302,42 @@ const char *xenstore_get_guest_uuid(void) return already_computed; } +/* prototype of xenstore_process_logdirty_event */ +void xenstore_process_logdirty_event(void); + +static int get_kemari_port(void) +{ + static int kemari_port = -1; + if (kemari_port > 0) + return kemari_port; + + kemari_port = xc_evtchn_bind_unbound_port(xce, DOMID_SELF); + return kemari_port; +} + +static void kemari_handler(void *dummy) +{ + int port; + + port = xc_evtchn_pending(xce); + if (port < 0) { + fprintf(logfile, "xc_evtchn_pending failed"); + return; + } + + if (port == get_kemari_port()) { + kemari_enabled = 1; /* QEMU will run in kemari mode */ + xenstore_process_logdirty_event(); + } else { + fprintf(logfile, "unexpected port %d fired", port); + } + + if (xc_evtchn_unmask(xce, port) < 0) { + fprintf(logfile, "xc_evtchn_unmask failed"); + return; + } +} + #define DIRECT_PCI_STR_LEN 512 #define PT_PCI_MSITRANSLATE_DEFAULT 1 char direct_pci_str[DIRECT_PCI_STR_LEN]; @@ -326,6 +363,12 @@ void xenstore_parse_domain_config(int hvm_domid) xenstore_get_guest_uuid(); + xce = xc_evtchn_open(); + if (xce < 0) { + fprintf(logfile, "Could not open event channel\n"); + return; + } + xsh = xs_daemon_open(); if (xsh == NULL) { fprintf(logfile, "Could not contact xenstore for domain config\n"); @@ -363,6 +406,27 @@ void xenstore_parse_domain_config(int hvm_domid) break; } } + + /* kemari */ + { + int port; + char port_string[128]; + port = get_kemari_port(); + if (port < 0) { + fprintf(stderr, "failed to get kemari port\n"); + goto out; + } + snprintf(port_string, sizeof(port_string), "%d", port); + + if (pasprintf(&buf, "/local/domain/%u/kemari/event-channel", + hvm_domid) == -1) + goto out; + + xs_write(xsh, XBT_NULL, buf, port_string, strlen(port_string)); + qemu_set_fd_handler2(xc_evtchn_fd(xce), + NULL, kemari_handler, NULL, NULL); + fprintf(stderr, "Kemari port is enabled: %d\n", port); + } for (i = 0; i < num; i++) { format = NULL; /* don''t know what the format is yet */ @@ -639,6 +703,8 @@ void xenstore_process_logdirty_event(void) static char *active_path = NULL; static char *next_active_path = NULL; static char *seg = NULL; + static char *kemari_qemu_info = NULL; + static char *qemu_file = NULL; unsigned int len; int i; @@ -705,6 +771,8 @@ void xenstore_process_logdirty_event(void) seg = NULL; return; } + kemari_qemu_info = seg + logdirty_bitmap_size * 2; + asprintf(&qemu_file, "/dev/shm/qemu-save.%d", domid); /* use tmpfs */ #endif /* Remember the paths for the next-active and active entries */ @@ -722,6 +790,30 @@ void xenstore_process_logdirty_event(void) } } +#ifndef CONFIG_STUBDOM + if (kemari_enabled) { + while (kemari_qemu_info[1]) + xen_rmb(); + + /* Switch buffers */ + i = kemari_qemu_info[0]; + if (i != 0 && i != 1) { + fprintf(logfile, "Log-dirty: bad next-active entry: %d\n", i); + exit(1); + } + logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size); + kemari_qemu_info[1] = 1; + xen_wmb(); + + /* Save QEMU status */ + while (kemari_qemu_info[2]) + xen_rmb(); + do_savevm(qemu_file); + kemari_qemu_info[2] = 1; + xen_wmb(); + return; + } +#endif /* !CONFIG_STUBDOM */ fprintf(logfile, "Triggered log-dirty buffer switch\n"); /* Read the required active buffer from the store */ _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel