This patch series introduces a basic support framework to incorporate Remus into the libxl toolstack. The only functionality currently implemented is memory checkpointing. Shriram
rshriram@cs.ubc.ca
2012-Jan-23 22:46 UTC
[PATCH 1 of 3] tools/libxl: QEMU device model suspend/resume
# HG changeset patch # User Shriram Rajagopalan <rshriram@cs.ubc.ca> # Date 1327358638 28800 # Node ID 11fb1dfda7de4d6759dec87d80cd16cf137f7369 # Parent 80fdf2182bc62ca358ba2f1a3513b47a4f8d9dfd tools/libxl: QEMU device model suspend/resume * Refactor the libxl__domain_save_device_model. * Add support for suspend/resume QEMU device model (both traditional xen and QMP). Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl.c --- a/tools/libxl/libxl.c Sat Jan 21 17:15:40 2012 +0000 +++ b/tools/libxl/libxl.c Mon Jan 23 14:43:58 2012 -0800 @@ -477,7 +477,7 @@ rc = libxl__domain_suspend_common(gc, domid, fd, type, live, debug); if (!rc && type == LIBXL_DOMAIN_TYPE_HVM) - rc = libxl__domain_save_device_model(gc, domid, fd); + rc = libxl__domain_save_device_model(gc, domid, fd, /* No Remus */ 0); GC_FREE; return rc; } diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c Sat Jan 21 17:15:40 2012 +0000 +++ b/tools/libxl/libxl_dom.c Mon Jan 23 14:43:58 2012 -0800 @@ -534,6 +534,53 @@ return 0; } +static int libxl__remus_domain_suspend_qemu(libxl__gc *gc, uint32_t domid) +{ + + switch (libxl__device_model_version_running(gc, domid)) { + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { + char *path = NULL; + path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", + domid); + libxl__xs_write(gc, XBT_NULL, path, "save"); + libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, NULL); + break; + } + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: + /* Stop QEMU */ + if (libxl__qmp_stop(gc, domid)) + return ERROR_FAIL; + break; + default: + return ERROR_INVAL; + } + + return 0; +} + +static int libxl__remus_domain_resume_qemu(libxl__gc *gc, uint32_t domid) +{ + + switch (libxl__device_model_version_running(gc, domid)) { + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { + char *path = NULL; + path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", + domid); + libxl__xs_write(gc, XBT_NULL, path, "continue"); + break; + } + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: + /* Stop QEMU */ + if (libxl__qmp_resume(gc, domid)) + return ERROR_FAIL; + break; + default: + return ERROR_INVAL; + } + + return 0; +} + int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, libxl_domain_type type, int live, int debug) @@ -620,7 +667,7 @@ return rc; } -int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd) +int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd, int remus) { libxl_ctx *ctx = libxl__gc_owner(gc); int ret, fd2 = -1, c; @@ -631,13 +678,12 @@ switch (libxl__device_model_version_running(gc, domid)) { case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { - char *path = NULL; - LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, - "Saving device model state to %s", filename); - path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", - domid); - libxl__xs_write(gc, XBT_NULL, path, "save"); - libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, NULL); + /* For Remus,we issue suspend_qemu call separately */ + if (!remus) { + c = libxl__remus_domain_suspend_qemu(gc, domid); + if (c) + return c; + } break; } case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: @@ -668,8 +714,9 @@ qemu_state_len = st.st_size; LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Qemu state is %d bytes\n", qemu_state_len); - ret = libxl_write_exactly(ctx, fd, QEMU_SIGNATURE, strlen(QEMU_SIGNATURE), - "saved-state file", "qemu signature"); + ret = libxl_write_exactly(ctx, fd, remus ? REMUS_QEMU_SIGNATURE : QEMU_SIGNATURE, + remus ? strlen(REMUS_QEMU_SIGNATURE): strlen(QEMU_SIGNATURE), + "saved-state file", "qemu signature"); if (ret) goto out; diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h Sat Jan 21 17:15:40 2012 +0000 +++ b/tools/libxl/libxl_internal.h Mon Jan 23 14:43:58 2012 -0800 @@ -73,6 +73,7 @@ #define LIBXL_HVM_EXTRA_MEMORY 2048 #define LIBXL_MIN_DOM0_MEM (128*1024) #define QEMU_SIGNATURE "DeviceModelRecord0002" +#define REMUS_QEMU_SIGNATURE "RemusDeviceModelState" #define STUBDOM_CONSOLE_LOGGING 0 #define STUBDOM_CONSOLE_SAVE 1 #define STUBDOM_CONSOLE_RESTORE 2 @@ -273,7 +274,8 @@ libxl_domain_type type, int live, int debug); _hidden const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid); -_hidden int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd); +_hidden int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd, + int remus); _hidden void libxl__userdata_destroyall(libxl__gc *gc, uint32_t domid); _hidden int libxl__domain_pvcontrol_available(libxl__gc *gc, uint32_t domid); @@ -616,6 +618,10 @@ _hidden int libxl__qmp_pci_add(libxl__gc *gc, int d, libxl_device_pci *pcidev); _hidden int libxl__qmp_pci_del(libxl__gc *gc, int domid, libxl_device_pci *pcidev); +/* Suspend QEMU. */ +_hidden int libxl__qmp_stop(libxl__gc *gc, int domid); +/* Resume QEMU. */ +_hidden int libxl__qmp_resume(libxl__gc *gc, int domid); /* Save current QEMU state into fd. */ _hidden int libxl__qmp_migrate(libxl__gc *gc, int domid, int fd); /* close and free the QMP handler */ diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl_qmp.c --- a/tools/libxl/libxl_qmp.c Sat Jan 21 17:15:40 2012 +0000 +++ b/tools/libxl/libxl_qmp.c Mon Jan 23 14:43:58 2012 -0800 @@ -878,6 +878,38 @@ return rc; } +int libxl__qmp_stop(libxl__gc *gc, int domid) +{ + libxl__qmp_handler *qmp = NULL; + int rc = 0; + + qmp = libxl__qmp_initialize(libxl__gc_owner(gc), domid); + if (!qmp) + return ERROR_FAIL; + + rc = qmp_synchronous_send(qmp, "stop", NULL, + NULL, NULL, qmp->timeout); + + libxl__qmp_close(qmp); + return rc; +} + +int libxl__qmp_resume(libxl__gc *gc, int domid) +{ + libxl__qmp_handler *qmp = NULL; + int rc = 0; + + qmp = libxl__qmp_initialize(libxl__gc_owner(gc), domid); + if (!qmp) + return ERROR_FAIL; + + rc = qmp_synchronous_send(qmp, "cont", NULL, + NULL, NULL, qmp->timeout); + + libxl__qmp_close(qmp); + return rc; +} + int libxl__qmp_initializations(libxl_ctx *ctx, uint32_t domid) { libxl__qmp_handler *qmp = NULL;
rshriram@cs.ubc.ca
2012-Jan-23 22:46 UTC
[PATCH 2 of 3] tools/libxl: suspend/postflush/commit callbacks
# HG changeset patch # User Shriram Rajagopalan <rshriram@cs.ubc.ca> # Date 1327358640 28800 # Node ID 0446591bee86eb4e767d75b70c885549c7a3cfef # Parent 11fb1dfda7de4d6759dec87d80cd16cf137f7369 tools/libxl: suspend/postflush/commit callbacks * Add libxl callbacks for Remus checkpoint suspend, postflush (aka resume) and checkpoint commit callbacks. * suspend callback just bounces off libxl__domain_suspend_common_callback * resume callback directly calls xc_domain_resume instead of re-using libxl_domain_resume (as the latter does not set the fast suspend argument to xc_domain_resume - aka suspend_cancel support). * commit callback just sleeps for the checkpoint interval (for the moment). * Future patches will augument these callbacks with more functionalities like issuing network buffer plug/unplug commands, disk checkpoint commands, etc. Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl.c --- a/tools/libxl/libxl.c Mon Jan 23 14:43:58 2012 -0800 +++ b/tools/libxl/libxl.c Mon Jan 23 14:44:00 2012 -0800 @@ -475,7 +475,9 @@ int debug = info != NULL && info->flags & XL_SUSPEND_DEBUG; int rc = 0; - rc = libxl__domain_suspend_common(gc, domid, fd, type, live, debug); + rc = libxl__domain_suspend_common(gc, domid, fd, type, live, debug, + /* No Remus */ NULL); + if (!rc && type == LIBXL_DOMAIN_TYPE_HVM) rc = libxl__domain_save_device_model(gc, domid, fd, /* No Remus */ 0); GC_FREE; diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl.h --- a/tools/libxl/libxl.h Mon Jan 23 14:43:58 2012 -0800 +++ b/tools/libxl/libxl.h Mon Jan 23 14:44:00 2012 -0800 @@ -212,6 +212,12 @@ int (*suspend_callback)(void *, int); } libxl_domain_suspend_info; +typedef struct { + int interval; + int blackhole; + int compression; +} libxl_domain_remus_info; + enum { ERROR_NONSPECIFIC = -1, ERROR_VERSION = -2, diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl_dom.c --- a/tools/libxl/libxl_dom.c Mon Jan 23 14:43:58 2012 -0800 +++ b/tools/libxl/libxl_dom.c Mon Jan 23 14:44:00 2012 -0800 @@ -395,6 +395,8 @@ int hvm; unsigned int flags; int guest_responded; + int save_fd; /* Migration stream fd (for Remus) */ + int interval; /* remus checkpoint interval */ }; static int libxl__domain_suspend_common_switch_qemu_logdirty(int domid, unsigned int enable, void *data) @@ -581,9 +583,69 @@ return 0; } +static int libxl__remus_domain_suspend_callback(void *data) +{ + struct suspendinfo *si = data; + + if (libxl__domain_suspend_common_callback(data)) { + if (si->hvm) { + if (!libxl__remus_domain_suspend_qemu(si->gc, si->domid)) + return 1; + else + return 0; + } + return 1; + } + return 0; +} + +static int libxl__remus_domain_resume_callback(void *data) +{ + struct suspendinfo *si = data; + libxl_ctx *ctx = libxl__gc_owner(si->gc); + + /* Assumes that SUSPEND_CANCEL is available - just like + * xm version of Remus. Without that, remus wont work. + * Since libxl_domain_resume calls xc_domain_resume with + * fast_suspend = 0, we cant re-use that api. + */ + if (xc_domain_resume(ctx->xch, si->domid, 1)) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, + "xc_domain_resume failed for domain %u", + si->domid); + return 0; + } + if (!xs_resume_domain(ctx->xsh, si->domid)) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, + "xs_resume_domain failed for domain %u", + si->domid); + return 0; + } + + if (si->hvm) { + if (libxl__remus_domain_resume_qemu(si->gc, si->domid)) + return 0; + } + return 1; +} + +/* For now, just sleep. Later, we need to release disk/netbuf */ +static int libxl__remus_domain_checkpoint_callback(void *data) +{ + struct suspendinfo *si = data; + + if (si->hvm && libxl__domain_save_device_model(si->gc, si->domid, + si->save_fd, 1)) + return 0; + + usleep(si->interval * 1000); + return 1; +} + int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, libxl_domain_type type, - int live, int debug) + int live, int debug, + const libxl_domain_remus_info *r_info) { libxl_ctx *ctx = libxl__gc_owner(gc); int flags; @@ -614,10 +676,20 @@ return ERROR_INVAL; } + memset(&si, 0, sizeof(si)); flags = (live) ? XCFLAGS_LIVE : 0 | (debug) ? XCFLAGS_DEBUG : 0 | (hvm) ? XCFLAGS_HVM : 0; + if (r_info != NULL) { + si.interval = r_info->interval; + if (r_info->compression) + flags |= XCFLAGS_CHECKPOINT_COMPRESS; + si.save_fd = fd; + } + else + si.save_fd = -1; + si.domid = domid; si.flags = flags; si.hvm = hvm; @@ -641,7 +713,13 @@ } memset(&callbacks, 0, sizeof(callbacks)); - callbacks.suspend = libxl__domain_suspend_common_callback; + if (r_info != NULL) { + callbacks.suspend = libxl__remus_domain_suspend_callback; + callbacks.postcopy = libxl__remus_domain_resume_callback; + callbacks.checkpoint = libxl__remus_domain_checkpoint_callback; + } else + callbacks.suspend = libxl__domain_suspend_common_callback; + callbacks.switch_qemu_logdirty = libxl__domain_suspend_common_switch_qemu_logdirty; callbacks.data = &si; diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl_internal.h --- a/tools/libxl/libxl_internal.h Mon Jan 23 14:43:58 2012 -0800 +++ b/tools/libxl/libxl_internal.h Mon Jan 23 14:44:00 2012 -0800 @@ -272,7 +272,8 @@ int fd); _hidden int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, libxl_domain_type type, - int live, int debug); + int live, int debug, + const libxl_domain_remus_info *r_info); _hidden const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid); _hidden int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd, int remus);
rshriram@cs.ubc.ca
2012-Jan-23 22:46 UTC
[PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
# HG changeset patch # User Shriram Rajagopalan <rshriram@cs.ubc.ca> # Date 1327358642 28800 # Node ID 822536df4aeced5aee00f1f26299086faa622681 # Parent 0446591bee86eb4e767d75b70c885549c7a3cfef tools/libxl: xl remus/remus-receive commands * xl remus (and its receive counterpart remus-receive) act as frontends to enable remus for a given domain. * At the moment, only memory checkpointing and blackhole replication are supported. Support for disk checkpointing and network buffering will be added in future. * xl remus borrows some aspects of xl migrate. Replication is currently done over a ssh connection. Future versions will use a low-overhead plain tcp socket for replication (similar to xend/remus). Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.c --- a/tools/libxl/libxl.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/libxl.c Mon Jan 23 14:44:02 2012 -0800 @@ -466,6 +466,40 @@ return ptr; } +int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, + uint32_t domid, int fd) +{ + GC_INIT(ctx); + libxl_domain_type type = libxl__domain_type(gc, domid); + int rc = 0; + + if (info == NULL) { + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, + "No remus_info structure supplied for domain %d", domid); + rc = -1; + goto remus_fail; + } + + /* TBD: Remus setup - i.e. attach qdisc, enable disk buffering, etc */ + + /* Point of no return */ + rc = libxl__domain_suspend_common(gc, domid, fd, type, /* live */ 1, + /* debug */ 0, info); + + /* + * With Remus, if we reach this point, it means either + * backup died or some network error occurred preventing us + * from sending checkpoints. + */ + + /* TBD: Remus cleanup - i.e. detach qdisc, release other + * resources. + */ + remus_fail: + GC_FREE; + return rc; +} + int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info, uint32_t domid, int fd) { diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.h --- a/tools/libxl/libxl.h Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/libxl.h Mon Jan 23 14:44:02 2012 -0800 @@ -272,6 +272,8 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config, libxl_console_ready cb, void *priv, uint32_t *domid); int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config, libxl_console_ready cb, void *priv, uint32_t *domid, int restore_fd); void libxl_domain_config_dispose(libxl_domain_config *d_config); +int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, + uint32_t domid, int fd); int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info, uint32_t domid, int fd); int libxl_domain_resume(libxl_ctx *ctx, uint32_t domid); diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl.h --- a/tools/libxl/xl.h Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl.h Mon Jan 23 14:44:02 2012 -0800 @@ -93,6 +93,8 @@ int main_getenforce(int argc, char **argv); int main_setenforce(int argc, char **argv); int main_loadpolicy(int argc, char **argv); +int main_remus_receive(int argc, char **argv); +int main_remus(int argc, char **argv); void help(const char *command); diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdimpl.c --- a/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:02 2012 -0800 @@ -1814,7 +1814,7 @@ * If we have daemonized then do not return to the caller -- this has * already happened in the parent. */ - if ( !need_daemon ) + if ( daemonize && !need_daemon ) exit(ret); return ret; @@ -5853,6 +5853,175 @@ return ret; } +int main_remus_receive(int argc, char **argv) +{ + int rc; + char *migration_domname; + struct domain_create dom_info; + + signal(SIGPIPE, SIG_IGN); + memset(&dom_info, 0, sizeof(dom_info)); + dom_info.debug = 1; + dom_info.no_incr_generationid = 1; + dom_info.restore_file = "incoming checkpoint stream"; + dom_info.migrate_fd = 0; /* stdin - will change in future*/ + dom_info.migration_domname_r = &migration_domname; + + rc = create_domain(&dom_info); + if (rc < 0) { + fprintf(stderr, "migration target (Remus): Domain creation failed" + " (code %d) domid %u.\n", rc, domid); + exit(-rc); + } + + /* If we are here, it means that the sender (primary) has crashed. + * If domain renaming fails, lets just continue (as we need the domain + * to be up & dom names may not matter much, as long as its reachable + * over network). + * + * If domain unpausing fails, destroy domain ? Or is it better to have + * a consistent copy of the domain (memory, cpu state, disk) + * on atleast one physical host ? Right now, lets just leave the domain + * as is and let the Administrator decide (or troubleshoot). + */ + fprintf(stderr, "migration target: Remus Failover for domain %u\n", domid); + if (migration_domname) { + rc = libxl_domain_rename(ctx, domid, migration_domname, + common_domname); + if (rc) + fprintf(stderr, "migration target (Remus): " + "Failed to rename domain from %s to %s:%d\n", + migration_domname, common_domname, rc); + + rc = libxl_domain_unpause(ctx, domid); + if (rc) + fprintf(stderr, "migration target (Remus): " + "Failed to unpause domain %s (id: %u):%d\n", + common_domname, domid, rc); + } + + return -rc; +} + +int main_remus(int argc, char **argv) +{ + int opt, rc; + const char *ssh_command = "ssh"; + char *host = NULL, *rune = NULL, *domain = NULL; + + int sendpipe[2], recvpipe[2]; + int send_fd = -1, recv_fd = -1; + pid_t child = -1; + + uint8_t *config_data; + int config_len; + + libxl_domain_remus_info r_info; + + memset(&r_info, 0, sizeof(libxl_domain_remus_info)); + /* Defaults */ + r_info.interval = 200; + r_info.blackhole = 0; + r_info.compression = 1; + + while ((opt = def_getopt(argc, argv, "bui:s:", "remus", 2)) != -1) { + switch (opt) { + case 0: case 2: + return opt; + + case ''i'': + r_info.interval = atoi(optarg); + break; + case ''b'': + r_info.blackhole = 1; + break; + case ''u'': + r_info.compression = 0; + break; + case ''s'': + ssh_command = optarg; + break; + } + } + + domain = argv[optind]; + host = argv[optind + 1]; + + if (r_info.blackhole) { + find_domain(domain); + send_fd = open("/dev/null", O_RDWR, 0644); + if (send_fd < 0) { + perror("failed to open /dev/null"); + exit(-1); + } + } else { + + if (!ssh_command[0]) { + rune = host; + } else { + if (asprintf(&rune, "exec %s %s xl remus-receive", + ssh_command, host) < 0) + return 1; + } + + save_domain_core_begin(domain, NULL, &config_data, &config_len); + + if (!config_len) { + fprintf(stderr, "No config file stored for running domain and " + "none supplied - cannot start remus.\n"); + exit(1); + } + + MUST( libxl_pipe(ctx, sendpipe) ); + MUST( libxl_pipe(ctx, recvpipe) ); + + child = libxl_fork(ctx); + if (child==-1) exit(1); + + /* TODO: change this to plain TCP socket based channel + * instead of SSH. + */ + if (!child) { + dup2(sendpipe[0], 0); + dup2(recvpipe[1], 1); + close(sendpipe[0]); close(sendpipe[1]); + close(recvpipe[0]); close(recvpipe[1]); + execlp("sh","sh","-c",rune,(char*)0); + perror("failed to exec sh"); + exit(-1); + } + + close(sendpipe[0]); + close(recvpipe[1]); + send_fd = sendpipe[1]; + recv_fd = recvpipe[0]; + + signal(SIGPIPE, SIG_IGN); + + save_domain_core_writeconfig(send_fd, "migration stream", + config_data, config_len); + } + + /* Point of no return */ + rc = libxl_domain_remus_start(ctx, &r_info, domid, send_fd); + + /* If we are here, it means backup has failed/domain suspend failed. + * Try to resume the domain and exit gracefully. + */ + fprintf(stderr, "remus sender: libxl_domain_suspend failed" + " (rc=%d)\n", rc); + + if (rc == ERROR_GUEST_TIMEDOUT) + fprintf(stderr, "Failed to suspend domain at primary.\n"); + else { + fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n"); + libxl_domain_resume(ctx, domid); + } + + close(send_fd); + return -ERROR_FAIL; +} + /* * Local variables: * mode: C diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdtable.c --- a/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:00 2012 -0800 +++ b/tools/libxl/xl_cmdtable.c Mon Jan 23 14:44:02 2012 -0800 @@ -407,6 +407,22 @@ "Loads a new policy int the Flask Xen security module", "<policy file>", }, + { "remus-receive", + &main_remus_receive, 0, + "Remus Checkpoint Receiver", + "- for internal use only", + }, + { "remus", + &main_remus, 0, + "Enable Remus HA for domain", + "[options] <Domain> [<DestinationHost>]", + "-i MS Checkpoint domain memory every MS milliseconds (def. 200ms).\n" + "-b Replicate memory checkpoints to /dev/null (blackhole)\n" + "-u Disable memory checkpoint compression.\n" + "-s <sshcommand> Use <sshcommand> instead of ssh. String will be passed\n" + " to sh. If empty, run <host> instead of \n" + " ssh <host> xl remus-receive\n" + }, }; int cmdtable_len = sizeof(cmd_table)/sizeof(struct cmd_spec);
Ian Campbell
2012-Jan-25 11:08 UTC
Re: [PATCH 1 of 3] tools/libxl: QEMU device model suspend/resume
On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote:> # HG changeset patch > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > # Date 1327358638 28800 > # Node ID 11fb1dfda7de4d6759dec87d80cd16cf137f7369 > # Parent 80fdf2182bc62ca358ba2f1a3513b47a4f8d9dfd > tools/libxl: QEMU device model suspend/resume > > * Refactor the libxl__domain_save_device_model. > * Add support for suspend/resume QEMU device model > (both traditional xen and QMP). > > > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> > > diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl.c > --- a/tools/libxl/libxl.c Sat Jan 21 17:15:40 2012 +0000 > +++ b/tools/libxl/libxl.c Mon Jan 23 14:43:58 2012 -0800 > @@ -477,7 +477,7 @@ > > rc = libxl__domain_suspend_common(gc, domid, fd, type, live, debug); > if (!rc && type == LIBXL_DOMAIN_TYPE_HVM) > - rc = libxl__domain_save_device_model(gc, domid, fd); > + rc = libxl__domain_save_device_model(gc, domid, fd, /* No Remus */ 0); > GC_FREE; > return rc; > } > diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl_dom.c > --- a/tools/libxl/libxl_dom.c Sat Jan 21 17:15:40 2012 +0000 > +++ b/tools/libxl/libxl_dom.c Mon Jan 23 14:43:58 2012 -0800 > @@ -534,6 +534,53 @@ > return 0; > } > > +static int libxl__remus_domain_suspend_qemu(libxl__gc *gc, uint32_t domid) > +{ > + > + switch (libxl__device_model_version_running(gc, domid)) { > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > + char *path = NULL; > + path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", > + domid); > + libxl__xs_write(gc, XBT_NULL, path, "save"); > + libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, NULL);This and libxl__remus_domain_resume_qemu, qemu_pci_add_xenstore, qemu_pci_remove_xenstore, libxl__domain_save_device_model and libxl_domain_unpause would probably benefit from a helper function to send a command to a traditional qemu.> + break; > + } > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > + /* Stop QEMU */ > + if (libxl__qmp_stop(gc, domid)) > + return ERROR_FAIL; > + break; > + default: > + return ERROR_INVAL; > + } > + > + return 0; > +} > + > +static int libxl__remus_domain_resume_qemu(libxl__gc *gc, uint32_t domid) > +{ > + > + switch (libxl__device_model_version_running(gc, domid)) { > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > + char *path = NULL; > + path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", > + domid); > + libxl__xs_write(gc, XBT_NULL, path, "continue"); > + break; > + } > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > + /* Stop QEMU */ > + if (libxl__qmp_resume(gc, domid)) > + return ERROR_FAIL; > + break; > + default: > + return ERROR_INVAL; > + } > + > + return 0; > +} > + > int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, > libxl_domain_type type, > int live, int debug) > @@ -620,7 +667,7 @@ > return rc; > } > > -int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd) > +int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd, int remus) > { > libxl_ctx *ctx = libxl__gc_owner(gc); > int ret, fd2 = -1, c; > @@ -631,13 +678,12 @@ > > switch (libxl__device_model_version_running(gc, domid)) { > case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > - char *path = NULL; > - LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, > - "Saving device model state to %s", filename); > - path = libxl__sprintf(gc, "/local/domain/0/device-model/%d/command", > - domid); > - libxl__xs_write(gc, XBT_NULL, path, "save"); > - libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, NULL); > + /* For Remus,we issue suspend_qemu call separately */Why? How does this interact with Stefano''s XC_SAVEID_TOOLSTACK patches? I think some sort of high level description of the control flow used by Remus and how/why it differs from normal save/restore would be useful for review.> + if (!remus) { > + c = libxl__remus_domain_suspend_qemu(gc, domid);It seems odd to call this function libxl__remus_FOO and the use it when !remus. The function doesn''t look to be inherently specific to either remus or !remus anyhow.> + if (c) > + return c; > + } > break; > } > case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > @@ -668,8 +714,9 @@ > qemu_state_len = st.st_size; > LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Qemu state is %d bytes\n", qemu_state_len); > > - ret = libxl_write_exactly(ctx, fd, QEMU_SIGNATURE, strlen(QEMU_SIGNATURE), > - "saved-state file", "qemu signature"); > + ret = libxl_write_exactly(ctx, fd, remus ? REMUS_QEMU_SIGNATURE : QEMU_SIGNATURE, > + remus ? strlen(REMUS_QEMU_SIGNATURE): strlen(QEMU_SIGNATURE), > + "saved-state file", "qemu signature");QEMU_SIGNATURE is "DeviceModelRecord0002" which I believe libxc treats identically to the Remus signature? Again consideration of how this interacts with Stefano''s patch would be useful. If possible we''d quite like to pull the qemu-restore our of xc_domain_restore for consistency with how xc_domain_save saves it, the current layering is quite mad. Ian.
Ian Campbell
2012-Jan-25 11:22 UTC
Re: [PATCH 2 of 3] tools/libxl: suspend/postflush/commit callbacks
On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote:> # HG changeset patch > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > # Date 1327358640 28800 > # Node ID 0446591bee86eb4e767d75b70c885549c7a3cfef > # Parent 11fb1dfda7de4d6759dec87d80cd16cf137f7369 > tools/libxl: suspend/postflush/commit callbacks > > * Add libxl callbacks for Remus checkpoint suspend, postflush (aka resume) > and checkpoint commit callbacks. > * suspend callback just bounces off libxl__domain_suspend_common_callback > * resume callback directly calls xc_domain_resume instead of re-using > libxl_domain_resume (as the latter does not set the fast suspend argument > to xc_domain_resume - aka suspend_cancel support). > * commit callback just sleeps for the checkpoint interval (for the moment). > > * Future patches will augument these callbacks with more functionalities like > issuing network buffer plug/unplug commands, disk checkpoint commands, etc. > > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> >> diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl.h > --- a/tools/libxl/libxl.h Mon Jan 23 14:43:58 2012 -0800 > +++ b/tools/libxl/libxl.h Mon Jan 23 14:44:00 2012 -0800 > @@ -212,6 +212,12 @@ > int (*suspend_callback)(void *, int); > } libxl_domain_suspend_info; > > +typedef struct { > + int interval; > + int blackhole; > + int compression; > +} libxl_domain_remus_info;This should be defined via the libxl IDL (see libxl_types.idl and idl.txt)> + > enum { > ERROR_NONSPECIFIC = -1, > ERROR_VERSION = -2, > diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl_dom.c > --- a/tools/libxl/libxl_dom.c Mon Jan 23 14:43:58 2012 -0800 > +++ b/tools/libxl/libxl_dom.c Mon Jan 23 14:44:00 2012 -0800 > @@ -395,6 +395,8 @@ > int hvm; > unsigned int flags; > int guest_responded; > + int save_fd; /* Migration stream fd (for Remus) */ > + int interval; /* remus checkpoint interval */ > }; > > static int libxl__domain_suspend_common_switch_qemu_logdirty(int domid, unsigned int enable, void *data) > @@ -581,9 +583,69 @@ > return 0; > } > > +static int libxl__remus_domain_suspend_callback(void *data) > +{ > + struct suspendinfo *si = data; > + > + if (libxl__domain_suspend_common_callback(data)) {if (!libxl_domain_suspend_common_callback) return 1; if (si->hvm && !libxl__remus_domain_suspend(...) return 1; return 0; Is would be clearer than the multiple nested if/elses and returns.> + if (si->hvm) { > + if (!libxl__remus_domain_suspend_qemu(si->gc, si->domid))Perhaps libxl__domain_suspend_common_callback should do this in both the Remus and non-Remus cases? Likewise perhaps normal save should use the checkpoint hook to call libxl__domain_save_device_model() as well. I''m in favour of tweaking the normal suspend/resume case as necessary to reduce the amount of Remus special casing that is required. IMHO a normal suspend should look a lot like a Remus suspend which happened to only do a single iteration.> + return 1; > + else > + return 0; > + } > + return 1; > + } > + return 0; > +} > + > +static int libxl__remus_domain_resume_callback(void *data) > +{ > + struct suspendinfo *si = data; > + libxl_ctx *ctx = libxl__gc_owner(si->gc); > + > + /* Assumes that SUSPEND_CANCEL is available - just like > + * xm version of Remus. Without that, remus wont work. > + * Since libxl_domain_resume calls xc_domain_resume with > + * fast_suspend = 0, we cant re-use that api.You could modify that API which would be better than duplicating its content. I think the "fast" flag is useful to expose to users of libxl but if not then you could refactor into an internal function which takes the flag and call it from both here and libxl_domain_resume.> + */ > + if (xc_domain_resume(ctx->xch, si->domid, 1)) { > + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, > + "xc_domain_resume failed for domain %u", > + si->domid); > + return 0; > + } > + if (!xs_resume_domain(ctx->xsh, si->domid)) { > + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, > + "xs_resume_domain failed for domain %u", > + si->domid); > + return 0; > + } > + > + if (si->hvm) { > + if (libxl__remus_domain_resume_qemu(si->gc, si->domid)) > + return 0; > + } > + return 1; > +} > + > +/* For now, just sleep. Later, we need to release disk/netbuf */ > +static int libxl__remus_domain_checkpoint_callback(void *data) > +{ > + struct suspendinfo *si = data; > + > + if (si->hvm && libxl__domain_save_device_model(si->gc, si->domid, > + si->save_fd, 1)) > + return 0; > + > + usleep(si->interval * 1000); > + return 1; > +} > + > int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, > libxl_domain_type type, > - int live, int debug) > + int live, int debug, > + const libxl_domain_remus_info *r_info) > { > libxl_ctx *ctx = libxl__gc_owner(gc); > int flags; > @@ -614,10 +676,20 @@Please can you add : [diff] showfunc = True to your ~/.hgrc.> return ERROR_INVAL; > } > > + memset(&si, 0, sizeof(si)); > flags = (live) ? XCFLAGS_LIVE : 0 > | (debug) ? XCFLAGS_DEBUG : 0 > | (hvm) ? XCFLAGS_HVM : 0; > > + if (r_info != NULL) { > + si.interval = r_info->interval; > + if (r_info->compression) > + flags |= XCFLAGS_CHECKPOINT_COMPRESS; > + si.save_fd = fd; > + } > + else > + si.save_fd = -1; > + > si.domid = domid; > si.flags = flags; > si.hvm = hvm; > @@ -641,7 +713,13 @@ > } > > memset(&callbacks, 0, sizeof(callbacks)); > - callbacks.suspend = libxl__domain_suspend_common_callback; > + if (r_info != NULL) { > + callbacks.suspend = libxl__remus_domain_suspend_callback; > + callbacks.postcopy = libxl__remus_domain_resume_callback; > + callbacks.checkpoint = libxl__remus_domain_checkpoint_callback; > + } else > + callbacks.suspend = libxl__domain_suspend_common_callback; > + > callbacks.switch_qemu_logdirty = libxl__domain_suspend_common_switch_qemu_logdirty; > callbacks.data = &si; > > diff -r 11fb1dfda7de -r 0446591bee86 tools/libxl/libxl_internal.h > --- a/tools/libxl/libxl_internal.h Mon Jan 23 14:43:58 2012 -0800 > +++ b/tools/libxl/libxl_internal.h Mon Jan 23 14:44:00 2012 -0800 > @@ -272,7 +272,8 @@ > int fd); > _hidden int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, > libxl_domain_type type, > - int live, int debug); > + int live, int debug, > + const libxl_domain_remus_info *r_info); > _hidden const char *libxl__device_model_savefile(libxl__gc *gc, uint32_t domid); > _hidden int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int fd, > int remus); > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel
Ian Campbell
2012-Jan-25 11:52 UTC
Re: [PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote:> # HG changeset patch > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > # Date 1327358642 28800 > # Node ID 822536df4aeced5aee00f1f26299086faa622681 > # Parent 0446591bee86eb4e767d75b70c885549c7a3cfef > tools/libxl: xl remus/remus-receive commands > > * xl remus (and its receive counterpart remus-receive) act as frontends > to enable remus for a given domain. > * At the moment, only memory checkpointing and blackhole replication are > supported. Support for disk checkpointing and network buffering will > be added in future. > * xl remus borrows some aspects of xl migrate. Replication is currently > done over a ssh connection. Future versions will use a low-overhead > plain tcp socket for replication (similar to xend/remus).Please also patch docs/man/xl.*.pod as required. Other references to relevant documentation and/or the addition of such documentation to the tree would also be useful.> > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> > > diff -r 0446591bee86 -r 822536df4aec tools/libxl/libxl.c > --- a/tools/libxl/libxl.c Mon Jan 23 14:44:00 2012 -0800 > +++ b/tools/libxl/libxl.c Mon Jan 23 14:44:02 2012 -0800 > @@ -466,6 +466,40 @@ > return ptr; > } > > +int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info, > + uint32_t domid, int fd) > +{ > + GC_INIT(ctx); > + libxl_domain_type type = libxl__domain_type(gc, domid); > + int rc = 0; > + > + if (info == NULL) { > + LIBXL__LOG(ctx, LIBXL__LOG_ERROR, > + "No remus_info structure supplied for domain %d", domid); > + rc = -1; > + goto remus_fail; > + } > + > + /* TBD: Remus setup - i.e. attach qdisc, enable disk buffering, etc */ > + > + /* Point of no return */ > + rc = libxl__domain_suspend_common(gc, domid, fd, type, /* live */ 1, > + /* debug */ 0, info); > + > + /* > + * With Remus, if we reach this point, it means either > + * backup died or some network error occurred preventing us > + * from sending checkpoints. > + */ > + > + /* TBD: Remus cleanup - i.e. detach qdisc, release other > + * resources. > + */ > + remus_fail: > + GC_FREE; > + return rc; > +} > + > int libxl_domain_suspend(libxl_ctx *ctx, libxl_domain_suspend_info *info, > uint32_t domid, int fd) > {> diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdimpl.c > --- a/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:00 2012 -0800 > +++ b/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:02 2012 -0800 > @@ -1814,7 +1814,7 @@ > * If we have daemonized then do not return to the caller -- this has > * already happened in the parent. > */ > - if ( !need_daemon ) > + if ( daemonize && !need_daemon )What is this change for?> exit(ret); > > return ret; > @@ -5853,6 +5853,175 @@ > return ret; > } > > +int main_remus_receive(int argc, char **argv)Can this not be done as a flag to migrate-receive? Likewise is there common code between the remus migrate and regular migration? Is there any reason that remus would not benefit from the xl migration protocol preamble?> +{ > + int rc; > + char *migration_domname; > + struct domain_create dom_info; > + > + signal(SIGPIPE, SIG_IGN); > + memset(&dom_info, 0, sizeof(dom_info)); > + dom_info.debug = 1; > + dom_info.no_incr_generationid = 1; > + dom_info.restore_file = "incoming checkpoint stream"; > + dom_info.migrate_fd = 0; /* stdin - will change in future*/ > + dom_info.migration_domname_r = &migration_domname; > + > + rc = create_domain(&dom_info);I''m totally failing to see where the Remus''ness of this create_domain comes from.> + if (rc < 0) { > + fprintf(stderr, "migration target (Remus): Domain creation failed" > + " (code %d) domid %u.\n", rc, domid); > + exit(-rc); > + } > + > + /* If we are here, it means that the sender (primary) has crashed. > + * If domain renaming fails, lets just continue (as we need the domain > + * to be up & dom names may not matter much, as long as its reachable > + * over network). > + * > + * If domain unpausing fails, destroy domain ? Or is it better to have > + * a consistent copy of the domain (memory, cpu state, disk) > + * on atleast one physical host ? Right now, lets just leave the domainat least> + * as is and let the Administrator decide (or troubleshoot). > + */ > + fprintf(stderr, "migration target: Remus Failover for domain %u\n", domid); > + if (migration_domname) { > + rc = libxl_domain_rename(ctx, domid, migration_domname, > + common_domname); > + if (rc) > + fprintf(stderr, "migration target (Remus): " > + "Failed to rename domain from %s to %s:%d\n", > + migration_domname, common_domname, rc); > + > + rc = libxl_domain_unpause(ctx, domid); > + if (rc) > + fprintf(stderr, "migration target (Remus): " > + "Failed to unpause domain %s (id: %u):%d\n", > + common_domname, domid, rc); > + } > + > + return -rc; > +} > + > +int main_remus(int argc, char **argv) > +{ > + int opt, rc; > + const char *ssh_command = "ssh"; > + char *host = NULL, *rune = NULL, *domain = NULL; > + > + int sendpipe[2], recvpipe[2]; > + int send_fd = -1, recv_fd = -1; > + pid_t child = -1; > + > + uint8_t *config_data; > + int config_len; > + > + libxl_domain_remus_info r_info; > + > + memset(&r_info, 0, sizeof(libxl_domain_remus_info)); > + /* Defaults */ > + r_info.interval = 200; > + r_info.blackhole = 0; > + r_info.compression = 1; > + > + while ((opt = def_getopt(argc, argv, "bui:s:", "remus", 2)) != -1) {main_migrate handles a bunch of other options which might also be interesting in the Remus case? Better would be to add all this as an option to the std migrate.> + switch (opt) { > + case 0: case 2: > + return opt; > + > + case ''i'': > + r_info.interval = atoi(optarg); > + break; > + case ''b'': > + r_info.blackhole = 1; > + break; > + case ''u'': > + r_info.compression = 0; > + break; > + case ''s'': > + ssh_command = optarg; > + break; > + } > + } > + > + domain = argv[optind]; > + host = argv[optind + 1]; > + > + if (r_info.blackhole) { > + find_domain(domain); > + send_fd = open("/dev/null", O_RDWR, 0644); > + if (send_fd < 0) { > + perror("failed to open /dev/null"); > + exit(-1); > + } > + } else {The following duplicates an awful lot of main_migrate which begs for them to either be the same underlying command or at least for some refactoring.> + > + if (!ssh_command[0]) { > + rune = host; > + } else { > + if (asprintf(&rune, "exec %s %s xl remus-receive", > + ssh_command, host) < 0) > + return 1; > + } > + > + save_domain_core_begin(domain, NULL, &config_data, &config_len); > + > + if (!config_len) { > + fprintf(stderr, "No config file stored for running domain and " > + "none supplied - cannot start remus.\n"); > + exit(1); > + } > + > + MUST( libxl_pipe(ctx, sendpipe) ); > + MUST( libxl_pipe(ctx, recvpipe) ); > + > + child = libxl_fork(ctx); > + if (child==-1) exit(1); > + > + /* TODO: change this to plain TCP socket based channel > + * instead of SSH.There are good reasons for using ssh though. However the user can supply another command which is not encrypted if they want to. Whatever happens here the same arguments would apply to non-remus migration so the changes should be done for both (or better the code should be made more common. Ian.
Shriram Rajagopalan
2012-Jan-25 23:07 UTC
Re: [PATCH 1 of 3] tools/libxl: QEMU device model suspend/resume
On Wed, Jan 25, 2012 at 3:08 AM, Ian Campbell <Ian.Campbell@citrix.com>wrote:> On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote: > > # HG changeset patch > > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > > # Date 1327358638 28800 > > # Node ID 11fb1dfda7de4d6759dec87d80cd16cf137f7369 > > # Parent 80fdf2182bc62ca358ba2f1a3513b47a4f8d9dfd > > tools/libxl: QEMU device model suspend/resume > > > > * Refactor the libxl__domain_save_device_model. > > * Add support for suspend/resume QEMU device model > > (both traditional xen and QMP). > > > > > > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> > > > > diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl.c > > --- a/tools/libxl/libxl.c Sat Jan 21 17:15:40 2012 +0000 > > +++ b/tools/libxl/libxl.c Mon Jan 23 14:43:58 2012 -0800 > > @@ -477,7 +477,7 @@ > > > > rc = libxl__domain_suspend_common(gc, domid, fd, type, live, debug); > > if (!rc && type == LIBXL_DOMAIN_TYPE_HVM) > > - rc = libxl__domain_save_device_model(gc, domid, fd); > > + rc = libxl__domain_save_device_model(gc, domid, fd, /* No Remus > */ 0); > > GC_FREE; > > return rc; > > } > > diff -r 80fdf2182bc6 -r 11fb1dfda7de tools/libxl/libxl_dom.c > > --- a/tools/libxl/libxl_dom.c Sat Jan 21 17:15:40 2012 +0000 > > +++ b/tools/libxl/libxl_dom.c Mon Jan 23 14:43:58 2012 -0800 > > @@ -534,6 +534,53 @@ > > return 0; > > } > > > > +static int libxl__remus_domain_suspend_qemu(libxl__gc *gc, uint32_t > domid) > > +{ > > + > > + switch (libxl__device_model_version_running(gc, domid)) { > > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > > + char *path = NULL; > > + path = libxl__sprintf(gc, > "/local/domain/0/device-model/%d/command", > > + domid); > > + libxl__xs_write(gc, XBT_NULL, path, "save"); > > + libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, > NULL); > > This and libxl__remus_domain_resume_qemu, qemu_pci_add_xenstore, > qemu_pci_remove_xenstore, libxl__domain_save_device_model and > libxl_domain_unpause would probably benefit from a helper function to > send a command to a traditional qemu. > > > + break; > > + } > > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > > + /* Stop QEMU */ > > + if (libxl__qmp_stop(gc, domid)) > > + return ERROR_FAIL; > > + break; > > + default: > > + return ERROR_INVAL; > > + } > > + > > + return 0; > > +} > > + > > +static int libxl__remus_domain_resume_qemu(libxl__gc *gc, uint32_t > domid) > > +{ > > + > > + switch (libxl__device_model_version_running(gc, domid)) { > > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > > + char *path = NULL; > > + path = libxl__sprintf(gc, > "/local/domain/0/device-model/%d/command", > > + domid); > > + libxl__xs_write(gc, XBT_NULL, path, "continue"); > > + break; > > + } > > + case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > > + /* Stop QEMU */ > > + if (libxl__qmp_resume(gc, domid)) > > + return ERROR_FAIL; > > + break; > > + default: > > + return ERROR_INVAL; > > + } > > + > > + return 0; > > +} > > + > > int libxl__domain_suspend_common(libxl__gc *gc, uint32_t domid, int fd, > > libxl_domain_type type, > > int live, int debug) > > @@ -620,7 +667,7 @@ > > return rc; > > } > > > > -int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int > fd) > > +int libxl__domain_save_device_model(libxl__gc *gc, uint32_t domid, int > fd, int remus) > > { > > libxl_ctx *ctx = libxl__gc_owner(gc); > > int ret, fd2 = -1, c; > > @@ -631,13 +678,12 @@ > > > > switch (libxl__device_model_version_running(gc, domid)) { > > case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL: { > > - char *path = NULL; > > - LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, > > - "Saving device model state to %s", filename); > > - path = libxl__sprintf(gc, > "/local/domain/0/device-model/%d/command", > > - domid); > > - libxl__xs_write(gc, XBT_NULL, path, "save"); > > - libxl__wait_for_device_model(gc, domid, "paused", NULL, NULL, > NULL); > > + /* For Remus,we issue suspend_qemu call separately */ > > Why? > >In remus, save and transmit device model phases are decoupled. This code (sending the saved device model to the target) is executed after the domain is resumed. The control flow in current Remus is like this: 1 suspend vm 1.1 suspend qemu (save command, that saves the device model to a file) 2 copy out dirty memory into a buffer 3 resume vm 3.1 resume qemu 4 send the data in the buffer 5 send the saved device model (xc_domain_restore extracts this from the tailbuf)> How does this interact with Stefano''s XC_SAVEID_TOOLSTACK patches? > >I couldnt find anything online by this name. Can you point me to the patch series? I think some sort of high level description of the control flow used by> Remus and how/why it differs from normal save/restore would be useful > for review. > > > + if (!remus) { > > + c = libxl__remus_domain_suspend_qemu(gc, domid); > > It seems odd to call this function libxl__remus_FOO and the use it > when !remus. The function doesn''t look to be inherently specific to > either remus or !remus anyhow. > > > + if (c) > > + return c; > > + } > > break; > > } > > case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN: > > @@ -668,8 +714,9 @@ > > qemu_state_len = st.st_size; > > LIBXL__LOG(ctx, LIBXL__LOG_DEBUG, "Qemu state is %d bytes\n", > qemu_state_len); > > > > - ret = libxl_write_exactly(ctx, fd, QEMU_SIGNATURE, > strlen(QEMU_SIGNATURE), > > - "saved-state file", "qemu signature"); > > + ret = libxl_write_exactly(ctx, fd, remus ? REMUS_QEMU_SIGNATURE : > QEMU_SIGNATURE, > > + remus ? strlen(REMUS_QEMU_SIGNATURE): > strlen(QEMU_SIGNATURE), > > + "saved-state file", "qemu signature"); > > QEMU_SIGNATURE is "DeviceModelRecord0002" which I believe libxc treats > identically to the Remus signature? > >Thanks. Yep, the remus_signature is redundant.> Again consideration of how this interacts with Stefano''s patch would be > useful. If possible we''d quite like to pull the qemu-restore our of > xc_domain_restore for consistency with how xc_domain_save saves it, the > current layering is quite mad. > > Ian. > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Shriram Rajagopalan
2012-Jan-25 23:15 UTC
Re: [PATCH 2 of 3] tools/libxl: suspend/postflush/commit callbacks
On Wed, Jan 25, 2012 at 3:22 AM, Ian Campbell <Ian.Campbell@citrix.com>wrote:> On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote: > > # HG changeset patch > > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > > # Date 1327358640 28800 > > # Node ID 0446591bee86eb4e767d75b70c885549c7a3cfef > > # Parent 11fb1dfda7de4d6759dec87d80cd16cf137f7369 > > tools/libxl: suspend/postflush/commit callbacks > > > > * Add libxl callbacks for Remus checkpoint suspend, postflush (aka > resume) > > and checkpoint commit callbacks. > > * suspend callback just bounces off > libxl__domain_suspend_common_callback > > * resume callback directly calls xc_domain_resume instead of re-using > > libxl_domain_resume (as the latter does not set the fast suspend > argument > > to xc_domain_resume - aka suspend_cancel support). > > * commit callback just sleeps for the checkpoint interval (for the > moment). > > > > * Future patches will augument these callbacks with more > functionalities like > > issuing network buffer plug/unplug commands, disk checkpoint > commands, etc. > > > > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> > > > > > > +static int libxl__remus_domain_resume_callback(void *data) > > +{ > > + struct suspendinfo *si = data; > > + libxl_ctx *ctx = libxl__gc_owner(si->gc); > > + > > + /* Assumes that SUSPEND_CANCEL is available - just like > > + * xm version of Remus. Without that, remus wont work. > > + * Since libxl_domain_resume calls xc_domain_resume with > > + * fast_suspend = 0, we cant re-use that api. > > You could modify that API which would be better than duplicating its > content. I think the "fast" flag is useful to expose to users of libxl > but if not then you could refactor into an internal function which takes > the flag and call it from both here and libxl_domain_resume. > >I didnt want to touch any existing public interfaces, structures, etc, especially something so common like domain_resume. While users of xl resume (or unpause) wont see any difference, other libraries using the libxl API might be affected. But I am in favor of "exposing" the flag instead of hiding it in an internal function, if there are no objections :). shriram _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Shriram Rajagopalan
2012-Jan-26 00:09 UTC
Re: [PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
On Wed, Jan 25, 2012 at 3:52 AM, Ian Campbell <Ian.Campbell@citrix.com>wrote:> > > > > diff -r 0446591bee86 -r 822536df4aec tools/libxl/xl_cmdimpl.c > > --- a/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:00 2012 -0800 > > +++ b/tools/libxl/xl_cmdimpl.c Mon Jan 23 14:44:02 2012 -0800 > > @@ -1814,7 +1814,7 @@ > > * If we have daemonized then do not return to the caller -- this > has > > * already happened in the parent. > > */ > > - if ( !need_daemon ) > > + if ( daemonize && !need_daemon ) > > What is this change for? > >Sorry my bad. I should have sent it out as a separate patch. Well, if you look at the control flow in the create_domain function , you would realize that this is a bug. create_domain() { int daemonize = dom_info->daemonize; .... int need_daemon = daemonize; restore_domain() or create_new() if (!daemonize && !monitor) goto out; if (need_daemon) { fork() daemon().. need_daemon = 0; } ... out: /* If we have daemonized then do not return to the caller -- this has * already happened in the parent. */ if ( !need_daemon ) exit() } So, even if one explicitly sets daemonize to 0, the function will never return to the caller. I needed it to return to the caller (remus_receiver() ), to take further actions. > exit(ret);> > > > return ret; > > @@ -5853,6 +5853,175 @@ > > return ret; > > } > > > > +int main_remus_receive(int argc, char **argv) > > Can this not be done as a flag to migrate-receive? Likewise is there > common code between the remus migrate and regular migration? > >It can but it would basically terminate after the create_domain() call in the migrate_receive function. I would have to have something like if (remus) { dont wait for permission to start domain try renaming domain unpause domain return } and a bunch of if (remus) flags peppered around the migrate receive function. Having it in a separate function allows me to add support for creating a TCP channel that receives the checkpoints, add hooks or call external libraries/binaries that allows the user to check for split-brain before resuming the domain. Is there any reason that remus would not benefit from the xl migration> protocol preamble? > >No specific reason.> +{ > > + int rc; > > + char *migration_domname; > > + struct domain_create dom_info; > > + > > + signal(SIGPIPE, SIG_IGN); > > + memset(&dom_info, 0, sizeof(dom_info)); > > + dom_info.debug = 1; > > + dom_info.no_incr_generationid = 1; > > + dom_info.restore_file = "incoming checkpoint stream"; > > + dom_info.migrate_fd = 0; /* stdin - will change in future*/ > > + dom_info.migration_domname_r = &migration_domname; > > + > > + rc = create_domain(&dom_info); > > I''m totally failing to see where the Remus''ness of this create_domain > comes from. > >dom_info.daemonize = 0, .monitor = 0 and .paused = 0; As I said earlier, this part is probably the only duplication between migrate-receive and remus-receive. With Xend, there was no separate remus receiver, to control what happens on the backup host. Now that I have an opportunity to incorporate remus into the toolstack from start, I thought I will keep the remus control flow as separate as possible and not have to worry about breaking live migration (nor complicate its code).> + if (rc < 0) { > > + fprintf(stderr, "migration target (Remus): Domain creation > failed" > > + " (code %d) domid %u.\n", rc, domid); > > + exit(-rc); > > + } > > + > > + /* If we are here, it means that the sender (primary) has crashed. > > + * If domain renaming fails, lets just continue (as we need the > domain > > + * to be up & dom names may not matter much, as long as its > reachable > > + * over network). > > + * > > + * If domain unpausing fails, destroy domain ? Or is it better to > have > > + * a consistent copy of the domain (memory, cpu state, disk) > > + * on atleast one physical host ? Right now, lets just leave the > domain > > at least > > > + * as is and let the Administrator decide (or troubleshoot). > > + */ > > + fprintf(stderr, "migration target: Remus Failover for domain %u\n", > domid); > > + if (migration_domname) { > > + rc = libxl_domain_rename(ctx, domid, migration_domname, > > + common_domname); > > + if (rc) > > + fprintf(stderr, "migration target (Remus): " > > + "Failed to rename domain from %s to %s:%d\n", > > + migration_domname, common_domname, rc); > > + > > + rc = libxl_domain_unpause(ctx, domid); > > + if (rc) > > + fprintf(stderr, "migration target (Remus): " > > + "Failed to unpause domain %s (id: %u):%d\n", > > + common_domname, domid, rc); > > + } > > + > > + return -rc; > > +} > > + > > +int main_remus(int argc, char **argv) > > +{ > > + int opt, rc; > > + const char *ssh_command = "ssh"; > > + char *host = NULL, *rune = NULL, *domain = NULL; > > + > > + int sendpipe[2], recvpipe[2]; > > + int send_fd = -1, recv_fd = -1; > > + pid_t child = -1; > > + > > + uint8_t *config_data; > > + int config_len; > > + > > + libxl_domain_remus_info r_info; > > + > > + memset(&r_info, 0, sizeof(libxl_domain_remus_info)); > > + /* Defaults */ > > + r_info.interval = 200; > > + r_info.blackhole = 0; > > + r_info.compression = 1; > > + > > + while ((opt = def_getopt(argc, argv, "bui:s:", "remus", 2)) != -1) { > > main_migrate handles a bunch of other options which might also be > interesting in the Remus case? Better would be to add all this as an > option to the std migrate. >Negative. It would look totally unintuitive to have a checkpoint interval option (or blackhole replication option) in a live migration command. To an end user, remus is just a HA system and live migration is for moving VMs around. What is the point in trying to educate him/her that remus is basically "continuous live migration" ? Imagine a command like "xl migrate --R --i 100 --N Guest Backup" and help options like "-R enable remus HA -i remus checkpoint interval -N disable network buffering in Remus" To you and me, as devs, it may not matter. But to common users who are mostly going to use just live migration, this would be utterly confusing.> > + switch (opt) { > > + case 0: case 2: > > + return opt; > > + > > + case ''i'': > > + r_info.interval = atoi(optarg); > > + break; > > + case ''b'': > > + r_info.blackhole = 1; > > + break; > > + case ''u'': > > + r_info.compression = 0; > > + break; > > + case ''s'': > > + ssh_command = optarg; > > + break; > > + } > > + } > > + > > + domain = argv[optind]; > > + host = argv[optind + 1]; > > + > > + if (r_info.blackhole) { > > + find_domain(domain); > > + send_fd = open("/dev/null", O_RDWR, 0644); > > + if (send_fd < 0) { > > + perror("failed to open /dev/null"); > > + exit(-1); > > + } > > + } else { > > The following duplicates an awful lot of main_migrate which begs for > them to either be the same underlying command or at least for some > refactoring. > >I agree.> > + > > + if (!ssh_command[0]) { > > + rune = host; > > + } else { > > + if (asprintf(&rune, "exec %s %s xl remus-receive", > > + ssh_command, host) < 0) > > + return 1; > > + } > > + > > + save_domain_core_begin(domain, NULL, &config_data, &config_len); > > + > > + if (!config_len) { > > + fprintf(stderr, "No config file stored for running domain > and " > > + "none supplied - cannot start remus.\n"); > > + exit(1); > > + } > > + > > + MUST( libxl_pipe(ctx, sendpipe) ); > > + MUST( libxl_pipe(ctx, recvpipe) ); > > + > > + child = libxl_fork(ctx); > > + if (child==-1) exit(1); > > + > > + /* TODO: change this to plain TCP socket based channel > > + * instead of SSH. > > There are good reasons for using ssh though. However the user can supply > another command which is not encrypted if they want to. >Whatever happens here the same arguments would apply to non-remus> migration so the changes should be done for both (or better the code > should be made more common. > > Ian. > > >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2012-Jan-26 09:18 UTC
Re: [PATCH 1 of 3] tools/libxl: QEMU device model suspend/resume
On Wed, 2012-01-25 at 23:07 +0000, Shriram Rajagopalan wrote:> On Wed, Jan 25, 2012 at 3:08 AM, Ian Campbell> > - libxl__xs_write(gc, XBT_NULL, path, "save"); > > - libxl__wait_for_device_model(gc, domid, "paused", > NULL, NULL, NULL); > > + /* For Remus,we issue suspend_qemu call separately > */ > > > Why? > > > In remus, save and transmit device model phases are decoupled. This > code (sending the saved device model to the target) is executed after > the domain is resumed. > > The control flow in current Remus is like this: > 1 suspend vm > 1.1 suspend qemu (save command, that saves the device model to a > file) > 2 copy out dirty memory into a buffer > 3 resume vm > 3.1 resume qemu > 4 send the data in the buffer > 5 send the saved device model > (xc_domain_restore extracts this from the tailbuf)Is this basic flow incompatible with regular save? Isn''t it equivalent to to omitting phase 3? (I know regular save is structured a bit different right now, but it could be done this way to minimise the amount of Remus specific code). Which "struct save_callbacks" members do each of these stages correspond to? I think a high level overview of the ordering of operations and the usage of these hooks would make an ideal comment just above the definition of "struct save_callbacks", what do you think?> How does this interact with Stefano''s XC_SAVEID_TOOLSTACK > patches? > > > I couldnt find anything online by this name. Can you point me to the > patch series?"libxl: save/restore qemu physmap" posted last Friday. We would also like to see the qemu restore done in a callback hook for symmetry with the save hook. Ian.
Ian Campbell
2012-Jan-26 09:23 UTC
Re: [PATCH 2 of 3] tools/libxl: suspend/postflush/commit callbacks
On Wed, 2012-01-25 at 23:15 +0000, Shriram Rajagopalan wrote:> On Wed, Jan 25, 2012 at 3:22 AM, Ian Campbell > <Ian.Campbell@citrix.com> wrote: > On Mon, 2012-01-23 at 22:46 +0000, rshriram@cs.ubc.ca wrote: > > # HG changeset patch > > # User Shriram Rajagopalan <rshriram@cs.ubc.ca> > > # Date 1327358640 28800 > > # Node ID 0446591bee86eb4e767d75b70c885549c7a3cfef > > # Parent 11fb1dfda7de4d6759dec87d80cd16cf137f7369 > > tools/libxl: suspend/postflush/commit callbacks > > > > * Add libxl callbacks for Remus checkpoint suspend, > postflush (aka resume) > > and checkpoint commit callbacks. > > * suspend callback just bounces off > libxl__domain_suspend_common_callback > > * resume callback directly calls xc_domain_resume instead > of re-using > > libxl_domain_resume (as the latter does not set the fast > suspend argument > > to xc_domain_resume - aka suspend_cancel support). > > * commit callback just sleeps for the checkpoint interval > (for the moment). > > > > * Future patches will augument these callbacks with more > functionalities like > > issuing network buffer plug/unplug commands, disk > checkpoint commands, etc. > > > > Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca> > > > > > > > +static int libxl__remus_domain_resume_callback(void *data) > > +{ > > + struct suspendinfo *si = data; > > + libxl_ctx *ctx = libxl__gc_owner(si->gc); > > + > > + /* Assumes that SUSPEND_CANCEL is available - just like > > + * xm version of Remus. Without that, remus wont work. > > + * Since libxl_domain_resume calls xc_domain_resume > with > > + * fast_suspend = 0, we cant re-use that api. > > > You could modify that API which would be better than > duplicating its > content. I think the "fast" flag is useful to expose to users > of libxl > but if not then you could refactor into an internal function > which takes > the flag and call it from both here and libxl_domain_resume. > > > > I didnt want to touch any existing public interfaces, structures, etc, > especially something so common like domain_resume.Until Xen 4.2 it is fine to improve the public interface of libxl, including adding a new parameter here if you think that is best.> While users of xl resume (or unpause) wont see any difference, other > libraries using the libxl API might be affected.So long as they can still ask for the current behaviour that is fine.> But I am in favor of "exposing" the flag instead of hiding it in an > internal function, if there are no objections :).None. Ian.
Ian Campbell
2012-Jan-26 10:37 UTC
Re: [PATCH 3 of 3] tools/libxl: xl remus/remus-receive commands
On Thu, 2012-01-26 at 00:09 +0000, Shriram Rajagopalan wrote:> On Wed, Jan 25, 2012 at 3:52 AM, Ian Campbell > > Sorry my bad. I should have sent it out as a separate patch. Well, if > you look at the control flow in the create_domain function , you would > realize that this is a bug. > > create_domain() > { > int daemonize = dom_info->daemonize; > .... > int need_daemon = daemonize; > > restore_domain() or create_new() > > if (!daemonize && !monitor) goto out; > > if (need_daemon) { > fork() > daemon().. > need_daemon = 0; > } > ... > out: > /* If we have daemonized then do not return to the caller -- this has > * already happened in the parent. > */ > if ( !need_daemon ) > exit() > > } > > > So, even if one explicitly sets daemonize to 0, the function will never return to the > caller. I needed it to return to the caller (remus_receiver() ), to take further actions.Makes sense. Although the default for remus-receive should be to daemonize otherwise when the domain is resumed you won''t get any automatic reboot/shutdown handling etc.> > exit(ret); > > > > return ret; > > @@ -5853,6 +5853,175 @@ > > return ret; > > } > > > > +int main_remus_receive(int argc, char **argv) > > > Can this not be done as a flag to migrate-receive? Likewise is > there common code between the remus migrate and regular > migration? > > > It can but it would basically terminate after the create_domain() call > in the migrate_receive function. I would have to have something like > if (remus) { > dont wait for permission to start domain > try renaming domain > unpause domain > return > }I think actually it becomes if (!remus) wait for permission to start domain try renaming domain unpause domain However I take your point that there isn''t that much in common after create_domain once you remove the post migration interlock protocol.> and a bunch of if (remus) flags peppered around the migrate receive > function. > > Having it in a separate function allows me to add support for creating > a TCP channel that receives the checkpoints,If this is useful for Remus then I can''t see any reason it wouldn''t also be useful for regular migration.> This add hooks or call external libraries/binaries that allows the > user to check for split-brain before resuming the domain.Hooks are OK, if they are not used by standard migration they can be NULL. They should of course be properly documented...> > +{ > > + int rc; > > + char *migration_domname; > > + struct domain_create dom_info; > > + > > + signal(SIGPIPE, SIG_IGN); > > + memset(&dom_info, 0, sizeof(dom_info)); > > + dom_info.debug = 1;BTW, this should be an option.> > + dom_info.no_incr_generationid = 1; > > + dom_info.restore_file = "incoming checkpoint stream"; > > + dom_info.migrate_fd = 0; /* stdin - will change in > future*/ > > + dom_info.migration_domname_r = &migration_domname; > > + > > + rc = create_domain(&dom_info); > > > I''m totally failing to see where the Remus''ness of this > create_domain comes from.> dom_info.daemonize = 0, .monitor = 0 and .paused = 0;And how do those options combine to == Remus? migrate doesn''t currently have a "paused" option but it is not beyond the realms of possibility that such a thing might be desirable in the future.> As I said earlier, this part is probably the only duplication between > migrate-receive and remus-receive. With Xend, there was no separate > remus receiver, to control what happens on the backup host. Now that I > have an opportunity to incorporate remus into the toolstack from > start, I thought I will keep the remus control flow as separate as > possible and not have to worry about breaking live migration (nor > complicate its code).I think this is the wrong approach and motivation. We want to combine the two as much as possible to reduce the amount of Remus specific code which requires maintenance. I don''t think supporting Remus via the migration code paths will unduly complicate things. [...]> > + while ((opt = def_getopt(argc, argv, "bui:s:", "remus", > 2)) != -1) { > > > main_migrate handles a bunch of other options which might also > be > interesting in the Remus case? Better would be to add all this > as an option to the std migrate. > > > Negative. It would look totally unintuitive to have a checkpoint > interval option (or blackhole replication option) in a live migration > command. To an end user, remus is just a HA system and live migration > is for moving VMs around. What is the point in trying to educate > him/her that remus is basically "continuous live migration" ?You''ve convinced me on the user interface point. However I think in terms of implementation we should aim for as much common code as possible. Likewise even if xl remus and xl migrate are separate commands they should support the same common options where appropriate (e.g. -F,-d,-e, possible -C etc). I''m undecided where *-receive (which is not user visible) should be separate or not -- this is the sort of thing which could in principle be negotiated as part of the protocol. Based on the above it''s not clear how much code sharing there would actually be. I expect we can revisit this once the bits which can should be shared actually are. Ian.