Hi all, The following small set of patches introduces an event-channel-based mechanism for suspending guests during the final stage of live migration. They were written in support of our Remus high availability project, presented at NSDI this April. The full paper is available here: http://www.usenix.org/event/nsdi08/tech/cully.html Because Remus takes checkpoints many times per second, it cannot afford the tens of milliseconds currently spent in signalling between the checkpoint process and the target domain (largely due to xenstore). So this patch set uses event channels instead of xenstore watches to perform this signalling, greatly reducing the amount of time spent waiting for message delivery and process scheduling. It is a revised version of the prototype patches originally submitted here: http://lists.xensource.com/archives/html/xen-devel/2007-05/msg00276.html This code is backwards-compatible with unmodified guest kernels (it simply falls back to the current xenstore-based notification mechanism for these guests). I''ve added timestamps to the suspend_and_state function in xc_domain_save, before and after the suspend callback. The difference in execution times (5 runs, idle dom0, idle guest, one vpcu for dom0 and domU, each pinned to separate hyperthreads on a P4): Old method: 84ms, 87ms, 92ms, 89ms, 92ms New method: 1ms, 1ms, 1ms, 1ms, 1ms Could this code be considered for the upcoming 3.3 release? Thanks, Brendan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Brendan Cully
2008-Jul-03  23:42 UTC
[Xen-devel] [PATCH 1/3] Set up an event channel to accept suspend requests.
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1215120859 25200
# Node ID af2845b0a746f9be5d7d087951ef99c0ca4a1687
# Parent  88e01408ec0da0c406fde22a0003bd870144f180
Set up an event channel to accept suspend requests.
This is much faster than going through xenstore.
Signed-off-by: Brendan Cullly <brendan@cs.ubc.ca>
diff --git a/drivers/xen/core/machine_reboot.c
b/drivers/xen/core/machine_reboot.c
--- a/drivers/xen/core/machine_reboot.c
+++ b/drivers/xen/core/machine_reboot.c
@@ -25,6 +25,8 @@
  */
 void (*pm_power_off)(void);
 EXPORT_SYMBOL(pm_power_off);
+
+int setup_suspend_evtchn(void);
 
 void machine_emergency_restart(void)
 {
@@ -241,6 +243,7 @@
 	if (!suspend_cancelled) {
 		xencons_resume();
 		xenbus_resume();
+		setup_suspend_evtchn();
 	} else {
 		xenbus_suspend_cancel();
 	}
diff --git a/drivers/xen/core/reboot.c b/drivers/xen/core/reboot.c
--- a/drivers/xen/core/reboot.c
+++ b/drivers/xen/core/reboot.c
@@ -7,6 +7,7 @@
 #include <linux/sysrq.h>
 #include <asm/hypervisor.h>
 #include <xen/xenbus.h>
+#include <xen/evtchn.h>
 #include <linux/kmod.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
@@ -217,6 +218,36 @@
 	.callback = sysrq_handler
 };
 
+static irqreturn_t suspend_int(int irq, void* dev_id, struct pt_regs *ptregs)
+{
+	shutting_down = SHUTDOWN_SUSPEND;
+	schedule_work(&shutdown_work);
+
+	return IRQ_HANDLED;
+}
+
+int setup_suspend_evtchn(void)
+{
+	static int irq = -1;
+	int port;
+	char portstr[5]; /* 1024 max */
+
+	if (irq > 0)
+		unbind_from_irqhandler(irq, NULL);
+
+	irq = bind_listening_port_to_irqhandler(0, suspend_int, 0,
"suspend",
+						NULL);
+	if (irq <= 0) {
+		return -1;
+	}
+	port = irq_to_evtchn_port(irq);
+	printk(KERN_INFO "suspend: event channel %d\n", port);
+	sprintf(portstr, "%d", port);
+	xenbus_write(XBT_NIL, "device/suspend", "event-channel",
portstr);
+
+	return 0;
+}
+
 static int setup_shutdown_watcher(void)
 {
 	int err;
@@ -234,6 +265,13 @@
 	err = register_xenbus_watch(&sysrq_watch);
 	if (err) {
 		printk(KERN_ERR "Failed to set sysrq watcher\n");
+		return err;
+	}
+
+	/* suspend event channel */
+	err = setup_suspend_evtchn();
+	if (err) {
+		printk(KERN_ERR "Failed to register suspend event channel\n");
 		return err;
 	}
 
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Brendan Cully
2008-Jul-03  23:43 UTC
[Xen-devel] [PATCH 2/3] Add facility to get notification of domain suspend by event channel.
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1215103876 25200
# Node ID ca55cf40cd134b41812c8c4b851d7df51031de07
# Parent  52a388ec09f852ffd7e42a71593c63f21a7b9fad
Add facility to get notification of domain suspend by event channel.
This event channel will be notified when the domain transitions to the
suspended state, which can be much faster than raising VIRQ_DOM_EXC
and waiting for the notification to be propagated via xenstore.
No attempt is made here to prevent multiple subscribers (last one
wins), or to detect that the subscriber has gone away. Userspace tools
should take care.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -981,6 +981,17 @@
     return do_domctl(xc_handle, &domctl);
 }
 
+int xc_dom_subscribe(int xc_handle, domid_t dom, evtchn_port_t port)
+{
+    DECLARE_DOMCTL;
+
+    domctl.cmd = XEN_DOMCTL_subscribe;
+    domctl.domain = dom;
+    domctl.u.subscribe.port = port;
+
+    return do_domctl(xc_handle, &domctl);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -810,6 +810,12 @@
 
 int xc_flask_op(int xc_handle, flask_op_t *op);
 
+/*
+ * Subscribe to state changes in a domain via evtchn.
+ * Returns -1 on failure, in which case errno will be set appropriately.
+ */
+int xc_dom_subscribe(int xc_handle, domid_t domid, evtchn_port_t port);
+
 /**************************
  * GRANT TABLE OPERATIONS *
  **************************/
diff --git a/xen/common/domain.c b/xen/common/domain.c
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -97,7 +97,13 @@
             return;
 
     d->is_shut_down = 1;
-    send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+    if ( d->shutdown_code == SHUTDOWN_suspend
+         && d->suspend_evtchn > 0 )
+    {
+        evtchn_set_pending(dom0->vcpu[0], d->suspend_evtchn);
+    }
+    else
+        send_guest_global_virq(dom0, VIRQ_DOM_EXC);
 }
 
 static void vcpu_check_shutdown(struct vcpu *v)
diff --git a/xen/common/domctl.c b/xen/common/domctl.c
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -824,6 +824,21 @@
     }
     break;
 
+    case XEN_DOMCTL_subscribe:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(op->domain);
+        if ( d != NULL )
+        {
+            d->suspend_evtchn = op->u.subscribe.port;
+            rcu_unlock_domain(d);
+            ret = 0;
+        }
+    }
+    break;
+
     default:
         ret = arch_do_domctl(op, u_domctl);
         break;
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -600,6 +600,13 @@
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_cpuid_t);
 #endif
 
+#define XEN_DOMCTL_subscribe          29
+struct xen_domctl_subscribe {
+    uint32_t port; /* IN */
+};
+typedef struct xen_domctl_subscribe xen_domctl_subscribe_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_subscribe_t);
+
 struct xen_domctl {
     uint32_t cmd;
     uint32_t interface_version; /* XEN_DOMCTL_INTERFACE_VERSION */
@@ -638,6 +645,7 @@
         struct xen_domctl_ext_vcpucontext   ext_vcpucontext;
         struct xen_domctl_set_opt_feature   set_opt_feature;
         struct xen_domctl_set_target        set_target;
+        struct xen_domctl_subscribe         subscribe;
 #if defined(__i386__) || defined(__x86_64__)
         struct xen_domctl_cpuid             cpuid;
 #endif
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -208,6 +208,10 @@
     bool_t           is_shutting_down; /* in process of shutting down? */
     bool_t           is_shut_down;     /* fully shut down? */
     int              shutdown_code;
+
+    /* If this is not 0, send suspend notification here instead of
+     * raising DOM_EXC */
+    int              suspend_evtchn;
 
     atomic_t         pause_count;
 
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
Brendan Cully
2008-Jul-03  23:44 UTC
[Xen-devel] [PATCH 3/3] Teach xc_save to use event-channel-based domain suspend if available.
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1215120748 25200
# Node ID 3e1d3cb03e8bf2b2f44066e6d7ec9729995d1db1
# Parent  ca55cf40cd134b41812c8c4b851d7df51031de07
Teach xc_save to use event-channel-based domain suspend if available.
If the guest provides a suspend event channel through xenstore,
xc_save will use it in preference to the old xenstore-based method.
Xend is still informed when the domain has suspended so that it can
perform device migration in parallel with last-round migration.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/python/xen/xend/XendCheckpoint.py
b/tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py
+++ b/tools/python/xen/xend/XendCheckpoint.py
@@ -108,6 +108,7 @@
                 log.debug("Suspending %d ...", dominfo.getDomid())
                 dominfo.shutdown(''suspend'')
                 dominfo.waitForShutdown()
+            if line in (''suspend'',
''suspended''):
                 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2,
                                        domain_name)
                 log.info("Domain %d suspended.", dominfo.getDomid())
@@ -116,6 +117,7 @@
                 if hvm:
                     dominfo.image.saveDeviceModel()
 
+            if line == "suspend":
                 tochild.write("done\n")
                 tochild.flush()
                 log.debug(''Written done'')
diff --git a/tools/xcutils/xc_save.c b/tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c
+++ b/tools/xcutils/xc_save.c
@@ -23,11 +23,18 @@
 #include <xenctrl.h>
 #include <xenguest.h>
 
+static struct suspendinfo {
+    int xce; /* event channel handle */
+
+    int suspend_evtchn;
+    int suspended_evtchn;
+} si;
+
 /**
  * Issue a suspend request through stdout, and receive the acknowledgement
  * from stdin.  This is handled by XendCheckpoint in the Python layer.
  */
-static int suspend(int domid)
+static int compat_suspend(int domid)
 {
     char ans[30];
 
@@ -36,6 +43,131 @@
 
     return (fgets(ans, sizeof(ans), stdin) != NULL &&
             !strncmp(ans, "done\n", 5));
+}
+
+static int suspend_evtchn_release(int xc, int domid)
+{
+    if (si.suspended_evtchn >= 0) {
+	xc_dom_subscribe(xc, domid, 0);
+	xc_evtchn_unbind(si.xce, si.suspended_evtchn);
+	si.suspended_evtchn = -1;
+    }
+    if (si.suspend_evtchn >= 0) {
+	xc_evtchn_unbind(si.xce, si.suspend_evtchn);
+	si.suspend_evtchn = -1;
+    }
+    if (si.xce >= 0) {
+	xc_evtchn_close(si.xce);
+	si.xce = -1;
+    }
+
+    return 0;
+}
+
+static int suspend_evtchn_init(int xc, int domid)
+{
+    struct xs_handle *xs;
+    char path[128];
+    char *portstr;
+    unsigned int plen;
+    int port;
+    int rc;
+
+    si.xce = -1;
+    si.suspend_evtchn = -1;
+    si.suspended_evtchn = -1;
+
+    xs = xs_daemon_open();
+    if (!xs) {
+	errx(1, "failed to get xenstore handle");
+	return -1;
+    }
+    sprintf(path, "/local/domain/%d/device/suspend/event-channel",
domid);
+    portstr = xs_read(xs, XBT_NULL, path, &plen);
+    xs_daemon_close(xs);
+
+    if (!portstr || !plen) {
+	warnx("could not read suspend event channel");
+	return -1;
+    }
+
+    port = atoi(portstr);
+    free(portstr);
+
+    si.xce = xc_evtchn_open();
+    if (si.xce < 0) {
+	errx(1, "failed to open event channel handle");
+	goto cleanup;
+    }
+
+    si.suspend_evtchn = xc_evtchn_bind_interdomain(si.xce, domid, port);
+    if (si.suspend_evtchn < 0) {
+	errx(1, "failed to bind suspend event channel: %d",
+	     si.suspend_evtchn);
+	goto cleanup;
+    }
+
+    si.suspended_evtchn = xc_evtchn_bind_unbound_port(si.xce, domid);
+    if (si.suspended_evtchn < 0) {
+	errx(1, "failed to allocate suspend notification port: %d",
+	     si.suspended_evtchn);
+	goto cleanup;
+    }
+
+    rc = xc_dom_subscribe(xc, domid, si.suspended_evtchn);
+    if (rc < 0) {
+	errx(1, "failed to subscribe to domain: %d", rc);
+	goto cleanup;
+    }
+
+    return 0;
+
+  cleanup:
+    suspend_evtchn_release(xc, domid);
+
+    return -1;
+}
+
+/**
+ * Issue a suspend request to a dedicated event channel in the guest, and
+ * receive the acknowledgement from the subscribe event channel. */
+static int evtchn_suspend(int domid)
+{
+    int xcefd;
+    int rc;
+
+    rc = xc_evtchn_notify(si.xce, si.suspend_evtchn);
+    if (rc < 0) {
+	errx(1, "failed to notify suspend request channel: %d", rc);
+	return 0;
+    }
+
+    xcefd = xc_evtchn_fd(si.xce);
+    do {
+      rc = xc_evtchn_pending(si.xce);
+      if (rc < 0) {
+	errx(1, "error polling suspend notification channel: %d", rc);
+	return 0;
+      }
+    } while (rc != si.suspended_evtchn);
+
+    /* harmless for one-off suspend */
+    if (xc_evtchn_unmask(si.xce, si.suspended_evtchn) < 0)
+	errx(1, "failed to unmask suspend notification channel: %d", rc);
+
+    /* notify xend that it can do device migration */
+    printf("suspended\n");
+    fflush(stdout);
+
+    return 1;
+}
+
+static int suspend(int domid)
+{
+    if (si.suspend_evtchn >= 0)
+	return evtchn_suspend(domid);
+
+    return compat_suspend(domid);
 }
 
 /* For HVM guests, there are two sources of dirty pages: the Xen shadow
@@ -188,9 +320,13 @@
     max_f = atoi(argv[4]);
     flags = atoi(argv[5]);
 
+    suspend_evtchn_init(xc_fd, domid);
+
     ret = xc_domain_save(xc_fd, io_fd, domid, maxit, max_f, flags, 
                          &suspend, !!(flags & XCFLAGS_HVM),
                          &init_qemu_maps, &qemu_flip_buffer);
+
+    suspend_evtchn_release(xc_fd, domid);
 
     xc_interface_close(xc_fd);
 
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel