Steven Hand
2008-May-08 13:14 UTC
[Xen-devel] [PATCH] - fix/improve error handling for failed suspend/migrate
This has been broken since cset 16964:5d84464dc1fc Also deal better with very early errors (close sender side socket) Signed-off-by: Steven Hand <steven.hand@cl.cam.ac.uk> diff -r b0d7780794eb tools/python/xen/xend/XendCheckpoint.py --- a/tools/python/xen/xend/XendCheckpoint.py Thu May 08 13:40:40 2008 +0100 +++ b/tools/python/xen/xend/XendCheckpoint.py Thu May 08 14:08:39 2008 +0100 @@ -81,8 +81,6 @@ def save(fd, dominfo, network, live, dst # thing is useful for debugging. dominfo.setName(''migrating-'' + domain_name) - done_suspend = 0 - try: dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP1, domain_name) @@ -110,7 +108,6 @@ def save(fd, dominfo, network, live, dst log.debug("Suspending %d ...", dominfo.getDomid()) dominfo.shutdown(''suspend'') dominfo.waitForShutdown() - done_suspend = 1 dominfo.migrateDevices(network, dst, DEV_MIGRATE_STEP2, domain_name) log.info("Domain %d suspended.", dominfo.getDomid()) @@ -154,16 +151,9 @@ def save(fd, dominfo, network, live, dst pass except Exception, exn: - log.exception("Save failed on domain %s (%s).", domain_name, + log.exception("Save failed on domain %s (%s) - resuming.", domain_name, dominfo.getDomid()) - - # If we didn''t get as far as suspending the domain (for - # example, we couldn''t balloon enough memory for the new - # domain), then we don''t want to re-plumb the devices, as the - # domU will not be expecting it. - if done_suspend: - log.debug("XendCheckpoint.save: resumeDomain") - dominfo.resumeDomain() + dominfo.resumeDomain() try: dominfo.setName(domain_name) diff -r b0d7780794eb tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py Thu May 08 13:40:40 2008 +0100 +++ b/tools/python/xen/xend/XendDomain.py Thu May 08 14:05:56 2008 +0100 @@ -1308,8 +1308,10 @@ class XendDomain: sock.send("receive\n") sock.recv(80) - XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node) - sock.close() + try: + XendCheckpoint.save(sock.fileno(), dominfo, True, live, dst, node=node) + finally: + sock.close() def domain_save(self, domid, dst, checkpoint=False): """Start saving a domain to file. diff -r b0d7780794eb tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Thu May 08 13:40:40 2008 +0100 +++ b/tools/python/xen/xend/XendDomainInfo.py Thu May 08 14:07:20 2008 +0100 @@ -2378,8 +2378,19 @@ class XendDomainInfo: def resumeDomain(self): log.debug("XendDomainInfo.resumeDomain(%s)", str(self.domid)) - if self.domid is None: + # resume a suspended domain (e.g. after live checkpoint, or after + # a later error during save or migate); checks that the domain + # is currently suspended first so safe to call from anywhere + + xeninfo = dom_get(self.domid) + if xeninfo is None: return + if not xeninfo[''shutdown'']: + return + reason = shutdown_reason(xeninfo[''shutdown_reason'']) + if reason != ''suspend'': + return + try: # could also fetch a parsed note from xenstore fast = self.info.get_notes().get(''SUSPEND_CANCEL'') and 1 or 0 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel