# HG changeset patch # User John Levon <john.levon@sun.com> # Date 1236822336 25200 # Node ID 88b3a560b0fb2a5adca969d9b192220d64dfd105 # Parent e92a56f3581975496d5d9f250823e46493e58548 Domain core-dumping fixes The code was attempting to use the domain''s current number of pages (info.nr_pages) as a maximum index. We then walk the memory map and can easily over-write past the end of the nr_pages-sized array, if the domain has more pages mapped in than earlier (live dump). Restrict ourselves to the current number of pages. Also fix the dump core method in xend to actually implement the crash and live options. In particular this means that xend clients other than xm now get non-live dumps by default. Signed-off-by: John Levon <john.levon@sun.com> diff --git a/tools/libxc/xc_core.c b/tools/libxc/xc_core.c --- a/tools/libxc/xc_core.c +++ b/tools/libxc/xc_core.c @@ -518,7 +518,17 @@ xc_domain_dumpcore_via_callback(int xc_h if ( sts != 0 ) goto out; + /* + * Note: this is the *current* number of pages and may change under + * a live dump-core. We''ll just take this value, and if more pages + * exist, we''ll skip them. If there''s less, then we''ll just not use + * all the array... + * + * We don''t want to use the total potential size of the memory map + * since that is usually much higher than info.nr_pages. + */ nr_pages = info.nr_pages; + if ( !auto_translated_physmap ) { /* obtain p2m table */ @@ -770,7 +780,7 @@ xc_domain_dumpcore_via_callback(int xc_h pfn_start = memory_map[map_idx].addr >> PAGE_SHIFT; pfn_end = pfn_start + (memory_map[map_idx].size >> PAGE_SHIFT); - for ( i = pfn_start; i < pfn_end; i++ ) + for ( i = pfn_start; i < pfn_end && j < nr_pages; i++ ) { uint64_t gmfn; void *vaddr; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -158,7 +158,7 @@ typedef struct xc_dominfo { paused:1, blocked:1, running:1, hvm:1, debugged:1; unsigned int shutdown_reason; /* only meaningful if shutdown==1 */ - unsigned long nr_pages; + unsigned long nr_pages; /* current number, not maximum */ unsigned long shared_info_frame; uint64_t cpu_time; unsigned long max_memkb; diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py --- a/tools/python/xen/xend/XendDomain.py +++ b/tools/python/xen/xend/XendDomain.py @@ -1252,13 +1252,23 @@ class XendDomain: POWER_STATE_NAMES[DOM_STATE_PAUSED], POWER_STATE_NAMES[dominfo._stateGet()]) - try: - log.info("Domain core dump requested for domain %s (%d) " - "live=%d crash=%d.", - dominfo.getName(), dominfo.getDomid(), live, crash) - return dominfo.dumpCore(filename) - except Exception, ex: - raise XendError(str(ex)) + dopause = (not live and dominfo._stateGet() == DOM_STATE_RUNNING) + if dopause: + dominfo.pause() + + try: + try: + log.info("Domain core dump requested for domain %s (%d) " + "live=%d crash=%d.", + dominfo.getName(), dominfo.getDomid(), live, crash) + dominfo.dumpCore(filename) + if crash: + self.domain_destroy(domid) + except Exception, ex: + raise XendError(str(ex)) + finally: + if dopause and not crash: + dominfo.unpause() def domain_destroy(self, domid): """Terminate domain immediately. diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -2036,26 +2036,31 @@ class XendDomainInfo: @raise: XendError if core dumping failed. """ + if not corefile: + this_time = time.strftime("%Y-%m%d-%H%M.%S", time.localtime()) + corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time, + self.info[''name_label''], self.domid) + + if os.path.isdir(corefile): + raise XendError("Cannot dump core in a directory: %s" % + corefile) + try: - if not corefile: - this_time = time.strftime("%Y-%m%d-%H%M.%S", time.localtime()) - corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time, - self.info[''name_label''], self.domid) - - if os.path.isdir(corefile): - raise XendError("Cannot dump core in a directory: %s" % - corefile) - - self._writeVm(DUMPCORE_IN_PROGRESS, ''True'') - xc.domain_dumpcore(self.domid, corefile) + try: + self._writeVm(DUMPCORE_IN_PROGRESS, ''True'') + xc.domain_dumpcore(self.domid, corefile) + except RuntimeError, ex: + corefile_incomp = corefile+''-incomplete'' + try: + os.rename(corefile, corefile_incomp) + except: + pass + + log.error("core dump failed: id = %s name = %s: %s", + self.domid, self.info[''name_label''], str(ex)) + raise XendError("Failed to dump core: %s" % str(ex)) + finally: self._removeVm(DUMPCORE_IN_PROGRESS) - except RuntimeError, ex: - corefile_incomp = corefile+''-incomplete'' - os.rename(corefile, corefile_incomp) - self._removeVm(DUMPCORE_IN_PROGRESS) - log.exception("XendDomainInfo.dumpCore failed: id = %s name = %s", - self.domid, self.info[''name_label'']) - raise XendError("Failed to dump core: %s" % str(ex)) # # Device creation/deletion functions diff --git a/tools/python/xen/xm/main.py b/tools/python/xen/xm/main.py --- a/tools/python/xen/xm/main.py +++ b/tools/python/xen/xm/main.py @@ -1351,22 +1351,10 @@ def xm_dump_core(args): else: filename = None - if not live: - ds = server.xend.domain.pause(dom, True) - - try: - print "Dumping core of domain: %s ..." % str(dom) - server.xend.domain.dump(dom, filename, live, crash) - - if crash: - print "Destroying domain: %s ..." % str(dom) - server.xend.domain.destroy(dom) - elif reset: - print "Resetting domain: %s ..." % str(dom) - server.xend.domain.reset(dom) - finally: - if not live and not crash and not reset and ds == DOM_STATE_RUNNING: - server.xend.domain.unpause(dom) + print "Dumping core of domain: %s ..." % str(dom) + server.xend.domain.dump(dom, filename, live, crash) + if reset: + server.xend.domain.reset(dom) def xm_rename(args): arg_check(args, "rename", 2) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Wed, Mar 11, 2009 at 06:45:49PM -0700, John Levon wrote:> diff --git a/tools/libxc/xc_core.c b/tools/libxc/xc_core.c > --- a/tools/libxc/xc_core.c > +++ b/tools/libxc/xc_core.c > @@ -518,7 +518,17 @@ xc_domain_dumpcore_via_callback(int xc_h > if ( sts != 0 ) > goto out; > > + /* > + * Note: this is the *current* number of pages and may change under > + * a live dump-core. We''ll just take this value, and if more pages > + * exist, we''ll skip them. If there''s less, then we''ll just not use > + * all the array... > + * > + * We don''t want to use the total potential size of the memory map > + * since that is usually much higher than info.nr_pages. > + */ > nr_pages = info.nr_pages; > + > if ( !auto_translated_physmap ) > { > /* obtain p2m table */ > @@ -770,7 +780,7 @@ xc_domain_dumpcore_via_callback(int xc_h > > pfn_start = memory_map[map_idx].addr >> PAGE_SHIFT; > pfn_end = pfn_start + (memory_map[map_idx].size >> PAGE_SHIFT); > - for ( i = pfn_start; i < pfn_end; i++ ) > + for ( i = pfn_start; i < pfn_end && j < nr_pages; i++ ) > { > uint64_t gmfn; > void *vaddr;Did the issue really happen? I believe the following if clause handles the case. or j orverflowed? pfn_start = memory_map[map_idx].addr >> PAGE_SHIFT; pfn_end = pfn_start + (memory_map[map_idx].size >> PAGE_SHIFT); for ( i = pfn_start; i < pfn_end; i++ ) { uint64_t gmfn; void *vaddr; if ( j >= nr_pages ) <<<<<<<<<<<<<< HERE! >>>>>>>>>>>>>>>> { /* * When live dump-mode (-L option) is specified, * guest domain may increase memory. */ IPRINTF("exceeded nr_pages (%ld) losing pages", nr_pages); goto copy_done; } -- yamahata _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Thu, Mar 12, 2009 at 11:24:57AM +0900, Isaku Yamahata wrote:> > + for ( i = pfn_start; i < pfn_end && j < nr_pages; i++ ) > > { > > uint64_t gmfn; > > void *vaddr; > > Did the issue really happen? > I believe the following if clause handles the case. or j orverflowed?Yep, I missed this when forward-porting these fixes - thanks. regards john _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Hi John, Please don''t reset a domain after unpaused the domain. Could you move the reset into xend? e.g. try: log.info("Domain core dump requested for domain %s (%d) " "live=%d crash=%d.", dominfo.getName(), dominfo.getDomid(), live, crash) dominfo.dumpCore(filename) if crash: self.domain_destroy(domid) + elif reset: + self.domain_reset(domid) Best regards, Kan Wed, 11 Mar 2009 18:45:49 -0700, John Levon wrote:># HG changeset patch ># User John Levon <john.levon@sun.com> ># Date 1236822336 25200 ># Node ID 88b3a560b0fb2a5adca969d9b192220d64dfd105 ># Parent e92a56f3581975496d5d9f250823e46493e58548 >Domain core-dumping fixes > >The code was attempting to use the domain''s current number of pages >(info.nr_pages) as a maximum index. We then walk the memory map and can >easily over-write past the end of the nr_pages-sized array, if the >domain has more pages mapped in than earlier (live dump). Restrict >ourselves to the current number of pages. > >Also fix the dump core method in xend to actually implement the crash >and live options. In particular this means that xend clients other than >xm now get non-live dumps by default. > >Signed-off-by: John Levon <john.levon@sun.com> > >diff --git a/tools/libxc/xc_core.c b/tools/libxc/xc_core.c >--- a/tools/libxc/xc_core.c >+++ b/tools/libxc/xc_core.c >@@ -518,7 +518,17 @@ xc_domain_dumpcore_via_callback(int xc_h > if ( sts != 0 ) > goto out; > >+ /* >+ * Note: this is the *current* number of pages and may change under >+ * a live dump-core. We''ll just take this value, and if more pages >+ * exist, we''ll skip them. If there''s less, then we''ll just not use >+ * all the array... >+ * >+ * We don''t want to use the total potential size of the memory map >+ * since that is usually much higher than info.nr_pages. >+ */ > nr_pages = info.nr_pages; >+ > if ( !auto_translated_physmap ) > { > /* obtain p2m table */ >@@ -770,7 +780,7 @@ xc_domain_dumpcore_via_callback(int xc_h > > pfn_start = memory_map[map_idx].addr >> PAGE_SHIFT; > pfn_end = pfn_start + (memory_map[map_idx].size >> PAGE_SHIFT); >- for ( i = pfn_start; i < pfn_end; i++ ) >+ for ( i = pfn_start; i < pfn_end && j < nr_pages; i++ ) > { > uint64_t gmfn; > void *vaddr; >diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h >--- a/tools/libxc/xenctrl.h >+++ b/tools/libxc/xenctrl.h >@@ -158,7 +158,7 @@ typedef struct xc_dominfo { > paused:1, blocked:1, running:1, > hvm:1, debugged:1; > unsigned int shutdown_reason; /* only meaningful if shutdown==1 */ >- unsigned long nr_pages; >+ unsigned long nr_pages; /* current number, not maximum */ > unsigned long shared_info_frame; > uint64_t cpu_time; > unsigned long max_memkb; >diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/ >XendDomain.py >--- a/tools/python/xen/xend/XendDomain.py >+++ b/tools/python/xen/xend/XendDomain.py >@@ -1252,13 +1252,23 @@ class XendDomain: > POWER_STATE_NAMES[DOM_STATE_PAUSED], > POWER_STATE_NAMES[dominfo._stateGet()]) > >- try: >- log.info("Domain core dump requested for domain %s (%d) " >- "live=%d crash=%d.", >- dominfo.getName(), dominfo.getDomid(), live, crash) >- return dominfo.dumpCore(filename) >- except Exception, ex: >- raise XendError(str(ex)) >+ dopause = (not live and dominfo._stateGet() == DOM_STATE_RUNNING) >+ if dopause: >+ dominfo.pause() >+ >+ try: >+ try: >+ log.info("Domain core dump requested for domain %s (%d) " >+ "live=%d crash=%d.", >+ dominfo.getName(), dominfo.getDomid(), live, crash) >+ dominfo.dumpCore(filename) >+ if crash: >+ self.domain_destroy(domid) >+ except Exception, ex: >+ raise XendError(str(ex)) >+ finally: >+ if dopause and not crash: >+ dominfo.unpause() > > def domain_destroy(self, domid): > """Terminate domain immediately. >diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/ >xend/XendDomainInfo.py >--- a/tools/python/xen/xend/XendDomainInfo.py >+++ b/tools/python/xen/xend/XendDomainInfo.py >@@ -2036,26 +2036,31 @@ class XendDomainInfo: > @raise: XendError if core dumping failed. > """ > >+ if not corefile: >+ this_time = time.strftime("%Y-%m%d-%H%M.%S", time.localtime()) >+ corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time, >+ self.info[''name_label''], self.domid) >+ >+ if os.path.isdir(corefile): >+ raise XendError("Cannot dump core in a directory: %s" % >+ corefile) >+ > try: >- if not corefile: >- this_time = time.strftime("%Y-%m%d-%H%M.%S", time. >localtime()) >- corefile = "/var/xen/dump/%s-%s.%s.core" % (this_time, >- self.info[''name_label''], self.domid) >- >- if os.path.isdir(corefile): >- raise XendError("Cannot dump core in a directory: %s" % >- corefile) >- >- self._writeVm(DUMPCORE_IN_PROGRESS, ''True'') >- xc.domain_dumpcore(self.domid, corefile) >+ try: >+ self._writeVm(DUMPCORE_IN_PROGRESS, ''True'') >+ xc.domain_dumpcore(self.domid, corefile) >+ except RuntimeError, ex: >+ corefile_incomp = corefile+''-incomplete'' >+ try: >+ os.rename(corefile, corefile_incomp) >+ except: >+ pass >+ >+ log.error("core dump failed: id = %s name = %s: %s", >+ self.domid, self.info[''name_label''], str(ex)) >+ raise XendError("Failed to dump core: %s" % str(ex)) >+ finally: > self._removeVm(DUMPCORE_IN_PROGRESS) >- except RuntimeError, ex: >- corefile_incomp = corefile+''-incomplete'' >- os.rename(corefile, corefile_incomp) >- self._removeVm(DUMPCORE_IN_PROGRESS) >- log.exception("XendDomainInfo.dumpCore failed: id = %s name = >%s", >- self.domid, self.info[''name_label'']) >- raise XendError("Failed to dump core: %s" % str(ex)) > > # > # Device creation/deletion functions >diff --git a/tools/python/xen/xm/main.py b/tools/python/xen/xm/main.py >--- a/tools/python/xen/xm/main.py >+++ b/tools/python/xen/xm/main.py >@@ -1351,22 +1351,10 @@ def xm_dump_core(args): > else: > filename = None > >- if not live: >- ds = server.xend.domain.pause(dom, True) >- >- try: >- print "Dumping core of domain: %s ..." % str(dom) >- server.xend.domain.dump(dom, filename, live, crash) >- >- if crash: >- print "Destroying domain: %s ..." % str(dom) >- server.xend.domain.destroy(dom) >- elif reset: >- print "Resetting domain: %s ..." % str(dom) >- server.xend.domain.reset(dom) >- finally: >- if not live and not crash and not reset and ds == DOM_STATE_RUNNING: >- server.xend.domain.unpause(dom) >+ print "Dumping core of domain: %s ..." % str(dom) >+ server.xend.domain.dump(dom, filename, live, crash) >+ if reset: >+ server.xend.domain.reset(dom) > > def xm_rename(args): > arg_check(args, "rename", 2) > >_______________________________________________ >Xen-devel mailing list >Xen-devel@lists.xensource.com >http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Thu, Mar 12, 2009 at 01:53:39PM +0900, Masaki Kanno wrote:> Please don''t reset a domain after unpaused the domain.I must admit I don''t really understand what this option is for.> Could you move the reset into xend?Not without breaking backwards compatibility. Unfortunately, the xend code in server/SrvDomain.py (I think) does not allow optional arguments: so every time you add a parameter like ''reset'' (which is not there now), every client breaks. In particular, libvirt. regards john _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Hi John,>> Please don''t reset a domain after unpaused the domain. > >I must admit I don''t really understand what this option is for. >The reset option cuts down the number of operations for users. Without the reset option: 1) xm dump-core --crash vm1 2) xm create/start vm1 With the reset option: 1) xm dump-core --reset vm1 If a crashed guest works between unpause and reset, then the guest system may corrupt. So we want to reset the crashed guest before unpause. If we cannot do that, the reset option is worthless. May I remove the reset option from xm dump-core command?>> Could you move the reset into xend? > >Not without breaking backwards compatibility. Unfortunately, the xend >code in server/SrvDomain.py (I think) does not allow optional arguments: >so every time you add a parameter like ''reset'' (which is not there now), >every client breaks. In particular, libvirt. >I cannot complain. Should we not add options for clients anymore? Will anyone improve server/SrvDomain.py? Best regards, Kan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Thu, Mar 12, 2009 at 04:03:40PM +0900, Masaki Kanno wrote:> The reset option cuts down the number of operations for users. > > Without the reset option: > 1) xm dump-core --crash vm1 > 2) xm create/start vm1 > > With the reset option: > 1) xm dump-core --reset vm1It does seem fairly marginal use... needing to grab cores is hopefully rare :)> I cannot complain. > Should we not add options for clients anymore? > > Will anyone improve server/SrvDomain.py?I''d like to say that somebody will, but I don''t have the energy to detangle that code, for sure... regards john _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel