Stefano Stabellini
2013-Jul-04 18:19 UTC
workaround for dom0 crash due to QEMU using O_DIRECT
Hi Alex, speaking with Ian about the dom0 kernel crash caused by using O_DIRECT in QEMU, we came up with a simple workaround that should turn the crash into a data corruption problem (same as native). The idea is that when we balloon out pages, we replace the original page with a mapping of a scrub page, so that if the network stack wants to access an old grant that doesn''t exist anymore, it should find a valid page mapped there (the scrub page). Could you please try the appended patch for Linux with QEMU that uses O_DIRECT to open a file on NFS? Thanks! - Stefano --- diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 930fb68..0663fda 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +static struct page* trade_page; #ifdef CONFIG_HIGHMEM #define inc_totalhigh_pages() (totalhigh_pages++) @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) if (xen_pv_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL), 0); BUG_ON(ret); } #endif @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, pfn_to_mfn(page_to_pfn(trade_page))); balloon_append(pfn_to_page(pfn)); } @@ -591,6 +592,10 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; + trade_page = alloc_page(GFP_KERNEL); + if (trade_page == NULL) + return -ENOMEM; + pr_info("xen/balloon: Initialising balloon driver.\n"); balloon_stats.current_pages = xen_pv_domain()
Stefano, We''ll test this (or more precisely Diana will when she has a minute). Remind me how you''d like O_DIRECT re-enabled. Back out the patch? Alex --On 4 July 2013 19:19:40 +0100 Stefano Stabellini <stefano.stabellini@eu.citrix.com> wrote:> Hi Alex, > speaking with Ian about the dom0 kernel crash caused by using O_DIRECT > in QEMU, we came up with a simple workaround that should turn the crash > into a data corruption problem (same as native). > > The idea is that when we balloon out pages, we replace the original page > with a mapping of a scrub page, so that if the network stack wants to > access an old grant that doesn''t exist anymore, it should find a valid > page mapped there (the scrub page). > > Could you please try the appended patch for Linux with QEMU that uses > O_DIRECT to open a file on NFS? > > Thanks! > > - Stefano > > --- > > > diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c > index 930fb68..0663fda 100644 > --- a/drivers/xen/balloon.c > +++ b/drivers/xen/balloon.c > @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); > > /* We increase/decrease in batches which fit in a page */ > static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; > +static struct page* trade_page; > > #ifdef CONFIG_HIGHMEM > #define inc_totalhigh_pages() (totalhigh_pages++) > @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned > long nr_pages, gfp_t gfp) if (xen_pv_domain() && !PageHighMem(page)) { > ret = HYPERVISOR_update_va_mapping( > (unsigned long)__va(pfn << PAGE_SHIFT), > - __pte_ma(0), 0); > + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL), 0); > BUG_ON(ret); > } > #endif > @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned > long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add > to balloon. */ > for (i = 0; i < nr_pages; i++) { > pfn = mfn_to_pfn(frame_list[i]); > - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); > + __set_phys_to_machine(pfn, pfn_to_mfn(page_to_pfn(trade_page))); > balloon_append(pfn_to_page(pfn)); > } > > @@ -591,6 +592,10 @@ static int __init balloon_init(void) > if (!xen_domain()) > return -ENODEV; > > + trade_page = alloc_page(GFP_KERNEL); > + if (trade_page == NULL) > + return -ENOMEM; > + > pr_info("xen/balloon: Initialising balloon driver.\n"); > > balloon_stats.current_pages = xen_pv_domain() > >-- Alex Bligh
Stefano Stabellini
2013-Jul-04 19:09 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
Use the new directiosafe option, or simply: diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 4108ce8..70c9ea0 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -790,12 +790,7 @@ static int blk_connect(struct XenDevice *xendev) struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev); int pers, index, qflags; - /* read-only ? */ - if (blkdev->directiosafe) { - qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO; - } else { - qflags = BDRV_O_CACHE_WB; - } + qflags = BDRV_O_NOCACHE | BDRV_O_NATIVE_AIO; if (strcmp(blkdev->mode, "w") == 0) { qflags |= BDRV_O_RDWR; } On Thu, 4 Jul 2013, Alex Bligh wrote:> Stefano, > > We''ll test this (or more precisely Diana will when she has a minute). Remind > me how you''d like O_DIRECT re-enabled. Back out the patch? > > Alex > > --On 4 July 2013 19:19:40 +0100 Stefano Stabellini > <stefano.stabellini@eu.citrix.com> wrote: > > > Hi Alex, > > speaking with Ian about the dom0 kernel crash caused by using O_DIRECT > > in QEMU, we came up with a simple workaround that should turn the crash > > into a data corruption problem (same as native). > > > > The idea is that when we balloon out pages, we replace the original page > > with a mapping of a scrub page, so that if the network stack wants to > > access an old grant that doesn''t exist anymore, it should find a valid > > page mapped there (the scrub page). > > > > Could you please try the appended patch for Linux with QEMU that uses > > O_DIRECT to open a file on NFS? > > > > Thanks! > > > > - Stefano > > > > --- > > > > > > diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c > > index 930fb68..0663fda 100644 > > --- a/drivers/xen/balloon.c > > +++ b/drivers/xen/balloon.c > > @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); > > > > /* We increase/decrease in batches which fit in a page */ > > static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; > > +static struct page* trade_page; > > > > #ifdef CONFIG_HIGHMEM > > #define inc_totalhigh_pages() (totalhigh_pages++) > > @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned > > long nr_pages, gfp_t gfp) if (xen_pv_domain() && > > !PageHighMem(page)) { > > ret = HYPERVISOR_update_va_mapping( > > (unsigned long)__va(pfn << PAGE_SHIFT), > > - __pte_ma(0), 0); > > + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL), > > 0); > > BUG_ON(ret); > > } > > #endif > > @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned > > long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add > > to balloon. */ > > for (i = 0; i < nr_pages; i++) { > > pfn = mfn_to_pfn(frame_list[i]); > > - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); > > + __set_phys_to_machine(pfn, > > pfn_to_mfn(page_to_pfn(trade_page))); > > balloon_append(pfn_to_page(pfn)); > > } > > > > @@ -591,6 +592,10 @@ static int __init balloon_init(void) > > if (!xen_domain()) > > return -ENODEV; > > > > + trade_page = alloc_page(GFP_KERNEL); > > + if (trade_page == NULL) > > + return -ENOMEM; > > + > > pr_info("xen/balloon: Initialising balloon driver.\n"); > > > > balloon_stats.current_pages = xen_pv_domain() > > > > > > > > -- > Alex Bligh >
Konrad Rzeszutek Wilk
2013-Jul-08 19:18 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
On Thu, Jul 04, 2013 at 07:19:40PM +0100, Stefano Stabellini wrote:> Hi Alex, > speaking with Ian about the dom0 kernel crash caused by using O_DIRECT > in QEMU, we came up with a simple workaround that should turn the crash > into a data corruption problem (same as native).<chuckles> You should for fun also do 0xEE on the ''trade_page'' whenever we update the PTE. That way we can detect the corruption as by default the trade_page would be 00.> > The idea is that when we balloon out pages, we replace the original page > with a mapping of a scrub page, so that if the network stack wants to > access an old grant that doesn''t exist anymore, it should find a valid > page mapped there (the scrub page).This will I think make it hard to do increase_reservation as it consults the P2M to make sure the PFN is indeed ''free''. But then this is a debug patch..> > Could you please try the appended patch for Linux with QEMU that uses > O_DIRECT to open a file on NFS? > > Thanks! > > - Stefano > > --- > > > diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c > index 930fb68..0663fda 100644 > --- a/drivers/xen/balloon.c > +++ b/drivers/xen/balloon.c > @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); > > /* We increase/decrease in batches which fit in a page */ > static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; > +static struct page* trade_page; > > #ifdef CONFIG_HIGHMEM > #define inc_totalhigh_pages() (totalhigh_pages++) > @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) > if (xen_pv_domain() && !PageHighMem(page)) { > ret = HYPERVISOR_update_va_mapping( > (unsigned long)__va(pfn << PAGE_SHIFT), > - __pte_ma(0), 0); > + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL), 0); > BUG_ON(ret); > } > #endif > @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) > /* No more mappings: invalidate P2M and add to balloon. */ > for (i = 0; i < nr_pages; i++) { > pfn = mfn_to_pfn(frame_list[i]); > - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); > + __set_phys_to_machine(pfn, pfn_to_mfn(page_to_pfn(trade_page))); > balloon_append(pfn_to_page(pfn)); > } > > @@ -591,6 +592,10 @@ static int __init balloon_init(void) > if (!xen_domain()) > return -ENODEV; > > + trade_page = alloc_page(GFP_KERNEL); > + if (trade_page == NULL) > + return -ENOMEM; > + > pr_info("xen/balloon: Initialising balloon driver.\n"); > > balloon_stats.current_pages = xen_pv_domain()
--On 8 July 2013 15:18:53 -0400 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:>> speaking with Ian about the dom0 kernel crash caused by using O_DIRECT >> in QEMU, we came up with a simple workaround that should turn the crash >> into a data corruption problem (same as native). > > <chuckles> You should for fun also do 0xEE on the ''trade_page'' whenever > we update the PTE. That way we can detect the corruption as by default > the trade_page would be 00.Ha ha I''ve just read the patch. If I read right, what happens is there''s a fixed ''junk'' page which gets mapped in whenever the granted page gets mapped out. Let''s put aside the minor issue here that we''ve got a kernel patch which doesn''t actually fix the kernel''s problem :-) Have we not got a danger here that trade_page could end up written to with VM A''s data, and this could then find itself in VM B''s disk? Or do we know that every access by the kernel after withdrawal of the grant is guaranteed to be a read? In which case making it read only might be safer. Also, our normal config has dom0 with completely fixed memory (no ballooning) I believe. Is that something Diana needs to change when testing this? -- Alex Bligh
Ian Campbell
2013-Jul-08 20:48 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
On Mon, 2013-07-08 at 20:40 +0100, Alex Bligh wrote:> Have we not got a danger here that trade_page could end up written > to with VM A''s data, and this could then find itself in VM B''s disk? > Or do we know that every access by the kernel after withdrawal > of the grant is guaranteed to be a read? In which case making it read > only might be safer.It absolutely should be read only. AFAICT that means PAGE_KERNEL_RO rather than the PAGE_KERNEL in the patch. The case we are worried about is read-after-free on the network tx path. There can be no write-after-free on the network rx path.> Also, our normal config has dom0 with completely fixed memory (no > ballooning) I believe. Is that something Diana needs to change when > testing this?No, the "ballooned" pages here are ones which are specially allocated for use by the PV backends. Ian.
Ian, --On 8 July 2013 21:48:17 +0100 Ian Campbell <ian.campbell@citrix.com> wrote:>> Have we not got a danger here that trade_page could end up written >> to with VM A''s data, and this could then find itself in VM B''s disk? >> Or do we know that every access by the kernel after withdrawal >> of the grant is guaranteed to be a read? In which case making it read >> only might be safer. > > It absolutely should be read only. AFAICT that means PAGE_KERNEL_RO > rather than the PAGE_KERNEL in the patch.OK. We should test with that then (Diana: NB)> The case we are worried about is read-after-free on the network tx path. > There can be no write-after-free on the network rx path.Yes I couldn''t think of a realistic scenario where this would happen, but something just feels wrong about the page not being read-only. -- Alex Bligh
George Dunlap
2013-Jul-09 13:39 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
On Mon, Jul 8, 2013 at 8:40 PM, Alex Bligh <alex@alex.org.uk> wrote:> > > --On 8 July 2013 15:18:53 -0400 Konrad Rzeszutek Wilk > <konrad.wilk@oracle.com> wrote: > >>> speaking with Ian about the dom0 kernel crash caused by using O_DIRECT >>> in QEMU, we came up with a simple workaround that should turn the crash >>> into a data corruption problem (same as native). >> >> >> <chuckles> You should for fun also do 0xEE on the ''trade_page'' whenever >> we update the PTE. That way we can detect the corruption as by default >> the trade_page would be 00. > > > Ha ha I''ve just read the patch. If I read right, what happens is there''s > a fixed ''junk'' page which gets mapped in whenever the granted page gets > mapped out. > > Let''s put aside the minor issue here that we''ve got a kernel patch > which doesn''t actually fix the kernel''s problem :-)It is a bit weird. :-) But what it actually does is make the behavior of the bug running on Xen the same as the behavior of the bug on other systems. On bare metal, the contents of a random page will be sent during the "retransmit" phase. At the moment, under Xen, it crashes. This makes it send random data instead. -George
George, --On 9 July 2013 14:39:49 +0100 George Dunlap <dunlapg@umich.edu> wrote:> It is a bit weird. :-) But what it actually does is make the > behavior of the bug running on Xen the same as the behavior of the bug > on other systems. On bare metal, the contents of a random page will > be sent during the "retransmit" phase. At the moment, under Xen, it > crashes. This makes it send random data instead.It''s not *quite* the same. On other systems the read post I/O complete is ''quite likely'' (for some value of ''quite'') to return the same page, so the write operation might just write the right data or something approximating it. ''Quite likely'' is going to be quite high when it''s something in the guest''s page cache undergoing normal write out. When virtualised through Xen any such operation is guaranteed to (a) kill dom0 (without the patch) or (b) corrupt the data (without the patch), unless the data happened to be all zeros. Of course there''s nothing you can do about this with O_DIRECT switched on, but we shouldn''t be under the impression this makes anything safer. I''m wondering whether we could count accesses to this horrible page, and thus detect the bug occurring? -- Alex Bligh
Diana Crisan
2013-Jul-16 14:25 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
On 04/07/13 19:25, Alex Bligh wrote:> Stefano, > > We''ll test this (or more precisely Diana will when she has a minute). > Remind me how you''d like O_DIRECT re-enabled. Back out the patch? > > Alex > > --On 4 July 2013 19:19:40 +0100 Stefano Stabellini > <stefano.stabellini@eu.citrix.com> wrote: > >> Hi Alex, >> speaking with Ian about the dom0 kernel crash caused by using O_DIRECT >> in QEMU, we came up with a simple workaround that should turn the crash >> into a data corruption problem (same as native). >> >> The idea is that when we balloon out pages, we replace the original page >> with a mapping of a scrub page, so that if the network stack wants to >> access an old grant that doesn''t exist anymore, it should find a valid >> page mapped there (the scrub page). >> >> Could you please try the appended patch for Linux with QEMU that uses >> O_DIRECT to open a file on NFS? >> >> Thanks! >> >> - Stefano >> >> --- >> >> >> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c >> index 930fb68..0663fda 100644 >> --- a/drivers/xen/balloon.c >> +++ b/drivers/xen/balloon.c >> @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); >> >> /* We increase/decrease in batches which fit in a page */ >> static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; >> +static struct page* trade_page; >> >> #ifdef CONFIG_HIGHMEM >> #define inc_totalhigh_pages() (totalhigh_pages++) >> @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned >> long nr_pages, gfp_t gfp) if (xen_pv_domain() && >> !PageHighMem(page)) { >> ret = HYPERVISOR_update_va_mapping( >> (unsigned long)__va(pfn << PAGE_SHIFT), >> - __pte_ma(0), 0); >> + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL), 0); >> BUG_ON(ret); >> } >> #endif >> @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned >> long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M >> and add >> to balloon. */ >> for (i = 0; i < nr_pages; i++) { >> pfn = mfn_to_pfn(frame_list[i]); >> - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); >> + __set_phys_to_machine(pfn, >> pfn_to_mfn(page_to_pfn(trade_page))); >> balloon_append(pfn_to_page(pfn)); >> } >> >> @@ -591,6 +592,10 @@ static int __init balloon_init(void) >> if (!xen_domain()) >> return -ENODEV; >> >> + trade_page = alloc_page(GFP_KERNEL); >> + if (trade_page == NULL) >> + return -ENOMEM; >> + >> pr_info("xen/balloon: Initialising balloon driver.\n"); >> >> balloon_stats.current_pages = xen_pv_domain() >> >> > > >Hello, I have tested the above patch against xen 4.3 with O_DIRECT *not* enabled and this patch makes dom0 crash when opening a file on nfs. Please see below my findings and a trace from the crashed dom0. Environment: Linux 3.10 custom build with the patch that can be found below. O_DIRECT disabled Actions perfomed: mount an nfs storage xl create xl.conf (which refers to a disk located in the nfs storage) Findings: dom0 crashes before the guest fully boots up. Regards, Diana ----------------------------------------------------------------------- diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 2a2ef97..3632707 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -82,6 +82,7 @@ enum bp_state { BP_ECANCELED }; +static struct page *trade_page; static DEFINE_MUTEX(balloon_mutex); @@ -412,7 +413,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) if (xen_pv_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL_RO), 0); BUG_ON(ret); } #endif @@ -425,7 +426,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, pfn_to_mfn(page_to_pfn(trade_page))); balloon_append(pfn_to_page(pfn)); } @@ -580,6 +581,10 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; + trade_page = alloc_page(GFP_KERNEL); + if (trade_page == NULL) + return -ENOMEM; + pr_info("Initialising balloon driver\n"); balloon_stats.current_pages = xen_pv_domain() --------------------------------------------------------------------------------------- [ 295.787439] ------------[ cut here ]------------ [ 295.787460] kernel BUG at drivers/xen/balloon.c:350! [ 295.787467] invalid opcode: 0000 [#1] SMP [ 295.787475] Modules linked in: xt_physdev iptable_filter ip_tables x_tables xen_pciback xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn xenfs xen_privcmd rpcsec_gss_krb5 nfsv4 nfsd nfs_acl auth_rpcgss oid_registry nfs fscache lockd sunrpc radeon bridge stp llc ttm drm_kms_helper drm sp5100_tco edac_core i2c_piix4 k10temp edac_mce_amd mac_hid i2c_algo_bit shpchp lp parport hid_generic pata_atiixp e1000e usbhid ptp hid pps_core [ 295.787557] CPU: 0 PID: 57 Comm: kworker/0:2 Not tainted 3.10.0-custom #4 [ 295.787564] Hardware name: HP ProLiant MicroServer, BIOS O41 07/29/2011 [ 295.787578] Workqueue: events balloon_process [ 295.787585] task: ffff88015b815c40 ti: ffff88013fce2000 task.ti: ffff88013fce2000 [ 295.787592] RIP: e030:[<ffffffff814068fa>] [<ffffffff814068fa>] balloon_process+0x42a/0x440 [ 295.787605] RSP: e02b:ffff88013fce3d88 EFLAGS: 00010217 [ 295.787611] RAX: 00000000003408e3 RBX: ffffea000559b880 RCX: 0000000000000005 [ 295.787618] RDX: 00000000001566e2 RSI: 0000000000000001 RDI: 00000000000000e2 [ 295.787625] RBP: ffff88013fce3de8 R08: 0001f8daf2c923c0 R09: 1e00000000000000 [ 295.787631] R10: 0001f8daf2c923c0 R11: 0000000000000000 R12: 0000000000000000 [ 295.787638] R13: 0000160000000000 R14: 0000000000000001 R15: 0000000000000003 [ 295.787650] FS: 00007f695e9f0900(0000) GS:ffff880167400000(0000) knlGS:0000000000000000 [ 295.787657] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [ 295.787662] CR2: 00007f695fd6f000 CR3: 0000000156bc4000 CR4: 0000000000000660 [ 295.787670] Stack: [ 295.787674] 00000000001566e2 ffff88013fce3fd8 ffffffff81f896e0 0000000000000001 [ 295.787685] 0000000000000000 0000000000007ff0 ffff88013fce3e38 ffffffff81c923c0 [ 295.787696] ffff88013fc7bd00 ffff880167413d00 ffff880167417d00 0000000000000000 [ 295.787706] Call Trace: [ 295.787717] [<ffffffff81082170>] process_one_work+0x170/0x4a0 [ 295.787726] [<ffffffff810832d1>] worker_thread+0x121/0x390 [ 295.787734] [<ffffffff810831b0>] ? manage_workers.isra.21+0x2f0/0x2f0 [ 295.787743] [<ffffffff8108a210>] kthread+0xc0/0xd0 [ 295.787751] [<ffffffff8108a150>] ? flush_kthread_worker+0xb0/0xb0 [ 295.787761] [<ffffffff816d642c>] ret_from_fork+0x7c/0xb0 [ 295.787768] [<ffffffff8108a150>] ? flush_kthread_worker+0xb0/0xb0 [ 295.787774] Code: 01 00 00 e8 99 a2 c7 ff e9 59 ff ff ff 0f 0b 0f 0b 48 89 d7 48 89 55 a0 e8 a4 53 c0 ff 48 83 f8 ff 48 8b 55 a0 0f 84 de fd ff ff <0f> 0b 89 45 a0 e8 4c 5a 2c 00 8b 45 a0 e9 a4 fc ff ff 90 90 90 [ 295.787856] RIP [<ffffffff814068fa>] balloon_process+0x42a/0x440 [ 295.787865] RSP <ffff88013fce3d88> [ 295.787872] ---[ end trace 0fb1d800275d4c7f ]--- [ 295.787944] BUG: unable to handle kernel paging request at ffffffffffffffd8 [ 295.787952] IP: [<ffffffff8108a520>] kthread_data+0x10/0x20 [ 295.787960] PGD 1c0f067 PUD 1c11067 PMD 0 [ 295.787969] Oops: 0000 [#2] SMP [ 295.787974] Modules linked in: xt_physdev iptable_filter ip_tables x_tables xen_pciback xen_netback xen_blkback xen_gntalloc xen_gntdev xen_evtchn xenfs xen_privcmd rpcsec_gss_krb5 nfsv4 nfsd nfs_acl auth_rpcgss oid_registry nfs fscache lockd sunrpc radeon bridge stp llc ttm drm_kms_helper drm sp5100_tco edac_core i2c_piix4 k10temp edac_mce_amd mac_hid i2c_algo_bit shpchp lp parport hid_generic pata_atiixp e1000e usbhid ptp hid pps_core [ 295.788050] CPU: 0 PID: 57 Comm: kworker/0:2 Tainted: G D 3.10.0-custom #4 [ 295.788056] Hardware name: HP ProLiant MicroServer, BIOS O41 07/29/2011 [ 295.788079] task: ffff88015b815c40 ti: ffff88013fce2000 task.ti: ffff88013fce2000 [ 295.788085] RIP: e030:[<ffffffff8108a520>] [<ffffffff8108a520>] kthread_data+0x10/0x20 [ 295.788095] RSP: e02b:ffff88013fce3a28 EFLAGS: 00010046 [ 295.788100] RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff81ecec00 [ 295.788107] RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff88015b815c40 [ 295.788114] RBP: ffff88013fce3a28 R08: 0000000033c6f12c R09: 0000000000000000 [ 295.788121] R10: ffffffff8132f132 R11: 000000000000000e R12: 0000000000000000 [ 295.788128] R13: ffff88015b816038 R14: ffff88015c0e8000 R15: ffff88015b815f40 [ 295.788137] FS: 00007f695e9f0900(0000) GS:ffff880167400000(0000) knlGS:0000000000000000 [ 295.788145] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [ 295.788151] CR2: 0000000000000028 CR3: 0000000156bc4000 CR4: 0000000000000660 [ 295.788158] Stack: [ 295.788162] ffff88013fce3a48 ffffffff81083b56 ffff880167414480 0000000000000000 [ 295.788173] ffff88013fce3ac8 ffffffff816cb63f ffff88013fce3a78 0000000000000000 [ 295.788183] ffff88015b815c40 ffff88013fce3fd8 ffff88013fce3fd8 ffff88013fce3fd8 [ 295.788194] Call Trace: [ 295.788201] [<ffffffff81083b56>] wq_worker_sleeping+0x16/0x90 [ 295.788211] [<ffffffff816cb63f>] __schedule+0x5df/0x840 [ 295.788218] [<ffffffff816cc379>] schedule+0x29/0x70 [ 295.788227] [<ffffffff810691a4>] do_exit+0x704/0xa80 [ 295.788235] [<ffffffff816ceb69>] oops_end+0xb9/0x100 [ 295.788245] [<ffffffff81016be8>] die+0x58/0x90 [ 295.788252] [<ffffffff816ce45b>] do_trap+0xcb/0x170 [ 295.788261] [<ffffffff81013f85>] do_invalid_op+0x95/0xb0 [ 295.788269] [<ffffffff814068fa>] ? balloon_process+0x42a/0x440 [ 295.788278] [<ffffffff810a01d3>] ? update_curr+0x143/0x200 [ 295.788287] [<ffffffff816d7b9e>] invalid_op+0x1e/0x30 [ 295.788302] [<ffffffff814068fa>] ? balloon_process+0x42a/0x440 [ 295.788311] [<ffffffff814068ec>] ? balloon_process+0x41c/0x440 [ 295.788319] [<ffffffff81082170>] process_one_work+0x170/0x4a0 [ 295.788328] [<ffffffff810832d1>] worker_thread+0x121/0x390 [ 295.788336] [<ffffffff810831b0>] ? manage_workers.isra.21+0x2f0/0x2f0 [ 295.788344] [<ffffffff8108a210>] kthread+0xc0/0xd0 [ 295.788351] [<ffffffff8108a150>] ? flush_kthread_worker+0xb0/0xb0 [ 295.788360] [<ffffffff816d642c>] ret_from_fork+0x7c/0xb0 [ 295.788367] [<ffffffff8108a150>] ? flush_kthread_worker+0xb0/0xb0 [ 295.788373] Code: 00 48 89 e5 5d 48 8b 40 c8 48 c1 e8 02 83 e0 01 c3 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 48 8b 87 a0 03 00 00 55 48 89 e5 <48> 8b 40 d8 5d c3 66 2e 0f 1f 84 00 00 00 00 00 66 66 66 66 90 [ 295.788455] RIP [<ffffffff8108a520>] kthread_data+0x10/0x20 [ 295.788462] RSP <ffff88013fce3a28> [ 295.788467] CR2: ffffffffffffffd8 [ 295.788472] ---[ end trace 0fb1d800275d4c80 ]--- [ 295.788477] Fixing recursive fault but reboot is needed! [ 365.066096] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=15002 jiffies, g=2073, c=2072, q=958) [ 365.066142] sending NMI to all CPUs: [ 365.066154] xen: vector 0x2 is not implemented [ 545.086096] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=60007 jiffies, g=2073, c=2072, q=5360) [ 545.086142] sending NMI to all CPUs: [ 545.086154] xen: vector 0x2 is not implemented [ 725.106096] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=105012 jiffies, g=2073, c=2072, q=9732) [ 725.106142] sending NMI to all CPUs: [ 725.106154] xen: vector 0x2 is not implemented [ 905.126096] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=150017 jiffies, g=2073, c=2072, q=14126) [ 905.126143] sending NMI to all CPUs: [ 905.126154] xen: vector 0x2 is not implemented [ 1085.146095] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=195022 jiffies, g=2073, c=2072, q=18484) [ 1085.146141] sending NMI to all CPUs: [ 1085.146153] xen: vector 0x2 is not implemented [ 1265.166096] INFO: rcu_sched detected stalls on CPUs/tasks: { 0} (detected by 1, t=240027 jiffies, g=2073, c=2072, q=22884) [ 1265.166144] sending NMI to all CPUs: [ 1265.166155] xen: vector 0x2 is not implemented
Stefano Stabellini
2013-Jul-18 17:27 UTC
Re: workaround for dom0 crash due to QEMU using O_DIRECT
On Tue, 16 Jul 2013, Diana Crisan wrote:> Hello, > > I have tested the above patch against xen 4.3 with O_DIRECT *not* enabled and > this patch makes dom0 crash when opening a file on nfs. Please see below my > findings and a trace from the crashed dom0. > > Environment: > Linux 3.10 custom build with the patch that can be found below. > O_DIRECT disabled > > Actions perfomed: > mount an nfs storage > xl create xl.conf (which refers to a disk located in the nfs storage) > > Findings: dom0 crashes before the guest fully boots up.I can''t really explain why it crashes for you without even using O_DIRECT. I am testing it with and without O_DIRECT and with or without NFS and it works OK for me. Also all those NMIs are suspicious. Unfortunately I found out that the patch I posted is not complete because unmapping the grants and restoring the old mappings is not a single atomic operation at the moment. The real issue is that the grant unmap operation doesn''t restore the original mapping automatically. We do have a GNTTABOP_unmap_and_replace operation but it''s not implemented on x86 if GNTMAP_contains_pte. I cannot see any solutions other than implementing a new grant table hypercall or maybe force the usage of multicall. For this test patch I have taken the second approach. diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 95fb2aa..f70aa46 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -968,7 +968,9 @@ int m2p_remove_override(struct page *page, if (!PageHighMem(page)) { struct multicall_space mcs; struct gnttab_unmap_grant_ref *unmap_op; + struct mmu_update *u; + WARN_ON(paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE); /* * It might be that we queued all the m2p grant table * hypercalls in a multicall, then m2p_remove_override @@ -989,8 +991,9 @@ int m2p_remove_override(struct page *page, return -1; } - mcs = xen_mc_entry( - sizeof(struct gnttab_unmap_grant_ref)); + xen_mc_batch(); + + mcs = __xen_mc_entry(sizeof(*unmap_op)); unmap_op = mcs.args; unmap_op->host_addr = kmap_op->host_addr; unmap_op->handle = kmap_op->handle; @@ -999,10 +1002,15 @@ int m2p_remove_override(struct page *page, MULTI_grant_table_op(mcs.mc, GNTTABOP_unmap_grant_ref, unmap_op, 1); + mcs = __xen_mc_entry(sizeof(*u)); + u = mcs.args; + u->ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u->val = pte_val_ma(pfn_pte(pfn, PAGE_KERNEL)); + + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + xen_mc_issue(PARAVIRT_LAZY_MMU); - set_pte_at(&init_mm, address, ptep, - pfn_pte(pfn, PAGE_KERNEL)); __flush_tlb_single(address); kmap_op->host_addr = 0; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 930fb68..ef9bc91 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -88,6 +88,7 @@ EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; +struct page* trade_page; #ifdef CONFIG_HIGHMEM #define inc_totalhigh_pages() (totalhigh_pages++) @@ -423,7 +424,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) if (xen_pv_domain() && !PageHighMem(page)) { ret = HYPERVISOR_update_va_mapping( (unsigned long)__va(pfn << PAGE_SHIFT), - __pte_ma(0), 0); + pfn_pte(page_to_pfn(trade_page), PAGE_KERNEL_RO), 0); BUG_ON(ret); } #endif @@ -436,7 +437,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { pfn = mfn_to_pfn(frame_list[i]); - __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + __set_phys_to_machine(pfn, pfn_to_mfn(page_to_pfn(trade_page))); balloon_append(pfn_to_page(pfn)); } @@ -591,6 +592,10 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; + trade_page = alloc_page(GFP_KERNEL); + if (trade_page == NULL) + return -ENOMEM; + pr_info("xen/balloon: Initialising balloon driver.\n"); balloon_stats.current_pages = xen_pv_domain()