Hey all, I ran into some problems trying remus on xen4.0.1rc4 with the 2.6.31.13 dom0 (checkout from yesterday): Upon creating a vm with remus tapdisk, the domU starts up fine, but keeps hanging on mountall. the tapdisk process is only active on server1. The domU shows up in xm list as blocked. Once remus is started, the tapdisk process also shows up on the backup server, but where i was expecting the domU to show up in paused state, in appeared as blocked. Console output on the initial server remains unresponsive, but on the backup server a login-shell appears. After logging in it turns out the disk is read-only. Remus doesn''t give any output anymore after the first two syncs and the following errors show up in dmesg on the primary server: [ 6600.224933] INFO: task xenwatch:55 blocked for more than 120 seconds. [ 6600.224969] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 6600.225025] xenwatch D ffff8802d7511040 0 55 2 0x00000000 [ 6600.225071] ffff8802e2e65c10 0000000000000246 ffff8802d5011d68 ffffffff8103819d [ 6600.225145] ffff8802e2e65be0 ffffffff81038a12 0000000000000000 00000000ffffffff [ 6600.225217] ffff8802e2e683b0 000000000000f668 00000000000153c0 ffff8802e2e683b0 [ 6600.225287] Call Trace: [ 6600.225320] [<ffffffff8103819d>] ? xen_force_evtchn_callback+0xd/0x10 [ 6600.225357] [<ffffffff81038a12>] ? check_events+0x12/0x20 [ 6600.225396] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 [ 6600.225432] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 [ 6600.225468] [<ffffffff81344c55>] blkif_disconnect+0x75/0xf0 [ 6600.225502] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 [ 6600.225542] [<ffffffff8114fd80>] ? kfree+0xf0/0x180 [ 6600.225576] [<ffffffff8134448c>] blkback_remove+0x4c/0x90 [ 6600.225609] [<ffffffff8133e890>] xenbus_dev_remove+0x50/0x70 [ 6600.225644] [<ffffffff8138b9d8>] __device_release_driver+0x58/0xb0 [ 6600.225679] [<ffffffff8138bb4d>] device_release_driver+0x2d/0x40 [ 6600.225712] [<ffffffff8138ac0a>] bus_remove_device+0x9a/0xc0 [ 6600.225744] [<ffffffff81388da7>] device_del+0x127/0x1d0 [ 6600.225777] [<ffffffff81388e66>] device_unregister+0x16/0x30 [ 6600.225813] [<ffffffff813441a0>] frontend_changed+0x90/0x2a0 [ 6600.225848] [<ffffffff8133eb82>] xenbus_otherend_changed+0xb2/0xc0 [ 6600.225883] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 [ 6600.225918] [<ffffffff8133f150>] frontend_changed+0x10/0x20 [ 6600.225951] [<ffffffff8133c794>] xenwatch_thread+0xb4/0x190 [ 6600.225986] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 [ 6600.226023] [<ffffffff8133c6e0>] ? xenwatch_thread+0x0/0x190 [ 6600.226060] [<ffffffff810aecb6>] kthread+0xa6/0xb0 [ 6600.226091] [<ffffffff8103f3ea>] child_rip+0xa/0x20 [ 6600.226125] [<ffffffff8103e5d7>] ? int_ret_from_sys_call+0x7/0x1b [ 6600.226158] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 [ 6600.226193] [<ffffffff8103f3e0>] ? child_rip+0x0/0x20 soon followed by: [ 6691.837161] ------------[ cut here ]------------ [ 6691.837163] kernel BUG at /usr/src/xen-4.0-testing.hg/linux-2.6-pvops.git/fs/sysfs/dir.c:511! [ 6691.837165] invalid opcode: 0000 [#1] SMP [ 6691.837167] last sysfs file: /sys/devices/pci0000:00/0000:00:05.0/0000:1a:00.0/host4/target4:2:0/4:2:0:0/block/sda/sda6/uevent [ 6691.837169] CPU 1 [ 6691.837171] Modules linked in: iptable_filter ip_tables bridge stp qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid megaraid_sas e1000e [ 6691.837191] Pid: 55, comm: xenwatch Not tainted 2.6.31.13 #4 System x3650 M3 -[7945M2G]- [ 6691.837192] RIP: e030:[<ffffffff811c2b4b>] [<ffffffff811c2b4b>] sysfs_remove_one+0x9b/0xa0 [ 6691.837199] RSP: e02b:ffff8802e2e65a30 EFLAGS: 00010202 [ 6691.837201] RAX: 0000000000000000 RBX: ffff8802e2e65a40 RCX: 0000000000000201 [ 6691.837202] RDX: ffffffff81a1a9a0 RSI: ffff8802644649b0 RDI: ffff8802e2e65a40 [ 6691.837204] RBP: ffff8802e2e65a30 R08: 000000004e84bbbc R09: 0000000000000000 [ 6691.837206] R10: 0000000000000005 R11: 0000000000000000 R12: ffff8802644649b0 [ 6691.837207] R13: ffff8802644649b0 R14: ffff880264464780 R15: 000000000000000a [ 6691.837212] FS: 00007f723b663710(0000) GS:ffffc90000018000(0000) knlGS:0000000000000000 [ 6691.837214] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [ 6691.837215] CR2: 00007f47c3dfe000 CR3: 00000002da830000 CR4: 0000000000002660 [ 6691.837217] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6691.837219] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 6691.837221] Process xenwatch (pid: 55, threadinfo ffff8802e2e64000, task ffff8802e2e68000) [ 6691.837222] Stack: [ 6691.837224] ffff8802e2e65a70 ffffffff811c3863 ffff880264464780 0000000000000000 [ 6691.837227] <0> 0000000000000000 0000000000000000 ffffffff81865320 ffffffff81865348 [ 6691.837231] <0> ffff8802e2e65a80 ffffffff811c388e ffff8802e2e65ab0 ffffffff811c4ed6 [ 6691.837235] Call Trace: [ 6691.837239] [<ffffffff811c3863>] remove_dir+0x33/0x50 [ 6691.837242] [<ffffffff811c388e>] sysfs_remove_subdir+0xe/0x10 [ 6691.837244] [<ffffffff811c4ed6>] sysfs_remove_group+0x76/0x100 [ 6691.837248] [<ffffffff8138fd09>] dpm_sysfs_remove+0x19/0x20 [ 6691.837252] [<ffffffff81388cc9>] device_del+0x49/0x1d0 [ 6691.837254] [<ffffffff81388e66>] device_unregister+0x16/0x30 [ 6691.837258] [<ffffffff81130fff>] bdi_unregister+0x3f/0x60 [ 6691.837262] [<ffffffff812b60d4>] unlink_gendisk+0x34/0x60 [ 6691.837265] [<ffffffff811bbcec>] del_gendisk+0x9c/0x150 [ 6691.837268] [<ffffffff812ab8ba>] ? blk_remove_plug+0x4a/0xd0 [ 6691.837271] [<ffffffff813475f3>] blktap_device_destroy+0xc3/0x130 [ 6691.837274] [<ffffffff81347688>] blktap_device_release+0x28/0x30 [ 6691.837278] [<ffffffff81187cf1>] __blkdev_put+0xd1/0x1a0 [ 6691.837281] [<ffffffff81187dd0>] blkdev_put+0x10/0x20 [ 6691.837284] [<ffffffff81344fea>] vbd_free+0x2a/0x40 [ 6691.837287] [<ffffffff81344499>] blkback_remove+0x59/0x90 [ 6691.837289] [<ffffffff8133e890>] xenbus_dev_remove+0x50/0x70 [ 6691.837292] [<ffffffff8138b9d8>] __device_release_driver+0x58/0xb0 [ 6691.837295] [<ffffffff8138bb4d>] device_release_driver+0x2d/0x40 [ 6691.837298] [<ffffffff8138ac0a>] bus_remove_device+0x9a/0xc0 [ 6691.837301] [<ffffffff81388da7>] device_del+0x127/0x1d0 [ 6691.837303] [<ffffffff81388e66>] device_unregister+0x16/0x30 [ 6691.837306] [<ffffffff813441a0>] frontend_changed+0x90/0x2a0 [ 6691.837309] [<ffffffff8133eb82>] xenbus_otherend_changed+0xb2/0xc0 [ 6691.837313] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 [ 6691.837316] [<ffffffff8133f150>] frontend_changed+0x10/0x20 [ 6691.837318] [<ffffffff8133c794>] xenwatch_thread+0xb4/0x190 [ 6691.837321] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 [ 6691.837324] [<ffffffff8133c6e0>] ? xenwatch_thread+0x0/0x190 [ 6691.837327] [<ffffffff810aecb6>] kthread+0xa6/0xb0 [ 6691.837331] [<ffffffff8103f3ea>] child_rip+0xa/0x20 [ 6691.837334] [<ffffffff8103e5d7>] ? int_ret_from_sys_call+0x7/0x1b [ 6691.837337] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 [ 6691.837340] [<ffffffff8103f3e0>] ? child_rip+0x0/0x20 [ 6691.837341] Code: 48 8b 46 10 48 89 02 48 c7 46 10 00 00 00 00 8b 4e 30 eb c3 0f 1f 40 00 48 8b 47 08 48 85 c0 74 ce 83 68 4c 01 83 47 18 01 c9 c3 <0f> 0b eb fe 90 55 48 89 e5 41 54 53 0f 1f 44 00 00 48 8b 5f 28 [ 6691.837378] RIP [<ffffffff811c2b4b>] sysfs_remove_one+0x9b/0xa0 [ 6691.837381] RSP <ffff8802e2e65a30> [ 6691.837697] ---[ end trace 2bd84e059b85817e ]--- and then: [ 6803.229132] BUG: soft lockup - CPU#1 stuck for 61s! [irqbalance:3772] [ 6803.229205] Modules linked in: iptable_filter ip_tables bridge stp qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid megaraid_sas e1000e [ 6803.230394] CPU 1: [ 6803.230499] Modules linked in: iptable_filter ip_tables bridge stp qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid megaraid_sas e1000e [ 6803.231669] Pid: 3772, comm: irqbalance Tainted: G D 2.6.31.13 #4 System x3650 M3 -[7945M2G]- [ 6803.231764] RIP: e030:[<ffffffff8157611a>] [<ffffffff8157611a>] __mutex_lock_slowpath+0x3a/0x180 [ 6803.231904] RSP: e02b:ffff8802d6049bd8 EFLAGS: 00000286 [ 6803.231974] RAX: 00000000fffffffe RBX: ffffffff81841980 RCX: 0000000000000000 [ 6803.232047] RDX: 0000000000000000 RSI: ffff8802e2e64000 RDI: ffffffff81841980 [ 6803.232121] RBP: ffff8802d6049c38 R08: 000000000000004e R09: 0000000000000000 [ 6803.232194] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8802d609c470 [ 6803.232268] R13: ffff8802e2e64000 R14: ffffffff81841998 R15: 0000000000000001 [ 6803.232344] FS: 00007f47c3df4720(0000) GS:ffffc90000018000(0000) knlGS:0000000000000000 [ 6803.232434] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b [ 6803.232505] CR2: 00007f47c3dfe000 CR3: 00000002db8e7000 CR4: 0000000000002660 [ 6803.232579] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 6803.232651] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 6803.232724] Call Trace: [ 6803.232789] [<ffffffff81576125>] ? __mutex_lock_slowpath+0x45/0x180 [ 6803.232863] [<ffffffff8116d3dd>] ? __d_lookup+0x12d/0x170 [ 6803.232936] [<ffffffff815760bb>] mutex_lock+0x2b/0x50 [ 6803.233009] [<ffffffff811c3ecd>] sysfs_follow_link+0x6d/0x1d0 [ 6803.233081] [<ffffffff8116f038>] ? touch_atime+0x38/0x150 [ 6803.233152] [<ffffffff81164c47>] __link_path_walk+0x5d7/0x1060 [ 6803.233229] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 [ 6803.233304] [<ffffffff812c7d2a>] ? __up_read+0x9a/0xc0 [ 6803.233379] [<ffffffff8103819d>] ? xen_force_evtchn_callback+0xd/0x10 [ 6803.233452] [<ffffffff8116594a>] path_walk+0x6a/0xe0 [ 6803.233522] [<ffffffff81165b1b>] do_path_lookup+0x5b/0xa0 [ 6803.233593] [<ffffffff8115a2a1>] ? get_empty_filp+0xa1/0x170 [ 6803.233663] [<ffffffff81166ac3>] do_filp_open+0x103/0xc60 [ 6803.233736] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 [ 6803.233809] [<ffffffff8114fc0b>] ? kmem_cache_free+0xcb/0x150 [ 6803.233884] [<ffffffff8103819d>] ? xen_force_evtchn_callback+0xd/0x10 [ 6803.233957] [<ffffffff81172334>] ? alloc_fd+0x114/0x160 [ 6803.234028] [<ffffffff81156b39>] do_sys_open+0x69/0x140 [ 6803.234098] [<ffffffff81156c50>] sys_open+0x20/0x30 [ 6803.235949] [<ffffffff8103e332>] system_call_fastpath+0x16/0x1b Which repeats several times. Does anyone have a suggestion what''s going wrong? Is it normal for the tapdisk process to only show up on the backup-server once remus is started or should i initiallise it beforehand? Thanks in advance! Cheers, NN _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
On Tuesday, 06 July 2010 at 13:36, Nomen Nescio wrote:> Hey all, > > I ran into some problems trying remus on xen4.0.1rc4 with the 2.6.31.13 > dom0 (checkout from yesterday):Wat''s your domU kernel? pvops support was recently added to dom0, but still doesn''t work for domU.> Upon creating a vm with remus tapdisk, the domU starts up fine, but > keeps hanging on mountall. the tapdisk process is only active on > server1. The domU shows up in xm list as blocked. Once remus is started, > the tapdisk process also shows up on the backup server, but where i was > expecting the domU to show up in paused state, in appeared as blocked. > Console output on the initial server remains unresponsive, but on the > backup server a login-shell appears. After logging in it turns out the > disk is read-only.The tapdisk process on the backup is created when you start remus on the primary. Until you start Remus, writes to the primary will block, so you''ll typically want to start Remus some time close to when the first write to the primary disk occurs (mount is a good time). If the backup is in blocked mode but the primary is still running, you''re in split-brain state (both sides running) due to a checkpointing timeout being interpreted as a guest failure. I''m guessing from the logs you post that the guest kernel on the primary is not suspending correctly.> Remus doesn''t give any output anymore after the first two syncs and the > following errors show up in dmesg on the primary server:Are these in dom0 or the primary domU? Looks a bit like dom0, but I haven''t seen these before.> [ 6600.224933] INFO: task xenwatch:55 blocked for more than 120 seconds. > [ 6600.224969] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" > disables this message. > [ 6600.225025] xenwatch D ffff8802d7511040 0 55 2 > 0x00000000 > [ 6600.225071] ffff8802e2e65c10 0000000000000246 ffff8802d5011d68 > ffffffff8103819d > [ 6600.225145] ffff8802e2e65be0 ffffffff81038a12 0000000000000000 > 00000000ffffffff > [ 6600.225217] ffff8802e2e683b0 000000000000f668 00000000000153c0 > ffff8802e2e683b0 > [ 6600.225287] Call Trace: > [ 6600.225320] [<ffffffff8103819d>] ? > xen_force_evtchn_callback+0xd/0x10 > [ 6600.225357] [<ffffffff81038a12>] ? check_events+0x12/0x20 > [ 6600.225396] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 > [ 6600.225432] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 > [ 6600.225468] [<ffffffff81344c55>] blkif_disconnect+0x75/0xf0 > [ 6600.225502] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 > [ 6600.225542] [<ffffffff8114fd80>] ? kfree+0xf0/0x180 > [ 6600.225576] [<ffffffff8134448c>] blkback_remove+0x4c/0x90 > [ 6600.225609] [<ffffffff8133e890>] xenbus_dev_remove+0x50/0x70 > [ 6600.225644] [<ffffffff8138b9d8>] __device_release_driver+0x58/0xb0 > [ 6600.225679] [<ffffffff8138bb4d>] device_release_driver+0x2d/0x40 > [ 6600.225712] [<ffffffff8138ac0a>] bus_remove_device+0x9a/0xc0 > [ 6600.225744] [<ffffffff81388da7>] device_del+0x127/0x1d0 > [ 6600.225777] [<ffffffff81388e66>] device_unregister+0x16/0x30 > [ 6600.225813] [<ffffffff813441a0>] frontend_changed+0x90/0x2a0 > [ 6600.225848] [<ffffffff8133eb82>] xenbus_otherend_changed+0xb2/0xc0 > [ 6600.225883] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 > [ 6600.225918] [<ffffffff8133f150>] frontend_changed+0x10/0x20 > [ 6600.225951] [<ffffffff8133c794>] xenwatch_thread+0xb4/0x190 > [ 6600.225986] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 > [ 6600.226023] [<ffffffff8133c6e0>] ? xenwatch_thread+0x0/0x190 > [ 6600.226060] [<ffffffff810aecb6>] kthread+0xa6/0xb0 > [ 6600.226091] [<ffffffff8103f3ea>] child_rip+0xa/0x20 > [ 6600.226125] [<ffffffff8103e5d7>] ? int_ret_from_sys_call+0x7/0x1b > [ 6600.226158] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 > [ 6600.226193] [<ffffffff8103f3e0>] ? child_rip+0x0/0x20 > > soon followed by: > > [ 6691.837161] ------------[ cut here ]------------ > [ 6691.837163] kernel BUG at > /usr/src/xen-4.0-testing.hg/linux-2.6-pvops.git/fs/sysfs/dir.c:511! > [ 6691.837165] invalid opcode: 0000 [#1] SMP > [ 6691.837167] last sysfs file: > /sys/devices/pci0000:00/0000:00:05.0/0000:1a:00.0/host4/target4:2:0/4:2:0:0/block/sda/sda6/uevent > [ 6691.837169] CPU 1 > [ 6691.837171] Modules linked in: iptable_filter ip_tables bridge stp > qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables > lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid > megaraid_sas e1000e > [ 6691.837191] Pid: 55, comm: xenwatch Not tainted 2.6.31.13 #4 System > x3650 M3 -[7945M2G]- > [ 6691.837192] RIP: e030:[<ffffffff811c2b4b>] [<ffffffff811c2b4b>] > sysfs_remove_one+0x9b/0xa0 > [ 6691.837199] RSP: e02b:ffff8802e2e65a30 EFLAGS: 00010202 > [ 6691.837201] RAX: 0000000000000000 RBX: ffff8802e2e65a40 RCX: > 0000000000000201 > [ 6691.837202] RDX: ffffffff81a1a9a0 RSI: ffff8802644649b0 RDI: > ffff8802e2e65a40 > [ 6691.837204] RBP: ffff8802e2e65a30 R08: 000000004e84bbbc R09: > 0000000000000000 > [ 6691.837206] R10: 0000000000000005 R11: 0000000000000000 R12: > ffff8802644649b0 > [ 6691.837207] R13: ffff8802644649b0 R14: ffff880264464780 R15: > 000000000000000a > [ 6691.837212] FS: 00007f723b663710(0000) GS:ffffc90000018000(0000) > knlGS:0000000000000000 > [ 6691.837214] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b > [ 6691.837215] CR2: 00007f47c3dfe000 CR3: 00000002da830000 CR4: > 0000000000002660 > [ 6691.837217] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 6691.837219] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: > 0000000000000400 > [ 6691.837221] Process xenwatch (pid: 55, threadinfo ffff8802e2e64000, > task ffff8802e2e68000) > [ 6691.837222] Stack: > [ 6691.837224] ffff8802e2e65a70 ffffffff811c3863 ffff880264464780 > 0000000000000000 > [ 6691.837227] <0> 0000000000000000 0000000000000000 ffffffff81865320 > ffffffff81865348 > [ 6691.837231] <0> ffff8802e2e65a80 ffffffff811c388e ffff8802e2e65ab0 > ffffffff811c4ed6 > [ 6691.837235] Call Trace: > [ 6691.837239] [<ffffffff811c3863>] remove_dir+0x33/0x50 > [ 6691.837242] [<ffffffff811c388e>] sysfs_remove_subdir+0xe/0x10 > [ 6691.837244] [<ffffffff811c4ed6>] sysfs_remove_group+0x76/0x100 > [ 6691.837248] [<ffffffff8138fd09>] dpm_sysfs_remove+0x19/0x20 > [ 6691.837252] [<ffffffff81388cc9>] device_del+0x49/0x1d0 > [ 6691.837254] [<ffffffff81388e66>] device_unregister+0x16/0x30 > [ 6691.837258] [<ffffffff81130fff>] bdi_unregister+0x3f/0x60 > [ 6691.837262] [<ffffffff812b60d4>] unlink_gendisk+0x34/0x60 > [ 6691.837265] [<ffffffff811bbcec>] del_gendisk+0x9c/0x150 > [ 6691.837268] [<ffffffff812ab8ba>] ? blk_remove_plug+0x4a/0xd0 > [ 6691.837271] [<ffffffff813475f3>] blktap_device_destroy+0xc3/0x130 > [ 6691.837274] [<ffffffff81347688>] blktap_device_release+0x28/0x30 > [ 6691.837278] [<ffffffff81187cf1>] __blkdev_put+0xd1/0x1a0 > [ 6691.837281] [<ffffffff81187dd0>] blkdev_put+0x10/0x20 > [ 6691.837284] [<ffffffff81344fea>] vbd_free+0x2a/0x40 > [ 6691.837287] [<ffffffff81344499>] blkback_remove+0x59/0x90 > [ 6691.837289] [<ffffffff8133e890>] xenbus_dev_remove+0x50/0x70 > [ 6691.837292] [<ffffffff8138b9d8>] __device_release_driver+0x58/0xb0 > [ 6691.837295] [<ffffffff8138bb4d>] device_release_driver+0x2d/0x40 > [ 6691.837298] [<ffffffff8138ac0a>] bus_remove_device+0x9a/0xc0 > [ 6691.837301] [<ffffffff81388da7>] device_del+0x127/0x1d0 > [ 6691.837303] [<ffffffff81388e66>] device_unregister+0x16/0x30 > [ 6691.837306] [<ffffffff813441a0>] frontend_changed+0x90/0x2a0 > [ 6691.837309] [<ffffffff8133eb82>] xenbus_otherend_changed+0xb2/0xc0 > [ 6691.837313] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 > [ 6691.837316] [<ffffffff8133f150>] frontend_changed+0x10/0x20 > [ 6691.837318] [<ffffffff8133c794>] xenwatch_thread+0xb4/0x190 > [ 6691.837321] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 > [ 6691.837324] [<ffffffff8133c6e0>] ? xenwatch_thread+0x0/0x190 > [ 6691.837327] [<ffffffff810aecb6>] kthread+0xa6/0xb0 > [ 6691.837331] [<ffffffff8103f3ea>] child_rip+0xa/0x20 > [ 6691.837334] [<ffffffff8103e5d7>] ? int_ret_from_sys_call+0x7/0x1b > [ 6691.837337] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 > [ 6691.837340] [<ffffffff8103f3e0>] ? child_rip+0x0/0x20 > [ 6691.837341] Code: 48 8b 46 10 48 89 02 48 c7 46 10 00 00 00 00 8b 4e > 30 eb c3 0f 1f 40 00 48 8b 47 08 48 85 c0 74 ce 83 68 4c 01 83 47 18 01 > c9 c3 <0f> 0b eb fe 90 55 48 89 e5 41 54 53 0f 1f 44 00 00 48 8b 5f 28 > [ 6691.837378] RIP [<ffffffff811c2b4b>] sysfs_remove_one+0x9b/0xa0 > [ 6691.837381] RSP <ffff8802e2e65a30> > [ 6691.837697] ---[ end trace 2bd84e059b85817e ]--- > > and then: > > [ 6803.229132] BUG: soft lockup - CPU#1 stuck for 61s! [irqbalance:3772] > [ 6803.229205] Modules linked in: iptable_filter ip_tables bridge stp > qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables > lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid > megaraid_sas e1000e > [ 6803.230394] CPU 1: > [ 6803.230499] Modules linked in: iptable_filter ip_tables bridge stp > qla4xxx scsi_transport_iscsi qla3xxx sch_queue bonding ebt_imq x_tables > lp parport serio_raw shpchp joydev bnx2 ses enclosure usbhid hid > megaraid_sas e1000e > [ 6803.231669] Pid: 3772, comm: irqbalance Tainted: G D > 2.6.31.13 #4 System x3650 M3 -[7945M2G]- > [ 6803.231764] RIP: e030:[<ffffffff8157611a>] [<ffffffff8157611a>] > __mutex_lock_slowpath+0x3a/0x180 > [ 6803.231904] RSP: e02b:ffff8802d6049bd8 EFLAGS: 00000286 > [ 6803.231974] RAX: 00000000fffffffe RBX: ffffffff81841980 RCX: > 0000000000000000 > [ 6803.232047] RDX: 0000000000000000 RSI: ffff8802e2e64000 RDI: > ffffffff81841980 > [ 6803.232121] RBP: ffff8802d6049c38 R08: 000000000000004e R09: > 0000000000000000 > [ 6803.232194] R10: 0000000000000001 R11: 0000000000000000 R12: > ffff8802d609c470 > [ 6803.232268] R13: ffff8802e2e64000 R14: ffffffff81841998 R15: > 0000000000000001 > [ 6803.232344] FS: 00007f47c3df4720(0000) GS:ffffc90000018000(0000) > knlGS:0000000000000000 > [ 6803.232434] CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b > [ 6803.232505] CR2: 00007f47c3dfe000 CR3: 00000002db8e7000 CR4: > 0000000000002660 > [ 6803.232579] DR0: 0000000000000000 DR1: 0000000000000000 DR2: > 0000000000000000 > [ 6803.232651] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: > 0000000000000400 > [ 6803.232724] Call Trace: > [ 6803.232789] [<ffffffff81576125>] ? __mutex_lock_slowpath+0x45/0x180 > [ 6803.232863] [<ffffffff8116d3dd>] ? __d_lookup+0x12d/0x170 > [ 6803.232936] [<ffffffff815760bb>] mutex_lock+0x2b/0x50 > [ 6803.233009] [<ffffffff811c3ecd>] sysfs_follow_link+0x6d/0x1d0 > [ 6803.233081] [<ffffffff8116f038>] ? touch_atime+0x38/0x150 > [ 6803.233152] [<ffffffff81164c47>] __link_path_walk+0x5d7/0x1060 > [ 6803.233229] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 > [ 6803.233304] [<ffffffff812c7d2a>] ? __up_read+0x9a/0xc0 > [ 6803.233379] [<ffffffff8103819d>] ? > xen_force_evtchn_callback+0xd/0x10 > [ 6803.233452] [<ffffffff8116594a>] path_walk+0x6a/0xe0 > [ 6803.233522] [<ffffffff81165b1b>] do_path_lookup+0x5b/0xa0 > [ 6803.233593] [<ffffffff8115a2a1>] ? get_empty_filp+0xa1/0x170 > [ 6803.233663] [<ffffffff81166ac3>] do_filp_open+0x103/0xc60 > [ 6803.233736] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 > [ 6803.233809] [<ffffffff8114fc0b>] ? kmem_cache_free+0xcb/0x150 > [ 6803.233884] [<ffffffff8103819d>] ? > xen_force_evtchn_callback+0xd/0x10 > [ 6803.233957] [<ffffffff81172334>] ? alloc_fd+0x114/0x160 > [ 6803.234028] [<ffffffff81156b39>] do_sys_open+0x69/0x140 > [ 6803.234098] [<ffffffff81156c50>] sys_open+0x20/0x30 > [ 6803.235949] [<ffffffff8103e332>] system_call_fastpath+0x16/0x1b > > Which repeats several times. > Does anyone have a suggestion what''s going wrong? Is it normal for the > tapdisk process to only show up on the backup-server once remus is started > or should i initiallise it beforehand? Thanks in advance!I''m not sure what these panics are, but it is normal for the tapdisk process not to start on the backup until you start Remus. If you''re not using 2.6.18 for your guest kernel, give that a try, or possibly HVM. If you haven''t walked through the getting started steps here: http://nss.cs.ubc.ca/remus/doc.html they may help narrow down the problem. Regards, Brendan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Hey Brendan & all,> > I ran into some problems trying remus on xen4.0.1rc4 with the 2.6.31.13 > > dom0 (checkout from yesterday): > > Wat''s your domU kernel? pvops support was recently added to dom0, but > still doesn''t work for domU.Ah, that explains a few things, however similar behaviour occurs with hvm. Remus starts, spits out the following output: qemu logdirty mode: enable 1: sent 267046, skipped 218, delta 8962ms, dom0 68%, target 0%, sent 976Mb/s, dirtied 1Mb/s 290 pages 2: sent 290, skipped 0, delta 12ms, dom0 66%, target 0%, sent 791Mb/s, dirtied 43Mb/s 16 pages 3: sent 16, skipped 0, Start last iteration PROF: suspending at 1278503125.101352 issuing HVM suspend hypercall suspend hypercall returned 0 pausing QEMU SUSPEND shinfo 000fffff delta 11ms, dom0 18%, target 0%, sent 47Mb/s, dirtied 47Mb/s 16 pages 4: sent 16, skipped 0, delta 5ms, dom0 20%, target 0%, sent 104Mb/s, dirtied 104Mb/s 16 pages Total pages sent= 267368 (0.25x) (of which 0 were fixups) All memory is saved PROF: resumed at 1278503125.111614 resuming QEMU Sending 6017 bytes of QEMU state PROF: flushed memory at 1278503125.112014 and then seems to become inactive. ps tree looks like this: root 4756 0.4 0.1 82740 11040 pts/0 SLl+ 13:45 0:03 /usr/bin/python /usr/bin/remus --no-net remus1 backup according to strace, it''s stuck reading FD6, which is a FIFO file: /var/run/tap/remus_nas1_9000.msg the domU comes up in blocked state on the backup machine and seems to run fine there. however xm list on the primary shows no state whatsoever: Domain-0 0 10208 12 r----- 468.6 remus1 1 1024 1 ------ 41.8 and after a ctrl-c remus segfaults: remus[4756]: segfault at 0 ip 00007f3f49cc7376 sp 00007fffec999fd8 error 4 in libc-2.11.1.so[7f3f49ba1000+178000]> Are these in dom0 or the primary domU? Looks a bit like dom0, but I > haven''t seen these before.those were in dom0. this time dmesg shows output after destroying the domU on the primary: [ 1920.059226] INFO: task xenwatch:55 blocked for more than 120 seconds. [ 1920.059262] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1920.059315] xenwatch D 0000000000000000 0 55 2 0x00000000 [ 1920.059363] ffff8802e2e656c0 0000000000000246 0000000000011200 0000000000000000 [ 1920.059439] ffff8802e2e65720 0000000000000000 ffff8802d55d20c0 00000001001586b3 [ 1920.059520] ffff8802e2e683b0 000000000000f668 00000000000153c0 ffff8802e2e683b0 [ 1920.059592] Call Trace: [ 1920.059626] [<ffffffff8157553d>] io_schedule+0x2d/0x40 [ 1920.059661] [<ffffffff812afbc9>] get_request_wait+0xe9/0x1c0 [ 1920.059695] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 [ 1920.059732] [<ffffffff812a3e87>] ? elv_merge+0x37/0x200 [ 1920.059765] [<ffffffff812afd41>] __make_request+0xa1/0x470 [ 1920.059800] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 [ 1920.059837] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 [ 1920.059874] [<ffffffff812ae5dc>] generic_make_request+0x17c/0x4a0 [ 1920.059909] [<ffffffff8111bdf6>] ? mempool_alloc+0x56/0x140 [ 1920.059946] [<ffffffff8103819d>] ? xen_force_evtchn_callback+0xd/0x10 [ 1920.059979] [<ffffffff812ae978>] submit_bio+0x78/0xf0 [ 1920.060013] [<ffffffff81180489>] submit_bh+0xf9/0x140 [ 1920.060046] [<ffffffff81182600>] __block_write_full_page+0x1e0/0x3a0 [ 1920.060080] [<ffffffff811819c0>] ? end_buffer_async_write+0x0/0x1f0 [ 1920.060116] [<ffffffff81186980>] ? blkdev_get_block+0x0/0x70 [ 1920.060151] [<ffffffff81186980>] ? blkdev_get_block+0x0/0x70 [ 1920.060186] [<ffffffff811819c0>] ? end_buffer_async_write+0x0/0x1f0 [ 1920.060222] [<ffffffff81182ec1>] block_write_full_page_endio+0xe1/0x120 [ 1920.060259] [<ffffffff81038a12>] ? check_events+0x12/0x20 [ 1920.060294] [<ffffffff81182f15>] block_write_full_page+0x15/0x20 [ 1920.060330] [<ffffffff81187928>] blkdev_writepage+0x18/0x20 [ 1920.060365] [<ffffffff81120937>] __writepage+0x17/0x40 [ 1920.060399] [<ffffffff81121897>] write_cache_pages+0x227/0x4d0 [ 1920.060434] [<ffffffff81120920>] ? __writepage+0x0/0x40 [ 1920.060469] [<ffffffff810389ff>] ? xen_restore_fl_direct_end+0x0/0x1 [ 1920.060504] [<ffffffff81121b64>] generic_writepages+0x24/0x30 [ 1920.060539] [<ffffffff81121b9d>] do_writepages+0x2d/0x50 [ 1920.060576] [<ffffffff81119beb>] __filemap_fdatawrite_range+0x5b/0x60 [ 1920.060613] [<ffffffff8111a1ff>] filemap_fdatawrite+0x1f/0x30 [ 1920.060646] [<ffffffff8111a245>] filemap_write_and_wait+0x35/0x50 [ 1920.060681] [<ffffffff81187ba4>] __sync_blockdev+0x24/0x50 [ 1920.060716] [<ffffffff81187be3>] sync_blockdev+0x13/0x20 [ 1920.060748] [<ffffffff81187cc8>] __blkdev_put+0xa8/0x1a0 [ 1920.060784] [<ffffffff81187dd0>] blkdev_put+0x10/0x20 [ 1920.060819] [<ffffffff81344fea>] vbd_free+0x2a/0x40 [ 1920.060851] [<ffffffff81344499>] blkback_remove+0x59/0x90 [ 1920.060885] [<ffffffff8133e890>] xenbus_dev_remove+0x50/0x70 [ 1920.060921] [<ffffffff8138b9d8>] __device_release_driver+0x58/0xb0 [ 1920.060956] [<ffffffff8138bb4d>] device_release_driver+0x2d/0x40 [ 1920.060991] [<ffffffff8138ac0a>] bus_remove_device+0x9a/0xc0 [ 1920.061027] [<ffffffff81388da7>] device_del+0x127/0x1d0 [ 1920.061061] [<ffffffff81388e66>] device_unregister+0x16/0x30 [ 1920.061095] [<ffffffff813441a0>] frontend_changed+0x90/0x2a0 [ 1920.061131] [<ffffffff8133eb82>] xenbus_otherend_changed+0xb2/0xc0 [ 1920.061167] [<ffffffff81577aa7>] ? _spin_unlock_irqrestore+0x37/0x60 [ 1920.061209] [<ffffffff8133f150>] frontend_changed+0x10/0x20 [ 1920.061243] [<ffffffff8133c794>] xenwatch_thread+0xb4/0x190 [ 1920.061281] [<ffffffff810af240>] ? autoremove_wake_function+0x0/0x40 [ 1920.061314] [<ffffffff8133c6e0>] ? xenwatch_thread+0x0/0x190 [ 1920.061349] [<ffffffff810aecb6>] kthread+0xa6/0xb0 [ 1920.061383] [<ffffffff8103f3ea>] child_rip+0xa/0x20 [ 1920.061415] [<ffffffff8103e5d7>] ? int_ret_from_sys_call+0x7/0x1b [ 1920.061451] [<ffffffff8103ed5d>] ? retint_restore_args+0x5/0x6 [ 1920.061485] [<ffffffff8103f3e0>] ? child_rip+0x0/0x20 Any idea what''s going wrong? Thanks! Cheers, NN _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel