# uname -a
SunOS sc420 5.11 snv_37 i86pc i386 i86pc
I run a ''dd'' command fives times in parrallel (to the same
file):
# dd if=/dev/zero of=/share/ddfile bs=512k count=1024 &
/share is a zpool/zfs mount point [1].
Those dd commands stopped progressing some time later,
no I/O''s going on, they cannot be killed.
"ls" on the target file also hangs and cannot be killed.
I searched the following key words in bug database, no perfect match:
"zfs_dirent_lock zfs_range_lock_writer txg_wait_open".
# pgrep dd
618
619
620
621
622
PID 618, 619 have the folllowing stack:
# echo "0t618::pid2proc|::walk thread|::findstack -v" | mdb -k
stack pointer for thread ffffffff82d6d760: fffffe8000bc0ba0
[ fffffe8000bc0ba0 _resume_from_idle+0xf8() ]
  fffffe8000bc0be0 swtch+0x185()
  fffffe8000bc0c10 cv_wait+0x6f(ffffffff803a9496, ffffffff803a9450)
  fffffe8000bc0c60 txg_wait_open+0x87(ffffffff803a9380, 0)
  fffffe8000bc0d70 zfs_write+0x6a4(ffffffff84612cc0, fffffe8000bc0e20, 0,
  ffffffff82e17598, 0)
  fffffe8000bc0dd0 fop_write+0x40(ffffffff84612cc0, fffffe8000bc0e20, 0,
  ffffffff82e17598, 0)
  fffffe8000bc0e90 write+0x2da(4, 8065000, 80000)
  fffffe8000bc0ec0 write32+0x20(4, 8065000, 80000)
  fffffe8000bc0f10 _sys_sysenter_post_swapgs+0x14b()
PID 620:
# echo "0t620::pid2proc|::walk thread|::findstack -v" | mdb -k
stack pointer for thread ffffffff82d6d400: fffffe8000414820
[ fffffe8000414820 _resume_from_idle+0xf8() ]
  fffffe8000414860 swtch+0x185()
  fffffe8000414890 cv_wait+0x6f(ffffffff8a8e5180, ffffffff84625410)
  fffffe8000414910 zfs_range_lock_writer+0x166(ffffffff846253c8,
  ffffffff8aa10f20)
  fffffe8000414970 zfs_range_lock+0xb8(ffffffff846253c8, 0,
ffffffffffffffff, 1
  )
  fffffe8000414a50 zfs_create+0x477(ffffffff824cbc00, ffffffff8336fb87,
  fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102)
  fffffe8000414ac0 fop_create+0x50(ffffffff824cbc00, ffffffff8336fb87,
  fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102)
  fffffe8000414c60 vn_createat+0x591(8047d73, 0, fffffe8000414cd8, 0, 80,
  fffffe8000414cc0, 0, 2102, 12, 0)
  fffffe8000414de0 vn_openat+0x13f(8047d73, 0, 2302, 1b6, fffffe8000414e48,
0,
  12, 0)
  fffffe8000414ea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6)
  fffffe8000414ec0 creat64+0x29(8047d73, 1b6)
  fffffe8000414f10 _sys_sysenter_post_swapgs+0x14b()
PID 621 and "ls" processes have the following stack:
# echo "0t621::pid2proc|::walk thread|::findstack -v" | mdb -k
stack pointer for thread ffffffff81306e40: fffffe800017f5c0
[ fffffe800017f5c0 _resume_from_idle+0xf8() ]
  fffffe800017f600 swtch+0x185()
  fffffe800017f630 cv_wait+0x6f(ffffffff88cb390e, ffffffff824c8f30)
  fffffe800017f6d0 zfs_dirent_lock+0x106(fffffe800017f718, ffffffff824c8f08,
  fffffe800017f890, fffffe800017f710, 6)
  fffffe800017f740 zfs_dirlook+0xb3(ffffffff824c8f08, fffffe800017f890,
  fffffe800017f880)
  fffffe800017f7c0 zfs_lookup+0xab(ffffffff824cbc00, fffffe800017f890,
  fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40, ffffffff82e17598)
  fffffe800017f830 fop_lookup+0x50(ffffffff824cbc00, fffffe800017f890,
  fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40, ffffffff82e17598)
  fffffe800017fa30 lookuppnvp+0x31d(fffffe800017fb28, 0, 1,
fffffe800017fb18,
  fffffe800017fcc0, ffffffff80aada40, ffffffff824cbc00, ffffffff82e17598)
  fffffe800017fad0 lookuppnat+0x13e(fffffe800017fb28, 0, 1,
fffffe800017fb18,
  fffffe800017fcc0, 0)
  fffffe800017fc60 vn_createat+0x101(8047d73, 0, fffffe800017fcd8, 0, 80,
  fffffe800017fcc0, 0, 2102, 12, 0)
  fffffe800017fde0 vn_openat+0x13f(8047d73, 0, 2302, 1b6, fffffe800017fe48,
0,
  12, 0)
  fffffe800017fea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6)
  fffffe800017fec0 creat64+0x29(8047d73, 1b6)
  fffffe800017ff10 _sys_sysenter_post_swapgs+0x14b()
[1] # zpool status
  pool: share
 state: ONLINE
 scrub: none requested
config:
        NAME        STATE     READ WRITE CKSUM
        share       ONLINE       0     0     0
          c2d0s0    ONLINE       0     0     0
errors: No known data errors
I''ll see if it''s reproducible.
Tao
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20060423/8bdcccd0/attachment.html>
Trying to _force_ a dump by "/usr/sbin/reboot -d".
    root   873   533   0 23:58:42 pts/1       0:00 /usr/sbin/reboot -d
# echo "0t873::pid2proc|::walk thread|::findstack -v" | mdb -k
stack pointer for thread ffffffff8eabb540: fffffe8000c86d40
[ fffffe8000c86d40 _resume_from_idle+0xf8() ]
  fffffe8000c86d80 swtch+0x185()
  fffffe8000c86db0 cv_wait+0x6f(ffffffff803a9492, ffffffff803a9450)
  fffffe8000c86e00 txg_wait_synced+0x87(ffffffff803a9380, 0)
  fffffe8000c86e30 spa_sync_allpools+0x74()
  fffffe8000c86e70 zfs_sync+0xc5(0, 0, ffffffff82e17438)
  fffffe8000c86eb0 vfs_sync+0xa4(0)
  fffffe8000c86ec0 syssync+0xd()
  fffffe8000c86f10 _sys_sysenter_post_swapgs+0x14b()
Guess that''s not _forcible_ enough  :-)
Tao
On 4/23/06, Tao Chen <nameistao at gmail.com>
wrote:>
> # uname -a
> SunOS sc420 5.11 snv_37 i86pc i386 i86pc
>
> I run a ''dd'' command fives times in parrallel (to the
same file):
>
> # dd if=/dev/zero of=/share/ddfile bs=512k count=1024 &
>
> /share is a zpool/zfs mount point [1].
>
> Those dd commands stopped progressing some time later,
> no I/O''s going on, they cannot be killed.
> "ls" on the target file also hangs and cannot be killed.
>
> I searched the following key words in bug database, no perfect match:
> "zfs_dirent_lock zfs_range_lock_writer txg_wait_open".
>
> # pgrep dd
> 618
> 619
> 620
> 621
> 622
>
> PID 618, 619 have the folllowing stack:
>
> # echo "0t618::pid2proc|::walk thread|::findstack -v" | mdb -k
> stack pointer for thread ffffffff82d6d760: fffffe8000bc0ba0
> [ fffffe8000bc0ba0 _resume_from_idle+0xf8() ]
>   fffffe8000bc0be0 swtch+0x185()
>   fffffe8000bc0c10 cv_wait+0x6f(ffffffff803a9496, ffffffff803a9450)
>   fffffe8000bc0c60 txg_wait_open+0x87(ffffffff803a9380, 0)
>   fffffe8000bc0d70 zfs_write+0x6a4(ffffffff84612cc0, fffffe8000bc0e20, 0,
>   ffffffff82e17598, 0)
>   fffffe8000bc0dd0 fop_write+0x40(ffffffff84612cc0, fffffe8000bc0e20, 0,
>   ffffffff82e17598, 0)
>   fffffe8000bc0e90 write+0x2da(4, 8065000, 80000)
>   fffffe8000bc0ec0 write32+0x20(4, 8065000, 80000)
>   fffffe8000bc0f10 _sys_sysenter_post_swapgs+0x14b()
>
> PID 620:
>
> # echo "0t620::pid2proc|::walk thread|::findstack -v" | mdb -k
> stack pointer for thread ffffffff82d6d400: fffffe8000414820
> [ fffffe8000414820 _resume_from_idle+0xf8() ]
>   fffffe8000414860 swtch+0x185()
>   fffffe8000414890 cv_wait+0x6f(ffffffff8a8e5180, ffffffff84625410)
>   fffffe8000414910 zfs_range_lock_writer+0x166(ffffffff846253c8,
>   ffffffff8aa10f20)
>   fffffe8000414970 zfs_range_lock+0xb8(ffffffff846253c8, 0,
> ffffffffffffffff, 1
>   )
>   fffffe8000414a50 zfs_create+0x477(ffffffff824cbc00, ffffffff8336fb87,
>   fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102)
>   fffffe8000414ac0 fop_create+0x50(ffffffff824cbc00, ffffffff8336fb87,
>   fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102)
>   fffffe8000414c60 vn_createat+0x591(8047d73, 0, fffffe8000414cd8, 0, 80,
>   fffffe8000414cc0, 0, 2102, 12, 0)
>   fffffe8000414de0 vn_openat+0x13f(8047d73, 0, 2302, 1b6,
> fffffe8000414e48, 0,
>   12, 0)
>   fffffe8000414ea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6)
>   fffffe8000414ec0 creat64+0x29(8047d73, 1b6)
>   fffffe8000414f10 _sys_sysenter_post_swapgs+0x14b()
>
> PID 621 and "ls" processes have the following stack:
>
> # echo "0t621::pid2proc|::walk thread|::findstack -v" | mdb -k
> stack pointer for thread ffffffff81306e40: fffffe800017f5c0
> [ fffffe800017f5c0 _resume_from_idle+0xf8() ]
>   fffffe800017f600 swtch+0x185()
>   fffffe800017f630 cv_wait+0x6f(ffffffff88cb390e, ffffffff824c8f30)
>   fffffe800017f6d0 zfs_dirent_lock+0x106(fffffe800017f718,
> ffffffff824c8f08,
>   fffffe800017f890, fffffe800017f710, 6)
>   fffffe800017f740 zfs_dirlook+0xb3(ffffffff824c8f08, fffffe800017f890,
>   fffffe800017f880)
>   fffffe800017f7c0 zfs_lookup+0xab(ffffffff824cbc00, fffffe800017f890,
>   fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40,
> ffffffff82e17598)
>   fffffe800017f830 fop_lookup+0x50(ffffffff824cbc00, fffffe800017f890,
>   fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40,
> ffffffff82e17598)
>   fffffe800017fa30 lookuppnvp+0x31d(fffffe800017fb28, 0, 1,
> fffffe800017fb18,
>   fffffe800017fcc0, ffffffff80aada40, ffffffff824cbc00, ffffffff82e17598)
>   fffffe800017fad0 lookuppnat+0x13e(fffffe800017fb28, 0, 1,
> fffffe800017fb18,
>   fffffe800017fcc0, 0)
>   fffffe800017fc60 vn_createat+0x101(8047d73, 0, fffffe800017fcd8, 0, 80,
>   fffffe800017fcc0, 0, 2102, 12, 0)
>   fffffe800017fde0 vn_openat+0x13f(8047d73, 0, 2302, 1b6,
> fffffe800017fe48, 0,
>   12, 0)
>   fffffe800017fea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6)
>   fffffe800017fec0 creat64+0x29(8047d73, 1b6)
>   fffffe800017ff10 _sys_sysenter_post_swapgs+0x14b()
>
>
> [1] # zpool status
>   pool: share
>  state: ONLINE
>  scrub: none requested
> config:
>
>         NAME        STATE     READ WRITE CKSUM
>         share       ONLINE       0     0     0
>           c2d0s0    ONLINE       0     0     0
>
> errors: No known data errors
>
> I''ll see if it''s reproducible.
>
> Tao
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL:
<http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20060424/c394cbbc/attachment.html>
Tao, We''re not sure what''s going on here. We don''t see an obvious deadlock from the stacks provided. Everything seems stacked up behind the txg_wait_open() which is presumably blocked for some reason. I''ve also tried for several hours to reproduce this without success. Did you manage to reproduce the problem or get a dump using "$<systedump" ? We really need more information to help. Thanks: Neil. Tao Chen wrote On 04/23/06 23:03,:> Trying to _force_ a dump by "/usr/sbin/reboot -d". > > root 873 533 0 23:58:42 pts/1 0:00 /usr/sbin/reboot -d > > # echo "0t873::pid2proc|::walk thread|::findstack -v" | mdb -k > stack pointer for thread ffffffff8eabb540: fffffe8000c86d40 > [ fffffe8000c86d40 _resume_from_idle+0xf8() ] > fffffe8000c86d80 swtch+0x185() > fffffe8000c86db0 cv_wait+0x6f(ffffffff803a9492, ffffffff803a9450) > fffffe8000c86e00 txg_wait_synced+0x87(ffffffff803a9380, 0) > fffffe8000c86e30 spa_sync_allpools+0x74() > fffffe8000c86e70 zfs_sync+0xc5(0, 0, ffffffff82e17438) > fffffe8000c86eb0 vfs_sync+0xa4(0) > fffffe8000c86ec0 syssync+0xd() > fffffe8000c86f10 _sys_sysenter_post_swapgs+0x14b() > > Guess that''s not _forcible_ enough :-) > > Tao > > > On 4/23/06, *Tao Chen* <nameistao at gmail.com > <mailto:nameistao at gmail.com>> wrote: > > # uname -a > SunOS sc420 5.11 snv_37 i86pc i386 i86pc > > I run a ''dd'' command fives times in parrallel (to the same file): > > # dd if=/dev/zero of=/share/ddfile bs=512k count=1024 & > > /share is a zpool/zfs mount point [1]. > > Those dd commands stopped progressing some time later, > no I/O''s going on, they cannot be killed. > "ls" on the target file also hangs and cannot be killed. > > I searched the following key words in bug database, no perfect match: > "zfs_dirent_lock zfs_range_lock_writer txg_wait_open". > > # pgrep dd > 618 > 619 > 620 > 621 > 622 > > PID 618, 619 have the folllowing stack: > > # echo "0t618::pid2proc|::walk thread|::findstack -v" | mdb -k > stack pointer for thread ffffffff82d6d760: fffffe8000bc0ba0 > [ fffffe8000bc0ba0 _resume_from_idle+0xf8() ] > fffffe8000bc0be0 swtch+0x185() > fffffe8000bc0c10 cv_wait+0x6f(ffffffff803a9496, ffffffff803a9450) > fffffe8000bc0c60 txg_wait_open+0x87(ffffffff803a9380, 0) > fffffe8000bc0d70 zfs_write+0x6a4(ffffffff84612cc0, > fffffe8000bc0e20, 0, > ffffffff82e17598, 0) > fffffe8000bc0dd0 fop_write+0x40(ffffffff84612cc0, fffffe8000bc0e20, 0, > ffffffff82e17598, 0) > fffffe8000bc0e90 write+0x2da(4, 8065000, 80000) > fffffe8000bc0ec0 write32+0x20(4, 8065000, 80000) > fffffe8000bc0f10 _sys_sysenter_post_swapgs+0x14b() > > PID 620: > > # echo "0t620::pid2proc|::walk thread|::findstack -v" | mdb -k > stack pointer for thread ffffffff82d6d400: fffffe8000414820 > [ fffffe8000414820 _resume_from_idle+0xf8() ] > fffffe8000414860 swtch+0x185() > fffffe8000414890 cv_wait+0x6f(ffffffff8a8e5180, ffffffff84625410) > fffffe8000414910 zfs_range_lock_writer+0x166(ffffffff846253c8, > ffffffff8aa10f20) > fffffe8000414970 zfs_range_lock+0xb8(ffffffff846253c8, 0, > ffffffffffffffff, 1 > ) > fffffe8000414a50 zfs_create+0x477(ffffffff824cbc00, ffffffff8336fb87, > fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102) > fffffe8000414ac0 fop_create+0x50(ffffffff824cbc00, ffffffff8336fb87, > fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102) > fffffe8000414c60 vn_createat+0x591(8047d73, 0, fffffe8000414cd8, > 0, 80, > fffffe8000414cc0, 0, 2102, 12, 0) > fffffe8000414de0 vn_openat+0x13f(8047d73, 0, 2302, 1b6, > fffffe8000414e48, 0, > 12, 0) > fffffe8000414ea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6) > fffffe8000414ec0 creat64+0x29(8047d73, 1b6) > fffffe8000414f10 _sys_sysenter_post_swapgs+0x14b() > > PID 621 and "ls" processes have the following stack: > > # echo "0t621::pid2proc|::walk thread|::findstack -v" | mdb -k > stack pointer for thread ffffffff81306e40: fffffe800017f5c0 > [ fffffe800017f5c0 _resume_from_idle+0xf8() ] > fffffe800017f600 swtch+0x185() > fffffe800017f630 cv_wait+0x6f(ffffffff88cb390e, ffffffff824c8f30) > fffffe800017f6d0 zfs_dirent_lock+0x106(fffffe800017f718, > ffffffff824c8f08, > fffffe800017f890, fffffe800017f710, 6) > fffffe800017f740 zfs_dirlook+0xb3(ffffffff824c8f08, fffffe800017f890, > fffffe800017f880) > fffffe800017f7c0 zfs_lookup+0xab(ffffffff824cbc00, fffffe800017f890, > fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40, > ffffffff82e17598) > fffffe800017f830 fop_lookup+0x50(ffffffff824cbc00, fffffe800017f890, > fffffe800017f880, fffffe800017fb28, 1, ffffffff80aada40, > ffffffff82e17598) > fffffe800017fa30 lookuppnvp+0x31d(fffffe800017fb28, 0, 1, > fffffe800017fb18, > fffffe800017fcc0, ffffffff80aada40, ffffffff824cbc00, > ffffffff82e17598) > fffffe800017fad0 lookuppnat+0x13e(fffffe800017fb28, 0, 1, > fffffe800017fb18, > fffffe800017fcc0, 0) > fffffe800017fc60 vn_createat+0x101(8047d73, 0, fffffe800017fcd8, > 0, 80, > fffffe800017fcc0, 0, 2102, 12, 0) > fffffe800017fde0 vn_openat+0x13f(8047d73, 0, 2302, 1b6, > fffffe800017fe48, 0, > 12, 0) > fffffe800017fea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6) > fffffe800017fec0 creat64+0x29(8047d73, 1b6) > fffffe800017ff10 _sys_sysenter_post_swapgs+0x14b() > > > [1] # zpool status > pool: share > state: ONLINE > scrub: none requested > config: > > NAME STATE READ WRITE CKSUM > share ONLINE 0 0 0 > c2d0s0 ONLINE 0 0 0 > > errors: No known data errors > > I''ll see if it''s reproducible. > > Tao > > > > ------------------------------------------------------------------------ > > _______________________________________________ > zfs-discuss mailing list > zfs-discuss at opensolaris.org > http://mail.opensolaris.org/mailman/listinfo/zfs-discuss-- Neil
Neil, On 4/24/06, Neil Perrin <Neil.Perrin at sun.com> wrote:> > Tao, > > We''re not sure what''s going on here. We don''t see an obvious > deadlock from the stacks provided. Everything seems stacked up > behind the txg_wait_open() which is presumably blocked for some reason. > I''ve also tried for several hours to reproduce this without success. > Did you manage to reproduce the problem or get a dump using "$<systedump" > ? > We really need more information to help. > >I didn''t manage to get a dump. It''s a Dell low end "server" (SC 420). I don''t have frkit''s "acpidrv" installed so that I may use powerbutton to force a dump (according to Casper). The good news is I just recreated the problem using the same procedure. Shall I try "$<systedump" ? Or would it be more helpful to let you (or anyone on this list that can investigate) login as root? It''s a personal box at home. Drop me an email if you want to get access. Thanks, Tao -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20060424/64115cd8/attachment.html>
Report back to the list, my problem has been identified as 6405008 Reboot hung after a stress test (Summary: The transaction chain is stalled due to the range locking code holding an assigned tx and trying to acquire a range lock which another thread owns.) It is fixed in snv_38. Big thanks to Neil Perrin, and others who helped in zfs team. Tao On 4/24/06, Neil Perrin <Neil.Perrin at sun.com> wrote:> > Tao, > > We''re not sure what''s going on here. We don''t see an obvious > deadlock from the stacks provided. Everything seems stacked up > behind the txg_wait_open() which is presumably blocked for some reason. > I''ve also tried for several hours to reproduce this without success. > Did you manage to reproduce the problem or get a dump using "$<systedump" > ? > We really need more information to help. > > Thanks: Neil. > > Tao Chen wrote On 04/23/06 23:03,: > > > PID 620: > > > > # echo "0t620::pid2proc|::walk thread|::findstack -v" | mdb -k > > stack pointer for thread ffffffff82d6d400: fffffe8000414820 > > [ fffffe8000414820 _resume_from_idle+0xf8() ] > > fffffe8000414860 swtch+0x185() > > fffffe8000414890 cv_wait+0x6f(ffffffff8a8e5180, ffffffff84625410) > > fffffe8000414910 zfs_range_lock_writer+0x166(ffffffff846253c8, > > ffffffff8aa10f20) > > fffffe8000414970 zfs_range_lock+0xb8(ffffffff846253c8, 0, > > ffffffffffffffff, 1 > > ) > > fffffe8000414a50 zfs_create+0x477(ffffffff824cbc00, > ffffffff8336fb87, > > fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102) > > fffffe8000414ac0 fop_create+0x50(ffffffff824cbc00, > ffffffff8336fb87, > > fffffe8000414cd8, 0, 80, fffffe8000414cc0, ffffffff82e17598, 2102) > > fffffe8000414c60 vn_createat+0x591(8047d73, 0, fffffe8000414cd8, > > 0, 80, > > fffffe8000414cc0, 0, 2102, 12, 0) > > fffffe8000414de0 vn_openat+0x13f(8047d73, 0, 2302, 1b6, > > fffffe8000414e48, 0, > > 12, 0) > > fffffe8000414ea0 copen+0x323(ffd19553, 8047d73, 2302, 1b6) > > fffffe8000414ec0 creat64+0x29(8047d73, 1b6) > > fffffe8000414f10 _sys_sysenter_post_swapgs+0x14b() > > >-------------- next part -------------- An HTML attachment was scrubbed... URL: <http://mail.opensolaris.org/pipermail/zfs-discuss/attachments/20060427/cae8863a/attachment.html>