6.2-PRERELEASE from 20061016 RELENG_6 sources. SMP with CPU: Intel(R) Core(TM)2 CPU 6400 @ 2.13GHz (2127.97-MHz 686-class CPU) GENERIC minus some devices plus: options IPSEC options IPSEC_ESP device tap device vlan device hwpmc options HWPMC_HOOKS options KDB options DDB This also happened a couple times with 6.2-PRERELEASE from 20061004 RELENG_6 sources before I updated a couple days ago. In userland from a csh session, I used Ctrl-Z to suspend a Bourne shell script writing to a file through an amd host NFS (tcp v3) mount back to my own machine. Sometimes (maybe 5 times out of 20? in the last week) when I do this, I get a locked vnode problem (see ddb session below) and can't do anything with the filesystems at all (local or nfs). Shells hang when I execute any command (until the suspended 'sh' is killed from ddb). Interrupts still work, Ctrl-Alt-F# allows switching from X to ttyv* consoles. Dropping into ddb allows me to kill the 'sh' with -9 and then continuing from ddb let's me continue working and things are back to normal. KDB: enter: manual escape to debugger [thread pid 19 tid 100025 ] Stopped at 0xc05d5bcf = kdb_enter+0x2b: nop db> show lockedvnods Locked vnodes 0xc6b7bdd0: tag nfs, type VDIR usecount 2, writecount 0, refcount 8 mountedhere 0 flags (VV_ROOT) v_object 0xc9d84108 ref 0 pages 0 lock type nfs: EXCL (count 1) by thread 0xc8adac00 (pid 50746) with 5 pending fileid 8 fsid 0x300ff06 db> ps pid ppid pgrp uid state wmesg wchan cmd 50761 1699 50761 0 S+ nfs 0xc6b7be28 ls 50760 1699 50760 0 S+ nfs 0xc6b7be28 lsof 50759 48969 48935 600 S+ nanslp 0xc088efcc seamonkey-bin 50758 48969 48935 600 S+ nanslp 0xc088efcc seamonkey-bin 50757 50756 50757 0 Ss nfs 0xc6b7be28 procmail 50756 1 809 0 S select 0xc0895ba4 sendmail 50751 50750 50751 0 Ss nfs 0xc6b7be28 procmail 50750 1 809 0 S select 0xc0895ba4 sendmail 50748 50747 1418 600 SV+ nfs 0xc6b7be28 csh 50747 1418 1418 600 S+ ppwait 0xc8573430 csh 50746 50000 49999 600 T+ sh 50000 1418 49999 600 T+ formail 49999 1418 49999 600 T+ pipewr 0xc910f000 cat . . db>db> trace 50746 Tracing pid 50746 tid 100231 td 0xc8adac00 sched_switch(c8adac00,0,2) at 0xc05ce0cb = sched_switch+0x173 mi_switch(2,0) at 0xc05c2b0a = mi_switch+0x1ba thread_suspend_check(1,c079e04c,c8adac00,c9206b80,1,...) at 0xc05c722d = thread_suspend_check+0x191 sleepq_catch_signals(c9206b80) at 0xc05db93f = sleepq_catch_signals+0x103 sleepq_wait_sig(c9206b80) at 0xc05dbd96 = sleepq_wait_sig+0xe msleep(c9206b80,c08a6a40,153,c0813379,0) at 0xc05c2652 = msleep+0x25a nfs_reply(c9206b80,0,c8adac00,4,c7ea7100,...) at 0xc06c33ac = nfs_reply+0x244 nfs_request(c6b7bdd0,c6ae2d00,1,c8adac00,c7815280,e8f3488c,e8f34890,e8f34894,c8adac00,e8f348a0) at 0xc06c40a5 = nfs_request+0x3c1 nfs_getattr(e8f348dc) at 0xc06c912b = nfs_getattr+0x11f VOP_GETATTR_APV(c086c700,e8f348dc) at 0xc07b260c = VOP_GETATTR_APV+0x38 nfsspec_access(e8f34a8c,c6bf7c94,0,e8f349a4,c060ca26,...) at 0xc06cebf1 = nfsspec_access+0x85 nfs_access(e8f34a8c) at 0xc06c8b7a = nfs_access+0x122 VOP_ACCESS_APV(c086c700,e8f34a8c) at 0xc07b25b0 = VOP_ACCESS_APV+0x38 nfs_lookup(e8f34b18) at 0xc06c96ff = nfs_lookup+0xd3 VOP_LOOKUP_APV(c086c700,e8f34b18) at 0xc07b22f7 = VOP_LOOKUP_APV+0x43 lookup(e8f34c00) at 0xc060ee79 = lookup+0x4c1 namei(e8f34c00) at 0xc060e71a = namei+0x39a kern_stat(c8adac00,806712c,0,e8f34c74) at 0xc061d3cd = kern_stat+0x35 stat(c8adac00,e8f34d04) at 0xc061d37b = stat+0x1b syscall(3b,3b,3b,1,80670ec,...) at 0xc07a9363 = syscall+0x2bf Xint0x80_syscall() at 0xc079456f = Xint0x80_syscall+0x1f --- syscall (188, FreeBSD ELF32, stat), eip = 0x28196477, esp = 0xbfbfdc1c, ebp = 0xbfbfdcb8 --- db> kill 9 50746 db> c
On Wed, Oct 18, 2006 at 10:01:45AM -0600, John E Hein wrote:> 6.2-PRERELEASE from 20061016 RELENG_6 sources. > Locked vnodes > > 0xc6b7bdd0: tag nfs, type VDIR > usecount 2, writecount 0, refcount 8 mountedhere 0 > flags (VV_ROOT) > v_object 0xc9d84108 ref 0 pages 0 > lock type nfs: EXCL (count 1) by thread 0xc8adac00 (pid 50746) with 5 pending > fileid 8 fsid 0x300ff06 > > 50746 50000 49999 600 T+ sh > . > . > db>db> trace 50746 > Tracing pid 50746 tid 100231 td 0xc8adac00 > sched_switch(c8adac00,0,2) at 0xc05ce0cb = sched_switch+0x173 > mi_switch(2,0) at 0xc05c2b0a = mi_switch+0x1ba > thread_suspend_check(1,c079e04c,c8adac00,c9206b80,1,...) at 0xc05c722d = thread_suspend_check+0x191 > sleepq_catch_signals(c9206b80) at 0xc05db93f = sleepq_catch_signals+0x103 > sleepq_wait_sig(c9206b80) at 0xc05dbd96 = sleepq_wait_sig+0xe > msleep(c9206b80,c08a6a40,153,c0813379,0) at 0xc05c2652 = msleep+0x25a > nfs_reply(c9206b80,0,c8adac00,4,c7ea7100,...) at 0xc06c33ac = nfs_reply+0x244 > nfs_request(c6b7bdd0,c6ae2d00,1,c8adac00,c7815280,e8f3488c,e8f34890,e8f34894,c8adac00,e8f348a0) at 0xc06c40a5 = nfs_request+0x3c1 > nfs_getattr(e8f348dc) at 0xc06c912b = nfs_getattr+0x11f > VOP_GETATTR_APV(c086c700,e8f348dc) at 0xc07b260c = VOP_GETATTR_APV+0x38 > nfsspec_access(e8f34a8c,c6bf7c94,0,e8f349a4,c060ca26,...) at 0xc06cebf1 = nfsspec_access+0x85 > nfs_access(e8f34a8c) at 0xc06c8b7a = nfs_access+0x122 > VOP_ACCESS_APV(c086c700,e8f34a8c) at 0xc07b25b0 = VOP_ACCESS_APV+0x38 > nfs_lookup(e8f34b18) at 0xc06c96ff = nfs_lookup+0xd3 > VOP_LOOKUP_APV(c086c700,e8f34b18) at 0xc07b22f7 = VOP_LOOKUP_APV+0x43 > lookup(e8f34c00) at 0xc060ee79 = lookup+0x4c1 > namei(e8f34c00) at 0xc060e71a = namei+0x39a > kern_stat(c8adac00,806712c,0,e8f34c74) at 0xc061d3cd = kern_stat+0x35 > stat(c8adac00,e8f34d04) at 0xc061d37b = stat+0x1b > syscall(3b,3b,3b,1,80670ec,...) at 0xc07a9363 = syscall+0x2bf > Xint0x80_syscall() at 0xc079456f = Xint0x80_syscall+0x1f > --- syscall (188, FreeBSD ELF32, stat), eip = 0x28196477, esp = 0xbfbfdc1c, ebp = 0xbfbfdcb8 --- > db> kill 9 50746 > db> cThe nfs_reply is sleeping with the PCATCH set. The question is why SIGTSTP does not cause msleep to return with EINTR. -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 187 bytes Desc: not available Url : http://lists.freebsd.org/pipermail/freebsd-stable/attachments/20061019/bdbbc2fa/attachment.pgp
On Thursday 19 October 2006 06:04, Kostik Belousov wrote:> On Wed, Oct 18, 2006 at 10:01:45AM -0600, John E Hein wrote: > > 6.2-PRERELEASE from 20061016 RELENG_6 sources. > > Locked vnodes > > > > 0xc6b7bdd0: tag nfs, type VDIR > > usecount 2, writecount 0, refcount 8 mountedhere 0 > > flags (VV_ROOT) > > v_object 0xc9d84108 ref 0 pages 0 > > lock type nfs: EXCL (count 1) by thread 0xc8adac00 (pid 50746) with 5pending> > fileid 8 fsid 0x300ff06 > > > > 50746 50000 49999 600 T+ sh > > . > > . > > db>db> trace 50746 > > Tracing pid 50746 tid 100231 td 0xc8adac00 > > sched_switch(c8adac00,0,2) at 0xc05ce0cb = sched_switch+0x173 > > mi_switch(2,0) at 0xc05c2b0a = mi_switch+0x1ba > > thread_suspend_check(1,c079e04c,c8adac00,c9206b80,1,...) at 0xc05c722d =thread_suspend_check+0x191> > sleepq_catch_signals(c9206b80) at 0xc05db93f = sleepq_catch_signals+0x103 > > sleepq_wait_sig(c9206b80) at 0xc05dbd96 = sleepq_wait_sig+0xe > > msleep(c9206b80,c08a6a40,153,c0813379,0) at 0xc05c2652 = msleep+0x25a > > nfs_reply(c9206b80,0,c8adac00,4,c7ea7100,...) at 0xc06c33ac =nfs_reply+0x244> >nfs_request(c6b7bdd0,c6ae2d00,1,c8adac00,c7815280,e8f3488c,e8f34890,e8f34894,c8adac00,e8f348a0) at 0xc06c40a5 = nfs_request+0x3c1> > nfs_getattr(e8f348dc) at 0xc06c912b = nfs_getattr+0x11f > > VOP_GETATTR_APV(c086c700,e8f348dc) at 0xc07b260c = VOP_GETATTR_APV+0x38 > > nfsspec_access(e8f34a8c,c6bf7c94,0,e8f349a4,c060ca26,...) at 0xc06cebf1 =nfsspec_access+0x85> > nfs_access(e8f34a8c) at 0xc06c8b7a = nfs_access+0x122 > > VOP_ACCESS_APV(c086c700,e8f34a8c) at 0xc07b25b0 = VOP_ACCESS_APV+0x38 > > nfs_lookup(e8f34b18) at 0xc06c96ff = nfs_lookup+0xd3 > > VOP_LOOKUP_APV(c086c700,e8f34b18) at 0xc07b22f7 = VOP_LOOKUP_APV+0x43 > > lookup(e8f34c00) at 0xc060ee79 = lookup+0x4c1 > > namei(e8f34c00) at 0xc060e71a = namei+0x39a > > kern_stat(c8adac00,806712c,0,e8f34c74) at 0xc061d3cd = kern_stat+0x35 > > stat(c8adac00,e8f34d04) at 0xc061d37b = stat+0x1b > > syscall(3b,3b,3b,1,80670ec,...) at 0xc07a9363 = syscall+0x2bf > > Xint0x80_syscall() at 0xc079456f = Xint0x80_syscall+0x1f > > --- syscall (188, FreeBSD ELF32, stat), eip = 0x28196477, esp =0xbfbfdc1c, ebp = 0xbfbfdcb8 ---> > db> kill 9 50746 > > db> c > > The nfs_reply is sleeping with the PCATCH set. The question is why SIGTSTP > does not cause msleep to return with EINTR.The problem is in thread_suspend_check(), not the sleepq code. -- John Baldwin