Andreas Longwitz
2013-Jul-07 22:26 UTC
Shutdown hangs on unmount of a gjournaled file system in 8-Stable
The problem occurs after an update of 8-stable from r248120 to r252111. Sometimes shutdown hangs: Waiting (max 60 seconds) for system process `vnlru' to stop...done Waiting (max 60 seconds) for system process `bufdaemon' to stop...done Waiting (max 60 seconds) for system process `syncer' to stop... Syncing disks, vnodes remaining...0 0 done All buffers synced.>From the kernel dump I see the deadlock occurs on unmount of agjournaled file system. Involved are two processes db> ps pid ppid pgrp uid state wmesg wchan cmd 1 0 1 0 SLs mount dr 0xffffff007f7e559c [init] 18 0 0 0 SL suspwt 0xffffff007f7e5364 [g_journal switcher] (kgdb) info threads 158 Thread 100002 (PID=1: init) sched_switch (td=0xffffff000235e8e0, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 .... 217 Thread 100076 (PID=18: g_journal switcher) sched_switch (td=0xffffff0002bd6000, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 (kgdb) thread 158 [Switching to thread 158 (Thread 100002)]#0 sched_switche(td=0xffffff000235e8e0, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 1932 cpuid = PCPU_GET(cpuid); (kgdb) bt #0 sched_switch (td=0xffffff000235e8e0, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 #1 0xffffffff80407836 in mi_switch (flags=260, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:466 #2 0xffffffff8043e0e2 in sleepq_wait (wchan=0xffffff007f7e559c, pri=80) at /usr/src/sys/kern/subr_sleepqueue.c:613 #3 0xffffffff80407fc6 in _sleep (ident=0xffffff007f7e559c, lock=0xffffff007f7e52f0, priority=<value optimized out>, wmesg=0xffffffff8069f595 "mount drain", timo=0) at /usr/src/sys/kern/kern_synch.c:250 #4 0xffffffff8048ee42 in dounmount (mp=0xffffff007f7e52f0, flags=524288, td=<value optimized out>) at /usr/src/sys/kern/vfs_mount.c:1266 #5 0xffffffff80493202 in vfs_unmountall () at /usr/src/sys/kern/vfs_subr.c:3321 #6 0xffffffff803fec69 in boot (howto=<value optimized out>) at /usr/src/sys/kern/kern_shutdown.c:428 #7 0xffffffff803fef86 in reboot (td=<value optimized out>, uap=0xffffff8000238bb0) at /usr/src/sys/kern/kern_shutdown.c:191 #8 0xffffffff805db1b4 in amd64_syscall (td=0xffffff000235e8e0, traced=0) at subr_syscall.c:114 #9 0xffffffff805c282c in Xfast_syscall () at /usr/src/sys/amd64/amd64/exception.S:387 (kgdb) f 5 #5 0xffffffff80493202 in vfs_unmountall () at /usr/src/sys/kern/vfs_subr.c:3321 3321 error = dounmount(mp, MNT_FORCE, td); (kgdb) p mp->mnt_lockref $1=1 (kgdb) f 4 #4 0xffffffff8048ee42 in dounmount (mp=0xffffff007f7e52f0, flags=524288, td=<value optimized out>) at /usr/src/sys/kern/vfs_mount.c:1266 1266 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, (kgdb) list 1261 if (flags & MNT_FORCE) 1262 mp->mnt_kern_flag |= MNTK_UNMOUNTF; 1263 error = 0; 1264 if (mp->mnt_lockref) { 1265 mp->mnt_kern_flag |= MNTK_DRAINING; 1266 error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS, 1267 "mount drain", 0); 1268 } 1269 MNT_IUNLOCK(mp); 1270 KASSERT(mp->mnt_lockref == 0, (kgdb) thread 217 [Switching to thread 217 (Thread 100076)]#0 sched_switch (td=0xffffff0002bd6000, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 1932 cpuid = PCPU_GET(cpuid); (kgdb) bt #0 sched_switch (td=0xffffff0002bd6000, newtd=<value optimized out>, flags=<value optimized out>) at /usr/src/sys/kern/sched_ule.c:1932 #1 0xffffffff80407836 in mi_switch (flags=260, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:466 #2 0xffffffff8043e0e2 in sleepq_wait (wchan=0xffffff007f7e5364, pri=159) at /usr/src/sys/kern/subr_sleepqueue.c:613 #3 0xffffffff80407fc6 in _sleep (ident=0xffffff007f7e5364, lock=0xffffff007f7e52f0, priority=<value optimized out>, wmesg=0xffffffff806a0813 "suspwt", timo=0) at /usr/src/sys/kern/kern_synch.c:250 #4 0xffffffff804a25f0 in vfs_write_suspend (mp=0xffffff007f7e52f0) at /usr/src/sys/kern/vfs_vnops.c:1277 #5 0xffffffff80c843bd in g_journal_switcher (arg=<value optimized out>) at /usr/src/sys/modules/geom/geom_journal/../ ../../geom/journal/g_journal.c:2968 #6 0xffffffff803d326f in fork_exit (callout=0xffffffff80c838e0 <g_journal_switcher>, arg=0xffffffff80c8b140, frame=0xffffff8242e68c40) at /usr/src/sys/kern/kern_fork.c:872 #7 0xffffffff805c2a0e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:602 (kgdb) f 4 #4 0xffffffff804a25f0 in vfs_write_suspend (mp=0xffffff007f7e52f0) at /usr/src/sys/kern/vfs_vnops.c:1277 1277 (void) msleep(&mp->mnt_writeopcount, (kgdb) list 1272 while (mp->mnt_kern_flag & MNTK_SUSPEND) 1273 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); 1274 mp->mnt_kern_flag |= MNTK_SUSPEND; 1275 mp->mnt_susp_owner = curthread; 1276 if (mp->mnt_writeopcount > 0) 1277 (void) msleep(&mp->mnt_writeopcount, 1278 MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0); 1279 else 1280 MNT_IUNLOCK(mp); 1281 if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) (kgdb) p mp->mnt_writeopcount $2 = 1 The deadlock can be explained now: pid 1 (init) sleeps on "mount drain" because mp->mnt_lockref was 1. This setting was done by pid 18 (gjournal switcher) by calling vfs_busy(). pid 18 now sleeps on "suspwt" because mp->mnt_writeopcount was 1. This setting was done by pid 1 before going to sleep by calling vn_start_write() in dounmount(). I think the reason for this deadlock is the commit r249055 which seems not to be compatible with gjournal. Andreas Longwitz
Konstantin Belousov
2013-Jul-08 05:43 UTC
Shutdown hangs on unmount of a gjournaled file system in 8-Stable
On Mon, Jul 08, 2013 at 12:26:43AM +0200, Andreas Longwitz wrote:> The deadlock can be explained now: pid 1 (init) sleeps on "mount drain" > because mp->mnt_lockref was 1. This setting was done by pid 18 (gjournal > switcher) by calling vfs_busy(). pid 18 now sleeps on "suspwt" because > mp->mnt_writeopcount was 1. This setting was done by pid 1 before going > to sleep by calling vn_start_write() in dounmount(). > > I think the reason for this deadlock is the commit r249055 which seems > not to be compatible with gjournal.Thank you for the analysis. I think 'not compatible' is some understatement. The situation clearly causes a deadlock, you are right. The vfs_busy(); vfs_write_suspend(); call sequence is somewhat dubious, in fact, exactly because unmount could start in between. I think that vfs_write_suspend() must avoid setting MNT_SUSPEND if unmount was started. Patch below, for HEAD, should fix the problem, by marking the callers of vfs_write_suspend(), which are not protected by the covered vnode lock, with the VS_SKIP_UNMOUNT flag. I believe that the conflicts on stable/8 should be trivial, if any. diff --git a/sys/geom/journal/g_journal.c b/sys/geom/journal/g_journal.c index a3c996c..3ce2785 100644 --- a/sys/geom/journal/g_journal.c +++ b/sys/geom/journal/g_journal.c @@ -2960,7 +2960,7 @@ g_journal_do_switch(struct g_class *classp) GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); GJ_TIMER_START(1, &bt); - error = vfs_write_suspend(mp); + error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); if (error != 0) { GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 7eac0ef..06e59f9 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1668,8 +1668,7 @@ vn_finished_secondary_write(mp) * Request a filesystem to suspend write operations. */ int -vfs_write_suspend(mp) - struct mount *mp; +vfs_write_suspend(struct mount *mp, int flags) { int error; @@ -1680,6 +1679,21 @@ vfs_write_suspend(mp) } while (mp->mnt_kern_flag & MNTK_SUSPEND) msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0); + + /* + * Unmount holds a write reference on the mount point. If we + * own busy reference and drain for writers, we deadlock with + * the reference draining in the unmount path. Callers of + * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if + * vfs_busy() reference is owned and caller is not in the + * unmount context. + */ + if ((flags & VS_SKIP_UNMOUNT) != 0 && + (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { + MNT_IUNLOCK(mp); + return (EBUSY); + } + mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = curthread; if (mp->mnt_writeopcount > 0) diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 42bfb65..b0cbcc0 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -398,6 +398,9 @@ extern int vttoif_tab[]; #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ +#define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the + filesystem is being unmounted */ + #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC @@ -711,7 +714,7 @@ int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, int vfs_cache_lookup(struct vop_lookup_args *ap); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); -int vfs_write_suspend(struct mount *mp); +int vfs_write_suspend(struct mount *mp, int flags); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); diff --git a/sys/ufs/ffs/ffs_snapshot.c b/sys/ufs/ffs/ffs_snapshot.c index 9a9c88a..ad157aa 100644 --- a/sys/ufs/ffs/ffs_snapshot.c +++ b/sys/ufs/ffs/ffs_snapshot.c @@ -423,7 +423,7 @@ restart: */ for (;;) { vn_finished_write(wrtmp); - if ((error = vfs_write_suspend(vp->v_mount)) != 0) { + if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; diff --git a/sys/ufs/ffs/ffs_suspend.c b/sys/ufs/ffs/ffs_suspend.c index 3198c1a..a8c4578 100644 --- a/sys/ufs/ffs/ffs_suspend.c +++ b/sys/ufs/ffs/ffs_suspend.c @@ -206,7 +206,7 @@ ffs_susp_suspend(struct mount *mp) return (EPERM); #endif - if ((error = vfs_write_suspend(mp)) != 0) + if ((error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT)) != 0) return (error); ump->um_writesuspended = 1; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 57f092c..a87fdfa 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -257,7 +257,7 @@ ffs_mount(struct mount *mp) return (error); for (;;) { vn_finished_write(mp); - if ((error = vfs_write_suspend(mp)) != 0) + if ((error = vfs_write_suspend(mp, 0)) != 0) return (error); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPENDED) { @@ -1255,7 +1255,7 @@ ffs_unmount(mp, mntflags) */ for (;;) { vn_finished_write(mp); - if ((error = vfs_write_suspend(mp)) != 0) + if ((error = vfs_write_suspend(mp, 0)) != 0) return (error); MNT_ILOCK(mp); if (mp->mnt_kern_flag & MNTK_SUSPENDED) { -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 834 bytes Desc: not available URL: <http://lists.freebsd.org/pipermail/freebsd-stable/attachments/20130708/15fc3a2b/attachment.sig>