Attached are updated patches (against both 4.8 and 5.0) for delaying disk buffer synching on softupdates-enabled FS. The original patch started a rather lengthy debate whether when disk updates are being delayed the fsync() processing should be delayed as well. As Kirk McKusick already summarized, some people will prefer partial battery power savings with working fsync() semantics, while other will desire greater savings with broken semantics. Therefore as suggested the updated patch introduces an additional sysctl tunable vfs.ena_lazy_fsync, which controls whether fsync() calls will be ignored or not. Note that when vfs.sync_extdelay is set to 0, vfs.ena_lazy_fsync has no effect, i.e. fsync() always works with standard semantics. Cheers, Marko -------------- next part -------------- --- /usr/src/sys.org/dev/ata/ata-disk.c Thu Jan 30 08:19:59 2003 +++ dev/ata/ata-disk.c Sat Apr 12 00:31:26 2003 @@ -294,6 +294,7 @@ adstrategy(struct buf *bp) struct ad_softc *adp = bp->b_dev->si_drv1; int s; + stratcalls++; if (adp->device->flags & ATA_D_DETACHING) { bp->b_error = ENXIO; bp->b_flags |= B_ERROR; --- /usr/src/sys.org/kern/vfs_subr.c Sun Oct 13 18:19:12 2002 +++ kern/vfs_subr.c Mon Apr 14 23:27:52 2003 @@ -116,6 +116,13 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufme static int nameileafonly = 0; SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); +int stratcalls = 0; +int sync_extdelay = 0; +SYSCTL_INT(_vfs, OID_AUTO, sync_extdelay, CTLFLAG_RW, &sync_extdelay, 0, ""); + +int ena_lazy_fsync = 0; +SYSCTL_INT(_vfs, OID_AUTO, ena_lazy_fsync, CTLFLAG_RW, &ena_lazy_fsync, 0, ""); + #ifdef ENABLE_VFS_IOOPT int vfs_ioopt = 0; SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); @@ -137,7 +144,7 @@ static vm_zone_t vnode_zone; * The workitem queue. */ #define SYNCER_MAXDELAY 32 -static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ time_t syncdelay = 30; /* max time to delay syncing data */ time_t filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); @@ -145,7 +152,7 @@ time_t dirdelay = 29; /* time to delay SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); time_t metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); -static int rushjob; /* number of slots to run ASAP */ +int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); @@ -1119,7 +1127,7 @@ sched_sync(void) { struct synclist *slp; struct vnode *vp; - long starttime; + time_t starttime; int s; struct proc *p = updateproc; @@ -1127,8 +1135,6 @@ sched_sync(void) SHUTDOWN_PRI_LAST); for (;;) { - kproc_suspend_loop(p); - starttime = time_second; /* @@ -1198,8 +1204,25 @@ sched_sync(void) * matter as we are just trying to generally pace the * filesystem activity. */ - if (time_second == starttime) + if (time_second != starttime) + continue; + + if (sync_extdelay >= syncer_maxdelay) + while (syncer_delayno == 0 && rushjob == 0 && + abs(time_second - starttime) < sync_extdelay) { + stratcalls = 0; tsleep(&lbolt, PPAUSE, "syncer", 0); + kproc_suspend_loop(p); + if (stratcalls != 0 && syncer_maxdelay < + abs(time_second - starttime)) { + rushjob = syncer_maxdelay; + break; + } + } + else { + tsleep(&lbolt, PPAUSE, "syncer", 0); + kproc_suspend_loop(p); + } } } --- /usr/src/sys.org/kern/vfs_syscalls.c Thu Jan 2 18:26:18 2003 +++ kern/vfs_syscalls.c Tue Apr 15 13:42:01 2003 @@ -563,6 +563,9 @@ sync(p, uap) register struct mount *mp, *nmp; int asyncflag; + /* Notify sched_sync() to try flushing syncer_workitem_pending[*] */ + rushjob += syncer_maxdelay; + simple_lock(&mountlist_slock); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { @@ -2627,6 +2630,10 @@ fsync(p, uap) struct file *fp; vm_object_t obj; int error; + + /* Just return if we are artificially delaying disk syncs */ + if (sync_extdelay && ena_lazy_fsync) + return (0); if ((error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) != 0) return (error); --- /usr/src/sys.org/ufs/ffs/ffs_alloc.c Fri Sep 21 21:15:21 2001 +++ ufs/ffs/ffs_alloc.c Sat Apr 12 00:06:20 2003 @@ -125,6 +125,10 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn #endif /* DIAGNOSTIC */ if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; + /* Speedup flushing of syncer_wokitem_pending[*] if low on freespace */ + if (rushjob == 0 && + freespace(fs, fs->fs_minfree + 2) - numfrags(fs, size) < 0) + rushjob = syncer_maxdelay; if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; @@ -195,6 +199,10 @@ ffs_realloccg(ip, lbprev, bpref, osize, if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* DIAGNOSTIC */ + /* Speedup flushing of syncer_wokitem_pending[*] if low on freespace */ + if (rushjob == 0 && + freespace(fs, fs->fs_minfree + 2) - numfrags(fs, nsize - osize) < 0) + rushjob = syncer_maxdelay; if (cred->cr_uid != 0 && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) goto nospace; --- /usr/src/sys.org/sys/buf.h Sat Jan 25 20:02:23 2003 +++ sys/buf.h Sat Apr 12 00:30:48 2003 @@ -478,6 +478,7 @@ extern char *buffers; /* The buffer con extern int bufpages; /* Number of memory pages in the buffer pool. */ extern struct buf *swbuf; /* Swap I/O buffer headers. */ extern int nswbuf; /* Number of swap I/O buffer headers. */ +extern int stratcalls; /* I/O ops since last buffer sync */ extern TAILQ_HEAD(swqueue, buf) bswlist; extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; --- /usr/src/sys.org/sys/vnode.h Sun Dec 29 19:19:53 2002 +++ sys/vnode.h Mon Apr 14 23:28:36 2003 @@ -294,6 +294,10 @@ extern struct vm_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ extern int vfs_ioopt; +extern int rushjob; +extern int syncer_maxdelay; +extern int sync_extdelay; +extern int ena_lazy_fsync; /* * Macro/function to check for client cache inconsistency w.r.t. leasing. -------------- next part -------------- --- /usr/src/sys.org/dev/ata/ata-disk.c Sat Nov 16 09:07:36 2002 +++ dev/ata/ata-disk.c Tue Apr 15 15:23:37 2003 @@ -289,6 +289,7 @@ adstrategy(struct bio *bp) struct ad_softc *adp = bp->bio_dev->si_drv1; int s; + stratcalls++; if (adp->device->flags & ATA_D_DETACHING) { biofinish(bp, NULL, ENXIO); return; --- /usr/src/sys.org/kern/vfs_subr.c Sat Nov 16 09:08:02 2002 +++ kern/vfs_subr.c Tue Apr 15 15:34:19 2003 @@ -73,6 +73,8 @@ #include <vm/vm_page.h> #include <vm/uma.h> +#define abs(x) (((x) < 0) ? -(x) : (x)) + static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); static void addalias(struct vnode *vp, dev_t nvp_rdev); @@ -130,6 +132,13 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufca static int nameileafonly; SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); +int stratcalls = 0; +int sync_extdelay = 0; +SYSCTL_INT(_vfs, OID_AUTO, sync_extdelay, CTLFLAG_RW, &sync_extdelay, 0, ""); + +int ena_lazy_fsync = 0; +SYSCTL_INT(_vfs, OID_AUTO, ena_lazy_fsync, CTLFLAG_RW, &ena_lazy_fsync, 0, ""); + #ifdef ENABLE_VFS_IOOPT /* See NOTES for a description of this setting. */ int vfs_ioopt; @@ -208,7 +217,7 @@ static struct synclist *syncer_workitem_ static struct mtx sync_mtx; #define SYNCER_MAXDELAY 32 -static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, ""); @@ -216,7 +225,7 @@ static int dirdelay = 29; /* time to de SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, ""); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, ""); -static int rushjob; /* number of slots to run ASAP */ +int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, ""); @@ -1669,7 +1678,7 @@ sched_sync(void) struct synclist *slp; struct vnode *vp; struct mount *mp; - long starttime; + time_t starttime; int s; struct thread *td = FIRST_THREAD_IN_PROC(updateproc); /* XXXKSE */ @@ -1679,8 +1688,6 @@ sched_sync(void) SHUTDOWN_PRI_LAST); for (;;) { - kthread_suspend_check(td->td_proc); - starttime = time_second; /* @@ -1765,8 +1772,25 @@ sched_sync(void) * matter as we are just trying to generally pace the * filesystem activity. */ - if (time_second == starttime) + if (time_second != starttime) + continue; + + if (sync_extdelay >= syncer_maxdelay) + while (syncer_delayno == 0 && rushjob == 0 && + abs(time_second - starttime) < sync_extdelay) { + stratcalls = 0; tsleep(&lbolt, PPAUSE, "syncer", 0); + kthread_suspend_check(td->td_proc); + if (stratcalls != 0 && syncer_maxdelay < + abs(time_second - starttime)) { + rushjob = syncer_maxdelay; + break; + } + } + else { + tsleep(&lbolt, PPAUSE, "syncer", 0); + kthread_suspend_check(td->td_proc); + } } } --- /usr/src/sys.org/kern/vfs_syscalls.c Sat Nov 16 09:08:02 2002 +++ kern/vfs_syscalls.c Tue Apr 15 17:38:55 2003 @@ -123,6 +123,9 @@ sync(td, uap) struct mount *mp, *nmp; int asyncflag; + /* Notify sched_sync to try flushing dirty buffers */ + rushjob += syncer_maxdelay; + mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) { @@ -2704,6 +2707,10 @@ fsync(td, uap) struct file *fp; vm_object_t obj; int error; + + /* Just return if we are artificially delaying disk synchs */ + if (sync_extdelay && ena_lazy_fsync) + return (0); GIANT_REQUIRED; --- /usr/src/sys.org/sys/bio.h Sat Nov 16 09:08:19 2002 +++ sys/bio.h Tue Apr 15 15:24:20 2003 @@ -134,6 +134,8 @@ bioq_first(struct bio_queue_head *head) return (TAILQ_FIRST(&head->queue)); } +extern int stratcalls; + void biodone(struct bio *bp); void biofinish(struct bio *bp, struct devstat *stat, int error); int biowait(struct bio *bp, const char *wchan); --- /usr/src/sys.org/sys/vnode.h Sat Nov 16 09:08:21 2002 +++ sys/vnode.h Tue Apr 15 15:23:38 2003 @@ -361,6 +361,10 @@ extern struct uma_zone *namei_zone; extern int prtactive; /* nonzero to call vprint() */ extern struct vattr va_null; /* predefined null vattr structure */ extern int vfs_ioopt; +extern int rushjob; +extern int syncer_maxdelay; +extern int sync_extdelay; +extern int ena_lazy_fsync; /* * Macro/function to check for client cache inconsistency w.r.t. leasing. --- /usr/src/sys.org/ufs/ffs/ffs_alloc.c Sat Nov 16 09:08:21 2002 +++ ufs/ffs/ffs_alloc.c Tue Apr 15 15:26:37 2003 @@ -139,6 +139,10 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn #endif /* DIAGNOSTIC */ reclaimed = 0; retry: + /* Speedup flushing of dirty buffers in sched_sync */ + if (rushjob == 0 && + freespace(fs, fs->fs_minfree + 2) - numfrags(fs, size) < 0) + rushjob = syncer_maxdelay; if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (suser_cred(cred, PRISON_ROOT) && @@ -222,6 +226,10 @@ ffs_realloccg(ip, lbprev, bprev, bpref, #endif /* DIAGNOSTIC */ reclaimed = 0; retry: + /* Speedup flushing of dirty buffers in sched_sync */ + if (rushjob == 0 && + freespace(fs, fs->fs_minfree + 2) - numfrags(fs, nsize - osize) < 0) + rushjob = syncer_maxdelay; if (suser_cred(cred, PRISON_ROOT) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) goto nospace;