Richard W.M. Jones
2021-Dec-07 23:07 UTC
[Libguestfs] [PATCH nbdkit] file: Fix implementation of cache=none for writes
When testing virt-v2v we found that cache=none had very pessimal performance in its current implementation when writing. See: https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e However we know of a much better implementation - the one in nbdcopy. This commit copies that implementation (for writes only). A simple test is to do: $ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri' and then check the cache usage of the output file, which should be around 0% (using https://github.com/Feh/nocache): $ cachestats out.img pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K] For modular virt-v2v doing a local disk to local disk conversion: - before this change, without cache=none virt-v2v took 93.7 seconds, 19.1% pages cached in output file - before this change, enabling cache=none virt-v2v took 125.4 seconds, 0.0% pages cached in output file ^^^ this is the bad case which caused the investigation - after this change, without cache=none virt-v2v took 93.2 seconds, 19.1% pages cached in output file - after this change, enabling cache=none virt-v2v took 97.9 seconds, 0.1% pages cached in output file I tried to adjust NR_WINDOWS to find an optimum. Increasing it made no difference in performance but predictably caused a slight increase in cached pages. Reducing it slowed performance slightly. So I conclude that 8 is about right, but it probably depends on the hardware. --- plugins/file/nbdkit-file-plugin.pod | 3 ++ plugins/file/file.c | 79 +++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 10 deletions(-) diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod index 0ac0ee53..f8f0e198 100644 --- a/plugins/file/nbdkit-file-plugin.pod +++ b/plugins/file/nbdkit-file-plugin.pod @@ -117,6 +117,9 @@ cache: nbdkit file disk.img fadvise=sequential cache=none +Only use fadvise=sequential if reading, and the reads are mainly +sequential. + =head2 Files on tmpfs If you want to expose a file that resides on a file system known to diff --git a/plugins/file/file.c b/plugins/file/file.c index 35270a24..caf24b2c 100644 --- a/plugins/file/file.c +++ b/plugins/file/file.c @@ -85,6 +85,69 @@ static int fadvise_mode /* cache mode */ static enum { cache_default, cache_none } cache_mode = cache_default; +/* Define EVICT_WRITES if we are going to evict the page cache + * (cache=none) after writing. This is only known to work on Linux. + */ +#ifdef __linux__ +#define EVICT_WRITES 1 +#endif + +#ifdef EVICT_WRITES +/* Queue writes so they will be evicted from the cache. See + * libnbd.git copy/file-ops.c for the rationale behind this. + */ +#define NR_WINDOWS 8 + +struct write_window { + int fd; + uint64_t offset; + size_t len; +}; + +static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; +static struct write_window window[NR_WINDOWS]; + +static void +evict_writes (int fd, uint64_t offset, size_t len) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); + + /* Evict the oldest window from the page cache. */ + if (window[0].len > 0) { + sync_file_range (window[0].fd, window[0].offset, window[0].len, + SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + posix_fadvise (window[0].fd, window[0].offset, window[0].len, + POSIX_FADV_DONTNEED); + } + + /* Move the Nth window to N-1. */ + memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); + + /* Set up the current window and tell Linux to start writing it out + * to disk (asynchronously). + */ + sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE); + window[NR_WINDOWS-1].fd = fd; + window[NR_WINDOWS-1].offset = offset; + window[NR_WINDOWS-1].len = len; +} + +/* When we close the handle we must remove any windows which are still + * associated. They missed the boat, oh well :-( + */ +static void +remove_fd_from_window (int fd) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); + size_t i; + + for (i = 0; i < NR_WINDOWS; ++i) + if (window[i].len > 0 && window[i].fd == fd) + window[i].len = 0; +} +#endif /* EVICT_WRITES */ + /* Any callbacks using lseek must be protected by this lock. */ static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; @@ -431,6 +494,9 @@ file_close (void *handle) { struct handle *h = handle; +#ifdef EVICT_WRITES + remove_fd_from_window (h->fd); +#endif close (h->fd); free (h); } @@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, { struct handle *h = handle; -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) +#if EVICT_WRITES uint32_t orig_count = count; uint64_t orig_offset = offset; - - /* If cache=none we want to force pages we have just written to the - * file to be flushed to disk so we can immediately evict them from - * the page cache. - */ - if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA; #endif while (count > 0) { @@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) return -1; -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) - /* On Linux this will evict the pages we just wrote from the page cache. */ +#if EVICT_WRITES if (cache_mode == cache_none) - posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED); + evict_writes (h->fd, orig_offset, orig_count); #endif return 0; -- 2.32.0
Laszlo Ersek
2021-Dec-08 12:51 UTC
[Libguestfs] [PATCH nbdkit] file: Fix implementation of cache=none for writes
On 12/08/21 00:07, Richard W.M. Jones wrote:> When testing virt-v2v we found that cache=none had very pessimal > performance in its current implementation when writing. See: > > https://github.com/libguestfs/virt-v2v/commit/ac59d3b2310511b1537d408b675b19ec9a5d384e > > However we know of a much better implementation - the one in nbdcopy. > This commit copies that implementation (for writes only). > > A simple test is to do: > > $ ./nbdkit file out.img cache=none --run 'nbdcopy fedora-33.img $uri' > > and then check the cache usage of the output file, which should be > around 0% (using https://github.com/Feh/nocache): > > $ cachestats out.img > pages in cache: 409/1572864 (0.0%) [filesize=6291456.0K, pagesize=4K] > > For modular virt-v2v doing a local disk to local disk conversion: > > - before this change, without cache=none > virt-v2v took 93.7 seconds, 19.1% pages cached in output file > > - before this change, enabling cache=none > virt-v2v took 125.4 seconds, 0.0% pages cached in output file > ^^^ this is the bad case which caused the investigation > > - after this change, without cache=none > virt-v2v took 93.2 seconds, 19.1% pages cached in output file > > - after this change, enabling cache=none > virt-v2v took 97.9 seconds, 0.1% pages cached in output file > > I tried to adjust NR_WINDOWS to find an optimum. Increasing it made > no difference in performance but predictably caused a slight increase > in cached pages. Reducing it slowed performance slightly. So I > conclude that 8 is about right, but it probably depends on the > hardware. > --- > plugins/file/nbdkit-file-plugin.pod | 3 ++ > plugins/file/file.c | 79 +++++++++++++++++++++++++---- > 2 files changed, 72 insertions(+), 10 deletions(-) > > diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod > index 0ac0ee53..f8f0e198 100644 > --- a/plugins/file/nbdkit-file-plugin.pod > +++ b/plugins/file/nbdkit-file-plugin.pod > @@ -117,6 +117,9 @@ cache: > > nbdkit file disk.img fadvise=sequential cache=none > > +Only use fadvise=sequential if reading, and the reads are mainly > +sequential. > + > =head2 Files on tmpfs > > If you want to expose a file that resides on a file system known to > diff --git a/plugins/file/file.c b/plugins/file/file.c > index 35270a24..caf24b2c 100644 > --- a/plugins/file/file.c > +++ b/plugins/file/file.c > @@ -85,6 +85,69 @@ static int fadvise_mode > /* cache mode */ > static enum { cache_default, cache_none } cache_mode = cache_default; > > +/* Define EVICT_WRITES if we are going to evict the page cache > + * (cache=none) after writing. This is only known to work on Linux. > + */ > +#ifdef __linux__ > +#define EVICT_WRITES 1 > +#endif > + > +#ifdef EVICT_WRITES > +/* Queue writes so they will be evicted from the cache. See > + * libnbd.git copy/file-ops.c for the rationale behind this. > + */ > +#define NR_WINDOWS 8 > + > +struct write_window { > + int fd; > + uint64_t offset; > + size_t len; > +}; > + > +static pthread_mutex_t window_lock = PTHREAD_MUTEX_INITIALIZER; > +static struct write_window window[NR_WINDOWS]; > + > +static void > +evict_writes (int fd, uint64_t offset, size_t len) > +{ > + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); > + > + /* Evict the oldest window from the page cache. */ > + if (window[0].len > 0) { > + sync_file_range (window[0].fd, window[0].offset, window[0].len, > + SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| > + SYNC_FILE_RANGE_WAIT_AFTER); > + posix_fadvise (window[0].fd, window[0].offset, window[0].len, > + POSIX_FADV_DONTNEED); > + }I didn't understand this sync_file_range() thing at all, so I looked up the original in libnbd, namely commit 64d50d994dd7 ("copy: Evict pages from the page cache when writing to local files.", 2021-03-02). That commit references: https://stackoverflow.com/a/3756466 which further references: http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01845.html http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01953.html I find it regrettable that we need sync_file_range() *at all*. All the behavior we desire here should be internal to posix_fadvise() with POSIX_FADV_DONTNEED. In particular the synchronous behavior of (WAIT_BEFORE | WRITE | WAIT_AFTER) is not something we actually want, in my opinion. What we want is for the data to disappear from the page cache as quickly as possible. This patch makes the tracking and flushing of the oldest writes the job of the userspace program, blocking userspace threads on a mutex while the mutex is held by a thread that's blocked in the sync_file_range() call above, inside the critical section. I think the kernel should be able to do the same tracking, when informed by POSIX_FADV_DONTNEED. That said, the implementation seems to match Linus's advice.> + > + /* Move the Nth window to N-1. */ > + memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); > + > + /* Set up the current window and tell Linux to start writing it out > + * to disk (asynchronously). > + */ > + sync_file_range (fd, offset, len, SYNC_FILE_RANGE_WRITE); > + window[NR_WINDOWS-1].fd = fd; > + window[NR_WINDOWS-1].offset = offset; > + window[NR_WINDOWS-1].len = len; > +} > + > +/* When we close the handle we must remove any windows which are still > + * associated. They missed the boat, oh well :-( > + */ > +static void > +remove_fd_from_window (int fd) > +{ > + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&window_lock); > + size_t i; > + > + for (i = 0; i < NR_WINDOWS; ++i) > + if (window[i].len > 0 && window[i].fd == fd) > + window[i].len = 0; > +} > +#endif /* EVICT_WRITES */ > + > /* Any callbacks using lseek must be protected by this lock. */ > static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; > > @@ -431,6 +494,9 @@ file_close (void *handle) > { > struct handle *h = handle; > > +#ifdef EVICT_WRITES > + remove_fd_from_window (h->fd); > +#endif > close (h->fd); > free (h); > } > @@ -583,15 +649,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, > { > struct handle *h = handle; > > -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) > +#if EVICT_WRITES > uint32_t orig_count = count; > uint64_t orig_offset = offset; > - > - /* If cache=none we want to force pages we have just written to the > - * file to be flushed to disk so we can immediately evict them from > - * the page cache. > - */ > - if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA; > #endif > > while (count > 0) { > @@ -608,10 +668,9 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, > if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) > return -1; > > -#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) > - /* On Linux this will evict the pages we just wrote from the page cache. */ > +#if EVICT_WRITES > if (cache_mode == cache_none) > - posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED); > + evict_writes (h->fd, orig_offset, orig_count); > #endif > > return 0; >Acked-by: Laszlo Ersek <lersek at redhat.com> Thanks Laszlo