Richard W.M. Jones
2021-Feb-25 17:34 UTC
[Libguestfs] [PATCH libnbd v3 0/3] copy: Preserve the host page cache.
This is a combination and evolution of two previous series: https://listman.redhat.com/archives/libguestfs/2021-February/msg00091.html https://listman.redhat.com/archives/libguestfs/2021-February/msg00094.html I made a few enhancements: - Separate macros for the platforms where we enhance reading (#ifdef PAGE_CACHE_MAPPING), vs writing (#ifdef EVICT_WRITES). This was mainly about renaming stuff. - When writing we now use 8 windows instead of 2 [both per thread]. This reduces the overhead from about 25% to about 10%. It's possible to reduce it further by using more windows, but you trade the amount of cruft added to the page cache. - This patch series is much more well-tested than before, including combining it with TLS, and testing it from the nbdkit test suite. Rich.
Richard W.M. Jones
2021-Feb-25 17:34 UTC
[Libguestfs] [PATCH libnbd v3 1/3] copy: Set POSIX_FADV_SEQUENTIAL flag on files.
On Linux this doubles the readahead. Even though we are not strictly speaking going to read or write the file sequentially (only mostly sequentially) it may provide some minor benefit. --- configure.ac | 3 +++ copy/file-ops.c | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/configure.ac b/configure.ac index fea43c8..6cf563a 100644 --- a/configure.ac +++ b/configure.ac @@ -98,6 +98,9 @@ AC_CHECK_HEADERS([\ AC_CHECK_HEADERS([linux/vm_sockets.h], [], [], [#include <sys/socket.h>]) +dnl posix_fadvise helps to optimise linear reads and writes (optional). +AC_CHECK_FUNCS([posix_fadvise]) + dnl Check for strerrordesc_np (optional, glibc only). dnl Prefer this over sys_errlist. dnl https://lists.fedoraproject.org/archives/list/glibc at lists.fedoraproject.org/thread/WJHGG2OO7ABNAYICGA5WQZ2Q34Q2FEHU/ diff --git a/copy/file-ops.c b/copy/file-ops.c index 1d7e6a6..73cbdcb 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -116,6 +116,13 @@ file_create (const char *name, int fd, off_t st_size, bool is_block) rwf->can_fallocate = true; } + /* Set the POSIX_FADV_SEQUENTIAL flag on the file descriptor, but + * don't fail. + */ +#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_SEQUENTIAL) + posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + return &rwf->rw; } -- 2.29.0.rc2
Richard W.M. Jones
2021-Feb-25 17:34 UTC
[Libguestfs] [PATCH libnbd v3 2/3] copy: Preserve the host page cache when reading from local files.
When reading from a local file we can take advantage of the page cache (ie. not having to read the file from disk if a copy is present in memory), while at the same time not disturbing the state of the page cache. Disturbing the page cache can have bad consequences for other processes running on the host since they will have their working set evicted so it is something we should generally avoid. This requires Linux APIs, using the technique described here: https://insights.oetiker.ch/linux/fadvise/ This change only affects reads, since doing the same for writes is even more complicated. You can see the effect using the tools from https://github.com/Feh/nocache Before this change: $ cachestats /var/tmp/random pages in cache: 315648/8388608 (3.8%) [filesize=33554432.0K, pagesize=4K] $ ./run time nbdcopy /var/tmp/random null: 1.80user 23.70system 0:20.69elapsed 123%CPU (0avgtext+0avgdata 135084maxresident)k 64585232inputs+0outputs (6major+8391948minor)pagefaults 0swaps $ cachestats /var/tmp/random pages in cache: 7734356/8388608 (92.2%) [filesize=33554432.0K, pagesize=4K] Notice that a large part of the file has been loaded into the page cache after the run. After this change: $ cachestats /var/tmp/random pages in cache: 315008/8388608 (3.8%) [filesize=33554432.0K, pagesize=4K] $ time ./run nbdcopy /var/tmp/random null: real 0m21.602s user 0m1.944s sys 0m29.013s $ cachestats /var/tmp/random pages in cache: 315648/8388608 (3.8%) [filesize=33554432.0K, pagesize=4K] Notice there is only a small increase in the amount of file which is cached, the elapsed time is about the same, but there is an increase in system time (presumably the overhead of POSIX_FADV_DONTNEED). --- copy/file-ops.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++- copy/main.c | 2 +- copy/nbdcopy.h | 2 +- 3 files changed, 131 insertions(+), 3 deletions(-) diff --git a/copy/file-ops.c b/copy/file-ops.c index 73cbdcb..888a388 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -24,8 +24,10 @@ #include <fcntl.h> #include <unistd.h> #include <errno.h> +#include <limits.h> #include <sys/ioctl.h> #include <sys/types.h> +#include <sys/mman.h> #include <pthread.h> @@ -34,8 +36,27 @@ #endif #include "isaligned.h" +#include "rounding.h" + #include "nbdcopy.h" +/* If we are going to attempt page cache mapping which tries not to + * disturb the page cache when reading a file. Only do this on Linux + * systems where we understand how the page cache behaves. Since we + * need to mmap the whole file, also restrict this to 64 bit systems. + */ +#ifdef __linux__ +#ifdef __SIZEOF_POINTER__ +#if __SIZEOF_POINTER__ == 8 +#define PAGE_CACHE_MAPPING 1 +#endif +#endif +#endif + +#ifdef PAGE_CACHE_MAPPING +DEFINE_VECTOR_TYPE (byte_vector, uint8_t) +#endif + static struct rw_ops file_ops; struct rw_file { @@ -50,8 +71,96 @@ struct rw_file { * the working method. */ bool can_punch_hole, can_zero_range, can_fallocate, can_zeroout; + +#ifdef PAGE_CACHE_MAPPING + byte_vector cached_pages; +#endif }; +#ifdef PAGE_CACHE_MAPPING +static long page_size; + +static void page_size_init (void) __attribute__((constructor)); +static void +page_size_init (void) +{ + page_size = sysconf (_SC_PAGE_SIZE); +} + +/* Load the page cache map for a particular file into + * rwf->cached_pages. Only used when reading files. This doesn't + * fail: if a system call fails then rwf->cached_pages.size will be + * zero which is handled in page_was_cached. + */ +static inline void +page_cache_map (struct rw_file *rwf, int fd, int64_t size) +{ + void *ptr; + + if (size == 0) return; + + ptr = mmap (NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (ptr == (void *)-1) return; + + const size_t veclen = ROUND_UP (size, page_size) / page_size; + + if (byte_vector_reserve (&rwf->cached_pages, veclen) == -1) + goto err; + if (mincore (ptr, size, rwf->cached_pages.ptr) == -1) + goto err; + + rwf->cached_pages.size = veclen; + err: + munmap (ptr, size); +} + +/* Test if a single page of the file was cached before nbdcopy ran. */ +static inline bool +page_was_cached (struct rw_file *rwf, uint64_t offset) +{ + uint64_t page = offset / page_size; + if (page < rwf->cached_pages.size) + return (rwf->cached_pages.ptr[page] & 1) != 0; + else + /* This path is taken if we didn't manage to map the input file + * for any reason. In this case assume that pages were mapped so + * we will not evict them: essentially fall back to doing nothing. + */ + return true; +} + +/* Evict file contents from the page cache if they were not present in + * the page cache before. + */ +static inline void +page_cache_evict (struct rw_file *rwf, uint64_t orig_offset, size_t orig_len) +{ + uint64_t offset, n; + size_t len; + + if (rwf->cached_pages.size == 0) return; + + /* Only bother with whole pages. */ + offset = ROUND_UP (orig_offset, page_size); + len = orig_len - (offset - orig_offset); + len = ROUND_DOWN (len, page_size); + + while (len > 0) { + n = page_size; + if (! page_was_cached (rwf, offset)) { + /* Try to evict runs of pages in one go. */ + while (len-n > 0 && ! page_was_cached (rwf, offset+n)) + n += page_size; + + posix_fadvise (rwf->fd, offset, n, POSIX_FADV_DONTNEED); + } + + offset += n; + len -= n; + } +} +#endif + static bool seek_hole_supported (int fd) { @@ -64,7 +173,8 @@ seek_hole_supported (int fd) } struct rw * -file_create (const char *name, int fd, off_t st_size, bool is_block) +file_create (const char *name, int fd, + off_t st_size, bool is_block, direction d) { struct rw_file *rwf = calloc (1, sizeof *rwf); if (rwf == NULL) { perror ("calloc"); exit (EXIT_FAILURE); } @@ -123,6 +233,11 @@ file_create (const char *name, int fd, off_t st_size, bool is_block) posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL); #endif +#if PAGE_CACHE_MAPPING + if (d == READING) + page_cache_map (rwf, fd, rwf->rw.size); +#endif + return &rwf->rw; } @@ -135,6 +250,11 @@ file_close (struct rw *rw) fprintf (stderr, "%s: close: %m\n", rw->name); exit (EXIT_FAILURE); } + +#ifdef PAGE_CACHE_MAPPING + byte_vector_reset (&rwf->cached_pages); +#endif + free (rw); } @@ -211,6 +331,10 @@ file_synch_read (struct rw *rw, void *data, size_t len, uint64_t offset) { struct rw_file *rwf = (struct rw_file *)rw; +#ifdef PAGE_CACHE_MAPPING + const uint64_t orig_offset = offset; + const size_t orig_len = len; +#endif size_t n = 0; ssize_t r; @@ -229,6 +353,10 @@ file_synch_read (struct rw *rw, n += r; } +#if PAGE_CACHE_MAPPING + page_cache_evict (rwf, orig_offset, orig_len); +#endif + return n; } diff --git a/copy/main.c b/copy/main.c index 3c574df..55c2b53 100644 --- a/copy/main.c +++ b/copy/main.c @@ -461,7 +461,7 @@ open_local (const char *filename, direction d) exit (EXIT_FAILURE); } if (S_ISBLK (stat.st_mode) || S_ISREG (stat.st_mode)) - return file_create (filename, fd, stat.st_size, S_ISBLK (stat.st_mode)); + return file_create (filename, fd, stat.st_size, S_ISBLK (stat.st_mode), d); else { /* Probably stdin/stdout, a pipe or a socket. */ synchronous = true; /* Force synchronous mode for pipes. */ diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 4496722..e4c3d4e 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -52,7 +52,7 @@ typedef enum { READING, WRITING } direction; /* Create subtypes. */ extern struct rw *file_create (const char *name, int fd, - off_t st_size, bool is_block); + off_t st_size, bool is_block, direction d); extern struct rw *nbd_rw_create_uri (const char *name, const char *uri, direction d); extern struct rw *nbd_rw_create_subprocess (const char **argv, size_t argc, -- 2.29.0.rc2
Richard W.M. Jones
2021-Feb-25 17:34 UTC
[Libguestfs] [PATCH libnbd v3 3/3] copy: Evict pages from the page cache when writing to local files.
When writing to a file or block device, we are always writing new (ie. previously uncached) data. This commit ensures that very little of that data will be in the page cache after nbdcopy finishes by evicting it as we go along. This ensures that the host page cache is largely unchanged for other host processes. This uses Linus's technique described here: https://stackoverflow.com/a/3756466 but instead of using 2 windows, it uses a configurable larger number of windows (in this case 8). Before this commit: $ rm /var/tmp/pattern ; sync ; time ./run nbdcopy [ nbdkit pattern 32G ] /var/tmp/pattern && cachestats /var/tmp/pattern real 0m34.852s user 0m18.368s sys 0m33.117s pages in cache: 7090389/8388608 (84.5%) [filesize=33554432.0K, pagesize=4K] Notice that the newly written file ends up in the cache, thus trashing the page cache on the host. After this commit: $ rm /var/tmp/pattern ; sync ; time ./run nbdcopy [ nbdkit pattern 32G ] /var/tmp/pattern && cachestats /var/tmp/pattern real 0m38.721s user 0m18.837s sys 0m40.654s pages in cache: 65536/8388608 (0.8%) [filesize=33554432.0K, pagesize=4K] The newly written file does not disturb the page cache. However there is about 11% slow down. --- copy/file-ops.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/copy/file-ops.c b/copy/file-ops.c index 888a388..a2860ea 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -21,6 +21,7 @@ #include <stdio.h> #include <stdlib.h> #include <stdbool.h> +#include <string.h> #include <fcntl.h> #include <unistd.h> #include <errno.h> @@ -53,6 +54,11 @@ #endif #endif +/* If we are going to evict the page cache after writing a new file. */ +#ifdef __linux__ +#define EVICT_WRITES 1 +#endif + #ifdef PAGE_CACHE_MAPPING DEFINE_VECTOR_TYPE (byte_vector, uint8_t) #endif @@ -159,7 +165,60 @@ page_cache_evict (struct rw_file *rwf, uint64_t orig_offset, size_t orig_len) len -= n; } } -#endif +#endif /* PAGE_CACHE_MAPPING */ + +#ifdef EVICT_WRITES +/* Prepare to evict file contents from the page cache when writing. + * We cannot do this directly (as for reads above) because we have to + * wait for Linux to finish writing the pages to disk. Therefore the + * strategy is to (1) tell Linux to begin writing asynchronously and + * (2) evict the previous pages, which have hopefully been written + * already by the time we get here. We have to maintain window(s) per + * thread. + * + * For more information see https://stackoverflow.com/a/3756466 and + * the links to Linus's advice from that entry. + */ + +/* Increasing the number of windows gives better performance since + * writes are given more time to make it to disk before we have to + * pause to do the page cache eviction. But a larger number of + * windows means less success overall since (a) more page cache is + * used as the program runs, and (b) we don't evict any writes which + * are still pending when the program exits. + */ +#define NR_WINDOWS 8 + +struct write_window { + uint64_t offset; + size_t len; +}; + +static inline void +evict_writes (struct rw_file *rwf, uint64_t offset, size_t len) +{ + static __thread struct write_window window[NR_WINDOWS]; + + /* Evict the oldest window from the page cache. */ + if (window[0].len > 0) { + sync_file_range (rwf->fd, window[0].offset, window[0].len, + SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| + SYNC_FILE_RANGE_WAIT_AFTER); + posix_fadvise (rwf->fd, window[0].offset, window[0].len, + POSIX_FADV_DONTNEED); + } + + /* Move the Nth window to N-1. */ + memmove (&window[0], &window[1], sizeof window[0] * (NR_WINDOWS-1)); + + /* Set up the current window and tell Linux to start writing it out + * to disk (asynchronously). + */ + sync_file_range (rwf->fd, offset, len, SYNC_FILE_RANGE_WRITE); + window[NR_WINDOWS-1].offset = offset; + window[NR_WINDOWS-1].len = len; +} +#endif /* EVICT_WRITES */ static bool seek_hole_supported (int fd) @@ -365,6 +424,10 @@ file_synch_write (struct rw *rw, const void *data, size_t len, uint64_t offset) { struct rw_file *rwf = (struct rw_file *)rw; +#ifdef PAGE_CACHE_MAPPING + const uint64_t orig_offset = offset; + const size_t orig_len = len; +#endif ssize_t r; while (len > 0) { @@ -377,6 +440,10 @@ file_synch_write (struct rw *rw, offset += r; len -= r; } + +#if EVICT_WRITES + evict_writes (rwf, orig_offset, orig_len); +#endif } static inline bool -- 2.29.0.rc2