Richard W.M. Jones
2021-Feb-19 17:56 UTC
[Libguestfs] [PATCH libnbd] copy: Set POSIX_FADV_SEQUENTIAL and POSIX_FADV_DONTNEED
I'm trying to emulate what we do in nbdkit-file-plugin to avoid the page cache getting trashed when copying to/from local files: https://gitlab.com/nbdkit/nbdkit/-/commit/aa5a2183a6d16afd919174b5ba8222b2bccf4039 This patch works well when reading from a local file as you can see from the results in the commit message. However when writing to a local file it has absolutely no effect, and I've no idea why. (Even checked with strace and it's making the correct fadvise64 system calls). For the reading case it has a slight performance impact which could be problematic. Should we only enable it with a flag? I would prefer to fix it so it's always beneficial, but it's not too easy to do that without adding more threads and synchronization. Rich.
Richard W.M. Jones
2021-Feb-19 17:56 UTC
[Libguestfs] [PATCH libnbd] copy: Set POSIX_FADV_SEQUENTIAL and POSIX_FADV_DONTNEED.
Set POSIX_FADV_SEQUENTIAL flag on the file when copying to or from a local file. This tells the operating system that we intend to read or write sequentially and to optimize for that case. Avoid polluting the page cache when copying to or from a local file or block device by calling POSIX_FADV_DONTNEED. This is similar to what we do in nbdkit-file-plugin: https://gitlab.com/nbdkit/nbdkit/-/commit/aa5a2183a6d16afd919174b5ba8222b2bccf4039 Before this change: $ cachedel /var/tmp/random $ cachestats /var/tmp/random pages in cache: 0/8388608 (0.0%) [filesize=33554432.0K, pagesize=4K] $ free -m; time ./run nbdcopy /var/tmp/random null: ; free -m ; cachestats /var/tmp/random total used free shared buff/cache available Mem: 32080 1069 14432 3 16578 30550 Swap: 16135 86 16049 real 0m21.484s user 0m2.275s sys 0m25.837s total used free shared buff/cache available Mem: 32080 1041 297 3 30741 30581 Swap: 16135 86 16049 pages in cache: 4971577/8388608 (59.3%) [filesize=33554432.0K, pagesize=4K] After this change: $ cachedel /var/tmp/random $ free -m; time ./run nbdcopy /var/tmp/random null: ; free -m ; cachestats /var/tmp/random total used free shared buff/cache available Mem: 32080 1071 14431 3 16578 30548 Swap: 16135 86 16049 real 0m22.423s user 0m2.180s sys 0m31.059s total used free shared buff/cache available Mem: 32080 1053 14238 3 16789 30566 Swap: 16135 86 16049 pages in cache: 53888/8388608 (0.6%) [filesize=33554432.0K, pagesize=4K] --- configure.ac | 3 +++ copy/file-ops.c | 17 +++++++++++++++-- copy/main.c | 11 +++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index 58a7724..b36ae09 100644 --- a/configure.ac +++ b/configure.ac @@ -79,6 +79,9 @@ AC_CHECK_HEADERS([\ AC_CHECK_HEADERS([linux/vm_sockets.h], [], [], [#include <sys/socket.h>]) +dnl posix_fadvise helps to optimise linear reads and writes (optional). +AC_CHECK_FUNCS([posix_fadvise]) + dnl Check for strerrordesc_np (optional, glibc only). dnl Prefer this over sys_errlist. dnl https://lists.fedoraproject.org/archives/list/glibc at lists.fedoraproject.org/thread/WJHGG2OO7ABNAYICGA5WQZ2Q34Q2FEHU/ diff --git a/copy/file-ops.c b/copy/file-ops.c index 2a239d0..1310f08 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -60,11 +60,12 @@ static size_t file_synch_read (struct rw *rw, void *data, size_t len, uint64_t offset) { + int fd = rw->u.local.fd; size_t n = 0; ssize_t r; while (len > 0) { - r = pread (rw->u.local.fd, data, len, offset); + r = pread (fd, data, len, offset); if (r == -1) { perror (rw->name); exit (EXIT_FAILURE); @@ -72,6 +73,11 @@ file_synch_read (struct rw *rw, if (r == 0) return n; +#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) + /* On Linux this will evict the pages we just read from the page cache. */ + posix_fadvise (fd, offset, r, POSIX_FADV_DONTNEED); +#endif + data += r; offset += r; len -= r; @@ -85,14 +91,21 @@ static void file_synch_write (struct rw *rw, const void *data, size_t len, uint64_t offset) { + int fd = rw->u.local.fd; ssize_t r; while (len > 0) { - r = pwrite (rw->u.local.fd, data, len, offset); + r = pwrite (fd, data, len, offset); if (r == -1) { perror (rw->name); exit (EXIT_FAILURE); } + +#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED) + /* On Linux this will evict the pages we just wrote from the page cache. */ + posix_fadvise (fd, offset, r, POSIX_FADV_DONTNEED); +#endif + data += r; offset += r; len -= r; diff --git a/copy/main.c b/copy/main.c index 68a6030..502ad28 100644 --- a/copy/main.c +++ b/copy/main.c @@ -460,6 +460,15 @@ open_null (struct rw *rw) rw->size = INT64_MAX; } +/* Set the POSIX_FADV_SEQUENTIAL flag on a file descriptor, but don't fail. */ +static inline void +fadvise_sequential (int fd) +{ +#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_SEQUENTIAL) + posix_fadvise (fd, 0, 0, POSIX_FADV_SEQUENTIAL); +#endif +} + /* Open a local (non-NBD) file, ie. a file, device, or "-" for stdio. * Returns the open file descriptor which the caller must close. * @@ -524,12 +533,14 @@ open_local (const char *prog, if (ioctl (fd, BLKSSZGET, &rw->u.local.sector_size)) fprintf (stderr, "warning: cannot get sector size: %s: %m", rw->name); #endif + fadvise_sequential (fd); } else if (S_ISREG (rw->u.local.stat.st_mode)) { /* Regular file. */ rw->ops = &file_ops; rw->size = rw->u.local.stat.st_size; rw->u.local.seek_hole_supported = seek_hole_supported (fd); + fadvise_sequential (fd); } else { /* Probably stdin/stdout, a pipe or a socket. Set size == -1 -- 2.29.0.rc2