Richard W.M. Jones
2020-Apr-09 05:58 UTC
[Libguestfs] [PATCH nbdkit PRELIMINARY] file: Move file operators to a new fileops mini-library
There's a lot of code in nbdkit-file-plugin which it would be nice to reuse elsewhere. One possible approach (as outlined here) is simply to move the file callbacks (like file.pread, file.pwrite, file.zero etc) to a new mini-library. They can then be consumed by other plugins fairly easily by doing: static void * foo_open (int readonly) { struct fileops *fops; int fd, flags; /* Allocate *fops */ /* Set up fd however you want */ if (init_fileops (fd, fops) == -1) { free (fops); return NULL; } return fops; } static struct nbdkit_plugin plugin = { .name = "foo", .open = foo_open, .close = foo_close, FILEOPS_CALLBACKS }; If we did this then it would only work for plugins which need to serve exactly one whole file or block device. In fact the only plugins we could possibly use it for are iso and tmpdisk. We can't use it for plugins like split, partitioning, linuxdisk because those don't only serve a single file. Instead they all either serve multiple files or have some extra structure (like a virtual partition table) around a single file. Another approach (not explored yet) might be to expose more fundamental primitives like "zeroing part of a file efficiently". I think possibly best is a mixed approach: We add fileops, but also we expose fundamental primitives (in libutils). Simpler plugins would use fileops, others would have a more complex implementation using the fundamental primitives. Rich.
Richard W.M. Jones
2020-Apr-09 05:58 UTC
[Libguestfs] [PATCH nbdkit PRELIMINARY] file: Move file operators to a new common/fileops mini-library.
Writing "file-like" plugins is hard because you have to implement your own .zero, .trim, .extents, etc, and that is very complicated. However implementations of these functions already exist in the file plugin. By factoring out the file plugin into a separate "fileops" mini-library we can reuse these implementations in other plugins. This refactoring commit creates a new mini-library called fileops, and uses it to implement the file plugin. Note that the name or prefix "file" leaks into fileops in a few places: the debug option is still called ‘-D file.zero=1’ and the nbdkit --dump-plugin output will still contain ‘file_blksszget’ etc. However I think that is fine as it should make usage more consistent across future file-like plugins. --- configure.ac | 1 + Makefile.am | 1 + common/fileops/Makefile.am | 45 +++ plugins/file/Makefile.am | 2 + common/fileops/fileops.h | 141 ++++++++++ common/fileops/fileops.c | 547 +++++++++++++++++++++++++++++++++++++ plugins/file/file.c | 540 ++---------------------------------- 7 files changed, 755 insertions(+), 522 deletions(-) diff --git a/configure.ac b/configure.ac index c5db962c..7a859bca 100644 --- a/configure.ac +++ b/configure.ac @@ -981,6 +981,7 @@ AC_CONFIG_FILES([common/protocol/generate-protostrings.sh], AC_CONFIG_FILES([Makefile bash/Makefile common/bitmap/Makefile + common/fileops/Makefile common/gpt/Makefile common/include/Makefile common/protocol/Makefile diff --git a/Makefile.am b/Makefile.am index ec8ae05d..6c6e5053 100644 --- a/Makefile.am +++ b/Makefile.am @@ -81,6 +81,7 @@ SUBDIRS = \ if HAVE_PLUGINS SUBDIRS += \ common/bitmap \ + common/fileops \ common/gpt \ common/regions \ common/sparse \ diff --git a/common/fileops/Makefile.am b/common/fileops/Makefile.am new file mode 100644 index 00000000..1fce2d81 --- /dev/null +++ b/common/fileops/Makefile.am @@ -0,0 +1,45 @@ +# nbdkit +# Copyright (C) 2020 Red Hat Inc. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the name of Red Hat nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. + +include $(top_srcdir)/common-rules.mk + +noinst_LTLIBRARIES = libfileops.la + +libfileops_la_SOURCES = \ + fileops.c \ + fileops.h \ + $(NULL) +libfileops_la_CPPFLAGS = \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/common/include \ + -I$(top_srcdir)/common/utils \ + $(NULL) +libfileops_la_CFLAGS = $(WARNINGS_CFLAGS) diff --git a/plugins/file/Makefile.am b/plugins/file/Makefile.am index 7fc40cf4..f3bd86c0 100644 --- a/plugins/file/Makefile.am +++ b/plugins/file/Makefile.am @@ -42,6 +42,7 @@ nbdkit_file_plugin_la_SOURCES = \ nbdkit_file_plugin_la_CPPFLAGS = \ -I$(top_srcdir)/include \ + -I$(top_srcdir)/common/fileops \ -I$(top_srcdir)/common/include \ -I$(top_srcdir)/common/utils \ $(NULL) @@ -51,6 +52,7 @@ nbdkit_file_plugin_la_LDFLAGS = \ -Wl,--version-script=$(top_srcdir)/plugins/plugins.syms \ $(NULL) nbdkit_file_plugin_la_LIBADD = \ + $(top_builddir)/common/fileops/libfileops.la \ $(top_builddir)/common/utils/libutils.la \ $(NULL) diff --git a/common/fileops/fileops.h b/common/fileops/fileops.h new file mode 100644 index 00000000..1aecf785 --- /dev/null +++ b/common/fileops/fileops.h @@ -0,0 +1,141 @@ +/* nbdkit + * Copyright (C) 2020 Red Hat Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name of Red Hat nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* This mini-library helps when writing plugins which are like + * nbdkit-file-plugin. It is also used to implement + * nbdkit-file-plugin itself. What this means is if your plugin + * (after perhaps some custom setup) serves a local file or local + * block device then you can implement most of the data serving part + * of the plugin using these generic fileops. The advantage is that + * this mini-library deals with the complexity of implementing + * callbacks such as .zero and .trim efficiently. + * + * To use it, your plugin per-connection handle must either have type + * ‘struct fileops *’, or include ‘struct fileops’ as the first + * member. Then add FILEOPS_CALLBACKS to your ‘struct nbdkit_plugin’. + * The mini-library is linked statically into the plugin. + * + * Your plugin must define and use NBDKIT_API_VERSION == 2. This + * library is safe to use from PARALLEL plugins (in fact, that is the + * recommended thread model). + * + * See plugins/file/file.c for an example. + */ + +#ifndef NBDKIT_FILEOPS_H +#define NBDKIT_FILEOPS_H + +#include <config.h> + +#include <stdbool.h> + +/* For SEEK_HOLE. */ +#include <unistd.h> +#include <sys/types.h> + +/* Either use this as the per-connection handle, or it must appear as + * the first member of the per-connection handle. Don’t access these + * fields directly from your plugin. + */ +struct fileops { + int fd; + int sector_size; + bool is_block_device; + bool can_punch_hole; + bool can_zero_range; + bool can_fallocate; + bool can_zeroout; +}; + +/* Initialize the fileops struct. ‘fd’ is a file descriptor opened on + * the local file or block device that you want to serve. Call this + * from your .open callback after allocating the handle and setting up + * the file descriptor. + */ +extern int init_fileops (int fd, struct fileops *fops); + +/* Close the file descriptor and perform any other cleanup (but it + * doesn’t free the struct or handle). Use this in your .close + * callback. + */ +extern void close_fileops (struct fileops *fops); + +/* You may optionally define a .dump_plugin callback which calls this. */ +extern void fileops_dump_plugin (void); + +/* Use the remaining callbacks by adding FILEOPS_CALLBACKS to your + * ‘struct nbdkit_plugin’. + */ +#define FILEOPS_CALLBACKS \ + .get_size = fileops_get_size, \ + .can_trim = fileops_can_trim, \ + .can_fua = fileops_can_fua, \ + .can_cache = fileops_can_cache, \ + .pread = fileops_pread, \ + .pwrite = fileops_pwrite, \ + .flush = fileops_flush, \ + .trim = fileops_trim, \ + .zero = fileops_zero, \ + .errno_is_preserved = 1, \ + FILEOPS_MAYBE_EXTENTS \ + FILEOPS_MAYBE_CACHE + +#ifdef SEEK_HOLE +#define FILEOPS_MAYBE_EXTENTS \ + .can_extents = fileops_can_extents, \ + .extents = fileops_extents, +#else +#define FILEOPS_MAYBE_EXTENTS /* nothing */ +#endif + +#ifdef HAVE_POSIX_FADVISE +#define FILEOPS_MAYBE_CACHE \ + .cache = fileops_cache, +#else +#define FILEOPS_MAYBE_CACHE /* nothing */ +#endif + +/* Don’t call these directly. */ +extern int64_t fileops_get_size (void *handle); +extern int fileops_can_trim (void *handle); +extern int fileops_can_fua (void *handle); +extern int fileops_can_cache (void *handle); +extern int fileops_flush (void *handle, uint32_t flags); +extern int fileops_pread (void *handle, void *buf, uint32_t count, uint64_t offset, uint32_t flags); +extern int fileops_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, uint32_t flags); +extern int fileops_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags); +extern int fileops_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags); +extern int fileops_can_extents (void *handle); +extern int fileops_extents (void *handle, uint32_t count, uint64_t offset, uint32_t flags, struct nbdkit_extents *extents); +extern int fileops_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags); + +#endif /* NBDKIT_FILEOPS_H */ diff --git a/common/fileops/fileops.c b/common/fileops/fileops.c new file mode 100644 index 00000000..6eb22aab --- /dev/null +++ b/common/fileops/fileops.c @@ -0,0 +1,547 @@ +/* nbdkit + * Copyright (C) 2020 Red Hat Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * * Neither the name of Red Hat nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <inttypes.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> + +#include <pthread.h> + +#if defined (__linux__) && !defined (FALLOC_FL_PUNCH_HOLE) +#include <linux/falloc.h> /* For FALLOC_FL_*, glibc < 2.18 */ +#endif + +#if defined (__linux__) +#include <linux/fs.h> /* For BLKZEROOUT */ +#endif + +#define NBDKIT_API_VERSION 2 +#include <nbdkit-plugin.h> + +#include "cleanup.h" +#include "fileops.h" +#include "isaligned.h" + +#ifndef HAVE_FDATASYNC +#define fdatasync fsync +#endif + +/* Any callbacks using lseek must be protected by this lock. */ +static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; + +/* To enable: -D file.zero=1 */ +int file_debug_zero; + +static bool +is_enotsup (int err) +{ + return err == ENOTSUP || err == EOPNOTSUPP; +} + +/* Print some extra information about how the plugin was compiled. */ +void +fileops_dump_plugin (void) +{ +#ifdef BLKSSZGET + printf ("file_blksszget=yes\n"); +#endif +#ifdef BLKZEROOUT + printf ("file_blkzeroout=yes\n"); +#endif +#ifdef FALLOC_FL_PUNCH_HOLE + printf ("file_falloc_fl_punch_hole=yes\n"); +#endif +#ifdef FALLOC_FL_ZERO_RANGE + printf ("file_falloc_fl_zero_range=yes\n"); +#endif +} + +int +init_fileops (int fd, struct fileops *fops) +{ + struct stat statbuf; + + if (fstat (fd, &statbuf) == -1) { + nbdkit_error ("fstat: %m"); + return -1; + } + + fops->fd = fd; + fops->is_block_device = S_ISBLK (statbuf.st_mode); + fops->sector_size = 4096; /* Start with safe guess */ + +#ifdef BLKSSZGET + if (fops->is_block_device) { + if (ioctl (fops->fd, BLKSSZGET, &fops->sector_size)) + nbdkit_debug ("cannot get sector size: %m"); + } +#endif + +#ifdef FALLOC_FL_PUNCH_HOLE + fops->can_punch_hole = true; +#else + fops->can_punch_hole = false; +#endif + +#ifdef FALLOC_FL_ZERO_RANGE + fops->can_zero_range = true; +#else + fops->can_zero_range = false; +#endif + + fops->can_fallocate = true; + fops->can_zeroout = fops->is_block_device; + + return 0; +} + +void +close_fileops (struct fileops *fops) +{ + close (fops->fd); +} + +/* For block devices, stat->st_size is not the true size. The caller + * grabs the lseek_lock. + */ +static int64_t +block_device_size (int fd) +{ + off_t size; + + size = lseek (fd, 0, SEEK_END); + if (size == -1) { + nbdkit_error ("lseek (to find device size): %m"); + return -1; + } + + return size; +} + +/* Get the file size. */ +int64_t +fileops_get_size (void *handle) +{ + struct fileops *fops = handle; + + if (fops->is_block_device) { + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); + return block_device_size (fops->fd); + } else { + /* Regular file. */ + struct stat statbuf; + + if (fstat (fops->fd, &statbuf) == -1) { + nbdkit_error ("fstat: %m"); + return -1; + } + + return statbuf.st_size; + } +} + +int +fileops_can_trim (void *handle) +{ + /* Trim is advisory, but we prefer to advertise it only when we can + * actually (attempt to) punch holes. Since not all filesystems + * support all fallocate modes, it would be nice if we had a way + * from fpathconf() to definitively learn what will work on a given + * fd for a more precise answer; oh well. */ +#ifdef FALLOC_FL_PUNCH_HOLE + return 1; +#else + return 0; +#endif +} + +int +fileops_can_fua (void *handle) +{ + return NBDKIT_FUA_NATIVE; +} + +int +fileops_can_cache (void *handle) +{ + /* Prefer posix_fadvise(), but letting nbdkit call .pread on our + * behalf also tends to work well for the local file system + * cache. + */ +#if HAVE_POSIX_FADVISE + return NBDKIT_FUA_NATIVE; +#else + return NBDKIT_FUA_EMULATE; +#endif +} + +/* Flush the file to disk. */ +int +fileops_flush (void *handle, uint32_t flags) +{ + struct fileops *fops = handle; + + if (fdatasync (fops->fd) == -1) { + nbdkit_error ("fdatasync: %m"); + return -1; + } + + return 0; +} + +/* Read data from the file. */ +int +fileops_pread (void *handle, void *buf, uint32_t count, uint64_t offset, + uint32_t flags) +{ + struct fileops *fops = handle; + + while (count > 0) { + ssize_t r = pread (fops->fd, buf, count, offset); + if (r == -1) { + nbdkit_error ("pread: %m"); + return -1; + } + if (r == 0) { + nbdkit_error ("pread: unexpected end of file"); + return -1; + } + buf += r; + count -= r; + offset += r; + } + + return 0; +} + +/* Write data to the file. */ +int +fileops_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, + uint32_t flags) +{ + struct fileops *fops = handle; + + while (count > 0) { + ssize_t r = pwrite (fops->fd, buf, count, offset); + if (r == -1) { + nbdkit_error ("pwrite: %m"); + return -1; + } + buf += r; + count -= r; + offset += r; + } + + if ((flags & NBDKIT_FLAG_FUA) && fileops_flush (handle, 0) == -1) + return -1; + + return 0; +} + +#if defined (FALLOC_FL_PUNCH_HOLE) || defined (FALLOC_FL_ZERO_RANGE) +static int +do_fallocate (int fd, int mode, off_t offset, off_t len) +{ + int r = fallocate (fd, mode, offset, len); + if (r == -1 && errno == ENODEV) { + /* kernel 3.10 fails with ENODEV for block device. Kernel >= 4.9 fails + with EOPNOTSUPP in this case. Normalize errno to simplify callers. */ + errno = EOPNOTSUPP; + } + return r; +} +#endif + +/* Write zeroes to the file. */ +int +fileops_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) +{ + struct fileops *fops = handle; + int r; + +#ifdef FALLOC_FL_PUNCH_HOLE + if (fops->can_punch_hole && (flags & NBDKIT_FLAG_MAY_TRIM)) { + r = do_fallocate (fops->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, count); + if (r == 0) { + if (file_debug_zero) + nbdkit_debug ("fops->can_punch_hole && may_trim: " + "zero succeeded using fallocate"); + goto out; + } + + if (!is_enotsup (errno)) { + nbdkit_error ("zero: %m"); + return -1; + } + + fops->can_punch_hole = false; + } +#endif + +#ifdef FALLOC_FL_ZERO_RANGE + if (fops->can_zero_range) { + r = do_fallocate (fops->fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == 0) { + if (file_debug_zero) + nbdkit_debug ("fops->can_zero-range: " + "zero succeeded using fallocate"); + goto out; + } + + if (!is_enotsup (errno)) { + nbdkit_error ("zero: %m"); + return -1; + } + + fops->can_zero_range = false; + } +#endif + +#ifdef FALLOC_FL_PUNCH_HOLE + /* If we can punch hole but may not trim, we can combine punching hole and + * fallocate to zero a range. This is expected to be more efficient than + * writing zeroes manually. */ + if (fops->can_punch_hole && fops->can_fallocate) { + r = do_fallocate (fops->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, count); + if (r == 0) { + r = do_fallocate (fops->fd, 0, offset, count); + if (r == 0) { + if (file_debug_zero) + nbdkit_debug ("fops->can_punch_hole && fops->can_fallocate: " + "zero succeeded using fallocate"); + goto out; + } + + if (!is_enotsup (errno)) { + nbdkit_error ("zero: %m"); + return -1; + } + + fops->can_fallocate = false; + } else { + if (!is_enotsup (errno)) { + nbdkit_error ("zero: %m"); + return -1; + } + + fops->can_punch_hole = false; + } + } +#endif + +#ifdef BLKZEROOUT + /* For aligned range and block device, we can use BLKZEROOUT. */ + if (fops->can_zeroout && IS_ALIGNED (offset | count, fops->sector_size)) { + uint64_t range[2] = {offset, count}; + + r = ioctl (fops->fd, BLKZEROOUT, &range); + if (r == 0) { + if (file_debug_zero) + nbdkit_debug ("fops->can_zeroout && IS_ALIGNED: " + "zero succeeded using BLKZEROOUT"); + goto out; + } + + if (errno != ENOTTY) { + nbdkit_error ("zero: %m"); + return -1; + } + + fops->can_zeroout = false; + } +#endif + + /* Trigger a fall back to writing */ + if (file_debug_zero) + nbdkit_debug ("zero falling back to writing"); + errno = EOPNOTSUPP; + return -1; + + out: + if ((flags & NBDKIT_FLAG_FUA) && fileops_flush (handle, 0) == -1) + return -1; + return 0; +} + +/* Punch a hole in the file. */ +int +fileops_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags) +{ +#ifdef FALLOC_FL_PUNCH_HOLE + struct fileops *fops = handle; + int r; + + if (fops->can_punch_hole) { + r = do_fallocate (fops->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, count); + if (r == -1) { + /* Trim is advisory; we don't care if it fails for anything other + * than EIO or EPERM. */ + if (errno == EPERM || errno == EIO) { + nbdkit_error ("fallocate: %m"); + return -1; + } + + if (is_enotsup (EOPNOTSUPP)) + fops->can_punch_hole = false; + + nbdkit_debug ("ignoring failed fallocate during trim: %m"); + } + } +#endif + + if ((flags & NBDKIT_FLAG_FUA) && fileops_flush (handle, 0) == -1) + return -1; + + return 0; +} + +#ifdef SEEK_HOLE +/* Extents. */ + +int +fileops_can_extents (void *handle) +{ + struct fileops *fops = handle; + off_t r; + + /* A simple test to see whether SEEK_HOLE etc is likely to work on + * the current filesystem. + */ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); + r = lseek (fops->fd, 0, SEEK_HOLE); + if (r == -1) { + nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m"); + return 0; + } + return 1; +} + +static int +do_extents (void *handle, uint32_t count, uint64_t offset, + uint32_t flags, struct nbdkit_extents *extents) +{ + struct fileops *fops = handle; + const bool req_one = flags & NBDKIT_FLAG_REQ_ONE; + uint64_t end = offset + count; + + do { + off_t pos; + + pos = lseek (fops->fd, offset, SEEK_DATA); + if (pos == -1) { + if (errno == ENXIO) { + /* The current man page does not describe this situation well, + * but a proposed change to POSIX adds these words for ENXIO: + * "or the whence argument is SEEK_DATA and the offset falls + * within the final hole of the file." + */ + pos = end; + } + else { + nbdkit_error ("lseek: SEEK_DATA: %" PRIu64 ": %m", offset); + return -1; + } + } + + /* We know there is a hole from offset to pos-1. */ + if (pos > offset) { + if (nbdkit_add_extent (extents, offset, pos - offset, + NBDKIT_EXTENT_HOLE | NBDKIT_EXTENT_ZERO) == -1) + return -1; + if (req_one) + break; + } + + offset = pos; + if (offset >= end) + break; + + pos = lseek (fops->fd, offset, SEEK_HOLE); + if (pos == -1) { + nbdkit_error ("lseek: SEEK_HOLE: %" PRIu64 ": %m", offset); + return -1; + } + + /* We know there is data from offset to pos-1. */ + if (pos > offset) { + if (nbdkit_add_extent (extents, offset, pos - offset, + 0 /* allocated data */) == -1) + return -1; + if (req_one) + break; + } + + offset = pos; + } while (offset < end); + + return 0; +} + +int +fileops_extents (void *handle, uint32_t count, uint64_t offset, + uint32_t flags, struct nbdkit_extents *extents) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); + return do_extents (handle, count, offset, flags, extents); +} +#endif /* SEEK_HOLE */ + +#if HAVE_POSIX_FADVISE +/* Caching. */ +int +fileops_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags) +{ + struct fileops *fops = handle; + int r; + + /* Cache is advisory, we don't care if this fails */ + r = posix_fadvise (fops->fd, offset, count, POSIX_FADV_WILLNEED); + if (r) { + errno = r; + nbdkit_error ("posix_fadvise: %m"); + return -1; + } + return 0; +} +#endif /* HAVE_POSIX_FADVISE */ diff --git a/plugins/file/file.c b/plugins/file/file.c index 8c2ea077..b3446fc3 100644 --- a/plugins/file/file.c +++ b/plugins/file/file.c @@ -34,51 +34,18 @@ #include <stdio.h> #include <stdlib.h> -#include <stdbool.h> +#include <stdint.h> #include <string.h> -#include <inttypes.h> #include <fcntl.h> -#include <unistd.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/ioctl.h> #include <errno.h> -#include <pthread.h> - -#if defined (__linux__) && !defined (FALLOC_FL_PUNCH_HOLE) -#include <linux/falloc.h> /* For FALLOC_FL_*, glibc < 2.18 */ -#endif - -#if defined (__linux__) -#include <linux/fs.h> /* For BLKZEROOUT */ -#endif - #define NBDKIT_API_VERSION 2 - #include <nbdkit-plugin.h> -#include "cleanup.h" -#include "isaligned.h" - -#ifndef HAVE_FDATASYNC -#define fdatasync fsync -#endif +#include "fileops.h" static char *filename = NULL; -/* Any callbacks using lseek must be protected by this lock. */ -static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; - -/* to enable: -D file.zero=1 */ -int file_debug_zero; - -static bool -is_enotsup (int err) -{ - return err == ENOTSUP || err == EOPNOTSUPP; -} - static void file_unload (void) { @@ -131,41 +98,18 @@ file_config_complete (void) static void file_dump_plugin (void) { -#ifdef BLKSSZGET - printf ("file_blksszget=yes\n"); -#endif -#ifdef BLKZEROOUT - printf ("file_blkzeroout=yes\n"); -#endif -#ifdef FALLOC_FL_PUNCH_HOLE - printf ("file_falloc_fl_punch_hole=yes\n"); -#endif -#ifdef FALLOC_FL_ZERO_RANGE - printf ("file_falloc_fl_zero_range=yes\n"); -#endif + fileops_dump_plugin (); } -/* The per-connection handle. */ -struct handle { - int fd; - bool is_block_device; - int sector_size; - bool can_punch_hole; - bool can_zero_range; - bool can_fallocate; - bool can_zeroout; -}; - /* Create the per-connection handle. */ static void * file_open (int readonly) { - struct handle *h; - struct stat statbuf; - int flags; + struct fileops *fops; + int fd, flags; - h = malloc (sizeof *h); - if (h == NULL) { + fops = malloc (sizeof *fops); + if (fops == NULL) { nbdkit_error ("malloc: %m"); return NULL; } @@ -176,98 +120,33 @@ file_open (int readonly) else flags |= O_RDWR; - h->fd = open (filename, flags); - if (h->fd == -1) { + fd = open (filename, flags); + if (fd == -1) { nbdkit_error ("open: %s: %m", filename); - free (h); + free (fops); return NULL; } - if (fstat (h->fd, &statbuf) == -1) { - nbdkit_error ("fstat: %s: %m", filename); - free (h); + if (init_fileops (fd, fops) == -1) { + free (fops); return NULL; } - h->is_block_device = S_ISBLK (statbuf.st_mode); - h->sector_size = 4096; /* Start with safe guess */ - -#ifdef BLKSSZGET - if (h->is_block_device) { - if (ioctl (h->fd, BLKSSZGET, &h->sector_size)) - nbdkit_debug ("cannot get sector size: %s: %m", filename); - } -#endif - -#ifdef FALLOC_FL_PUNCH_HOLE - h->can_punch_hole = true; -#else - h->can_punch_hole = false; -#endif - -#ifdef FALLOC_FL_ZERO_RANGE - h->can_zero_range = true; -#else - h->can_zero_range = false; -#endif - - h->can_fallocate = true; - h->can_zeroout = h->is_block_device; - - return h; + return fops; } /* Free up the per-connection handle. */ static void file_close (void *handle) { - struct handle *h = handle; + struct fileops *fops = handle; - close (h->fd); - free (h); + close_fileops (fops); + free (fops); } #define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL -/* For block devices, stat->st_size is not the true size. The caller - * grabs the lseek_lock. - */ -static int64_t -block_device_size (int fd) -{ - off_t size; - - size = lseek (fd, 0, SEEK_END); - if (size == -1) { - nbdkit_error ("lseek (to find device size): %m"); - return -1; - } - - return size; -} - -/* Get the file size. */ -static int64_t -file_get_size (void *handle) -{ - struct handle *h = handle; - - if (h->is_block_device) { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); - return block_device_size (h->fd); - } else { - /* Regular file. */ - struct stat statbuf; - - if (fstat (h->fd, &statbuf) == -1) { - nbdkit_error ("fstat: %m"); - return -1; - } - - return statbuf.st_size; - } -} - /* Allow multiple parallel connections from a single client. */ static int file_can_multi_conn (void *handle) @@ -275,374 +154,6 @@ file_can_multi_conn (void *handle) return 1; } -static int -file_can_trim (void *handle) -{ - /* Trim is advisory, but we prefer to advertise it only when we can - * actually (attempt to) punch holes. Since not all filesystems - * support all fallocate modes, it would be nice if we had a way - * from fpathconf() to definitively learn what will work on a given - * fd for a more precise answer; oh well. */ -#ifdef FALLOC_FL_PUNCH_HOLE - return 1; -#else - return 0; -#endif -} - -static int -file_can_fua (void *handle) -{ - return NBDKIT_FUA_NATIVE; -} - -static int -file_can_cache (void *handle) -{ - /* Prefer posix_fadvise(), but letting nbdkit call .pread on our - * behalf also tends to work well for the local file system - * cache. - */ -#if HAVE_POSIX_FADVISE - return NBDKIT_FUA_NATIVE; -#else - return NBDKIT_FUA_EMULATE; -#endif -} - -/* Flush the file to disk. */ -static int -file_flush (void *handle, uint32_t flags) -{ - struct handle *h = handle; - - if (fdatasync (h->fd) == -1) { - nbdkit_error ("fdatasync: %m"); - return -1; - } - - return 0; -} - -/* Read data from the file. */ -static int -file_pread (void *handle, void *buf, uint32_t count, uint64_t offset, - uint32_t flags) -{ - struct handle *h = handle; - - while (count > 0) { - ssize_t r = pread (h->fd, buf, count, offset); - if (r == -1) { - nbdkit_error ("pread: %m"); - return -1; - } - if (r == 0) { - nbdkit_error ("pread: unexpected end of file"); - return -1; - } - buf += r; - count -= r; - offset += r; - } - - return 0; -} - -/* Write data to the file. */ -static int -file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset, - uint32_t flags) -{ - struct handle *h = handle; - - while (count > 0) { - ssize_t r = pwrite (h->fd, buf, count, offset); - if (r == -1) { - nbdkit_error ("pwrite: %m"); - return -1; - } - buf += r; - count -= r; - offset += r; - } - - if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) - return -1; - - return 0; -} - -#if defined (FALLOC_FL_PUNCH_HOLE) || defined (FALLOC_FL_ZERO_RANGE) -static int -do_fallocate (int fd, int mode, off_t offset, off_t len) -{ - int r = fallocate (fd, mode, offset, len); - if (r == -1 && errno == ENODEV) { - /* kernel 3.10 fails with ENODEV for block device. Kernel >= 4.9 fails - with EOPNOTSUPP in this case. Normalize errno to simplify callers. */ - errno = EOPNOTSUPP; - } - return r; -} -#endif - -/* Write zeroes to the file. */ -static int -file_zero (void *handle, uint32_t count, uint64_t offset, uint32_t flags) -{ - struct handle *h = handle; - int r; - -#ifdef FALLOC_FL_PUNCH_HOLE - if (h->can_punch_hole && (flags & NBDKIT_FLAG_MAY_TRIM)) { - r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - offset, count); - if (r == 0) { - if (file_debug_zero) - nbdkit_debug ("h->can_punch_hole && may_trim: " - "zero succeeded using fallocate"); - goto out; - } - - if (!is_enotsup (errno)) { - nbdkit_error ("zero: %m"); - return -1; - } - - h->can_punch_hole = false; - } -#endif - -#ifdef FALLOC_FL_ZERO_RANGE - if (h->can_zero_range) { - r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); - if (r == 0) { - if (file_debug_zero) - nbdkit_debug ("h->can_zero-range: " - "zero succeeded using fallocate"); - goto out; - } - - if (!is_enotsup (errno)) { - nbdkit_error ("zero: %m"); - return -1; - } - - h->can_zero_range = false; - } -#endif - -#ifdef FALLOC_FL_PUNCH_HOLE - /* If we can punch hole but may not trim, we can combine punching hole and - * fallocate to zero a range. This is expected to be more efficient than - * writing zeroes manually. */ - if (h->can_punch_hole && h->can_fallocate) { - r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - offset, count); - if (r == 0) { - r = do_fallocate (h->fd, 0, offset, count); - if (r == 0) { - if (file_debug_zero) - nbdkit_debug ("h->can_punch_hole && h->can_fallocate: " - "zero succeeded using fallocate"); - goto out; - } - - if (!is_enotsup (errno)) { - nbdkit_error ("zero: %m"); - return -1; - } - - h->can_fallocate = false; - } else { - if (!is_enotsup (errno)) { - nbdkit_error ("zero: %m"); - return -1; - } - - h->can_punch_hole = false; - } - } -#endif - -#ifdef BLKZEROOUT - /* For aligned range and block device, we can use BLKZEROOUT. */ - if (h->can_zeroout && IS_ALIGNED (offset | count, h->sector_size)) { - uint64_t range[2] = {offset, count}; - - r = ioctl (h->fd, BLKZEROOUT, &range); - if (r == 0) { - if (file_debug_zero) - nbdkit_debug ("h->can_zeroout && IS_ALIGNED: " - "zero succeeded using BLKZEROOUT"); - goto out; - } - - if (errno != ENOTTY) { - nbdkit_error ("zero: %m"); - return -1; - } - - h->can_zeroout = false; - } -#endif - - /* Trigger a fall back to writing */ - if (file_debug_zero) - nbdkit_debug ("zero falling back to writing"); - errno = EOPNOTSUPP; - return -1; - - out: - if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) - return -1; - return 0; -} - -/* Punch a hole in the file. */ -static int -file_trim (void *handle, uint32_t count, uint64_t offset, uint32_t flags) -{ -#ifdef FALLOC_FL_PUNCH_HOLE - struct handle *h = handle; - int r; - - if (h->can_punch_hole) { - r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - offset, count); - if (r == -1) { - /* Trim is advisory; we don't care if it fails for anything other - * than EIO or EPERM. */ - if (errno == EPERM || errno == EIO) { - nbdkit_error ("fallocate: %m"); - return -1; - } - - if (is_enotsup (EOPNOTSUPP)) - h->can_punch_hole = false; - - nbdkit_debug ("ignoring failed fallocate during trim: %m"); - } - } -#endif - - if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1) - return -1; - - return 0; -} - -#ifdef SEEK_HOLE -/* Extents. */ - -static int -file_can_extents (void *handle) -{ - struct handle *h = handle; - off_t r; - - /* A simple test to see whether SEEK_HOLE etc is likely to work on - * the current filesystem. - */ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); - r = lseek (h->fd, 0, SEEK_HOLE); - if (r == -1) { - nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m"); - return 0; - } - return 1; -} - -static int -do_extents (void *handle, uint32_t count, uint64_t offset, - uint32_t flags, struct nbdkit_extents *extents) -{ - struct handle *h = handle; - const bool req_one = flags & NBDKIT_FLAG_REQ_ONE; - uint64_t end = offset + count; - - do { - off_t pos; - - pos = lseek (h->fd, offset, SEEK_DATA); - if (pos == -1) { - if (errno == ENXIO) { - /* The current man page does not describe this situation well, - * but a proposed change to POSIX adds these words for ENXIO: - * "or the whence argument is SEEK_DATA and the offset falls - * within the final hole of the file." - */ - pos = end; - } - else { - nbdkit_error ("lseek: SEEK_DATA: %" PRIu64 ": %m", offset); - return -1; - } - } - - /* We know there is a hole from offset to pos-1. */ - if (pos > offset) { - if (nbdkit_add_extent (extents, offset, pos - offset, - NBDKIT_EXTENT_HOLE | NBDKIT_EXTENT_ZERO) == -1) - return -1; - if (req_one) - break; - } - - offset = pos; - if (offset >= end) - break; - - pos = lseek (h->fd, offset, SEEK_HOLE); - if (pos == -1) { - nbdkit_error ("lseek: SEEK_HOLE: %" PRIu64 ": %m", offset); - return -1; - } - - /* We know there is data from offset to pos-1. */ - if (pos > offset) { - if (nbdkit_add_extent (extents, offset, pos - offset, - 0 /* allocated data */) == -1) - return -1; - if (req_one) - break; - } - - offset = pos; - } while (offset < end); - - return 0; -} - -static int -file_extents (void *handle, uint32_t count, uint64_t offset, - uint32_t flags, struct nbdkit_extents *extents) -{ - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lseek_lock); - return do_extents (handle, count, offset, flags, extents); -} -#endif /* SEEK_HOLE */ - -#if HAVE_POSIX_FADVISE -/* Caching. */ -static int -file_cache (void *handle, uint32_t count, uint64_t offset, uint32_t flags) -{ - struct handle *h = handle; - int r; - - /* Cache is advisory, we don't care if this fails */ - r = posix_fadvise (h->fd, offset, count, POSIX_FADV_WILLNEED); - if (r) { - errno = r; - nbdkit_error ("posix_fadvise: %m"); - return -1; - } - return 0; -} -#endif /* HAVE_POSIX_FADVISE */ - static struct nbdkit_plugin plugin = { .name = "file", .longname = "nbdkit file plugin", @@ -655,24 +166,9 @@ static struct nbdkit_plugin plugin = { .dump_plugin = file_dump_plugin, .open = file_open, .close = file_close, - .get_size = file_get_size, .can_multi_conn = file_can_multi_conn, - .can_trim = file_can_trim, - .can_fua = file_can_fua, - .can_cache = file_can_cache, - .pread = file_pread, - .pwrite = file_pwrite, - .flush = file_flush, - .trim = file_trim, - .zero = file_zero, -#ifdef SEEK_HOLE - .can_extents = file_can_extents, - .extents = file_extents, -#endif -#if HAVE_POSIX_FADVISE - .cache = file_cache, -#endif - .errno_is_preserved = 1, + + FILEOPS_CALLBACKS }; NBDKIT_REGISTER_PLUGIN(plugin) -- 2.25.0
Apparently Analagous Threads
- [PATCH nbdkit v2 0/3] Implement fileops.
- [PATCH v2 0/4] file: Zero for block devices and older file systems
- [PATCH v3 0/4] file: Zero for block devices and older file systems
- [PATCH v3 0/4] file: Zero for block devices and older file systems
- [PATCH 0/3] file: Zero for block devices and older file systems