Nir Soffer
2018-Jul-29 12:04 UTC
[Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
If we may not trim, we tried ZERO_RANGE, but this is not well supported yet, for example it is not available on NFS 4.2. ZERO_RANGE and PUNCH_HOLE are supported now on block devices, but not on RHRL 7, so we fallback to slow manual zeroing there. Change the logic to support block devices on RHEL 7, and file systems that do not support ZERO_RANGE. The new logic: - If we may trim, try PUNCH_HOLE - If we can zero range, Try ZERO_RANGE - If we can punch hole and fallocate, try fallocate(PUNCH_HOLE) followed by fallocate(0). - If underlying file is a block device, try ioctl(BLKZEROOUT) - Otherwise fallback to manual zeroing The handle keeps now the underlying file capabilities, so once we discover that an operation is not supported, we never try it again. Here are examples runs on a server based on Intel(R) Xeon(R) CPU E5-2630 v4 @ 2.20GHz, using XtremIO storage via 4G FC HBA and 4 paths to storage. $ export SOCK=/tmp/nbd.sock $ export BLOCK=/dev/e30bfac2-8e13-479d-8cd6-c6da5e306c4e/c9864222-bc52-4359-80d7-76e47d619b15 $ src/nbdkit -f plugins/file/.libs/nbdkit-file-plugin.so file=$BLOCK -U $SOCK $ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img nbd:unix:$SOCK real 0m2.741s user 0m0.224s sys 0m0.634s $ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img nbd:unix:$SOCK real 0m1.920s user 0m0.163s sys 0m0.735s Issues: - ioctl(BLKZEROOUT) will fail if offset or count are not aligned to logical sector size. I'm not sure if nbdkit or qemu-img ensure this. - Need testing with NFS --- plugins/file/file.c | 126 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 103 insertions(+), 23 deletions(-) diff --git a/plugins/file/file.c b/plugins/file/file.c index fb20622..bce2ed1 100644 --- a/plugins/file/file.c +++ b/plugins/file/file.c @@ -33,6 +33,7 @@ #include <config.h> +#include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -42,6 +43,8 @@ #include <sys/stat.h> #include <errno.h> #include <linux/falloc.h> /* For FALLOC_FL_* on RHEL, glibc < 2.18 */ +#include <sys/ioctl.h> +#include <linux/fs.h> #include <nbdkit-plugin.h> @@ -116,6 +119,10 @@ file_config_complete (void) /* The per-connection handle. */ struct handle { int fd; + bool is_block_device; + bool can_punch_hole; + bool can_zero_range; + bool can_fallocate; }; /* Create the per-connection handle. */ @@ -123,6 +130,7 @@ static void * file_open (int readonly) { struct handle *h; + struct stat statbuf; int flags; h = malloc (sizeof *h); @@ -144,6 +152,23 @@ file_open (int readonly) return NULL; } + if (fstat (h->fd, &statbuf) == -1) { + nbdkit_error ("fstat: %s: %m", filename); + free (h); + return NULL; + } + + h->is_block_device = S_ISBLK(statbuf.st_mode); + + /* These flags will disabled if an operation is not supported. */ +#ifdef FALLOC_FL_PUNCH_HOLE + h->can_punch_hole = true; +#endif +#ifdef FALLOC_FL_ZERO_RANGE + h->can_zero_range = true; +#endif + h->can_fallocate = true; + return h; } @@ -164,27 +189,29 @@ static int64_t file_get_size (void *handle) { struct handle *h = handle; - struct stat statbuf; - if (fstat (h->fd, &statbuf) == -1) { - nbdkit_error ("stat: %m"); - return -1; - } - - if (S_ISBLK (statbuf.st_mode)) { + if (h->is_block_device) { + /* Block device, so st_size will not be the true size. */ off_t size; - /* Block device, so st_size will not be the true size. */ size = lseek (h->fd, 0, SEEK_END); if (size == -1) { nbdkit_error ("lseek (to find device size): %m"); return -1; } + return size; - } + } else { + /* Regular file. */ + struct stat statbuf; + + if (fstat (h->fd, &statbuf) == -1) { + nbdkit_error ("fstat: %m"); + return -1; + } - /* Else regular file. */ - return statbuf.st_size; + return statbuf.st_size; + } } static int @@ -250,33 +277,86 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset) static int file_zero (void *handle, uint32_t count, uint64_t offset, int may_trim) { -#if defined(FALLOC_FL_PUNCH_HOLE) || defined(FALLOC_FL_ZERO_RANGE) struct handle *h = handle; -#endif int r = -1; #ifdef FALLOC_FL_PUNCH_HOLE - if (may_trim) { + /* If we can and may trim, punching hole is our best option. */ + if (h->can_punch_hole && may_trim) { r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, count); - if (r == -1 && errno != EOPNOTSUPP) { + if (r == 0) + return 0; + + if (errno != EOPNOTSUPP) { nbdkit_error ("zero: %m"); + return r; } - /* PUNCH_HOLE is older; if it is not supported, it is likely that - ZERO_RANGE will not work either, so fall back to write. */ - return r; + + h->can_punch_hole = false; } #endif #ifdef FALLOC_FL_ZERO_RANGE - r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); - if (r == -1 && errno != EOPNOTSUPP) { - nbdkit_error ("zero: %m"); + /* ZERO_RANGE is not well supported yet, but it the next best option. */ + if (h->can_zero_range) { + r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == 0) + return 0; + + if (errno != EOPNOTSUPP) { + nbdkit_error ("zero: %m"); + return r; + } + + h->can_zero_range = false; } -#else +#endif + +#ifdef FALLOC_FL_PUNCH_HOLE + /* If we can punch hole but may not trim, we can combine punching hole and + fallocate to zero a range. This is much more efficient than writing zeros + manually. */ + if (h->can_punch_hole && h->can_fallocate) { + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, count); + if (r == 0) { + r = do_fallocate(h->fd, 0, offset, count); + if (r == 0) + return 0; + + if (errno != EOPNOTSUPP) { + nbdkit_error ("zero: %m"); + return r; + } + + h->can_fallocate = false; + } else { + if (errno != EOPNOTSUPP) { + nbdkit_error ("zero: %m"); + return r; + } + + h->can_punch_hole = false; + } + } +#endif + + /* For block devices, we can use BLKZEROOUT. + NOTE: count and offset must be aligned to logical block size. */ + if (h->is_block_device) { + uint64_t range[2] = {offset, count}; + + r = ioctl(h->fd, BLKZEROOUT, &range); + if (r == 0) + return 0; + + nbdkit_error("zero: %m"); + return r; + } + /* Trigger a fall back to writing */ errno = EOPNOTSUPP; -#endif return r; } -- 2.17.1
Eric Blake
2018-Jul-30 15:01 UTC
Re: [Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
On 07/29/2018 07:04 AM, Nir Soffer wrote:> If we may not trim, we tried ZERO_RANGE, but this is not well supported > yet, for example it is not available on NFS 4.2. ZERO_RANGE and > PUNCH_HOLE are supported now on block devices, but not on RHRL 7, so we > fallback to slow manual zeroing there. > > Change the logic to support block devices on RHEL 7, and file systems > that do not support ZERO_RANGE. > > The new logic: > - If we may trim, try PUNCH_HOLE > - If we can zero range, Try ZERO_RANGE > - If we can punch hole and fallocate, try fallocate(PUNCH_HOLE) followed > by fallocate(0). > - If underlying file is a block device, try ioctl(BLKZEROOUT) > - Otherwise fallback to manual zeroing > > The handle keeps now the underlying file capabilities, so once we > discover that an operation is not supported, we never try it again. >> > Issues: > - ioctl(BLKZEROOUT) will fail if offset or count are not aligned to > logical sector size. I'm not sure if nbdkit or qemu-img ensure this.qemu-img tends to default to 512-byte alignment, but can be told to follow 4k alignment instead. nbdkit includes a filter that can force 4k alignment on top of any plugin, regardless of client alignment. Someday, I'd like to enhance nbdkit to support block size advertisement (qemu-img already knows how to honor such advertisements). It's on my todo queue, but lower in priority than getting incremental backups working in libvirt.> - Need testing with NFS > --- > plugins/file/file.c | 126 ++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 103 insertions(+), 23 deletions(-)> +++ b/plugins/file/file.c > @@ -33,6 +33,7 @@ > > #include <config.h> > > +#include <stdbool.h> > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > @@ -42,6 +43,8 @@ > #include <sys/stat.h> > #include <errno.h> > #include <linux/falloc.h> /* For FALLOC_FL_* on RHEL, glibc < 2.18 */ > +#include <sys/ioctl.h> > +#include <linux/fs.h>Does this need a configure-time probe to see if it exists, since it will break compilation on BSD systems? Same question to linux/falloc.h. Actually, linux/falloc.h doesn't see any use in the current nbdkit.git; does this email depend on another thread being applied first?> + > +#ifdef FALLOC_FL_PUNCH_HOLE > + /* If we can punch hole but may not trim, we can combine punching hole and > + fallocate to zero a range. This is much more efficient than writing zeros > + manually. */s/is/can be/ (it's two syscalls instead of one, and may not be as efficient as we'd like - but does indeed stand a chance of being more efficient than manual efforts)> + if (h->can_punch_hole && h->can_fallocate) { > + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > + offset, count); > + if (r == 0) { > + r = do_fallocate(h->fd, 0, offset, count); > + if (r == 0) > + return 0; > + > + if (errno != EOPNOTSUPP) { > + nbdkit_error ("zero: %m"); > + return r; > + } > + > + h->can_fallocate = false; > + } else { > + if (errno != EOPNOTSUPP) { > + nbdkit_error ("zero: %m"); > + return r; > + } > + > + h->can_punch_hole = false; > + } > + } > +#endif > + > + /* For block devices, we can use BLKZEROOUT. > + NOTE: count and offset must be aligned to logical block size. */ > + if (h->is_block_device) { > + uint64_t range[2] = {offset, count};Is it worth attempting the ioctl only when you have aligned values?> + > + r = ioctl(h->fd, BLKZEROOUT, &range);This portion of the code be conditional on whether BLKZEROOUT is defined.> + if (r == 0) > + return 0; > + > + nbdkit_error("zero: %m"); > + return r; > + } > + > /* Trigger a fall back to writing */ > errno = EOPNOTSUPP; > -#endif > > return r; > } >-- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Nir Soffer
2018-Jul-30 16:50 UTC
Re: [Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
On Mon, Jul 30, 2018 at 6:01 PM Eric Blake <eblake@redhat.com> wrote: ...> > @@ -42,6 +43,8 @@ > > #include <sys/stat.h> > > #include <errno.h> > > #include <linux/falloc.h> /* For FALLOC_FL_* on RHEL, glibc < 2.18 */ > > +#include <sys/ioctl.h> > > +#include <linux/fs.h> > > Does this need a configure-time probe to see if it exists, since it will > break compilation on BSD systems? Same question to linux/falloc.h. > Actually, linux/falloc.h doesn't see any use in the current nbdkit.git; > does this email depend on another thread being applied first? >Yes, this depends on https://www.redhat.com/archives/libguestfs/2018-July/msg00083.html I plan to protect both imports with #if defined(__linux__). Any reason to use configure instead?> + > > +#ifdef FALLOC_FL_PUNCH_HOLE > > + /* If we can punch hole but may not trim, we can combine punching > hole and > > + fallocate to zero a range. This is much more efficient than > writing zeros > > + manually. */ > > s/is/can be/ (it's two syscalls instead of one, and may not be as > efficient as we'd like - but does indeed stand a chance of being more > efficient than manual efforts) >"can be" is better, but I really mean "is typically", or "is expected to be". For example in imageio same change improved upload throughout by 450% with my poor NFS server, see: https://gerrit.ovirt.org/c/92871/11//COMMIT_MSG> > + if (h->can_punch_hole && h->can_fallocate) { > > + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > > + offset, count); > > + if (r == 0) { > > + r = do_fallocate(h->fd, 0, offset, count); > > + if (r == 0) > > + return 0; > > + > > + if (errno != EOPNOTSUPP) { > > + nbdkit_error ("zero: %m"); > > + return r; > > + } > > + > > + h->can_fallocate = false; > > + } else { > > + if (errno != EOPNOTSUPP) { > > + nbdkit_error ("zero: %m"); > > + return r; > > + } > > + > > + h->can_punch_hole = false; > > + } > > + } > > +#endif > > + > > + /* For block devices, we can use BLKZEROOUT. > > + NOTE: count and offset must be aligned to logical block size. */ > > + if (h->is_block_device) { > > + uint64_t range[2] = {offset, count}; > > Is it worth attempting the ioctl only when you have aligned values? >I think it does, but this requires getting the logical sector size and keeping it in the handle. Looking in plugin_zero, if we find that the offset and count are not aligned and return EOPNOTSUPP, we will fallback to manual zeroing for this call. But this worries me: 549 int can_zero = 1; /* TODO cache this per-connection? */ Once can_zero is cached per connection, failing once because of single unaligned call will prevent efficient zero for the rest of the image.> + > > + r = ioctl(h->fd, BLKZEROOUT, &range); > > This portion of the code be conditional on whether BLKZEROOUT is defined. >Right. ... Nir
Richard W.M. Jones
2018-Jul-31 11:41 UTC
Re: [Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
On Mon, Jul 30, 2018 at 10:01:47AM -0500, Eric Blake wrote:> On 07/29/2018 07:04 AM, Nir Soffer wrote: > >If we may not trim, we tried ZERO_RANGE, but this is not well supported > >yet, for example it is not available on NFS 4.2. ZERO_RANGE and > >PUNCH_HOLE are supported now on block devices, but not on RHRL 7, so we > >fallback to slow manual zeroing there. > > > >Change the logic to support block devices on RHEL 7, and file systems > >that do not support ZERO_RANGE. > > > >The new logic: > >- If we may trim, try PUNCH_HOLE > >- If we can zero range, Try ZERO_RANGE > >- If we can punch hole and fallocate, try fallocate(PUNCH_HOLE) followed > > by fallocate(0). > >- If underlying file is a block device, try ioctl(BLKZEROOUT) > >- Otherwise fallback to manual zeroing > > > >The handle keeps now the underlying file capabilities, so once we > >discover that an operation is not supported, we never try it again. > > > > > > >Issues: > >- ioctl(BLKZEROOUT) will fail if offset or count are not aligned to > > logical sector size. I'm not sure if nbdkit or qemu-img ensure this. > > qemu-img tends to default to 512-byte alignment, but can be told to > follow 4k alignment instead. nbdkit includes a filter that can force > 4k alignment on top of any plugin, regardless of client alignment. > > Someday, I'd like to enhance nbdkit to support block size > advertisement (qemu-img already knows how to honor such > advertisements). It's on my todo queue, but lower in priority than > getting incremental backups working in libvirt.The VDDK plugin actually requires 512 byte alignment. If a client issues < 512 byte aligned requests it returns an error :-( Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Possibly Parallel Threads
- [PATCH] file: Zero support for block devices and NFS 4.2
- [PATCH] file: Zero support for block devices and NFS 4.2
- [PATCH v2 1/4] file: Avoid unsupported fallocate() calls
- [PATCH v3 1/4] file: Avoid unsupported fallocate() calls
- [PATCH v4 1/4] file: Avoid unsupported fallocate() calls