Nir Soffer
2018-Aug-02  19:05 UTC
[Libguestfs] [PATCH 0/3] file: Zero for block devices and older file systems
This is the second version to support efficient zero for block devices on older kernels (e.g. RHEL 7.5), and file systems that do not support yet FALLOC_FS_ZERO_RANGE (e.g. NFS 4.2). Changes since v1: - Split to smaller patches - Skip linux only includes on other systems - Skip code using BLKZEROOUT if the macro is not defined - Try BLKZEROOUT only if the offset and count are aligned to device sector size. v1 was here: https://www.redhat.com/archives/libguestfs/2018-July/msg00084.html Nir Soffer (3): file: Avoid unsupported fallocate() calls file: Support zero without ZERO_RANGE file: Zero for block devices on old kernels plugins/file/file.c | 194 +++++++++++++++++++++++++++++++++----------- 1 file changed, 147 insertions(+), 47 deletions(-) -- 2.17.1
Nir Soffer
2018-Aug-02  19:05 UTC
[Libguestfs] [PATCH 1/3] file: Avoid unsupported fallocate() calls
When using file systems not supporting ZERO_RANGE (e.g. NFS 4.2) or
block device on kernel < 4.9, we used to call fallocate() for every
zero, fail with EOPNOTSUPP, and fallback to manual zeroing.  When
trimming, we used to try unsupported fallocate() on every call.
Change file handle to remember if punching holes or zeroing range are
supported, and avoid unsupported calls.
- can_trim changed to report the actual capability once we tried to
  punch a hole. I don't think this is useful yet.
- zero changed to:
  1. If we can punch hole and may trim, try PUNCH_HOLE
  2. If we can zero range, try ZERO_RANGE
  3. Fall back to manual writing
- trim changed to:
  1. If we can punch hole, try PUNCH_HOLE
  2. Succeed
---
 plugins/file/file.c | 96 ++++++++++++++++++++++++++++-----------------
 1 file changed, 59 insertions(+), 37 deletions(-)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index 3bb4d17..8bb9a2d 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -33,6 +33,7 @@
 
 #include <config.h>
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -118,6 +119,8 @@ file_config_complete (void)
 /* The per-connection handle. */
 struct handle {
   int fd;
+  bool can_punch_hole;
+  bool can_zero_range;
 };
 
 /* Create the per-connection handle. */
@@ -146,6 +149,18 @@ file_open (int readonly)
     return NULL;
   }
 
+#ifdef FALLOC_FL_PUNCH_HOLE
+  h->can_punch_hole = true;
+#else
+  h->can_punch_hole = false;
+#endif
+
+#ifdef FALLOC_FL_ZERO_RANGE
+  h->can_zero_range = true;
+#else
+  h->can_zero_range = false;
+#endif
+
   return h;
 }
 
@@ -189,19 +204,15 @@ file_get_size (void *handle)
   return statbuf.st_size;
 }
 
+/* Trim is advisory, but we prefer to advertise it only when we can actually
+ * (attempt to) punch holes. Before we tried to punch a hole, report true if
+ * FALLOC_FL_PUNCH_HOLE is defined before we did the first call. Once we tried
+ * to punch a hole, report the actual cappability of the underlying file. */
 static int
 file_can_trim (void *handle)
 {
-  /* Trim is advisory, but we prefer to advertise it only when we can
-   * actually (attempt to) punch holes.  Since not all filesystems
-   * support all fallocate modes, it would be nice if we had a way
-   * from fpathconf() to definitively learn what will work on a given
-   * fd for a more precise answer; oh well.  */
-#ifdef FALLOC_FL_PUNCH_HOLE
-  return 1;
-#else
-  return 0;
-#endif
+  struct handle *h = handle;
+  return h->can_punch_hole;
 }
 
 /* Read data from the file. */
@@ -252,34 +263,42 @@ file_pwrite (void *handle, const void *buf, uint32_t
count, uint64_t offset)
 static int
 file_zero (void *handle, uint32_t count, uint64_t offset, int may_trim)
 {
-#if defined(FALLOC_FL_PUNCH_HOLE) || defined(FALLOC_FL_ZERO_RANGE)
   struct handle *h = handle;
-#endif
   int r = -1;
 
 #ifdef FALLOC_FL_PUNCH_HOLE
-  if (may_trim) {
+  if (h->can_punch_hole && may_trim) {
     r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                       offset, count);
-    if (r == -1 && errno != EOPNOTSUPP) {
+    if (r == 0)
+      return r;
+
+    if (errno != EOPNOTSUPP) {
       nbdkit_error ("zero: %m");
+      return r;
     }
-    /* PUNCH_HOLE is older; if it is not supported, it is likely that
-       ZERO_RANGE will not work either, so fall back to write. */
-    return r;
+
+    h->can_punch_hole = false;
   }
 #endif
 
 #ifdef FALLOC_FL_ZERO_RANGE
-  r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count);
-  if (r == -1 && errno != EOPNOTSUPP) {
-    nbdkit_error ("zero: %m");
+  if (h->can_zero_range) {
+    r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count);
+    if (r== 0)
+      return r;
+
+    if (errno != EOPNOTSUPP) {
+      nbdkit_error ("zero: %m");
+      return r;
+    }
+
+    h->can_zero_range = false;
   }
-#else
-  /* Trigger a fall back to writing */
-  errno = EOPNOTSUPP;
 #endif
 
+  /* Trigger a fall back to writing */
+  errno = EOPNOTSUPP;
   return r;
 }
 
@@ -301,27 +320,30 @@ file_flush (void *handle)
 static int
 file_trim (void *handle, uint32_t count, uint64_t offset)
 {
-  int r = -1;
 #ifdef FALLOC_FL_PUNCH_HOLE
   struct handle *h = handle;
+  int r = -1;
+
+  if (h->can_punch_hole) {
+    r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                      offset, count);
+    if (r == -1) {
+      /* Trim is advisory; we don't care if it fails for anything other
+       * than EIO or EPERM. */
+      if (errno == EPERM || errno == EIO) {
+        nbdkit_error ("fallocate: %m");
+        return r;
+      }
+
+      if (errno == EOPNOTSUPP)
+        h->can_punch_hole = false;
 
-  /* Trim is advisory; we don't care if it fails for anything other
-   * than EIO or EPERM. */
-  r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                    offset, count);
-  if (r < 0) {
-    if (errno != EPERM && errno != EIO) {
       nbdkit_debug ("ignoring failed fallocate during trim: %m");
-      r = 0;
     }
-    else
-      nbdkit_error ("fallocate: %m");
   }
-#else
-  /* Based on .can_trim, this should not be reached. */
-  errno = EOPNOTSUPP;
 #endif
-  return r;
+
+  return 0;
 }
 
 static struct nbdkit_plugin plugin = {
-- 
2.17.1
Nir Soffer
2018-Aug-02  19:05 UTC
[Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
If we may not trim, we tried ZERO_RANGE, but this is not well supported
yet, for example it is not available on NFS 4.2. ZERO_RANGE and
PUNCH_HOLE are supported now on block devices, but not on RHRL 7, so we
fallback to slow manual zeroing there.
Change the logic to support block devices on RHEL 7, and file systems
that do not support ZERO_RANGE.
The new logic:
- If we may trim, try PUNCH_HOLE
- If we can zero range, Try ZERO_RANGE
- If we can punch hole and fallocate, try fallocate(PUNCH_HOLE) followed
  by fallocate(0).
- If underlying file is a block device, try ioctl(BLKZEROOUT)
- Otherwise fallback to manual zeroing
The handle keeps now the underlying file capabilities, so once we
discover that an operation is not supported, we never try it again.
Here are examples runs on a server based on Intel(R) Xeon(R) CPU E5-2630
v4 @ 2.20GHz, using XtremIO storage via 4G FC HBA and 4 paths to
storage.
$ export SOCK=/tmp/nbd.sock
$ export
BLOCK=/dev/e30bfac2-8e13-479d-8cd6-c6da5e306c4e/c9864222-bc52-4359-80d7-76e47d619b15
$ src/nbdkit -f plugins/file/.libs/nbdkit-file-plugin.so file=$BLOCK -U $SOCK
$ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img nbd:unix:$SOCK
real	0m2.741s
user	0m0.224s
sys	0m0.634s
$ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img
nbd:unix:$SOCK
real	0m1.920s
user	0m0.163s
sys	0m0.735s
Issues:
- ioctl(BLKZEROOUT) will fail if offset or count are not aligned to
  logical sector size. I'm not sure if nbdkit or qemu-img ensure this.
- Need testing with NFS
---
 plugins/file/file.c | 126 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 23 deletions(-)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index fb20622..bce2ed1 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -33,6 +33,7 @@
 
 #include <config.h>
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -42,6 +43,8 @@
 #include <sys/stat.h>
 #include <errno.h>
 #include <linux/falloc.h>   /* For FALLOC_FL_* on RHEL, glibc < 2.18
*/
+#include <sys/ioctl.h>
+#include <linux/fs.h>
 
 #include <nbdkit-plugin.h>
 
@@ -116,6 +119,10 @@ file_config_complete (void)
 /* The per-connection handle. */
 struct handle {
   int fd;
+  bool is_block_device;
+  bool can_punch_hole;
+  bool can_zero_range;
+  bool can_fallocate;
 };
 
 /* Create the per-connection handle. */
@@ -123,6 +130,7 @@ static void *
 file_open (int readonly)
 {
   struct handle *h;
+  struct stat statbuf;
   int flags;
 
   h = malloc (sizeof *h);
@@ -144,6 +152,23 @@ file_open (int readonly)
     return NULL;
   }
 
+  if (fstat (h->fd, &statbuf) == -1) {
+    nbdkit_error ("fstat: %s: %m", filename);
+    free (h);
+    return NULL;
+  }
+
+  h->is_block_device = S_ISBLK(statbuf.st_mode);
+
+  /* These flags will disabled if an operation is not supported. */
+#ifdef FALLOC_FL_PUNCH_HOLE
+  h->can_punch_hole = true;
+#endif
+#ifdef FALLOC_FL_ZERO_RANGE
+  h->can_zero_range = true;
+#endif
+  h->can_fallocate = true;
+
   return h;
 }
 
@@ -164,27 +189,29 @@ static int64_t
 file_get_size (void *handle)
 {
   struct handle *h = handle;
-  struct stat statbuf;
 
-  if (fstat (h->fd, &statbuf) == -1) {
-    nbdkit_error ("stat: %m");
-    return -1;
-  }
-
-  if (S_ISBLK (statbuf.st_mode)) {
+  if (h->is_block_device) {
+    /* Block device, so st_size will not be the true size. */
     off_t size;
 
-    /* Block device, so st_size will not be the true size. */
     size = lseek (h->fd, 0, SEEK_END);
     if (size == -1) {
       nbdkit_error ("lseek (to find device size): %m");
       return -1;
     }
+
     return size;
-  }
+  } else {
+    /* Regular file. */
+    struct stat statbuf;
+
+    if (fstat (h->fd, &statbuf) == -1) {
+      nbdkit_error ("fstat: %m");
+      return -1;
+    }
 
-  /* Else regular file. */
-  return statbuf.st_size;
+    return statbuf.st_size;
+  }
 }
 
 static int
@@ -250,33 +277,86 @@ file_pwrite (void *handle, const void *buf, uint32_t
count, uint64_t offset)
 static int
 file_zero (void *handle, uint32_t count, uint64_t offset, int may_trim)
 {
-#if defined(FALLOC_FL_PUNCH_HOLE) || defined(FALLOC_FL_ZERO_RANGE)
   struct handle *h = handle;
-#endif
   int r = -1;
 
 #ifdef FALLOC_FL_PUNCH_HOLE
-  if (may_trim) {
+  /* If we can and may trim, punching hole is our best option. */
+  if (h->can_punch_hole && may_trim) {
     r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                       offset, count);
-    if (r == -1 && errno != EOPNOTSUPP) {
+    if (r == 0)
+        return 0;
+
+    if (errno != EOPNOTSUPP) {
       nbdkit_error ("zero: %m");
+      return r;
     }
-    /* PUNCH_HOLE is older; if it is not supported, it is likely that
-       ZERO_RANGE will not work either, so fall back to write. */
-    return r;
+
+    h->can_punch_hole = false;
   }
 #endif
 
 #ifdef FALLOC_FL_ZERO_RANGE
-  r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count);
-  if (r == -1 && errno != EOPNOTSUPP) {
-    nbdkit_error ("zero: %m");
+  /* ZERO_RANGE is not well supported yet, but it the next best option. */
+  if (h->can_zero_range) {
+    r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count);
+    if (r == 0)
+      return 0;
+
+    if (errno != EOPNOTSUPP) {
+      nbdkit_error ("zero: %m");
+      return r;
+    }
+
+    h->can_zero_range = false;
   }
-#else
+#endif
+
+#ifdef FALLOC_FL_PUNCH_HOLE
+  /* If we can punch hole but may not trim, we can combine punching hole and
+     fallocate to zero a range. This is much more efficient than writing zeros
+     manually. */
+  if (h->can_punch_hole && h->can_fallocate) {
+    r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                      offset, count);
+    if (r == 0) {
+      r = do_fallocate(h->fd, 0, offset, count);
+      if (r == 0)
+        return 0;
+
+      if (errno != EOPNOTSUPP) {
+        nbdkit_error ("zero: %m");
+        return r;
+      }
+
+      h->can_fallocate = false;
+    } else {
+      if (errno != EOPNOTSUPP) {
+        nbdkit_error ("zero: %m");
+        return r;
+      }
+
+      h->can_punch_hole = false;
+    }
+  }
+#endif
+
+  /* For block devices, we can use BLKZEROOUT.
+     NOTE: count and offset must be aligned to logical block size. */
+  if (h->is_block_device) {
+    uint64_t range[2] = {offset, count};
+
+    r = ioctl(h->fd, BLKZEROOUT, &range);
+    if (r == 0)
+      return 0;
+
+    nbdkit_error("zero: %m");
+    return r;
+  }
+
   /* Trigger a fall back to writing */
   errno = EOPNOTSUPP;
-#endif
 
   return r;
 }
-- 
2.17.1
Nir Soffer
2018-Aug-02  19:05 UTC
[Libguestfs] [PATCH 2/3] file: Support zero without ZERO_RANGE
File systems not supporting FALLOC_FL_ZERO_RANGE yet fall back to manual
zeroing.
We can avoid this by combining two fallocate calls:
    fallocate(FALLOC_FL_PUNHCH_HOLE)
    fallocate(0)
Based on my tests this is much more efficient compared to manual
zeroing. The idea came from this qemu patch:
https://github.com/qemu/qemu/commit/1cdc3239f1bb
Here is an example run with NFS 4.2 without this change, converting
fedora 27 image created with virt-builder:
$ export SOCK=/tmp/nbd.sock
$ export FILE=/nfs-mount/fedora-27.img
$ src/nbdkit plugins/file/.libs/nbdkit-file-plugin.so file=$FILE -U $SOCK
$ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m17.481s
user	0m0.199s
sys	0m0.691s
$ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m17.072s
user	0m0.191s
sys	0m0.738s
With this change:
$ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m6.285s
user	0m0.217s
sys	0m0.829s
$ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m3.967s
user	0m0.193s
sys	0m0.702s
Note: the image is sparse, but nbdkit creates a fully allocated image.
This may be a bug in nbdkit or qemu-img.
---
 plugins/file/file.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index 8bb9a2d..aa05492 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -121,6 +121,7 @@ struct handle {
   int fd;
   bool can_punch_hole;
   bool can_zero_range;
+  bool can_fallocate;
 };
 
 /* Create the per-connection handle. */
@@ -161,6 +162,8 @@ file_open (int readonly)
   h->can_zero_range = false;
 #endif
 
+  h->can_fallocate = true;
+
   return h;
 }
 
@@ -297,6 +300,35 @@ file_zero (void *handle, uint32_t count, uint64_t offset,
int may_trim)
   }
 #endif
 
+#ifdef FALLOC_FL_PUNCH_HOLE
+  /* If we can punch hole but may not trim, we can combine punching hole and
+      fallocate to zero a range. This is expected to be more efficient than
+      writing zeros manually. */
+  if (h->can_punch_hole && h->can_fallocate) {
+    r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                      offset, count);
+    if (r == 0) {
+      r = do_fallocate(h->fd, 0, offset, count);
+      if (r == 0)
+        return r;
+
+      if (errno != EOPNOTSUPP) {
+        nbdkit_error ("zero: %m");
+        return r;
+      }
+
+      h->can_fallocate = false;
+    } else {
+      if (errno != EOPNOTSUPP) {
+        nbdkit_error ("zero: %m");
+        return r;
+      }
+
+      h->can_punch_hole = false;
+    }
+  }
+#endif
+
   /* Trigger a fall back to writing */
   errno = EOPNOTSUPP;
   return r;
-- 
2.17.1
Nir Soffer
2018-Aug-02  19:05 UTC
[Libguestfs] [PATCH 3/3] file: Zero for block devices on old kernels
fallocate(FALLOC_FL_ZERO_RANGE) is supportd for block devices with
modern kernel, but when it is not, we fall back to manual zeroing.
Check if the underlying file is a block device when opening the file,
and fall back to ioctl(BLKZEROOUT) for aligned zero requests for a
block device.
Here is an example run without this change on RHEL 7.5:
$ export SOCK=/tmp/nbd.sock
$ export
BLOCK=/dev/e30bfac2-8e13-479d-8cd6-c6da5e306c4e/c9864222-bc52-4359-80d7-76e47d619b15
$ src/nbdkit plugins/file/.libs/nbdkit-file-plugin.so file=$BLOCK -U $SOCK
$ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m11.563s
user	0m0.219s
sys	0m0.746s
$ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m9.841s
user	0m0.167s
sys	0m0.715s
With this change:
$ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m2.796s
user	0m0.202s
sys	0m0.700s
$ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img
nbd:unix:/tmp/nbd.sock
real	0m1.908s
user	0m0.166s
sys	0m0.728s
---
 plugins/file/file.c | 68 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 11 deletions(-)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index aa05492..8877db2 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -45,6 +45,7 @@
 
 #if defined(__linux__) && !defined(FALLOC_FL_PUNCH_HOLE)
 #include <linux/falloc.h>   /* For FALLOC_FL_*, glibc < 2.18 */
+#include <linux/fs.h>       /* For BLKZEROOUT */
 #endif
 
 #include <nbdkit-plugin.h>
@@ -119,16 +120,25 @@ file_config_complete (void)
 /* The per-connection handle. */
 struct handle {
   int fd;
+  bool is_block_device;
+  int sector_size;
   bool can_punch_hole;
   bool can_zero_range;
   bool can_fallocate;
 };
 
+static bool
+is_aligned(struct handle *h, uint64_t n)
+{
+  return n % h->sector_size == 0;
+}
+
 /* Create the per-connection handle. */
 static void *
 file_open (int readonly)
 {
   struct handle *h;
+  struct stat statbuf;
   int flags;
 
   h = malloc (sizeof *h);
@@ -150,6 +160,26 @@ file_open (int readonly)
     return NULL;
   }
 
+  if (fstat (h->fd, &statbuf) == -1) {
+    nbdkit_error ("fstat: %s: %m", filename);
+    free (h);
+    return NULL;
+  }
+
+  h->is_block_device = S_ISBLK(statbuf.st_mode);
+
+#ifdef BLKSSZGET
+  if (h->is_block_device) {
+    if (ioctl (h->fd, BLKSSZGET, &h->sector_size)) {
+      nbdkit_error ("ioctl(BLKSSZGET): %s: %m", filename);
+      free (h);
+      return NULL;
+    }
+  }
+#else
+  h->sector_size = 4096;  /* Safe guess */
+#endif
+
 #ifdef FALLOC_FL_PUNCH_HOLE
   h->can_punch_hole = true;
 #else
@@ -184,27 +214,29 @@ static int64_t
 file_get_size (void *handle)
 {
   struct handle *h = handle;
-  struct stat statbuf;
 
-  if (fstat (h->fd, &statbuf) == -1) {
-    nbdkit_error ("stat: %m");
-    return -1;
-  }
-
-  if (S_ISBLK (statbuf.st_mode)) {
+  if (h->is_block_device) {
+    /* Block device, so st_size will not be the true size. */
     off_t size;
 
-    /* Block device, so st_size will not be the true size. */
     size = lseek (h->fd, 0, SEEK_END);
     if (size == -1) {
       nbdkit_error ("lseek (to find device size): %m");
       return -1;
     }
+
     return size;
-  }
+  } else {
+    /* Regular file. */
+    struct stat statbuf;
+
+    if (fstat (h->fd, &statbuf) == -1) {
+      nbdkit_error ("fstat: %m");
+      return -1;
+    }
 
-  /* Else regular file. */
-  return statbuf.st_size;
+    return statbuf.st_size;
+  }
 }
 
 /* Trim is advisory, but we prefer to advertise it only when we can actually
@@ -329,6 +361,20 @@ file_zero (void *handle, uint32_t count, uint64_t offset,
int may_trim)
   }
 #endif
 
+#ifdef BLKZEROOUT
+  /* For aligned range and block devices, we can use BLKZEROOUT. */
+  if (h->is_block_device && is_aligned (h, offset) &&
is_aligned (h, count)) {
+    uint64_t range[2] = {offset, count};
+
+    r = ioctl (h->fd, BLKZEROOUT, &range);
+    if (r == 0)
+      return r;
+
+    nbdkit_error ("zero: %m");
+    return r;
+  }
+#endif
+
   /* Trigger a fall back to writing */
   errno = EOPNOTSUPP;
   return r;
-- 
2.17.1
Nir Soffer
2018-Aug-02  19:09 UTC
Re: [Libguestfs] [PATCH] file: Zero support for block devices and NFS 4.2
Oops, please ignore this, this was already sent and reviewed here: https://www.redhat.com/archives/libguestfs/2018-July/msg00084.html The patch was hiding in my tree and selected by a careless glob :-) On Thu, Aug 2, 2018 at 10:06 PM Nir Soffer <nirsof@gmail.com> wrote:> If we may not trim, we tried ZERO_RANGE, but this is not well supported > yet, for example it is not available on NFS 4.2. ZERO_RANGE and > PUNCH_HOLE are supported now on block devices, but not on RHRL 7, so we > fallback to slow manual zeroing there. > > Change the logic to support block devices on RHEL 7, and file systems > that do not support ZERO_RANGE. > > The new logic: > - If we may trim, try PUNCH_HOLE > - If we can zero range, Try ZERO_RANGE > - If we can punch hole and fallocate, try fallocate(PUNCH_HOLE) followed > by fallocate(0). > - If underlying file is a block device, try ioctl(BLKZEROOUT) > - Otherwise fallback to manual zeroing > > The handle keeps now the underlying file capabilities, so once we > discover that an operation is not supported, we never try it again. > > Here are examples runs on a server based on Intel(R) Xeon(R) CPU E5-2630 > v4 @ 2.20GHz, using XtremIO storage via 4G FC HBA and 4 paths to > storage. > > $ export SOCK=/tmp/nbd.sock > $ export > BLOCK=/dev/e30bfac2-8e13-479d-8cd6-c6da5e306c4e/c9864222-bc52-4359-80d7-76e47d619b15 > > $ src/nbdkit -f plugins/file/.libs/nbdkit-file-plugin.so file=$BLOCK -U > $SOCK > > $ time qemu-img convert -n -f raw -O raw /var/tmp/fedora-27.img > nbd:unix:$SOCK > > real 0m2.741s > user 0m0.224s > sys 0m0.634s > > $ time qemu-img convert -n -f raw -O raw -W /var/tmp/fedora-27.img > nbd:unix:$SOCK > > real 0m1.920s > user 0m0.163s > sys 0m0.735s > > Issues: > - ioctl(BLKZEROOUT) will fail if offset or count are not aligned to > logical sector size. I'm not sure if nbdkit or qemu-img ensure this. > - Need testing with NFS > --- > plugins/file/file.c | 126 ++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 103 insertions(+), 23 deletions(-) > > diff --git a/plugins/file/file.c b/plugins/file/file.c > index fb20622..bce2ed1 100644 > --- a/plugins/file/file.c > +++ b/plugins/file/file.c > @@ -33,6 +33,7 @@ > > #include <config.h> > > +#include <stdbool.h> > #include <stdio.h> > #include <stdlib.h> > #include <string.h> > @@ -42,6 +43,8 @@ > #include <sys/stat.h> > #include <errno.h> > #include <linux/falloc.h> /* For FALLOC_FL_* on RHEL, glibc < 2.18 */ > +#include <sys/ioctl.h> > +#include <linux/fs.h> > > #include <nbdkit-plugin.h> > > @@ -116,6 +119,10 @@ file_config_complete (void) > /* The per-connection handle. */ > struct handle { > int fd; > + bool is_block_device; > + bool can_punch_hole; > + bool can_zero_range; > + bool can_fallocate; > }; > > /* Create the per-connection handle. */ > @@ -123,6 +130,7 @@ static void * > file_open (int readonly) > { > struct handle *h; > + struct stat statbuf; > int flags; > > h = malloc (sizeof *h); > @@ -144,6 +152,23 @@ file_open (int readonly) > return NULL; > } > > + if (fstat (h->fd, &statbuf) == -1) { > + nbdkit_error ("fstat: %s: %m", filename); > + free (h); > + return NULL; > + } > + > + h->is_block_device = S_ISBLK(statbuf.st_mode); > + > + /* These flags will disabled if an operation is not supported. */ > +#ifdef FALLOC_FL_PUNCH_HOLE > + h->can_punch_hole = true; > +#endif > +#ifdef FALLOC_FL_ZERO_RANGE > + h->can_zero_range = true; > +#endif > + h->can_fallocate = true; > + > return h; > } > > @@ -164,27 +189,29 @@ static int64_t > file_get_size (void *handle) > { > struct handle *h = handle; > - struct stat statbuf; > > - if (fstat (h->fd, &statbuf) == -1) { > - nbdkit_error ("stat: %m"); > - return -1; > - } > - > - if (S_ISBLK (statbuf.st_mode)) { > + if (h->is_block_device) { > + /* Block device, so st_size will not be the true size. */ > off_t size; > > - /* Block device, so st_size will not be the true size. */ > size = lseek (h->fd, 0, SEEK_END); > if (size == -1) { > nbdkit_error ("lseek (to find device size): %m"); > return -1; > } > + > return size; > - } > + } else { > + /* Regular file. */ > + struct stat statbuf; > + > + if (fstat (h->fd, &statbuf) == -1) { > + nbdkit_error ("fstat: %m"); > + return -1; > + } > > - /* Else regular file. */ > - return statbuf.st_size; > + return statbuf.st_size; > + } > } > > static int > @@ -250,33 +277,86 @@ file_pwrite (void *handle, const void *buf, uint32_t > count, uint64_t offset) > static int > file_zero (void *handle, uint32_t count, uint64_t offset, int may_trim) > { > -#if defined(FALLOC_FL_PUNCH_HOLE) || defined(FALLOC_FL_ZERO_RANGE) > struct handle *h = handle; > -#endif > int r = -1; > > #ifdef FALLOC_FL_PUNCH_HOLE > - if (may_trim) { > + /* If we can and may trim, punching hole is our best option. */ > + if (h->can_punch_hole && may_trim) { > r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > offset, count); > - if (r == -1 && errno != EOPNOTSUPP) { > + if (r == 0) > + return 0; > + > + if (errno != EOPNOTSUPP) { > nbdkit_error ("zero: %m"); > + return r; > } > - /* PUNCH_HOLE is older; if it is not supported, it is likely that > - ZERO_RANGE will not work either, so fall back to write. */ > - return r; > + > + h->can_punch_hole = false; > } > #endif > > #ifdef FALLOC_FL_ZERO_RANGE > - r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); > - if (r == -1 && errno != EOPNOTSUPP) { > - nbdkit_error ("zero: %m"); > + /* ZERO_RANGE is not well supported yet, but it the next best option. */ > + if (h->can_zero_range) { > + r = do_fallocate (h->fd, FALLOC_FL_ZERO_RANGE, offset, count); > + if (r == 0) > + return 0; > + > + if (errno != EOPNOTSUPP) { > + nbdkit_error ("zero: %m"); > + return r; > + } > + > + h->can_zero_range = false; > } > -#else > +#endif > + > +#ifdef FALLOC_FL_PUNCH_HOLE > + /* If we can punch hole but may not trim, we can combine punching hole > and > + fallocate to zero a range. This is much more efficient than writing > zeros > + manually. */ > + if (h->can_punch_hole && h->can_fallocate) { > + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > + offset, count); > + if (r == 0) { > + r = do_fallocate(h->fd, 0, offset, count); > + if (r == 0) > + return 0; > + > + if (errno != EOPNOTSUPP) { > + nbdkit_error ("zero: %m"); > + return r; > + } > + > + h->can_fallocate = false; > + } else { > + if (errno != EOPNOTSUPP) { > + nbdkit_error ("zero: %m"); > + return r; > + } > + > + h->can_punch_hole = false; > + } > + } > +#endif > + > + /* For block devices, we can use BLKZEROOUT. > + NOTE: count and offset must be aligned to logical block size. */ > + if (h->is_block_device) { > + uint64_t range[2] = {offset, count}; > + > + r = ioctl(h->fd, BLKZEROOUT, &range); > + if (r == 0) > + return 0; > + > + nbdkit_error("zero: %m"); > + return r; > + } > + > /* Trigger a fall back to writing */ > errno = EOPNOTSUPP; > -#endif > > return r; > } > -- > 2.17.1 > >
Nir Soffer
2018-Aug-02  19:19 UTC
Re: [Libguestfs] [PATCH 0/3] file: Zero for block devices and older file systems
On Thu, Aug 2, 2018 at 10:06 PM Nir Soffer <nirsof@gmail.com> wrote:> This is the second version to support efficient zero for block devices > on older kernels (e.g. RHEL 7.5), and file systems that do not support > yet FALLOC_FS_ZERO_RANGE (e.g. NFS 4.2). > > Changes since v1: > - Split to smaller patches > - Skip linux only includes on other systems > - Skip code using BLKZEROOUT if the macro is not defined > - Try BLKZEROOUT only if the offset and count are aligned to device > sector size. >Other changes I forgot to mention (interrupted by a child): - initialize h->can_* properly. Before they were uninitialized if FALLOC_FL_* macros were not defined. - Use new h->can_punch_hole in file_can_trim, so now we report the actual capability once we detected it. - Use h->can_punch_hole in file_trim, so we try only once if the operation is not supported.> v1 was here: > https://www.redhat.com/archives/libguestfs/2018-July/msg00084.html > > Nir Soffer (3): > file: Avoid unsupported fallocate() calls > file: Support zero without ZERO_RANGE > file: Zero for block devices on old kernels > > plugins/file/file.c | 194 +++++++++++++++++++++++++++++++++----------- > 1 file changed, 147 insertions(+), 47 deletions(-) > > -- > 2.17.1 > >
Richard W.M. Jones
2018-Aug-02  19:28 UTC
Re: [Libguestfs] [PATCH 2/3] file: Support zero without ZERO_RANGE
On Thu, Aug 02, 2018 at 10:05:28PM +0300, Nir Soffer wrote:> File systems not supporting FALLOC_FL_ZERO_RANGE yet fall back to manual > zeroing. > > We can avoid this by combining two fallocate calls: > > fallocate(FALLOC_FL_PUNHCH_HOLE)"PUNCH"> fallocate(0)> +#ifdef FALLOC_FL_PUNCH_HOLE > + /* If we can punch hole but may not trim, we can combine punching hole and > + fallocate to zero a range. This is expected to be more efficient than > + writing zeros manually. */Although we can fix this before pushing so there's no need to submit a new version, it's better if the comment style sticks to the same as used elsewhere, ie: /* Commmnt * More comment */ Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Richard W.M. Jones
2018-Aug-02  19:30 UTC
Re: [Libguestfs] [PATCH 3/3] file: Zero for block devices on old kernels
On Thu, Aug 02, 2018 at 10:05:29PM +0300, Nir Soffer wrote:> fallocate(FALLOC_FL_ZERO_RANGE) is supportd for block devices with"supported" --- I think 1 & 2 look fine. I don't really know enough about the subject area to say anything intelligent for 3. I think Eric needs the final say on these. Thanks, Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Eric Blake
2018-Aug-02  19:30 UTC
Re: [Libguestfs] [PATCH 1/3] file: Avoid unsupported fallocate() calls
On 08/02/2018 02:05 PM, Nir Soffer wrote:> When using file systems not supporting ZERO_RANGE (e.g. NFS 4.2) or > block device on kernel < 4.9, we used to call fallocate() for every > zero, fail with EOPNOTSUPP, and fallback to manual zeroing. When > trimming, we used to try unsupported fallocate() on every call. > > Change file handle to remember if punching holes or zeroing range are > supported, and avoid unsupported calls. > > - can_trim changed to report the actual capability once we tried to > punch a hole. I don't think this is useful yet.The documentation states that filters (or the nbdkit engine itself) are free to cache a per-connection value of .can_trim(), and thus the plugin should keep the value stable per connection - in part, because they affect what is advertised to the client, but the advertisement happens only once as part of the client's initial connection. Changing it after the fact on a given connection won't change the client's behavior (turning the feature on is pointless as the client already remembers it was off; turning the feature off is detrimental as the client already assumes it works). So the best you can do is make subsequent connections more efficient after the initial connection has learned state.> > - zero changed to: > 1. If we can punch hole and may trim, try PUNCH_HOLE > 2. If we can zero range, try ZERO_RANGE > 3. Fall back to manual writing > > - trim changed to: > 1. If we can punch hole, try PUNCH_HOLE > 2. SucceedSeems reasonable from the description.> @@ -118,6 +119,8 @@ file_config_complete (void) > /* The per-connection handle. */ > struct handle { > int fd; > + bool can_punch_hole; > + bool can_zero_range;Would it be better to make these tri-state rather than merely bool? (Indeterminate, supported, known to fail)> }; > > /* Create the per-connection handle. */ > @@ -146,6 +149,18 @@ file_open (int readonly) > return NULL; > } > > +#ifdef FALLOC_FL_PUNCH_HOLE > + h->can_punch_hole = true; > +#else > + h->can_punch_hole = false;If you use tri-state, then the existence of the macro results in an initialization of indeterminate, whereas the absence results in known to fail.> +#endif > + > +#ifdef FALLOC_FL_ZERO_RANGE > + h->can_zero_range = true; > +#else > + h->can_zero_range = false;Likewise.> +#endif > + > return h; > } > > @@ -189,19 +204,15 @@ file_get_size (void *handle) > return statbuf.st_size; > } > > +/* Trim is advisory, but we prefer to advertise it only when we can actually > + * (attempt to) punch holes. Before we tried to punch a hole, report true if > + * FALLOC_FL_PUNCH_HOLE is defined before we did the first call. Once we tried > + * to punch a hole, report the actual cappability of the underlying file. */s/cappability/capability/ If you use a tri-state, then report true if the variable is indeterminate or works, false if known to fail.> static int > file_can_trim (void *handle) > { > - /* Trim is advisory, but we prefer to advertise it only when we can > - * actually (attempt to) punch holes. Since not all filesystems > - * support all fallocate modes, it would be nice if we had a way > - * from fpathconf() to definitively learn what will work on a given > - * fd for a more precise answer; oh well. */The comment about fpathconf() would still be nice to keep in some form.> -#ifdef FALLOC_FL_PUNCH_HOLE > - return 1; > -#else > - return 0; > -#endif > + struct handle *h = handle; > + return h->can_punch_hole;I'm a bit worried about whether changing the return value within the context of a single connection is wise, or if we need to further maintain a per-connection state that is initialized according to the overall plugin state.> }Also, it might be nice to add a .can_zero() callback, so that nbdkit won't even waste time calling into .zero() if we know we will just be deferring right back to a full write (either because the macro is not available, or because we encountered a failure trying to use it on a previous connection).> @@ -301,27 +320,30 @@ file_flush (void *handle) > static int > file_trim (void *handle, uint32_t count, uint64_t offset) > { > - int r = -1; > #ifdef FALLOC_FL_PUNCH_HOLE > struct handle *h = handle; > + int r = -1; > + > + if (h->can_punch_hole) { > + r = do_fallocate (h->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, > + offset, count);Should we also use FALLOC_FL_NO_HIDE_STALE here, if it is available? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Eric Blake
2018-Aug-02  19:39 UTC
Re: [Libguestfs] [PATCH 3/3] file: Zero for block devices on old kernels
On 08/02/2018 02:05 PM, Nir Soffer wrote:> fallocate(FALLOC_FL_ZERO_RANGE) is supportd for block devices with > modern kernel, but when it is not, we fall back to manual zeroing. > > Check if the underlying file is a block device when opening the file, > and fall back to ioctl(BLKZEROOUT) for aligned zero requests for a > block device. > > +++ b/plugins/file/file.c > @@ -45,6 +45,7 @@ > > #if defined(__linux__) && !defined(FALLOC_FL_PUNCH_HOLE) > #include <linux/falloc.h> /* For FALLOC_FL_*, glibc < 2.18 */ > +#include <linux/fs.h> /* For BLKZEROOUT */Will this pick up BLKZEROOUT in all cases where it is needed? Or do we need to relax the !defined(FALLOC_FL_PUNCH_HOLE), and just blindly include both of these headers for all Linux compilations?> +static bool > +is_aligned(struct handle *h, uint64_t n) > +{ > + return n % h->sector_size == 0;Since we know (but the compiler doesn't) that sector_size is a power of 2, it is slightly faster to use bitwise math: return !(n & (h->sector_size - 1))> +#ifdef BLKSSZGET > + if (h->is_block_device) { > + if (ioctl (h->fd, BLKSSZGET, &h->sector_size)) { > + nbdkit_error ("ioctl(BLKSSZGET): %s: %m", filename); > + free (h); > + return NULL;If the ioctl() fails, would it be better to just fall back...> + } > + } > +#else > + h->sector_size = 4096; /* Safe guess */...to the safe guess, instead of giving up entirely? (Might matter on a system with newer headers that have the macro, but where the kernel does not support the ioctl).> @@ -329,6 +361,20 @@ file_zero (void *handle, uint32_t count, uint64_t offset, int may_trim) > } > #endif > > +#ifdef BLKZEROOUT > + /* For aligned range and block devices, we can use BLKZEROOUT. */ > + if (h->is_block_device && is_aligned (h, offset) && is_aligned (h, count)) {Since alignment is a power of 2, you can compress this as: if (h->is_block_device && is_aligned (h, offset | count)) {> + uint64_t range[2] = {offset, count}; > + > + r = ioctl (h->fd, BLKZEROOUT, &range); > + if (r == 0) > + return r; > + > + nbdkit_error ("zero: %m"); > + return r;Are we sure that treating ALL errors as fatal is worthwhile, or should we still attempt to trigger a fall back to writing?> + } > +#endif > + > /* Trigger a fall back to writing */ > errno = EOPNOTSUPP; > return r; >-- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org
Reasonably Related Threads
- [PATCH 0/3] file: Zero for block devices and older file systems
- Re: [PATCH 1/3] file: Avoid unsupported fallocate() calls
- [PATCH 1/3] file: Avoid unsupported fallocate() calls
- [PATCH v2 0/4] file: Zero for block devices and older file systems
- [PATCH] file: Zero support for block devices and NFS 4.2