Nir Soffer
2021-Feb-21 01:09 UTC
[Libguestfs] [PATCH libnbd 0/2] Improve zero correctness, reliability and compatibility
Don't use trim() for zeroing. NBD_CMD_TRIM does not guarantee anything about the content of the trimmed range. Add allocate flag to zero(), so it can be used both for sparse and allocated copy. The zero strategy was less advanced than what we have in nbdkit file plugin. Port the zero strategy from the file plugin, improving reliability and compatibility. Tested with loop devices, local files, and qemu-nbd. Nir Soffer (2): nbdcopy: Do not use trim for zeroing nbdcopy: Port zero strategy from nbdkit copy/copy-sparse.sh | 66 ++++++++--------- copy/file-ops.c | 141 +++++++++++++++++++++++++++++------- copy/main.c | 19 +++++ copy/multi-thread-copying.c | 20 ++--- copy/nbd-ops.c | 8 +- copy/nbdcopy.h | 11 ++- copy/null-ops.c | 4 +- copy/synch-copying.c | 8 +- 8 files changed, 190 insertions(+), 87 deletions(-) -- 2.26.2
Nir Soffer
2021-Feb-21 01:09 UTC
[Libguestfs] [PATCH libnbd 1/2] nbdcopy: Do not use trim for zeroing
Current code uses NBD_CMD_TRIM for writing zeroes without allocating space (--allocate not specified). This incorrect and can lead to corrupting the target image. NBD manual warns about NBD_CMD_TRIM: After issuing this command, a client MUST NOT make any assumptions about the contents of the export affected by this command, until overwriting it again with NBD_CMD_WRITE or NBD_CMD_WRITE_ZEROES Add allocate flag to the zero operation, to allow zeroing by punching hole (default) or zeroing by allocating space. nbd-ops use NBD_CMD_WRITE_ZEROES with or without NBD_FLAG_NO_HOLE flag. file-ops use fallocate with PUNCH_HOLE or ZERO_RANGE flag. The copy-sparse tests was modified to require zero operation with may_trim flag. Signed-off-by: Nir Soffer <nsoffer at redhat.com> --- copy/copy-sparse.sh | 66 ++++++++++++++++++------------------- copy/file-ops.c | 49 +++++++++++++++++++-------- copy/multi-thread-copying.c | 20 ++++------- copy/nbd-ops.c | 8 ++--- copy/nbdcopy.h | 5 +-- copy/null-ops.c | 4 +-- copy/synch-copying.c | 8 ++--- 7 files changed, 88 insertions(+), 72 deletions(-) diff --git a/copy/copy-sparse.sh b/copy/copy-sparse.sh index 846767b..c43b41a 100755 --- a/copy/copy-sparse.sh +++ b/copy/copy-sparse.sh @@ -33,7 +33,7 @@ cleanup_fn rm -f $out # Copy from a sparse data disk to an nbdkit-eval-plugin instance which # is logging everything. This allows us to see exactly what nbdcopy -# is writing, to ensure it is writing and trimming the target as +# is writing, to ensure it is writing and zeroing the target as # expected. $VG nbdcopy -S 0 -- \ [ nbdkit --exit-with-parent data data=' @@ -60,38 +60,38 @@ if [ "$(cat $out)" != "pwrite 1 4294967296 pwrite 32768 0 pwrite 32768 1073709056 pwrite 32768 4294934528 -trim 134184960 32768 -trim 134184960 4160749568 -trim 134184960 939524096 -trim 134217728 1073741824 -trim 134217728 1207959552 -trim 134217728 134217728 -trim 134217728 1342177280 -trim 134217728 1476395008 -trim 134217728 1610612736 -trim 134217728 1744830464 -trim 134217728 1879048192 -trim 134217728 2013265920 -trim 134217728 2147483648 -trim 134217728 2281701376 -trim 134217728 2415919104 -trim 134217728 2550136832 -trim 134217728 268435456 -trim 134217728 2684354560 -trim 134217728 2818572288 -trim 134217728 2952790016 -trim 134217728 3087007744 -trim 134217728 3221225472 -trim 134217728 3355443200 -trim 134217728 3489660928 -trim 134217728 3623878656 -trim 134217728 3758096384 -trim 134217728 3892314112 -trim 134217728 402653184 -trim 134217728 4026531840 -trim 134217728 536870912 -trim 134217728 671088640 -trim 134217728 805306368" ]; then +zero 134184960 32768 may_trim +zero 134184960 4160749568 may_trim +zero 134184960 939524096 may_trim +zero 134217728 1073741824 may_trim +zero 134217728 1207959552 may_trim +zero 134217728 1342177280 may_trim +zero 134217728 134217728 may_trim +zero 134217728 1476395008 may_trim +zero 134217728 1610612736 may_trim +zero 134217728 1744830464 may_trim +zero 134217728 1879048192 may_trim +zero 134217728 2013265920 may_trim +zero 134217728 2147483648 may_trim +zero 134217728 2281701376 may_trim +zero 134217728 2415919104 may_trim +zero 134217728 2550136832 may_trim +zero 134217728 2684354560 may_trim +zero 134217728 268435456 may_trim +zero 134217728 2818572288 may_trim +zero 134217728 2952790016 may_trim +zero 134217728 3087007744 may_trim +zero 134217728 3221225472 may_trim +zero 134217728 3355443200 may_trim +zero 134217728 3489660928 may_trim +zero 134217728 3623878656 may_trim +zero 134217728 3758096384 may_trim +zero 134217728 3892314112 may_trim +zero 134217728 4026531840 may_trim +zero 134217728 402653184 may_trim +zero 134217728 536870912 may_trim +zero 134217728 671088640 may_trim +zero 134217728 805306368 may_trim" ]; then echo "$0: output does not match expected" exit 1 fi diff --git a/copy/file-ops.c b/copy/file-ops.c index 2a239d0..d0b9447 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -100,10 +100,9 @@ file_synch_write (struct rw *rw, } static bool -file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +file_punch_hole(int fd, uint64_t offset, uint64_t count) { #ifdef FALLOC_FL_PUNCH_HOLE - int fd = rw->u.local.fd; int r; r = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, @@ -113,17 +112,14 @@ file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) exit (EXIT_FAILURE); } return true; -#else /* !FALLOC_FL_PUNCH_HOLE */ - return false; #endif + return false; } static bool -file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +file_zero_range(int fd, uint64_t offset, uint64_t count) { - if (S_ISREG (rw->u.local.stat.st_mode)) { #ifdef FALLOC_FL_ZERO_RANGE - int fd = rw->u.local.fd; int r; r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); @@ -133,11 +129,13 @@ file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) } return true; #endif - } - else if (S_ISBLK (rw->u.local.stat.st_mode) && - IS_ALIGNED (offset | count, rw->u.local.sector_size)) { + return false; +} + +static bool +file_zeroout(int fd, uint64_t offset, uint64_t count) +{ #ifdef BLKZEROOUT - int fd = rw->u.local.fd; int r; uint64_t range[2] = {offset, count}; @@ -148,6 +146,31 @@ file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) } return true; #endif + return false; +} + +static bool +file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +{ + return file_punch_hole(rw->u.local.fd, offset, count); +} + +static bool +file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) +{ + int fd = rw->u.local.fd; + + if (S_ISREG (rw->u.local.stat.st_mode)) { + if (allocate) { + return file_zero_range (fd, offset, count); + } else { + return file_punch_hole (fd, offset, count); + } + } + else if (S_ISBLK (rw->u.local.stat.st_mode) && + IS_ALIGNED (offset | count, rw->u.local.sector_size)) { + /* Always allocate, discard and gurantee zeroing. */ + return file_zeroout (fd, offset, count); } return false; @@ -197,9 +220,9 @@ file_asynch_trim (struct rw *rw, struct command *command, static bool file_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { - if (!file_synch_zero (rw, command->offset, command->slice.len)) + if (!file_synch_zero (rw, command->offset, command->slice.len, allocate)) return false; errno = 0; if (cb.callback (cb.user_data, &errno) == -1) { diff --git a/copy/multi-thread-copying.c b/copy/multi-thread-copying.c index 98b4056..8061787 100644 --- a/copy/multi-thread-copying.c +++ b/copy/multi-thread-copying.c @@ -485,11 +485,12 @@ finished_read (void *vp, int *error) * --destination-is-zero: * do nothing * - * --allocated: we must write zeroes either using an efficient + * --allocated: write zeroes allocating space using an efficient * zeroing command or writing a command of zeroes * - * (neither flag) try trimming if supported, else write zeroes - * as above + * (neither flag) write zeroes punching a hole using an efficient + * zeroing command or fallback to writing a command + * of zeroes. * * This takes over ownership of the command and frees it eventually. */ @@ -501,22 +502,13 @@ fill_dst_range_with_zeroes (struct command *command) if (destination_is_zero) goto free_and_return; - if (!allocated) { - /* Try trimming. */ - if (dst.ops->asynch_trim (&dst, command, - (nbd_completion_callback) { - .callback = free_command, - .user_data = command, - })) - return; - } - /* Try efficient zeroing. */ if (dst.ops->asynch_zero (&dst, command, (nbd_completion_callback) { .callback = free_command, .user_data = command, - })) + }, + allocated)) return; /* Fall back to loop writing zeroes. This is going to be slow diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c index 0bcf29b..f3a9744 100644 --- a/copy/nbd-ops.c +++ b/copy/nbd-ops.c @@ -97,13 +97,13 @@ nbd_ops_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) } static bool -nbd_ops_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +nbd_ops_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) { if (!rw->u.nbd.can_zero) return false; if (nbd_zero (rw->u.nbd.handles.ptr[0], - count, offset, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + count, offset, allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) { fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); exit (EXIT_FAILURE); } @@ -158,7 +158,7 @@ nbd_ops_asynch_trim (struct rw *rw, struct command *command, static bool nbd_ops_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { if (!rw->u.nbd.can_zero) return false; @@ -167,7 +167,7 @@ nbd_ops_asynch_zero (struct rw *rw, struct command *command, if (nbd_aio_zero (rw->u.nbd.handles.ptr[command->index], command->slice.len, command->offset, - cb, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + cb, allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) { fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); exit (EXIT_FAILURE); } diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 69fac2a..21d09bf 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -134,7 +134,8 @@ struct rw_ops { bool (*synch_trim) (struct rw *rw, uint64_t offset, uint64_t count); /* Synchronously zero. If not possible, returns false. */ - bool (*synch_zero) (struct rw *rw, uint64_t offset, uint64_t count); + bool (*synch_zero) (struct rw *rw, uint64_t offset, uint64_t count, + bool allocate); /* Asynchronous I/O operations. These start the operation and call * 'cb' on completion. @@ -162,7 +163,7 @@ struct rw_ops { * returns false. */ bool (*asynch_zero) (struct rw *rw, struct command *command, - nbd_completion_callback cb); + nbd_completion_callback cb, bool allocate); /* Number of asynchronous commands in flight for a particular thread. */ unsigned (*in_flight) (struct rw *rw, uintptr_t index); diff --git a/copy/null-ops.c b/copy/null-ops.c index b2ca66f..0a7b0fb 100644 --- a/copy/null-ops.c +++ b/copy/null-ops.c @@ -63,7 +63,7 @@ null_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) } static bool -null_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +null_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) { return true; } @@ -102,7 +102,7 @@ null_asynch_trim (struct rw *rw, struct command *command, static bool null_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { errno = 0; if (cb.callback (cb.user_data, &errno) == -1) { diff --git a/copy/synch-copying.c b/copy/synch-copying.c index 2712c10..37d2d7f 100644 --- a/copy/synch-copying.c +++ b/copy/synch-copying.c @@ -69,10 +69,10 @@ synch_copying (void) assert (exts.ptr[i].length <= count); if (exts.ptr[i].zero) { - if (!dst.ops->synch_trim (&dst, offset, exts.ptr[i].length) && - !dst.ops->synch_zero (&dst, offset, exts.ptr[i].length)) { - /* If neither trimming nor efficient zeroing are possible, - * write zeroes the hard way. + if (!dst.ops->synch_zero (&dst, offset, exts.ptr[i].length, false) && + !dst.ops->synch_zero (&dst, offset, exts.ptr[i].length, true)) { + /* If efficient zeroing (punching a hole or allocating space) are + * possible, write zeroes the hard way. */ memset (buf, 0, exts.ptr[i].length); dst.ops->synch_write (&dst, buf, exts.ptr[i].length, offset); -- 2.26.2
Nir Soffer
2021-Feb-21 01:09 UTC
[Libguestfs] [PATCH libnbd 2/2] nbdcopy: Port zero strategy from nbdkit
Port the zero strategy from nbdkit file plugin, improving the reliability and compatibility with block devices on modern kernels. Local rw have now capability flags: can_punch_hole, can_zero_range, can_fallocate, and can_zeroout. The flags are initialized based on the type of the file descriptor and compile time checks: - For regular file, we enable can_punch_hole, can_zero_range, and can_fallocate. - For block device, we enable can_punch_hole, can_zero_range, an can_zeroout. - For pipes and sockets we don't enable anything. When calling zero() in the first time, we try the following methods, returning on the first success: - If don't need to allocate, try to punch a hole. - Try to zero the range - Try to combine punching a hole and fallocate - Try BLKZEROOUT ioctl If a method is not supported, we disable the capability flag, so the next call can try only what works. The fallocate and ioctl wrappers return false when the call is not supported by the underlying storage so we can disable the capability. Previously the process would exit with an error. Signed-off-by: Nir Soffer <nsoffer at redhat.com> --- copy/file-ops.c | 108 +++++++++++++++++++++++++++++++++++++----------- copy/main.c | 19 +++++++++ copy/nbdcopy.h | 6 +++ 3 files changed, 110 insertions(+), 23 deletions(-) diff --git a/copy/file-ops.c b/copy/file-ops.c index d0b9447..c4d8a67 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -99,6 +99,12 @@ file_synch_write (struct rw *rw, } } +static inline bool +is_not_supported (int err) +{ + return err == ENOTSUP || err == EOPNOTSUPP; +} + static bool file_punch_hole(int fd, uint64_t offset, uint64_t count) { @@ -108,6 +114,9 @@ file_punch_hole(int fd, uint64_t offset, uint64_t count) r = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, count); if (r == -1) { + if (is_not_supported (errno)) + return false; + perror ("fallocate: FALLOC_FL_PUNCH_HOLE"); exit (EXIT_FAILURE); } @@ -120,14 +129,17 @@ static bool file_zero_range(int fd, uint64_t offset, uint64_t count) { #ifdef FALLOC_FL_ZERO_RANGE - int r; + int r; - r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); - if (r == -1) { - perror ("fallocate: FALLOC_FL_ZERO_RANGE"); - exit (EXIT_FAILURE); - } - return true; + r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == -1) { + if (is_not_supported (errno)) + return false; + + perror ("fallocate: FALLOC_FL_ZERO_HOLE"); + exit (EXIT_FAILURE); + } + return true; #endif return false; } @@ -136,15 +148,18 @@ static bool file_zeroout(int fd, uint64_t offset, uint64_t count) { #ifdef BLKZEROOUT - int r; - uint64_t range[2] = {offset, count}; + int r; + uint64_t range[2] = {offset, count}; - r = ioctl (fd, BLKZEROOUT, &range); - if (r == -1) { - perror ("ioctl: BLKZEROOUT"); - exit (EXIT_FAILURE); - } - return true; + r = ioctl (fd, BLKZEROOUT, &range); + if (r == -1) { + if (errno == ENOTTY) + return false; + + perror ("ioctl: BLKZEROOUT"); + exit (EXIT_FAILURE); + } + return true; #endif return false; } @@ -152,7 +167,14 @@ file_zeroout(int fd, uint64_t offset, uint64_t count) static bool file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) { - return file_punch_hole(rw->u.local.fd, offset, count); + if (rw->u.local.can_punch_hole) { + if (file_punch_hole(rw->u.local.fd, offset, count)) + return true; + + rw->u.local.can_punch_hole = false; + } + + return false; } static bool @@ -160,17 +182,57 @@ file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) { int fd = rw->u.local.fd; - if (S_ISREG (rw->u.local.stat.st_mode)) { - if (allocate) { - return file_zero_range (fd, offset, count); + /* The first call will try several options, discovering the capabilities of + * the underlying storage, and disabling non working options. The next calls + * will try only what works. + * + * If we don't need to allocate try to punch a hole. This works for both + * files and block devices with modern kernels. + */ + + if (!allocate && rw->u.local.can_punch_hole) { + if (file_punch_hole (fd, offset, count)) + return true; + + rw->u.local.can_punch_hole = false; + } + + /* Try to zero the range. This works for both files and block devices with + * modern kernels. + */ + + if (rw->u.local.can_zero_range) { + if (file_zero_range (fd, offset, count)) + return true; + + rw->u.local.can_zero_range = false; + } + + /* If we can punch a hole and fallocate, we can combine both operations. This + * is expected to be more efficient than actually writing zeroes. This works + * only for files. + */ + + if (rw->u.local.can_punch_hole && rw->u.local.can_fallocate) { + if (file_punch_hole (fd, offset, count)) { + if (fallocate (fd, 0, offset, count)) + return true; + + rw->u.local.can_fallocate = false; } else { - return file_punch_hole (fd, offset, count); + rw->u.local.can_punch_hole = false; } } - else if (S_ISBLK (rw->u.local.stat.st_mode) && + + /* Finally try BLKZEROOUT. This works only for block device if offset and + * count are aligned to device sector size. + */ + else if (rw->u.local.can_zeroout && IS_ALIGNED (offset | count, rw->u.local.sector_size)) { - /* Always allocate, discard and gurantee zeroing. */ - return file_zeroout (fd, offset, count); + if (file_zeroout(fd, offset, count)) + return true; + + rw->u.local.can_zeroout = false; } return false; diff --git a/copy/main.c b/copy/main.c index 68a6030..78fdff8 100644 --- a/copy/main.c +++ b/copy/main.c @@ -523,6 +523,16 @@ open_local (const char *prog, #ifdef BLKSSZGET if (ioctl (fd, BLKSSZGET, &rw->u.local.sector_size)) fprintf (stderr, "warning: cannot get sector size: %s: %m", rw->name); +#endif + /* Possible efficient zero methods for block device. */ +#ifdef FALLOC_FL_PUNCH_HOLE + rw->u.local.can_punch_hole = true; +#endif +#ifdef FALLOC_FL_ZERO_RANGE + rw->u.local.can_zero_range = true; +#endif +#ifdef BLKZEROOUT + rw->u.local.can_zeroout = true; #endif } else if (S_ISREG (rw->u.local.stat.st_mode)) { @@ -530,6 +540,15 @@ open_local (const char *prog, rw->ops = &file_ops; rw->size = rw->u.local.stat.st_size; rw->u.local.seek_hole_supported = seek_hole_supported (fd); + + /* Possible efficient zero methods for regular file. */ +#ifdef FALLOC_FL_PUNCH_HOLE + rw->u.local.can_punch_hole = true; +#endif +#ifdef FALLOC_FL_ZERO_RANGE + rw->u.local.can_zero_range = true; +#endif + rw->u.local.can_fallocate = true; } else { /* Probably stdin/stdout, a pipe or a socket. Set size == -1 diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 21d09bf..e33ea8e 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -51,6 +51,12 @@ struct rw { struct stat stat; bool seek_hole_supported; int sector_size; + + /* We try to use the most eficient zeroing first. If an efficent zero + * method is not available, we disable the flag so next time we use the + * working method. + */ + bool can_punch_hole, can_zero_range, can_fallocate, can_zeroout; } local; struct { handles handles; /* For NBD, one handle per connection. */ -- 2.26.2