Nir Soffer
2021-Feb-22 19:34 UTC
[Libguestfs] [PATCH libnbd v2 0/3] Improve zero correctness, reliability and compatibility
Don't use trim() for zeroing. NBD_CMD_TRIM does not guarantee anything about the content of the trimmed range. Add allocate flag to zero(), so it can be used both for sparse and allocated copy. The zero strategy was less advanced than what we have in nbdkit file plugin. Port the zero strategy from the file plugin, improving reliability and compatibility. Tested with loop devices, local files, and qemu-nbd. Changes since v1: - Rebase after struct rw refactoring - Fix block device checks - Improve formatting (Rich) Nir Soffer (3): copy/file-ops.c: Fix copy for block device copy: Do not use trim for zeroing copy/file-ops.c: Port zero strategy from nbdkit copy/copy-sparse.sh | 66 +++++++------- copy/file-ops.c | 168 +++++++++++++++++++++++++++++------- copy/multi-thread-copying.c | 20 ++--- copy/nbd-ops.c | 11 +-- copy/nbdcopy.h | 5 +- copy/null-ops.c | 4 +- copy/synch-copying.c | 8 +- 7 files changed, 193 insertions(+), 89 deletions(-) -- 2.26.2
Nir Soffer
2021-Feb-22 19:34 UTC
[Libguestfs] [PATCH libnbd v2 1/3] copy/file-ops.c: Fix copy for block device
Initialize is_block so operations using it work correctly with block device. Fixes: commit 48d1e2066c1f9ea73cbad19e86565960ef7fbebf Signed-off-by: Nir Soffer <nsoffer at redhat.com> --- copy/file-ops.c | 1 + 1 file changed, 1 insertion(+) diff --git a/copy/file-ops.c b/copy/file-ops.c index 937620d..6d098c2 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -66,6 +66,7 @@ file_create (const char *name, int fd, off_t st_size, bool is_block) rwf->rw.ops = &file_ops; rwf->rw.name = name; rwf->fd = fd; + rwf->is_block = is_block; if (is_block) { /* Block device - ignore size passed in. */ -- 2.26.2
Nir Soffer
2021-Feb-22 19:34 UTC
[Libguestfs] [PATCH libnbd v2 2/3] copy: Do not use trim for zeroing
Current code uses NBD_CMD_TRIM for writing zeroes without allocating space (--allocate not specified). This incorrect and can lead to corrupting the target image. NBD manual warns about NBD_CMD_TRIM: After issuing this command, a client MUST NOT make any assumptions about the contents of the export affected by this command, until overwriting it again with NBD_CMD_WRITE or NBD_CMD_WRITE_ZEROES Add allocate flag to the zero operation, to allow zeroing by punching hole (default) or zeroing by allocating space. nbd-ops use NBD_CMD_WRITE_ZEROES with or without NBD_FLAG_NO_HOLE flag. file-ops use fallocate with PUNCH_HOLE or ZERO_RANGE flag. The copy-sparse tests was modified to require zero operation with may_trim flag. Signed-off-by: Nir Soffer <nsoffer at redhat.com> --- copy/copy-sparse.sh | 66 ++++++++++++++--------------- copy/file-ops.c | 82 +++++++++++++++++++++++-------------- copy/multi-thread-copying.c | 20 +++------ copy/nbd-ops.c | 11 ++--- copy/nbdcopy.h | 5 ++- copy/null-ops.c | 4 +- copy/synch-copying.c | 8 ++-- 7 files changed, 106 insertions(+), 90 deletions(-) diff --git a/copy/copy-sparse.sh b/copy/copy-sparse.sh index 846767b..c43b41a 100755 --- a/copy/copy-sparse.sh +++ b/copy/copy-sparse.sh @@ -33,7 +33,7 @@ cleanup_fn rm -f $out # Copy from a sparse data disk to an nbdkit-eval-plugin instance which # is logging everything. This allows us to see exactly what nbdcopy -# is writing, to ensure it is writing and trimming the target as +# is writing, to ensure it is writing and zeroing the target as # expected. $VG nbdcopy -S 0 -- \ [ nbdkit --exit-with-parent data data=' @@ -60,38 +60,38 @@ if [ "$(cat $out)" != "pwrite 1 4294967296 pwrite 32768 0 pwrite 32768 1073709056 pwrite 32768 4294934528 -trim 134184960 32768 -trim 134184960 4160749568 -trim 134184960 939524096 -trim 134217728 1073741824 -trim 134217728 1207959552 -trim 134217728 134217728 -trim 134217728 1342177280 -trim 134217728 1476395008 -trim 134217728 1610612736 -trim 134217728 1744830464 -trim 134217728 1879048192 -trim 134217728 2013265920 -trim 134217728 2147483648 -trim 134217728 2281701376 -trim 134217728 2415919104 -trim 134217728 2550136832 -trim 134217728 268435456 -trim 134217728 2684354560 -trim 134217728 2818572288 -trim 134217728 2952790016 -trim 134217728 3087007744 -trim 134217728 3221225472 -trim 134217728 3355443200 -trim 134217728 3489660928 -trim 134217728 3623878656 -trim 134217728 3758096384 -trim 134217728 3892314112 -trim 134217728 402653184 -trim 134217728 4026531840 -trim 134217728 536870912 -trim 134217728 671088640 -trim 134217728 805306368" ]; then +zero 134184960 32768 may_trim +zero 134184960 4160749568 may_trim +zero 134184960 939524096 may_trim +zero 134217728 1073741824 may_trim +zero 134217728 1207959552 may_trim +zero 134217728 1342177280 may_trim +zero 134217728 134217728 may_trim +zero 134217728 1476395008 may_trim +zero 134217728 1610612736 may_trim +zero 134217728 1744830464 may_trim +zero 134217728 1879048192 may_trim +zero 134217728 2013265920 may_trim +zero 134217728 2147483648 may_trim +zero 134217728 2281701376 may_trim +zero 134217728 2415919104 may_trim +zero 134217728 2550136832 may_trim +zero 134217728 2684354560 may_trim +zero 134217728 268435456 may_trim +zero 134217728 2818572288 may_trim +zero 134217728 2952790016 may_trim +zero 134217728 3087007744 may_trim +zero 134217728 3221225472 may_trim +zero 134217728 3355443200 may_trim +zero 134217728 3489660928 may_trim +zero 134217728 3623878656 may_trim +zero 134217728 3758096384 may_trim +zero 134217728 3892314112 may_trim +zero 134217728 4026531840 may_trim +zero 134217728 402653184 may_trim +zero 134217728 536870912 may_trim +zero 134217728 671088640 may_trim +zero 134217728 805306368 may_trim" ]; then echo "$0: output does not match expected" exit 1 fi diff --git a/copy/file-ops.c b/copy/file-ops.c index 6d098c2..d22273a 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -221,11 +221,9 @@ file_synch_write (struct rw *rw, } static bool -file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +file_punch_hole(int fd, uint64_t offset, uint64_t count) { #ifdef FALLOC_FL_PUNCH_HOLE - struct rw_file *rwf = (struct rw_file *)rw; - int fd = rwf->fd; int r; r = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, @@ -235,42 +233,66 @@ file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) exit (EXIT_FAILURE); } return true; -#else /* !FALLOC_FL_PUNCH_HOLE */ - return false; #endif + return false; } static bool -file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +file_zero_range(int fd, uint64_t offset, uint64_t count) { - struct rw_file *rwf = (struct rw_file *)rw; - - if (! rwf->is_block) { #ifdef FALLOC_FL_ZERO_RANGE - int fd = rwf->fd; - int r; + int r; - r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); - if (r == -1) { - perror ("fallocate: FALLOC_FL_ZERO_RANGE"); - exit (EXIT_FAILURE); - } - return true; -#endif + r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == -1) { + perror ("fallocate: FALLOC_FL_ZERO_RANGE"); + exit (EXIT_FAILURE); } - else if (rwf->is_block && IS_ALIGNED (offset | count, rwf->sector_size)) { + return true; +#endif + return false; +} + +static bool +file_zeroout(int fd, uint64_t offset, uint64_t count) +{ #ifdef BLKZEROOUT - int fd = rwf->fd; - int r; - uint64_t range[2] = {offset, count}; + int r; + uint64_t range[2] = {offset, count}; - r = ioctl (fd, BLKZEROOUT, &range); - if (r == -1) { - perror ("ioctl: BLKZEROOUT"); - exit (EXIT_FAILURE); - } - return true; + r = ioctl (fd, BLKZEROOUT, &range); + if (r == -1) { + perror ("ioctl: BLKZEROOUT"); + exit (EXIT_FAILURE); + } + return true; #endif + return false; +} + +static bool +file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +{ + struct rw_file *rwf = (struct rw_file *)rw; + + return file_punch_hole (rwf->fd, offset, count); +} + +static bool +file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) +{ + struct rw_file *rwf = (struct rw_file *)rw; + + if (!rwf->is_block) { + if (allocate) { + return file_zero_range (rwf->fd, offset, count); + } else { + return file_punch_hole (rwf->fd, offset, count); + } + } + else if (IS_ALIGNED (offset | count, rwf->sector_size)) { + /* Always allocate, discard and gurantee zeroing. */ + return file_zeroout (rwf->fd, offset, count); } return false; @@ -320,9 +342,9 @@ file_asynch_trim (struct rw *rw, struct command *command, static bool file_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { - if (!file_synch_zero (rw, command->offset, command->slice.len)) + if (!file_synch_zero (rw, command->offset, command->slice.len, allocate)) return false; errno = 0; if (cb.callback (cb.user_data, &errno) == -1) { diff --git a/copy/multi-thread-copying.c b/copy/multi-thread-copying.c index 4f57054..768bca1 100644 --- a/copy/multi-thread-copying.c +++ b/copy/multi-thread-copying.c @@ -487,11 +487,12 @@ finished_read (void *vp, int *error) * --destination-is-zero: * do nothing * - * --allocated: we must write zeroes either using an efficient + * --allocated: write zeroes allocating space using an efficient * zeroing command or writing a command of zeroes * - * (neither flag) try trimming if supported, else write zeroes - * as above + * (neither flag) write zeroes punching a hole using an efficient + * zeroing command or fallback to writing a command + * of zeroes. * * This takes over ownership of the command and frees it eventually. */ @@ -503,22 +504,13 @@ fill_dst_range_with_zeroes (struct command *command) if (destination_is_zero) goto free_and_return; - if (!allocated) { - /* Try trimming. */ - if (dst->ops->asynch_trim (dst, command, - (nbd_completion_callback) { - .callback = free_command, - .user_data = command, - })) - return; - } - /* Try efficient zeroing. */ if (dst->ops->asynch_zero (dst, command, (nbd_completion_callback) { .callback = free_command, .user_data = command, - })) + }, + allocated)) return; /* Fall back to loop writing zeroes. This is going to be slow diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c index f001d02..388bd53 100644 --- a/copy/nbd-ops.c +++ b/copy/nbd-ops.c @@ -277,15 +277,16 @@ nbd_ops_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) } static bool -nbd_ops_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +nbd_ops_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, + bool allocate) { struct rw_nbd *rwn = (struct rw_nbd *) rw; if (!rwn->can_zero) return false; - if (nbd_zero (rwn->handles.ptr[0], - count, offset, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + if (nbd_zero (rwn->handles.ptr[0], count, offset, + allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) { fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); exit (EXIT_FAILURE); } @@ -346,7 +347,7 @@ nbd_ops_asynch_trim (struct rw *rw, struct command *command, static bool nbd_ops_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { struct rw_nbd *rwn = (struct rw_nbd *) rw; @@ -357,7 +358,7 @@ nbd_ops_asynch_zero (struct rw *rw, struct command *command, if (nbd_aio_zero (rwn->handles.ptr[command->index], command->slice.len, command->offset, - cb, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + cb, allocate ? LIBNBD_CMD_FLAG_NO_HOLE : 0) == -1) { fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); exit (EXIT_FAILURE); } diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 97e3b49..d553399 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -152,7 +152,8 @@ struct rw_ops { bool (*synch_trim) (struct rw *rw, uint64_t offset, uint64_t count); /* Synchronously zero. If not possible, returns false. */ - bool (*synch_zero) (struct rw *rw, uint64_t offset, uint64_t count); + bool (*synch_zero) (struct rw *rw, uint64_t offset, uint64_t count, + bool allocate); /* Asynchronous I/O operations. These start the operation and call * 'cb' on completion. @@ -180,7 +181,7 @@ struct rw_ops { * returns false. */ bool (*asynch_zero) (struct rw *rw, struct command *command, - nbd_completion_callback cb); + nbd_completion_callback cb, bool allocate); /* Number of asynchronous commands in flight for a particular thread. */ unsigned (*in_flight) (struct rw *rw, uintptr_t index); diff --git a/copy/null-ops.c b/copy/null-ops.c index b75ca1f..aed2a03 100644 --- a/copy/null-ops.c +++ b/copy/null-ops.c @@ -105,7 +105,7 @@ null_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) } static bool -null_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +null_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) { return true; } @@ -144,7 +144,7 @@ null_asynch_trim (struct rw *rw, struct command *command, static bool null_asynch_zero (struct rw *rw, struct command *command, - nbd_completion_callback cb) + nbd_completion_callback cb, bool allocate) { errno = 0; if (cb.callback (cb.user_data, &errno) == -1) { diff --git a/copy/synch-copying.c b/copy/synch-copying.c index 985f005..17bda16 100644 --- a/copy/synch-copying.c +++ b/copy/synch-copying.c @@ -69,10 +69,10 @@ synch_copying (void) assert (exts.ptr[i].length <= count); if (exts.ptr[i].zero) { - if (!dst->ops->synch_trim (dst, offset, exts.ptr[i].length) && - !dst->ops->synch_zero (dst, offset, exts.ptr[i].length)) { - /* If neither trimming nor efficient zeroing are possible, - * write zeroes the hard way. + if (!dst->ops->synch_zero (dst, offset, exts.ptr[i].length, false) && + !dst->ops->synch_zero (dst, offset, exts.ptr[i].length, true)) { + /* If efficient zeroing (punching a hole or allocating + * space) are possible, write zeroes the hard way. */ memset (buf, 0, exts.ptr[i].length); dst->ops->synch_write (dst, buf, exts.ptr[i].length, offset); -- 2.26.2
Nir Soffer
2021-Feb-22 19:34 UTC
[Libguestfs] [PATCH libnbd v2 3/3] copy/file-ops.c: Port zero strategy from nbdkit
Port the zero strategy from nbdkit file plugin, improving the reliability and compatibility with block devices on modern kernels. Local rw have now capability flags: can_punch_hole, can_zero_range, can_fallocate, and can_zeroout. The flags are initialized based on the type of the file descriptor and compile time checks: - For regular file, we enable can_punch_hole, can_zero_range, and can_fallocate. - For block device, we enable can_punch_hole, can_zero_range, an can_zeroout. - For pipes and sockets we don't enable anything. When calling zero() in the first time, we try the following methods, returning on the first success: - If don't need to allocate, try to punch a hole. - Try to zero the range - Try to combine punching a hole and fallocate - Try BLKZEROOUT ioctl If a method is not supported, we disable the capability flag, so the next call can try only what works. The fallocate and ioctl wrappers return false when the call is not supported by the underlying storage so we can disable the capability. Previously the process would exit with an error. Signed-off-by: Nir Soffer <nsoffer at redhat.com> --- copy/file-ops.c | 103 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 95 insertions(+), 8 deletions(-) diff --git a/copy/file-ops.c b/copy/file-ops.c index d22273a..970e81e 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -44,6 +44,12 @@ struct rw_file { bool is_block; bool seek_hole_supported; int sector_size; + + /* We try to use the most eficient zeroing first. If an efficent zero + * method is not available, we disable the flag so next time we use + * the working method. + */ + bool can_punch_hole, can_zero_range, can_fallocate, can_zeroout; }; static bool @@ -84,12 +90,30 @@ file_create (const char *name, int fd, off_t st_size, bool is_block) #ifdef BLKSSZGET if (ioctl (fd, BLKSSZGET, &rwf->sector_size)) fprintf (stderr, "warning: cannot get sector size: %s: %m", name); +#endif + /* Possible efficient zero methods for block device. */ +#ifdef FALLOC_FL_PUNCH_HOLE + rwf->can_punch_hole = true; +#endif +#ifdef FALLOC_FL_ZERO_RANGE + rwf->can_zero_range = true; +#endif +#ifdef BLKZEROOUT + rwf->can_zeroout = true; #endif } else { /* Regular file. */ rwf->rw.size = st_size; rwf->seek_hole_supported = seek_hole_supported (fd); + /* Possible efficient zero methods for regular file. */ +#ifdef FALLOC_FL_PUNCH_HOLE + rwf->can_punch_hole = true; +#endif +#ifdef FALLOC_FL_ZERO_RANGE + rwf->can_zero_range = true; +#endif + rwf->can_fallocate = true; } return &rwf->rw; @@ -220,6 +244,12 @@ file_synch_write (struct rw *rw, } } +static inline bool +is_not_supported (int err) +{ + return err == ENOTSUP || err == EOPNOTSUPP; +} + static bool file_punch_hole(int fd, uint64_t offset, uint64_t count) { @@ -229,6 +259,9 @@ file_punch_hole(int fd, uint64_t offset, uint64_t count) r = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, count); if (r == -1) { + if (is_not_supported (errno)) + return false; + perror ("fallocate: FALLOC_FL_PUNCH_HOLE"); exit (EXIT_FAILURE); } @@ -245,6 +278,9 @@ file_zero_range(int fd, uint64_t offset, uint64_t count) r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); if (r == -1) { + if (is_not_supported (errno)) + return false; + perror ("fallocate: FALLOC_FL_ZERO_RANGE"); exit (EXIT_FAILURE); } @@ -262,6 +298,9 @@ file_zeroout(int fd, uint64_t offset, uint64_t count) r = ioctl (fd, BLKZEROOUT, &range); if (r == -1) { + if (errno == ENOTTY) + return false; + perror ("ioctl: BLKZEROOUT"); exit (EXIT_FAILURE); } @@ -275,7 +314,14 @@ file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) { struct rw_file *rwf = (struct rw_file *)rw; - return file_punch_hole (rwf->fd, offset, count); + if (rwf->can_punch_hole) { + if (file_punch_hole (rwf->fd, offset, count)) + return true; + + rwf->can_punch_hole = false; + } + + return false; } static bool @@ -283,16 +329,57 @@ file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count, bool allocate) { struct rw_file *rwf = (struct rw_file *)rw; - if (!rwf->is_block) { - if (allocate) { - return file_zero_range (rwf->fd, offset, count); + /* The first call will try several options, discovering the + * capabilities of the underlying storage, and disabling non working + * options. The next calls will try only what works. + * + * If we don't need to allocate try to punch a hole. This works for + * both files and block devices with modern kernels. + */ + + if (!allocate && rwf->can_punch_hole) { + if (file_punch_hole (rwf->fd, offset, count)) + return true; + + rwf->can_punch_hole = false; + } + + /* Try to zero the range. This works for both files and block devices + * with modern kernels. + */ + + if (rwf->can_zero_range) { + if (file_zero_range (rwf->fd, offset, count)) + return true; + + rwf->can_zero_range = false; + } + + /* If we can punch a hole and fallocate, we can combine both + * operations. This is expected to be more efficient than actually + * writing zeroes. This works only for files. + */ + + if (rwf->can_punch_hole && rwf->can_fallocate) { + if (file_punch_hole (rwf->fd, offset, count)) { + if (fallocate (rwf->fd, 0, offset, count)) + return true; + + rwf->can_fallocate = false; } else { - return file_punch_hole (rwf->fd, offset, count); + rwf->can_punch_hole = false; } } - else if (IS_ALIGNED (offset | count, rwf->sector_size)) { - /* Always allocate, discard and gurantee zeroing. */ - return file_zeroout (rwf->fd, offset, count); + + /* Finally try BLKZEROOUT. This works only for block device if offset + * and count are aligned to device sector size. + */ + else if (rwf->can_zeroout && + IS_ALIGNED (offset | count, rwf->sector_size)) { + if (file_zeroout (rwf->fd, offset, count)) + return true; + + rwf->can_zeroout = false; } return false; -- 2.26.2