Richard W.M. Jones
2022-Jun-29 11:03 UTC
[Libguestfs] [PATCH libnbd v2 0/3] copy: Use preferred block size for copying
This is not really an evolution of the previous patch set, I started almost from scratch, but for reference version 1 was here: https://listman.redhat.com/archives/libguestfs/2022-June/029301.html This version does appear to work, all tests pass, and I ran the nbdkit test suite as well (which uses nbdcopy in a bunch of places) and that passes also. Note that patch 3 is a generic fix of a test which was previously broken, but now that larger requests are being made by default, it breaks much more often. Rich.
Richard W.M. Jones
2022-Jun-29 11:03 UTC
[Libguestfs] [PATCH libnbd v2 1/3] copy: Store the preferred block size in the operations struct
This will be used in a subsequent commit. At the moment the preferred block size for all sources / destinations is simply calculated and stored. --- copy/file-ops.c | 9 ++++++++- copy/main.c | 9 ++++++--- copy/nbd-ops.c | 9 +++++++++ copy/nbdcopy.h | 4 +++- copy/null-ops.c | 1 + copy/pipe-ops.c | 1 + 6 files changed, 28 insertions(+), 5 deletions(-) diff --git a/copy/file-ops.c b/copy/file-ops.c index ab3787545c..3b901bcdb8 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -241,7 +241,8 @@ seek_hole_supported (int fd) struct rw * file_create (const char *name, int fd, - off_t st_size, bool is_block, direction d) + off_t st_size, blksize_t st_blksize, + bool is_block, direction d) { struct rw_file *rwf = calloc (1, sizeof *rwf); if (rwf == NULL) { perror ("calloc"); exit (EXIT_FAILURE); } @@ -262,6 +263,11 @@ file_create (const char *name, int fd, perror ("lseek"); exit (EXIT_FAILURE); } + rwf->rw.preferred = 4096; +#ifdef BLKIOOPT + if (ioctl (fd, BLKIOOPT, &rwf->rw.preferred)) + fprintf (stderr, "warning: cannot get optimal I/O size: %s: %m", name); +#endif rwf->seek_hole_supported = seek_hole_supported (fd); rwf->sector_size = 4096; #ifdef BLKSSZGET @@ -282,6 +288,7 @@ file_create (const char *name, int fd, else { /* Regular file. */ rwf->rw.size = st_size; + rwf->rw.preferred = st_blksize; rwf->seek_hole_supported = seek_hole_supported (fd); /* Possible efficient zero methods for regular file. */ #ifdef FALLOC_FL_PUNCH_HOLE diff --git a/copy/main.c b/copy/main.c index cc379e98bc..9b551de664 100644 --- a/copy/main.c +++ b/copy/main.c @@ -513,7 +513,9 @@ open_local (const char *filename, direction d) exit (EXIT_FAILURE); } if (S_ISBLK (stat.st_mode) || S_ISREG (stat.st_mode)) - return file_create (filename, fd, stat.st_size, S_ISBLK (stat.st_mode), d); + return file_create (filename, fd, + stat.st_size, stat.st_blksize, S_ISBLK (stat.st_mode), + d); else { /* Probably stdin/stdout, a pipe or a socket. */ synchronous = true; /* Force synchronous mode for pipes. */ @@ -528,8 +530,9 @@ print_rw (struct rw *rw, const char *prefix, FILE *fp) char buf[HUMAN_SIZE_LONGEST]; fprintf (fp, "%s: %s \"%s\"\n", prefix, rw->ops->ops_name, rw->name); - fprintf (fp, "%s: size=%" PRIi64 " (%s)\n", - prefix, rw->size, human_size (buf, rw->size, NULL)); + fprintf (fp, "%s: size=%" PRIi64 " (%s), preferred block size=%" PRIu64 "\n", + prefix, rw->size, human_size (buf, rw->size, NULL), + rw->preferred); } /* Default implementation of rw->ops->get_extents for backends which diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c index 3bc26ba613..6c5d59c578 100644 --- a/copy/nbd-ops.c +++ b/copy/nbd-ops.c @@ -113,11 +113,20 @@ open_one_nbd_handle (struct rw_nbd *rwn) */ if (rwn->handles.len == 0) { rwn->can_zero = nbd_can_zero (nbd) > 0; + rwn->rw.size = nbd_get_size (nbd); if (rwn->rw.size == -1) { fprintf (stderr, "%s: %s: %s\n", prog, rwn->rw.name, nbd_get_error ()); exit (EXIT_FAILURE); } + + rwn->rw.preferred = nbd_get_block_size (nbd, LIBNBD_SIZE_PREFERRED); + if (rwn->rw.preferred == -1) { + fprintf (stderr, "%s: %s: %s\n", prog, rwn->rw.name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + if (rwn->rw.preferred == 0) + rwn->rw.preferred = 4096; } if (handles_append (&rwn->handles, nbd) == -1) { diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 19797dfd66..9b02ddf270 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -43,6 +43,7 @@ struct rw { struct rw_ops *ops; /* Operations. */ const char *name; /* Printable name, for error messages etc. */ int64_t size; /* May be -1 for streams. */ + uint64_t preferred; /* Preferred block size. */ /* Followed by private data for the particular subtype. */ }; @@ -53,7 +54,8 @@ typedef enum { READING, WRITING } direction; /* Create subtypes. */ extern struct rw *file_create (const char *name, int fd, - off_t st_size, bool is_block, direction d); + off_t st_size, blksize_t st_blksize, + bool is_block, direction d); extern struct rw *nbd_rw_create_uri (const char *name, const char *uri, direction d); extern struct rw *nbd_rw_create_subprocess (const char **argv, size_t argc, diff --git a/copy/null-ops.c b/copy/null-ops.c index 1218a623e2..854e291613 100644 --- a/copy/null-ops.c +++ b/copy/null-ops.c @@ -45,6 +45,7 @@ null_create (const char *name) rw->rw.ops = &null_ops; rw->rw.name = name; rw->rw.size = INT64_MAX; + rw->rw.preferred = 4 * 1024 * 1024; return &rw->rw; } diff --git a/copy/pipe-ops.c b/copy/pipe-ops.c index 3c8b6c2b39..3815f824fc 100644 --- a/copy/pipe-ops.c +++ b/copy/pipe-ops.c @@ -43,6 +43,7 @@ pipe_create (const char *name, int fd) rwp->rw.ops = &pipe_ops; rwp->rw.name = name; rwp->rw.size = -1; + rwp->rw.preferred = 4096; rwp->fd = fd; return &rwp->rw; } -- 2.37.0.rc2
Richard W.M. Jones
2022-Jun-29 11:03 UTC
[Libguestfs] [PATCH libnbd v2 2/3] copy: Use preferred block size for copying
You're not supposed to read or write NBD servers at a granularity less than the advertised minimum block size. nbdcopy has ignored this requirement, and this is usually fine because the NBD servers we care about support 512-byte sector granularity, and never advertise sizes / extents less granular than sectors (even if it's a bit suboptimal in a few cases). However there is one new case where we do care: When writing to a compressed qcow2 file, qemu advertises a minimum and preferred block size of 64K, and it really means it. You cannot write blocks smaller than this because of the way qcow2 compression is implemented. This commit attempts to do the least work possible to fix this. The previous multi-thread-copying loop was driven by the extent map received from the source. I have modified the loop so that it iterates over request_size blocks. request_size is set from the command line (--request-size) but will be adjusted upwards if either the source or destination preferred block size is larger. So this will always copy blocks which are at least the preferred block size (except for the very last block of the disk). While copying these blocks we consult the source extent map. If it contains only zero regions covering the whole block (only_zeroes function) then we can skip straight to zeroing the target (fill_dst_range_with_zeroes), else we do read + write as before. I only modified the multi-thread-copying loop, not the synchronous loop. That should be updated in the same way later. One side effect of this change is it always makes larger requests, even for regions we know are sparse. This is clear in the copy-sparse.sh and copy-sparse-allocated.sh tests which were previously driven by the 32K sparse map granularity of the source. Without changing these tests, they would make make 256K reads & writes (and also read from areas of the disk even though we know they are sparse). I adjusted these tests to use --request-size=32768 to force the existing behaviour. Note this doesn't attempt to limit the maximum block size when reading or writing. That is for future work. This is a partial fix for https://bugzilla.redhat.com/2047660. Further changes will be required in virt-v2v. Link: https://lists.gnu.org/archive/html/qemu-block/2022-01/threads.html#00729 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2047660 --- TODO | 4 +- copy/copy-sparse-allocated.sh | 4 +- copy/copy-sparse.sh | 7 +- copy/main.c | 7 ++ copy/multi-thread-copying.c | 143 +++++++++++++++++++++++++--------- 5 files changed, 119 insertions(+), 46 deletions(-) diff --git a/TODO b/TODO index 7c9c15e245..bc38d7045e 100644 --- a/TODO +++ b/TODO @@ -28,7 +28,9 @@ Performance: Chart it over various buffer sizes and threads, as that Examine other fuzzers: https://gitlab.com/akihe/radamsa nbdcopy: - - Minimum/preferred/maximum block size. + - Enforce maximum block size. + - Synchronous loop should be adjusted to take into account + the NBD preferred block size, as was done for multi-thread loop. - Benchmark. - Better page cache usage, see nbdkit-file-plugin options fadvise=sequential cache=none. diff --git a/copy/copy-sparse-allocated.sh b/copy/copy-sparse-allocated.sh index 203c3b9850..465e347067 100755 --- a/copy/copy-sparse-allocated.sh +++ b/copy/copy-sparse-allocated.sh @@ -17,8 +17,6 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # Adapted from copy-sparse.sh. -# -# This test depends on the nbdkit default sparse block size (32K). . ../tests/functions.sh @@ -33,7 +31,7 @@ requires nbdkit eval --version out=copy-sparse-allocated.out cleanup_fn rm -f $out -$VG nbdcopy --allocated -- \ +$VG nbdcopy --allocated --request-size=32768 -- \ [ nbdkit --exit-with-parent data data=' 1 @1073741823 1 diff --git a/copy/copy-sparse.sh b/copy/copy-sparse.sh index 1a6da868e9..7912a2120a 100755 --- a/copy/copy-sparse.sh +++ b/copy/copy-sparse.sh @@ -16,8 +16,6 @@ # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -# This test depends on the nbdkit default sparse block size (32K). - . ../tests/functions.sh set -e @@ -34,8 +32,9 @@ cleanup_fn rm -f $out # Copy from a sparse data disk to an nbdkit-eval-plugin instance which # is logging everything. This allows us to see exactly what nbdcopy # is writing, to ensure it is writing and zeroing the target as -# expected. -$VG nbdcopy -S 0 -- \ +# expected. Force request size to match nbdkit default sparse +# allocator block size (32K). +$VG nbdcopy -S 0 --request-size=32768 -- \ [ nbdkit --exit-with-parent data data=' 1 @1073741823 1 diff --git a/copy/main.c b/copy/main.c index 9b551de664..cbfe5b5436 100644 --- a/copy/main.c +++ b/copy/main.c @@ -40,6 +40,7 @@ #include "ispowerof2.h" #include "human-size.h" +#include "minmax.h" #include "version.h" #include "nbdcopy.h" @@ -379,6 +380,12 @@ main (int argc, char *argv[]) if (threads < connections) connections = threads; + /* request_size must always be larger than preferred size of + * source or destination. + */ + request_size = MAX (request_size, src->preferred); + request_size = MAX (request_size, dst->preferred); + /* Adapt queue to size to request size if needed. */ if (request_size > queue_size) queue_size = request_size; diff --git a/copy/multi-thread-copying.c b/copy/multi-thread-copying.c index 06cdb8eef6..7194f4fafd 100644 --- a/copy/multi-thread-copying.c +++ b/copy/multi-thread-copying.c @@ -166,6 +166,58 @@ decrease_queue_size (struct worker *worker, size_t len) worker->queue_size -= len; } +/* Using the extents map 'exts', check if the region + * [offset..offset+len-1] contains only zero extents. + * + * The invariant for '*i' is always an extent which starts before or + * equal to the current offset. + */ +static bool +only_zeroes (const extent_list exts, size_t *i, + uint64_t offset, unsigned len) +{ + size_t j; + + /* Invariant. */ + assert (0 <= *i && *i < exts.len); + assert (exts.ptr[*i].offset <= offset); + + /* Update the variant. Search for the last possible extent in the + * list which is <= offset. + */ + for (j = *i + 1; j < exts.len; ++j) { + if (exts.ptr[j].offset <= offset) + *i = j; + else + break; + } + + /* Check invariant again. */ + assert (0 <= *i && *i < exts.len); + assert (exts.ptr[*i].offset <= offset); + + /* Search forward, look for any non-zero extents covering the region. */ + for (j = *i; j < exts.len; ++j) { + uint64_t start, end; + + /* [start..end] of the current extent. */ + start = exts.ptr[j].offset; + end = exts.ptr[j].offset + exts.ptr[j].length; + + if (end <= offset) + continue; + + if (start >= offset + len) + break; + + /* Non-zero extent covering this region => test failed. */ + if (!exts.ptr[j].zero) + return false; + } + + return true; +} + /* There are 'threads' worker threads, each copying work ranges from * src to dst until there are no more work ranges. */ @@ -177,7 +229,10 @@ worker_thread (void *wp) extent_list exts = empty_vector; while (get_next_offset (&offset, &count)) { + struct command *command; size_t i; + bool is_zeroing = false; + uint64_t zeroing_start = 0; assert (0 < count && count <= THREAD_WORK_SIZE); if (extents) @@ -185,52 +240,64 @@ worker_thread (void *wp) else default_get_extents (src, w->index, offset, count, &exts); - for (i = 0; i < exts.len; ++i) { - struct command *command; - size_t len; + i = 0; // index into extents array + while (count) { + const size_t len = MIN (count, request_size); - if (exts.ptr[i].zero) { + if (only_zeroes (exts, &i, offset, len)) { /* The source is zero so we can proceed directly to skipping, - * fast zeroing, or writing zeroes at the destination. + * fast zeroing, or writing zeroes at the destination. Defer + * zeroing so we can send it as a single large command. */ - command = create_command (exts.ptr[i].offset, exts.ptr[i].length, - true, w); - fill_dst_range_with_zeroes (command); + if (!is_zeroing) { + is_zeroing = true; + zeroing_start = offset; + } } - else /* data */ { - /* As the extent might be larger than permitted for a single - * command, we may have to split this into multiple read - * requests. - */ - while (exts.ptr[i].length > 0) { - len = exts.ptr[i].length; - if (len > request_size) - len = request_size; - - command = create_command (exts.ptr[i].offset, len, - false, w); - - wait_for_request_slots (w); - - /* NOTE: Must increase the queue size after waiting. */ - increase_queue_size (w, len); - - /* Begin the asynch read operation. */ - src->ops->asynch_read (src, command, - (nbd_completion_callback) { - .callback = finished_read, - .user_data = command, - }); - - exts.ptr[i].offset += len; - exts.ptr[i].length -= len; + /* If we were in the middle of deferred zeroing, do it now. */ + if (is_zeroing) { + /* Note that offset-zeroing_start can never exceed + * THREAD_WORK_SIZE, so there is no danger of overflowing + * size_t. + */ + command = create_command (zeroing_start, offset-zeroing_start, + true, w); + fill_dst_range_with_zeroes (command); + is_zeroing = false; } + + /* Issue the asynchronous read command. */ + command = create_command (offset, len, false, w); + + wait_for_request_slots (w); + + /* NOTE: Must increase the queue size after waiting. */ + increase_queue_size (w, len); + + /* Begin the asynch read operation. */ + src->ops->asynch_read (src, command, + (nbd_completion_callback) { + .callback = finished_read, + .user_data = command, + }); } - offset += count; - count = 0; - } /* for extents */ + offset += len; + count -= len; + } /* while (count) */ + + /* If we were in the middle of deferred zeroing, do it now. */ + if (is_zeroing) { + /* Note that offset-zeroing_start can never exceed + * THREAD_WORK_SIZE, so there is no danger of overflowing + * size_t. + */ + command = create_command (zeroing_start, offset - zeroing_start, + true, w); + fill_dst_range_with_zeroes (command); + is_zeroing = false; + } } /* Wait for in flight NBD requests to finish. */ -- 2.37.0.rc2
Richard W.M. Jones
2022-Jun-29 11:03 UTC
[Libguestfs] [PATCH libnbd v2 3/3] copy/copy-nbd-error.sh: Make this test non-stochastic
Because the test previously used error rates of 50%, it could sometimes "fail to fail". This is noticable if you run the test repeatedly: $ while make -C copy check TESTS=copy-nbd-error.sh >& /tmp/log ; do echo -n . ; done This now happens more often because of the larger requests made by the new multi-threaded loop, resulting in fewer calls to the error filter, so a greater chance that a series of 50% coin tosses will come up all heads in the test. Fix this by making the test non-stocastic. Fixes: commit 8d444b41d09a700c7ee6f9182a649f3f2d325abb --- copy/copy-nbd-error.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/copy/copy-nbd-error.sh b/copy/copy-nbd-error.sh index 0088807f54..01524a890c 100755 --- a/copy/copy-nbd-error.sh +++ b/copy/copy-nbd-error.sh @@ -40,7 +40,7 @@ $VG nbdcopy -- [ nbdkit --exit-with-parent -v --filter=error pattern 5M \ # Failure to read should be fatal echo "Testing read failures on non-sparse source" $VG nbdcopy -- [ nbdkit --exit-with-parent -v --filter=error pattern 5M \ - error-pread-rate=0.5 ] null: && fail=1 + error-pread-rate=1 ] null: && fail=1 # However, reliable block status on a sparse image can avoid the need to read echo "Testing read failures on sparse source" @@ -51,7 +51,7 @@ $VG nbdcopy -- [ nbdkit --exit-with-parent -v --filter=error null 5M \ echo "Testing write data failures on arbitrary destination" $VG nbdcopy -- [ nbdkit --exit-with-parent -v pattern 5M ] \ [ nbdkit --exit-with-parent -v --filter=error --filter=noextents \ - memory 5M error-pwrite-rate=0.5 ] && fail=1 + memory 5M error-pwrite-rate=1 ] && fail=1 # However, writing zeroes can bypass the need for normal writes echo "Testing write data failures from sparse source" -- 2.37.0.rc2