Nir Soffer
2020-Aug-06 20:22 UTC
[Libguestfs] [PATCH nbdkit] Experiment with parallel python plugin
This is a quick hack to experiment with parallel threading model in the
python plugin.
Changes:
- Use aligned buffers to make it possible to use O_DIRECT. Using
parallel I/O does not buy us much when using buffered I/O. pwrite()
copies data to the page cache, and pread() reads data from the page
cache.
- Disable extents in the file plugin. This way we can compare it with
the python file example.
- Implement flush in the file example.
With these changes, I could compare the file plugin with the new python
file example, and it seems that the parallel threading models works
nicely, and we get similar performance for the case of fully allocated
image.
I created a test image using:
$ virt-builder fedora-32 -o /var/tmp/fedora-32.raw --root-password=password:root
And a fully allocated test image using:
$ fallocate --length 6g /var/tmp/disk.raw
$ dd if=/var/tmp/fedora-32.raw bs=8M of=/var/tmp/disk.raw iflag=direct
oflag=direct conv=fsync,notrunc
$ qemu-img map --output json /var/tmp/disk.raw
[{ "start": 0, "length": 6442450944, "depth": 0,
"zero": false, "data": true, "offset": 0}]
For reference, copying this image with dd using direct I/O:
$ dd if=/var/tmp/disk.raw bs=2M of=/dev/shm/disk.raw iflag=direct conv=fsync
status=progress
6442450944 bytes (6.4 GB, 6.0 GiB) copied, 10.4783 s, 615 MB/s
Copying same image with qemu-img convert, disabling zero detection,
using different number of coroutines:
$ time qemu-img convert -f raw -O raw -T none -S0 -m1 -W /var/tmp/disk.raw
/dev/shm/disk.raw
real 0m11.527s
user 0m0.102s
sys 0m2.330s
$ time qemu-img convert -f raw -O raw -T none -S0 -m2 -W /var/tmp/disk.raw
/dev/shm/disk.raw
real 0m5.971s
user 0m0.080s
sys 0m2.749s
$ time qemu-img convert -f raw -O raw -T none -S0 -m4 -W /var/tmp/disk.raw
/dev/shm/disk.raw
real 0m3.674s
user 0m0.071s
sys 0m3.140s
$ time qemu-img convert -f raw -O raw -T none -S0 -m8 -W /var/tmp/disk.raw
/dev/shm/disk.raw
real 0m3.408s
user 0m0.069s
sys 0m3.813s
$ time qemu-img convert -f raw -O raw -T none -S0 -m16 -W /var/tmp/disk.raw
/dev/shm/disk.raw
real 0m3.305s
user 0m0.054s
sys 0m3.767s
Same with the modified file plugin, using direct I/O and without
extents:
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m12.167s
user 0m5.798s
sys 0m2.477s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m7.981s
user 0m5.204s
sys 0m2.740s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.568s
user 0m4.996s
sys 0m3.167s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.493s
user 0m4.950s
sys 0m3.492s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r file
file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.138s
user 0m4.621s
sys 0m3.550s
Finally, same with the python file example:
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m12.398s
user 0m6.652s
sys 0m2.484s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m8.169s
user 0m5.418s
sys 0m2.736s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.419s
user 0m4.891s
sys 0m3.103s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.610s
user 0m5.115s
sys 0m3.377s
$ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r python
./plugins/python/examples/file.py file=/var/tmp/disk.raw
$ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock
/dev/shm/disk.raw
real 0m6.093s
user 0m4.520s
sys 0m3.567s
I think this show that the parallel threading model works for the python
plugin as good as for the file plugin.
---
plugins/file/file.c | 4 ++--
plugins/python/examples/file.py | 5 ++++-
server/plugins.c | 20 ++++++++++++++------
server/threadlocal.c | 7 +++++--
4 files changed, 25 insertions(+), 11 deletions(-)
diff --git a/plugins/file/file.c b/plugins/file/file.c
index dc99f992..27316b9f 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -170,7 +170,7 @@ file_open (int readonly)
return NULL;
}
- flags = O_CLOEXEC|O_NOCTTY;
+ flags = O_CLOEXEC|O_NOCTTY|O_DIRECT;
if (readonly)
flags |= O_RDONLY;
else
@@ -551,7 +551,7 @@ file_can_extents (void *handle)
nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m");
return 0;
}
- return 1;
+ return 0;
}
static int
diff --git a/plugins/python/examples/file.py b/plugins/python/examples/file.py
index 866b8244..3652eb52 100644
--- a/plugins/python/examples/file.py
+++ b/plugins/python/examples/file.py
@@ -49,7 +49,7 @@ def open(readonly):
flags = os.O_RDONLY
else:
flags = os.O_RDWR
- fd = os.open(filename, flags)
+ fd = os.open(filename, flags | os.O_DIRECT)
return { 'fd': fd }
def get_size(h):
@@ -65,3 +65,6 @@ def pwrite(h, buf, offset, flags):
n = os.pwritev(h['fd'], [buf], offset)
if n != len(buf):
raise RuntimeError("short write")
+
+def flush(h, flags):
+ os.fsync(h['fd'])
diff --git a/server/plugins.c b/server/plugins.c
index d4364cd2..ce4700a3 100644
--- a/server/plugins.c
+++ b/server/plugins.c
@@ -631,6 +631,8 @@ plugin_zero (struct backend *b, void *handle,
bool fast_zero = flags & NBDKIT_FLAG_FAST_ZERO;
bool emulate = false;
bool need_flush = false;
+ void *zero_buffer = NULL;
+ int buffer_size = MIN (MAX_REQUEST_SIZE, count);
if (fua && backend_can_fua (b) != NBDKIT_FUA_NATIVE) {
flags &= ~NBDKIT_FLAG_FUA;
@@ -669,19 +671,25 @@ plugin_zero (struct backend *b, void *handle,
threadlocal_set_error (0);
*err = 0;
+ *err = posix_memalign(&zero_buffer, 4096, buffer_size);
+ if (*err != 0) {
+ r = -1;
+ goto done;
+ }
+
+ memset(zero_buffer, 0, buffer_size);
+
while (count) {
- /* Always contains zeroes, but we can't use const or else gcc 9
- * will use .rodata instead of .bss and inflate the binary size.
- */
- static /* const */ char buf[MAX_REQUEST_SIZE];
- uint32_t limit = MIN (count, sizeof buf);
+ uint32_t limit = MIN (count, buffer_size);
- r = plugin_pwrite (b, handle, buf, limit, offset, flags, err);
+ r = plugin_pwrite (b, handle, zero_buffer, limit, offset, flags, err);
if (r == -1)
break;
count -= limit;
}
+ free(zero_buffer);
+
done:
if (r != -1 && need_flush)
r = plugin_flush (b, handle, 0, err);
diff --git a/server/threadlocal.c b/server/threadlocal.c
index 90230028..04c82842 100644
--- a/server/threadlocal.c
+++ b/server/threadlocal.c
@@ -195,13 +195,16 @@ threadlocal_buffer (size_t size)
if (threadlocal->buffer_size < size) {
void *ptr;
+ int err;
- ptr = realloc (threadlocal->buffer, size);
- if (ptr == NULL) {
+ err = posix_memalign (&ptr, 4096, size);
+ if (err != 0) {
nbdkit_error ("threadlocal_buffer: realloc: %m");
return NULL;
}
+
memset (ptr, 0, size);
+ free(threadlocal->buffer);
threadlocal->buffer = ptr;
threadlocal->buffer_size = size;
}
--
2.25.4
Richard W.M. Jones
2020-Aug-06 20:52 UTC
Re: [Libguestfs] [PATCH nbdkit] Experiment with parallel python plugin
On Thu, Aug 06, 2020 at 11:22:00PM +0300, Nir Soffer wrote:> This is a quick hack to experiment with parallel threading model in the > python plugin. > > Changes: > > - Use aligned buffers to make it possible to use O_DIRECT. Using > parallel I/O does not buy us much when using buffered I/O. pwrite() > copies data to the page cache, and pread() reads data from the page > cache.O_DIRECT is unfortunately a bit too fragile to consider using routinely. But I wonder if one of the posix_fadvise(2) hints could be used (eg. POSIX_FADV_SEQUENTIAL + POSIX_FADV_DONTNEED). Adding fadvise hints as a parameter for the file plugin is a very plausible change.> - Disable extents in the file plugin. This way we can compare it with > the python file example. > > - Implement flush in the file example. > > With these changes, I could compare the file plugin with the new python > file example, and it seems that the parallel threading models works > nicely, and we get similar performance for the case of fully allocated > image. > > I created a test image using: > > $ virt-builder fedora-32 -o /var/tmp/fedora-32.raw --root-password=password:root > > And a fully allocated test image using: > > $ fallocate --length 6g /var/tmp/disk.raw > $ dd if=/var/tmp/fedora-32.raw bs=8M of=/var/tmp/disk.raw iflag=direct oflag=direct conv=fsync,notrunc > > $ qemu-img map --output json /var/tmp/disk.raw > [{ "start": 0, "length": 6442450944, "depth": 0, "zero": false, "data": true, "offset": 0}] > > For reference, copying this image with dd using direct I/O: > > $ dd if=/var/tmp/disk.raw bs=2M of=/dev/shm/disk.raw iflag=direct conv=fsync status=progress > 6442450944 bytes (6.4 GB, 6.0 GiB) copied, 10.4783 s, 615 MB/s > > Copying same image with qemu-img convert, disabling zero detection, > using different number of coroutines: > > $ time qemu-img convert -f raw -O raw -T none -S0 -m1 -W /var/tmp/disk.raw /dev/shm/disk.raw > > real 0m11.527s > user 0m0.102s > sys 0m2.330s > > $ time qemu-img convert -f raw -O raw -T none -S0 -m2 -W /var/tmp/disk.raw /dev/shm/disk.raw > > real 0m5.971s > user 0m0.080s > sys 0m2.749s > > $ time qemu-img convert -f raw -O raw -T none -S0 -m4 -W /var/tmp/disk.raw /dev/shm/disk.raw > > real 0m3.674s > user 0m0.071s > sys 0m3.140s > > $ time qemu-img convert -f raw -O raw -T none -S0 -m8 -W /var/tmp/disk.raw /dev/shm/disk.raw > > real 0m3.408s > user 0m0.069s > sys 0m3.813s > > $ time qemu-img convert -f raw -O raw -T none -S0 -m16 -W /var/tmp/disk.raw /dev/shm/disk.raw > > real 0m3.305s > user 0m0.054s > sys 0m3.767s > > Same with the modified file plugin, using direct I/O and without > extents: > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r file file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m12.167s > user 0m5.798s > sys 0m2.477s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r file file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m7.981s > user 0m5.204s > sys 0m2.740s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r file file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.568s > user 0m4.996s > sys 0m3.167s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r file file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.493s > user 0m4.950s > sys 0m3.492s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r file file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.138s > user 0m4.621s > sys 0m3.550s > > Finally, same with the python file example: > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m12.398s > user 0m6.652s > sys 0m2.484s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m8.169s > user 0m5.418s > sys 0m2.736s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.419s > user 0m4.891s > sys 0m3.103s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.610s > user 0m5.115s > sys 0m3.377s > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > real 0m6.093s > user 0m4.520s > sys 0m3.567sAll pretty excellent for an interpreted programming language.> I think this show that the parallel threading model works for the python > plugin as good as for the file plugin. > --- > plugins/file/file.c | 4 ++-- > plugins/python/examples/file.py | 5 ++++- > server/plugins.c | 20 ++++++++++++++------ > server/threadlocal.c | 7 +++++-- > 4 files changed, 25 insertions(+), 11 deletions(-) > > diff --git a/plugins/file/file.c b/plugins/file/file.c > index dc99f992..27316b9f 100644 > --- a/plugins/file/file.c > +++ b/plugins/file/file.c > @@ -170,7 +170,7 @@ file_open (int readonly) > return NULL; > } > > - flags = O_CLOEXEC|O_NOCTTY; > + flags = O_CLOEXEC|O_NOCTTY|O_DIRECT; > if (readonly) > flags |= O_RDONLY; > else > @@ -551,7 +551,7 @@ file_can_extents (void *handle) > nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m"); > return 0; > } > - return 1; > + return 0; > } > > static int > diff --git a/plugins/python/examples/file.py b/plugins/python/examples/file.py > index 866b8244..3652eb52 100644 > --- a/plugins/python/examples/file.py > +++ b/plugins/python/examples/file.py > @@ -49,7 +49,7 @@ def open(readonly): > flags = os.O_RDONLY > else: > flags = os.O_RDWR > - fd = os.open(filename, flags) > + fd = os.open(filename, flags | os.O_DIRECT) > return { 'fd': fd } > > def get_size(h): > @@ -65,3 +65,6 @@ def pwrite(h, buf, offset, flags): > n = os.pwritev(h['fd'], [buf], offset) > if n != len(buf): > raise RuntimeError("short write") > + > +def flush(h, flags): > + os.fsync(h['fd']) > diff --git a/server/plugins.c b/server/plugins.c > index d4364cd2..ce4700a3 100644 > --- a/server/plugins.c > +++ b/server/plugins.c > @@ -631,6 +631,8 @@ plugin_zero (struct backend *b, void *handle, > bool fast_zero = flags & NBDKIT_FLAG_FAST_ZERO; > bool emulate = false; > bool need_flush = false; > + void *zero_buffer = NULL; > + int buffer_size = MIN (MAX_REQUEST_SIZE, count); > > if (fua && backend_can_fua (b) != NBDKIT_FUA_NATIVE) { > flags &= ~NBDKIT_FLAG_FUA; > @@ -669,19 +671,25 @@ plugin_zero (struct backend *b, void *handle, > threadlocal_set_error (0); > *err = 0; > > + *err = posix_memalign(&zero_buffer, 4096, buffer_size); > + if (*err != 0) { > + r = -1; > + goto done; > + } > + > + memset(zero_buffer, 0, buffer_size); > + > while (count) { > - /* Always contains zeroes, but we can't use const or else gcc 9 > - * will use .rodata instead of .bss and inflate the binary size. > - */ > - static /* const */ char buf[MAX_REQUEST_SIZE]; > - uint32_t limit = MIN (count, sizeof buf); > + uint32_t limit = MIN (count, buffer_size); > > - r = plugin_pwrite (b, handle, buf, limit, offset, flags, err); > + r = plugin_pwrite (b, handle, zero_buffer, limit, offset, flags, err); > if (r == -1) > break; > count -= limit; > } > > + free(zero_buffer); > + > done: > if (r != -1 && need_flush) > r = plugin_flush (b, handle, 0, err); > diff --git a/server/threadlocal.c b/server/threadlocal.c > index 90230028..04c82842 100644 > --- a/server/threadlocal.c > +++ b/server/threadlocal.c > @@ -195,13 +195,16 @@ threadlocal_buffer (size_t size) > > if (threadlocal->buffer_size < size) { > void *ptr; > + int err; > > - ptr = realloc (threadlocal->buffer, size); > - if (ptr == NULL) { > + err = posix_memalign (&ptr, 4096, size); > + if (err != 0) { > nbdkit_error ("threadlocal_buffer: realloc: %m"); > return NULL; > } > + > memset (ptr, 0, size); > + free(threadlocal->buffer); > threadlocal->buffer = ptr; > threadlocal->buffer_size = size; > } > -- > 2.25.4Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-builder quickly builds VMs from scratch http://libguestfs.org/virt-builder.1.html
Nir Soffer
2020-Aug-06 22:06 UTC
Re: [Libguestfs] [PATCH nbdkit] Experiment with parallel python plugin
On Thu, Aug 6, 2020 at 11:52 PM Richard W.M. Jones <rjones@redhat.com> wrote:> > On Thu, Aug 06, 2020 at 11:22:00PM +0300, Nir Soffer wrote: > > This is a quick hack to experiment with parallel threading model in the > > python plugin. > > > > Changes: > > > > - Use aligned buffers to make it possible to use O_DIRECT. Using > > parallel I/O does not buy us much when using buffered I/O. pwrite() > > copies data to the page cache, and pread() reads data from the page > > cache. > > O_DIRECT is unfortunately a bit too fragile to consider using > routinely. But I wonder if one of the posix_fadvise(2) hints could be > used (eg. POSIX_FADV_SEQUENTIAL + POSIX_FADV_DONTNEED). Adding > fadvise hints as a parameter for the file plugin is a very plausible > change.This does not help much, and does not replace direct I/O when using shared storage. In oVirt we alway use direct I/O for image copies, for several reasons: - Reading using buffered I/O you may get stale data form page cache, after image was modified on another host - Reading and writing pollute the page cache with data we will never need, since VM also use direct I/O. - Writing using buffered I/O lead to I/O delays in other programs when the kernel flushes gigabytes from the page cache to storage. This easily cause timeouts in sanlock leases renewal (using 10 seconds timeout), and these timeouts can cause leases to expire, killing VMs holding the expired leases, or in the worst case rebooting the host if sanlock could not kill a program using an expired lease. - Performance using direct I/O is usually better and always more consistent For nbdkit this is nice to have. If we change the buffer allocation to create aligned buffers, plugins like the file plugin can add direct=false option to use direct I/O. Supporting unaligned requests will not be easy, but a plugin can reject such requests.> > - Disable extents in the file plugin. This way we can compare it with > > the python file example. > > > > - Implement flush in the file example. > > > > With these changes, I could compare the file plugin with the new python > > file example, and it seems that the parallel threading models works > > nicely, and we get similar performance for the case of fully allocated > > image. > > > > I created a test image using: > > > > $ virt-builder fedora-32 -o /var/tmp/fedora-32.raw --root-password=password:root > > > > And a fully allocated test image using: > > > > $ fallocate --length 6g /var/tmp/disk.raw > > $ dd if=/var/tmp/fedora-32.raw bs=8M of=/var/tmp/disk.raw iflag=direct oflag=direct conv=fsync,notrunc > > > > $ qemu-img map --output json /var/tmp/disk.raw > > [{ "start": 0, "length": 6442450944, "depth": 0, "zero": false, "data": true, "offset": 0}] > > > > For reference, copying this image with dd using direct I/O: > > > > $ dd if=/var/tmp/disk.raw bs=2M of=/dev/shm/disk.raw iflag=direct conv=fsync status=progress > > 6442450944 bytes (6.4 GB, 6.0 GiB) copied, 10.4783 s, 615 MB/s > > > > Copying same image with qemu-img convert, disabling zero detection, > > using different number of coroutines: > > > > $ time qemu-img convert -f raw -O raw -T none -S0 -m1 -W /var/tmp/disk.raw /dev/shm/disk.raw > > > > real 0m11.527s > > user 0m0.102s > > sys 0m2.330s > > > > $ time qemu-img convert -f raw -O raw -T none -S0 -m2 -W /var/tmp/disk.raw /dev/shm/disk.raw > > > > real 0m5.971s > > user 0m0.080s > > sys 0m2.749s > > > > $ time qemu-img convert -f raw -O raw -T none -S0 -m4 -W /var/tmp/disk.raw /dev/shm/disk.raw > > > > real 0m3.674s > > user 0m0.071s > > sys 0m3.140s > > > > $ time qemu-img convert -f raw -O raw -T none -S0 -m8 -W /var/tmp/disk.raw /dev/shm/disk.raw > > > > real 0m3.408s > > user 0m0.069s > > sys 0m3.813s > > > > $ time qemu-img convert -f raw -O raw -T none -S0 -m16 -W /var/tmp/disk.raw /dev/shm/disk.raw > > > > real 0m3.305s > > user 0m0.054s > > sys 0m3.767s > > > > Same with the modified file plugin, using direct I/O and without > > extents: > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r file file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m12.167s > > user 0m5.798s > > sys 0m2.477s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r file file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m7.981s > > user 0m5.204s > > sys 0m2.740s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r file file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.568s > > user 0m4.996s > > sys 0m3.167s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r file file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.493s > > user 0m4.950s > > sys 0m3.492s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r file file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.138s > > user 0m4.621s > > sys 0m3.550s > > > > Finally, same with the python file example: > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t1 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > > $ time qemu-img convert -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m12.398s > > user 0m6.652s > > sys 0m2.484s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t2 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m8.169s > > user 0m5.418s > > sys 0m2.736s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t4 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.419s > > user 0m4.891s > > sys 0m3.103s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t8 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.610s > > user 0m5.115s > > sys 0m3.377s > > > > $ rm -f /tmp/nbd.sock && ./nbdkit -U /tmp/nbd.sock -t16 -f -r python ./plugins/python/examples/file.py file=/var/tmp/disk.raw > > $ time qemu-img convert -p -f raw -O raw -S0 -m16 -W nbd:unix:/tmp/nbd.sock /dev/shm/disk.raw > > > > real 0m6.093s > > user 0m4.520s > > sys 0m3.567s > > All pretty excellent for an interpreted programming language.This test is the best case for the python plugin since without extents, all reads are 2 MiB. If we enable extents we will have many small reads, and the file plugin will be faster.> > I think this show that the parallel threading model works for the python > > plugin as good as for the file plugin. > > --- > > plugins/file/file.c | 4 ++-- > > plugins/python/examples/file.py | 5 ++++- > > server/plugins.c | 20 ++++++++++++++------ > > server/threadlocal.c | 7 +++++-- > > 4 files changed, 25 insertions(+), 11 deletions(-) > > > > diff --git a/plugins/file/file.c b/plugins/file/file.c > > index dc99f992..27316b9f 100644 > > --- a/plugins/file/file.c > > +++ b/plugins/file/file.c > > @@ -170,7 +170,7 @@ file_open (int readonly) > > return NULL; > > } > > > > - flags = O_CLOEXEC|O_NOCTTY; > > + flags = O_CLOEXEC|O_NOCTTY|O_DIRECT; > > if (readonly) > > flags |= O_RDONLY; > > else > > @@ -551,7 +551,7 @@ file_can_extents (void *handle) > > nbdkit_debug ("extents disabled: lseek: SEEK_HOLE: %m"); > > return 0; > > } > > - return 1; > > + return 0; > > } > > > > static int > > diff --git a/plugins/python/examples/file.py b/plugins/python/examples/file.py > > index 866b8244..3652eb52 100644 > > --- a/plugins/python/examples/file.py > > +++ b/plugins/python/examples/file.py > > @@ -49,7 +49,7 @@ def open(readonly): > > flags = os.O_RDONLY > > else: > > flags = os.O_RDWR > > - fd = os.open(filename, flags) > > + fd = os.open(filename, flags | os.O_DIRECT) > > return { 'fd': fd } > > > > def get_size(h): > > @@ -65,3 +65,6 @@ def pwrite(h, buf, offset, flags): > > n = os.pwritev(h['fd'], [buf], offset) > > if n != len(buf): > > raise RuntimeError("short write") > > + > > +def flush(h, flags): > > + os.fsync(h['fd']) > > diff --git a/server/plugins.c b/server/plugins.c > > index d4364cd2..ce4700a3 100644 > > --- a/server/plugins.c > > +++ b/server/plugins.c > > @@ -631,6 +631,8 @@ plugin_zero (struct backend *b, void *handle, > > bool fast_zero = flags & NBDKIT_FLAG_FAST_ZERO; > > bool emulate = false; > > bool need_flush = false; > > + void *zero_buffer = NULL; > > + int buffer_size = MIN (MAX_REQUEST_SIZE, count); > > > > if (fua && backend_can_fua (b) != NBDKIT_FUA_NATIVE) { > > flags &= ~NBDKIT_FLAG_FUA; > > @@ -669,19 +671,25 @@ plugin_zero (struct backend *b, void *handle, > > threadlocal_set_error (0); > > *err = 0; > > > > + *err = posix_memalign(&zero_buffer, 4096, buffer_size); > > + if (*err != 0) { > > + r = -1; > > + goto done; > > + } > > + > > + memset(zero_buffer, 0, buffer_size); > > + > > while (count) { > > - /* Always contains zeroes, but we can't use const or else gcc 9 > > - * will use .rodata instead of .bss and inflate the binary size. > > - */ > > - static /* const */ char buf[MAX_REQUEST_SIZE]; > > - uint32_t limit = MIN (count, sizeof buf); > > + uint32_t limit = MIN (count, buffer_size); > > > > - r = plugin_pwrite (b, handle, buf, limit, offset, flags, err); > > + r = plugin_pwrite (b, handle, zero_buffer, limit, offset, flags, err); > > if (r == -1) > > break; > > count -= limit; > > } > > > > + free(zero_buffer); > > + > > done: > > if (r != -1 && need_flush) > > r = plugin_flush (b, handle, 0, err); > > diff --git a/server/threadlocal.c b/server/threadlocal.c > > index 90230028..04c82842 100644 > > --- a/server/threadlocal.c > > +++ b/server/threadlocal.c > > @@ -195,13 +195,16 @@ threadlocal_buffer (size_t size) > > > > if (threadlocal->buffer_size < size) { > > void *ptr; > > + int err; > > > > - ptr = realloc (threadlocal->buffer, size); > > - if (ptr == NULL) { > > + err = posix_memalign (&ptr, 4096, size); > > + if (err != 0) { > > nbdkit_error ("threadlocal_buffer: realloc: %m"); > > return NULL; > > } > > + > > memset (ptr, 0, size); > > + free(threadlocal->buffer); > > threadlocal->buffer = ptr; > > threadlocal->buffer_size = size; > > } > > -- > > 2.25.4 > > Rich. > > -- > Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones > Read my programming and virtualization blog: http://rwmj.wordpress.com > virt-builder quickly builds VMs from scratch > http://libguestfs.org/virt-builder.1.html >
Apparently Analagous Threads
- [PATCH nbdkit] Experiment with parallel python plugin
- Re: [PATCH nbdkit] Experiment with parallel python plugin
- Re: [PATCH nbdkit v2 2/2] server: Use a thread-local pread/pwrite buffer to avoid leaking heap data.
- [PATCH nbdkit v2 0/2] Be careful not to leak server heap memory to the client.
- [nbdkit PATCH 0/3] More responsive shutdown