thr3ads.net - Libguestfs - [Libguestfs] [PATCH nbdkit] cow: Use more fine-grained locking. [Feb 2021]

If this information is useful, please help other people find it:
Share via:

Richard W.M. Jones

2021-Feb-19 15:43 UTC

[Libguestfs] [PATCH nbdkit] cow: Use more fine-grained locking.

perf analysis of the cow filter showed that a great deal of time was
spent holding the global lock.  This change makes the locking more
fine-grained so now we are only using the global lock to protect the
bitmap from concurrent access and nothing else.

I added a new read-modify-write lock to protect a few places where we
use an RMW cycle to modify an unaligned block, which should be a
fairly rare occurrence.  Previously the global lock protected these.

A benchmark shows this is a considerable improvement. Running the test
from here:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA6.sh;h=e19dca2504a9b3d76b8f472dc7caefd85113a54b;hb=HEAD

Before this change:
real	3m51.086s
user	0m0.994s
sys	0m3.568s

After this change:
real	2m1.091s
user	0m1.038s
sys	0m3.576s

Compared to a similar pipeline using qemu-img convert and qcow2:
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA4.sh;h=88085d0aa6bb479cad83346292dc72d2b4d3117f;hb=HEAD
real	2m22.368s
user	0m54.415s
sys	1m27.410s
---
 filters/cow/blk.h |  7 -------
 filters/cow/blk.c | 27 ++++++++++++++++++++++++++-
 filters/cow/cow.c | 41 ++++++++++++++---------------------------
 3 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/filters/cow/blk.h b/filters/cow/blk.h
index d96b9924..be3e764a 100644
--- a/filters/cow/blk.h
+++ b/filters/cow/blk.h
@@ -44,13 +44,6 @@ extern int blk_init (void);
 /* Close the overlay, free the bitmap. */
 extern void blk_free (void);
 
-/*----------------------------------------------------------------------
- * ** NOTE **
- *
- * An exclusive lock must be held when you call any function below
- * this line.
- */
-
 /* Allocate or resize the overlay and bitmap. */
 extern int blk_set_size (uint64_t new_size);
 
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index 51ba91c4..cbd40188 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -90,9 +90,12 @@
 #include <alloca.h>
 #endif
 
+#include <pthread.h>
+
 #include <nbdkit-filter.h>
 
 #include "bitmap.h"
+#include "cleanup.h"
 #include "fdatasync.h"
 #include "rounding.h"
 #include "pread.h"
@@ -104,6 +107,9 @@
 /* The temporary overlay. */
 static int fd = -1;
 
+/* This lock protects the bitmap from parallel access. */
+static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
 /* Bitmap. */
 static struct bitmap bm;
 
@@ -186,6 +192,8 @@ static uint64_t size = 0;
 int
 blk_set_size (uint64_t new_size)
 {
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+
   size = new_size;
 
   if (bitmap_resize (&bm, size) == -1)
@@ -205,6 +213,7 @@ blk_set_size (uint64_t new_size)
 void
 blk_status (uint64_t blknum, bool *present, bool *trimmed)
 {
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
 
   *present = state != BLOCK_NOT_ALLOCATED;
@@ -219,7 +228,18 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
           uint64_t blknum, uint8_t *block, int *err)
 {
   off_t offset = blknum * BLKSIZE;
-  enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+  enum bm_entry state;
+
+  /* The state might be modified from another thread - for example
+   * another thread might write (BLOCK_NOT_ALLOCATED ->
+   * BLOCK_ALLOCATED) while we are reading from the plugin, returning
+   * the old data.  However a read issued after the write returns
+   * should always return the correct data.
+   */
+  {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+  }
 
   nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %"
PRIu64 ") is %s",
                 blknum, (uint64_t) offset, state_to_string (state));
@@ -260,6 +280,8 @@ int
 blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
            uint64_t blknum, uint8_t *block, enum cache_mode mode, int *err)
 {
+  /* XXX Could make this lock more fine-grained with some thought. */
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   off_t offset = blknum * BLKSIZE;
   enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
   unsigned n = BLKSIZE, tail = 0;
@@ -322,6 +344,8 @@ blk_write (uint64_t blknum, const uint8_t *block, int *err)
     nbdkit_error ("pwrite: %m");
     return -1;
   }
+
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
 
   return 0;
@@ -339,6 +363,7 @@ blk_trim (uint64_t blknum, int *err)
    * here.  However it's not trivial since BLKSIZE is unrelated to the
    * overlay filesystem block size.
    */
+  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   bitmap_set_blk (&bm, blknum, BLOCK_TRIMMED);
   return 0;
 }
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 93e10f24..54291476 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -45,16 +45,17 @@
 #include <nbdkit-filter.h>
 
 #include "cleanup.h"
-
-#include "blk.h"
 #include "isaligned.h"
 #include "minmax.h"
 #include "rounding.h"
 
-/* In order to handle parallel requests safely, this lock must be held
- * when calling any blk_* functions.
+#include "blk.h"
+
+/* Read-modify-write requests are serialized through this global lock.
+ * This is only used for unaligned requests which should be
+ * infrequent.
  */
-static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t rmw_lock = PTHREAD_MUTEX_INITIALIZER;
 
 bool cow_on_cache;
 
@@ -117,7 +118,6 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void
*nxdata,
 
   nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
 
-  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   r = blk_set_size (size);
   if (r == -1)
     return -1;
@@ -221,7 +221,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;
@@ -242,7 +241,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
    * smarter here.
    */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, buf, err);
     if (r == -1)
       return -1;
@@ -256,7 +254,6 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
   /* Unaligned tail */
   if (count) {
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;
@@ -294,10 +291,10 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void
*nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     /* Do a read-modify-write operation on the current block.
-     * Hold the lock over the whole operation.
+     * Hold the rmw_lock over the whole operation.
      */
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memcpy (&block[blkoffs], buf, n);
@@ -314,7 +311,6 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_write (blknum, buf, err);
     if (r == -1)
       return -1;
@@ -328,7 +324,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
   /* Unaligned tail */
   if (count) {
     assert (block);
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memcpy (block, buf, count);
@@ -376,9 +372,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     uint64_t n = MIN (BLKSIZE - blkoffs, count);
 
     /* Do a read-modify-write operation on the current block.
-     * Hold the lock over the whole operation.
+     * Hold the rmw_lock over the whole operation.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[blkoffs], 0, n);
@@ -399,7 +395,6 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     /* XXX There is the possibility of optimizing this: since this loop is
      * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_write (blknum, block, err);
     if (r == -1)
       return -1;
@@ -411,7 +406,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Unaligned tail */
   if (count) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[count], 0, BLKSIZE - count);
@@ -455,7 +450,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
      */
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[blkoffs], 0, n);
@@ -471,7 +466,6 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (count >= BLKSIZE) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_trim (blknum, err);
     if (r == -1)
       return -1;
@@ -483,7 +477,7 @@ cow_trim (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Unaligned tail */
   if (count) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&rmw_lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
       memset (&block[count], 0, BLKSIZE - count);
@@ -553,7 +547,6 @@ cow_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   /* Aligned body */
   while (remaining) {
-    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_cache (next_ops, nxdata, blknum, block, mode, err);
     if (r == -1)
       return -1;
@@ -588,12 +581,6 @@ cow_extents (struct nbdkit_next_ops *next_ops, void
*nxdata,
   assert (IS_ALIGNED (count, BLKSIZE));
   assert (count > 0);           /* We must make forward progress. */
 
-  /* We hold the lock for the whole time, even when requesting extents
-   * from the plugin, because we want to present an atomic picture of
-   * the current state.
-   */
-  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-
   while (count > 0) {
     bool present, trimmed;
     struct nbdkit_extent e;
-- 
2.29.0.rc2

Eric Blake

2021-Feb-19 16:24 UTC

head link

[Libguestfs] [PATCH nbdkit] cow: Use more fine-grained locking.

On 2/19/21 9:43 AM, Richard W.M. Jones wrote:> perf analysis of the cow filter showed that a great deal of time was
> spent holding the global lock.  This change makes the locking more
> fine-grained so now we are only using the global lock to protect the
> bitmap from concurrent access and nothing else.
> 
> I added a new read-modify-write lock to protect a few places where we
> use an RMW cycle to modify an unaligned block, which should be a
> fairly rare occurrence.  Previously the global lock protected these.
> 
> A benchmark shows this is a considerable improvement. Running the test
> from here:
>
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA6.sh;h=e19dca2504a9b3d76b8f472dc7caefd85113a54b;hb=HEAD
> 
> Before this change:
> real	3m51.086s
> user	0m0.994s
> sys	0m3.568s
> 
> After this change:
> real	2m1.091s
> user	0m1.038s
> sys	0m3.576s
> 
> Compared to a similar pipeline using qemu-img convert and qcow2:
>
http://git.annexia.org/?p=libguestfs-talks.git;a=blob;f=2021-pipelines/benchmarking/testA4.sh;h=88085d0aa6bb479cad83346292dc72d2b4d3117f;hb=HEAD
> real	2m22.368s
> user	0m54.415s
> sys	1m27.410s
Nice speedups.  In my review, I'm double-checking whether my recent
proposal to advertise multi-conn will be impacted in any way.
> +++ b/filters/cow/blk.c
> @@ -90,9 +90,12 @@
>  #include <alloca.h>
>  #endif
>  
> +#include <pthread.h>
> +
>  #include <nbdkit-filter.h>
>  
>  #include "bitmap.h"
> +#include "cleanup.h"
>  #include "fdatasync.h"
>  #include "rounding.h"
>  #include "pread.h"
> @@ -104,6 +107,9 @@
>  /* The temporary overlay. */
>  static int fd = -1;
>  
> +/* This lock protects the bitmap from parallel access. */
> +static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
So pre-patch, we were locking both the bitmap access and the
next_ops->act().  Without reading further, it looks like your goal is to
move next_ops->act() out of the lock.
> @@ -205,6 +213,7 @@ blk_set_size (uint64_t new_size)
>  void
>  blk_status (uint64_t blknum, bool *present, bool *trimmed)
>  {
> +  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
>    enum bm_entry state = bitmap_get_blk (&bm, blknum,
BLOCK_NOT_ALLOCATED);
>  
>    *present = state != BLOCK_NOT_ALLOCATED;
> @@ -219,7 +228,18 @@ blk_read (struct nbdkit_next_ops *next_ops, void
*nxdata,
>            uint64_t blknum, uint8_t *block, int *err)
>  {
>    off_t offset = blknum * BLKSIZE;
> -  enum bm_entry state = bitmap_get_blk (&bm, blknum,
BLOCK_NOT_ALLOCATED);
> +  enum bm_entry state;
> +
> +  /* The state might be modified from another thread - for example
> +   * another thread might write (BLOCK_NOT_ALLOCATED ->
> +   * BLOCK_ALLOCATED) while we are reading from the plugin, returning
> +   * the old data.  However a read issued after the write returns
> +   * should always return the correct data.
> +   */
> +  {
> +    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
> +    state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
> +  }
Pre-patch, if two threads compete on a read and write, we have exactly
one of two outcomes:

threadA            threadB
CMD_READ           CMD_WRITE
lock               blocked on lock...
  state = not allocated
  read old data
return
                   lock
                     write new data, update state
                   return

or

threadA            threadB
CMD_READ           CMD_WRITE
blocked on lock... lock
                     write new data, update state
                   return
lock
  state = allocated
  read new data
return

As we were locking per-block, we could shred large requests (a read
could see some old blocks and some new ones), but shredding is already
problem for the client to worry about and not unique to our cow filter.

The new code now allows reading old data to occur in parallel with
writing new data (for more performance), but otherwise doesn't seem to
be any worse in behavior; I still only see one of two valid outcomes
(the read either sees an entirely old block, or an entirely new block).
Large reads may still be shredded by block, but a client that is already
avoiding overlapping reads at the same time as a write will not be any
worse off.
>  
>    nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset
%" PRIu64 ") is %s",
>                  blknum, (uint64_t) offset, state_to_string (state));
> @@ -260,6 +280,8 @@ int
>  blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
>             uint64_t blknum, uint8_t *block, enum cache_mode mode, int
*err)
>  {
> +  /* XXX Could make this lock more fine-grained with some thought. */
> +  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
On the other hand, NBD_CMD_CACHE is not frequently used, so I'm less
worried about this one being heavy-handed.
>    off_t offset = blknum * BLKSIZE;
>    enum bm_entry state = bitmap_get_blk (&bm, blknum,
BLOCK_NOT_ALLOCATED);
>    unsigned n = BLKSIZE, tail = 0;
> @@ -322,6 +344,8 @@ blk_write (uint64_t blknum, const uint8_t *block, int
*err)
>      nbdkit_error ("pwrite: %m");
>      return -1;
>    }
> +
> +  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
>    bitmap_set_blk (&bm, blknum, BLOCK_ALLOCATED);
pre-patch, if two threads contend on a write, we had:

threadA            threadB
CMD_WRITE          CMD_WRITE
lock               blocked on lock...
  write new data
  update map
return
                   lock
                     write new data
                     update map (no-op)
                   return

post-patch, we have:

threadA            threadB
CMD_WRITE          CMD_WRITE
write new data     write new data
lock               blocked on lock...
  update map
return
                   lock
                     update map (no-op)
                   return

Because we always pwrite() exactly one block, we're depending on the OS
to guarantee that we end up with exactly one of the two writes as our
final state (for client transactions larger than a block, shredding is
possible, but that was true without this patch and as above with reads,
dealing with shredding is already the responsibility of the client to
avoid overlapping I/O).  Again, looks safe to me.
> +++ b/filters/cow/cow.c
> @@ -45,16 +45,17 @@
>  #include <nbdkit-filter.h>
>  
>  #include "cleanup.h"
> -
> -#include "blk.h"
>  #include "isaligned.h"
>  #include "minmax.h"
>  #include "rounding.h"
>  
> -/* In order to handle parallel requests safely, this lock must be held
> - * when calling any blk_* functions.
> +#include "blk.h"
Looks a bit odd to move the #include, but not wrong.
> +
> +/* Read-modify-write requests are serialized through this global lock.
> + * This is only used for unaligned requests which should be
> + * infrequent.
>   */
They would be even more infrequent if I ever got around to finishing my
patches on exposing block sizing from plugins and filters all the way
back to the client...  But not this patch's problem.
> @@ -588,12 +581,6 @@ cow_extents (struct nbdkit_next_ops *next_ops, void
*nxdata,
>    assert (IS_ALIGNED (count, BLKSIZE));
>    assert (count > 0);           /* We must make forward progress. */
>  
> -  /* We hold the lock for the whole time, even when requesting extents
> -   * from the plugin, because we want to present an atomic picture of
> -   * the current state.
> -   */
> -  ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
> -
>    while (count > 0) {
>      bool present, trimmed;
>      struct nbdkit_extent e;
Our extents picture is no longer atomic, but I don't think that makes us
any worse off.

I agree with your analysis that any read issued by the client on one
connection after a write/flush has completed on another connection will
see only the cached data.  The only race where we can see stale data on
a read is when the read is done in parallel with an outstanding write or
flush - but those aren't the cases we worry about when deciding whether
multi-conn makes sense.  So I don't think anything in this patch
interferes with my patch to enable multi-conn.

ACK.  Looks like a nice job!

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Libguestfs - Feb 2021 - [PATCH nbdkit] cow: Use more fine-grained locking.

[Libguestfs] [PATCH nbdkit] cow: Use more fine-grained locking.

[Libguestfs] [PATCH nbdkit] cow: Use more fine-grained locking.