Richard W.M. Jones
2022-Jan-25 11:26 UTC
[Libguestfs] [PATCH nbdkit] common/allocators: Always align mlock buffer
On platforms with 64K page sizes (RHEL + aarch64 or ppc64le) we have hit a problem in test-memory-allocator-malloc-mlock.sh which appears to be caused by: 1. The test requests a 2K mlocked buffer. 2. The allocated buffer straddles a page boundary sometimes. 3. Linux expands the mlock request to two pages, ie. the requested size becomes 2 * 64K = 128K 4. The standard mlock limit on Linux is 64K so the mlock request is rejected and the test fails. Also POSIX requires mlock parameters to be page aligned (although Linux does not require this). This commit attempts to fix both problems by ensuring the mlock request is always page aligned. On the separate mlock path we use the usual vector reserve function to do overflow checks and so on, but then we reallocate the buffer using posix_memalign (or valloc) so it is page aligned before locking it. This requires a duplicate allocation and memcpy but this path is not performance critical. I extended the test mlock size from 2K to 8K to try to provoke the original bug (if it still happens) more often. Thanks: Laszlo Ersek --- configure.ac | 4 +- common/allocators/malloc.c | 89 ++++++++++++++++++--- tests/test-memory-allocator-malloc-mlock.sh | 16 +++- 3 files changed, 94 insertions(+), 15 deletions(-) diff --git a/configure.ac b/configure.ac index c426aec8..1ab85e3e 100644 --- a/configure.ac +++ b/configure.ac @@ -384,7 +384,9 @@ AC_CHECK_FUNCS([\ pipe \ pipe2 \ ppoll \ - posix_fadvise]) + posix_fadvise \ + posix_memalign \ + valloc]) dnl Check for structs and members. AC_CHECK_MEMBERS([struct dirent.d_type], [], [], [[#include <dirent.h>]]) diff --git a/common/allocators/malloc.c b/common/allocators/malloc.c index eea44432..986a45c9 100644 --- a/common/allocators/malloc.c +++ b/common/allocators/malloc.c @@ -36,6 +36,9 @@ #include <stdlib.h> #include <stdbool.h> #include <string.h> +#include <unistd.h> +#include <errno.h> +#include <assert.h> #ifdef HAVE_SYS_MMAN_H #include <sys/mman.h> @@ -46,6 +49,7 @@ #include <nbdkit-plugin.h> #include "cleanup.h" +#include "rounding.h" #include "vector.h" #include "allocator.h" @@ -81,13 +85,46 @@ m_alloc_free (struct allocator *a) } } -/* Extend the underlying bytearray if needed. mlock if requested. */ +/* Extend the underlying bytearray if needed. */ static int -extend (struct m_alloc *ma, uint64_t new_size) +extend_without_mlock (struct m_alloc *ma, uint64_t new_size) { - ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock); size_t old_size, n; + if (ma->ba.cap < new_size) { + old_size = ma->ba.cap; + n = new_size - ma->ba.cap; + + if (bytearray_reserve (&ma->ba, n) == -1) { + nbdkit_error ("realloc: %m"); + return -1; + } + + /* Initialize the newly allocated memory to 0. */ + memset (ma->ba.ptr + old_size, 0, n); + } + + return 0; +} + +#ifdef HAVE_MLOCK +static int +extend_with_mlock (struct m_alloc *ma, uint64_t new_size) +{ + size_t old_size, n; + void *p; +#ifdef HAVE_POSIX_MEMALIGN + int r; +#endif + + long pagesize = sysconf (_SC_PAGE_SIZE); + assert (pagesize > 0); + + /* POSIX requires both base and size of mlock to be page aligned, + * even though Linux does not. + */ + new_size = ROUND_UP (new_size, pagesize); + if (ma->ba.cap < new_size) { old_size = ma->ba.cap; n = new_size - ma->ba.cap; @@ -96,30 +133,62 @@ extend (struct m_alloc *ma, uint64_t new_size) /* Since the memory might be moved by realloc, we must unlock the * original array. */ - if (ma->use_mlock) + if (ma->use_mlock && ma->ba.ptr != NULL) munlock (ma->ba.ptr, ma->ba.cap); #endif + /* Call the normal vector reserve function so that it does the + * overflow checks. + */ if (bytearray_reserve (&ma->ba, n) == -1) { nbdkit_error ("realloc: %m"); return -1; } + assert (new_size <= ma->ba.cap); /* Initialize the newly allocated memory to 0. */ memset (ma->ba.ptr + old_size, 0, n); -#ifdef HAVE_MLOCK - if (ma->use_mlock) { - if (mlock (ma->ba.ptr, ma->ba.cap) == -1) { - nbdkit_error ("allocator=malloc: mlock: %m"); - return -1; - } + /* But now reallocate it as a page-aligned buffer so we can mlock it. */ +#ifdef HAVE_POSIX_MEMALIGN + if ((r = posix_memalign (&p, pagesize, ma->ba.cap)) != 0) { + errno = r; + nbdkit_error ("posix_memalign: %m"); + return -1; } +#elif HAVE_VALLOC + p = valloc (ma->ba.cap); + if (p == NULL) { + nbdkit_error ("valloc: %m"); + return -1; + } +#else +#error "this platform does not have posix_memalign or valloc" #endif + + memcpy (p, ma->ba.ptr, new_size); + if (mlock (ma->ba.ptr, new_size) == -1) { + nbdkit_error ("allocator=malloc: mlock: %m"); + return -1; + } } return 0; } +#endif /* HAVE_MLOCK */ + +static int +extend (struct m_alloc *ma, uint64_t new_size) +{ + ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock); + +#ifdef HAVE_MLOCK + if (ma->use_mlock) + return extend_with_mlock (ma, new_size); +#endif + + return extend_without_mlock (ma, new_size); +} static int m_alloc_set_size_hint (struct allocator *a, uint64_t size_hint) diff --git a/tests/test-memory-allocator-malloc-mlock.sh b/tests/test-memory-allocator-malloc-mlock.sh index 4ef92db6..983ef22b 100755 --- a/tests/test-memory-allocator-malloc-mlock.sh +++ b/tests/test-memory-allocator-malloc-mlock.sh @@ -46,9 +46,9 @@ if ! nbdkit memory --dump-plugin | grep -sq mlock=yes; then fi # ulimit -l is measured in kilobytes and so for this test must be at -# least 2 (kilobytes) and we actually check it's a bit larger to allow +# least 8 (kilobytes) and we actually check it's a bit larger to allow # room for error. On Linux the default is usually 64. -requires test `ulimit -l` -gt 8 +requires test `ulimit -l` -gt 16 sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX) files="memory-allocator-malloc-mlock.pid $sock" @@ -57,17 +57,21 @@ cleanup_fn rm -f $files # Run nbdkit with memory plugin. start_nbdkit -P memory-allocator-malloc-mlock.pid -U $sock \ - memory 2048 allocator=malloc,mlock=true + memory 8192 allocator=malloc,mlock=true nbdsh --connect "nbd+unix://?socket=$sock" \ -c ' -# Write some stuff to the beginning, middle and end. +# Write some stuff. buf1 = b"1" * 512 h.pwrite(buf1, 0) buf2 = b"2" * 512 h.pwrite(buf2, 1024) buf3 = b"3" * 512 h.pwrite(buf3, 1536) +buf4 = b"4" * 512 +h.pwrite(buf4, 4096) +buf5 = b"5" * 1024 +h.pwrite(buf5, 8192-1024) # Read it back. buf11 = h.pread(len(buf1), 0) @@ -76,4 +80,8 @@ buf22 = h.pread(len(buf2), 1024) assert buf2 == buf22 buf33 = h.pread(len(buf3), 1536) assert buf3 == buf33 +buf44 = h.pread(len(buf4), 4096) +assert buf4 == buf44 +buf55 = h.pread(len(buf5), 8192-1024) +assert buf5 == buf55 ' -- 2.32.0
Laszlo Ersek
2022-Jan-26 10:26 UTC
[Libguestfs] [PATCH nbdkit] common/allocators: Always align mlock buffer
On 01/25/22 12:26, Richard W.M. Jones wrote:> On platforms with 64K page sizes (RHEL + aarch64 or ppc64le) we have > hit a problem in test-memory-allocator-malloc-mlock.sh which appears > to be caused by: > > 1. The test requests a 2K mlocked buffer. > > 2. The allocated buffer straddles a page boundary sometimes. > > 3. Linux expands the mlock request to two pages, ie. the requested > size becomes 2 * 64K = 128K > > 4. The standard mlock limit on Linux is 64K so the mlock request is > rejected and the test fails. > > Also POSIX requires mlock parameters to be page aligned (although > Linux does not require this).(1) This is not precise; POSIX says that the implementation *may* require the *base address* to be page aligned.> > This commit attempts to fix both problems by ensuring the mlock > request is always page aligned. On the separate mlock path we use the > usual vector reserve function to do overflow checks and so on, but > then we reallocate the buffer using posix_memalign (or valloc) so it > is page aligned before locking it. This requires a duplicate > allocation and memcpy but this path is not performance critical. > > I extended the test mlock size from 2K to 8K to try to provoke the > original bug (if it still happens) more often. > > Thanks: Laszlo Ersek > --- > configure.ac | 4 +- > common/allocators/malloc.c | 89 ++++++++++++++++++--- > tests/test-memory-allocator-malloc-mlock.sh | 16 +++- > 3 files changed, 94 insertions(+), 15 deletions(-) > > diff --git a/configure.ac b/configure.ac > index c426aec8..1ab85e3e 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -384,7 +384,9 @@ AC_CHECK_FUNCS([\ > pipe \ > pipe2 \ > ppoll \ > - posix_fadvise]) > + posix_fadvise \ > + posix_memalign \ > + valloc]) > > dnl Check for structs and members. > AC_CHECK_MEMBERS([struct dirent.d_type], [], [], [[#include <dirent.h>]]) > diff --git a/common/allocators/malloc.c b/common/allocators/malloc.c > index eea44432..986a45c9 100644 > --- a/common/allocators/malloc.c > +++ b/common/allocators/malloc.c > @@ -36,6 +36,9 @@ > #include <stdlib.h> > #include <stdbool.h> > #include <string.h> > +#include <unistd.h> > +#include <errno.h> > +#include <assert.h> > > #ifdef HAVE_SYS_MMAN_H > #include <sys/mman.h> > @@ -46,6 +49,7 @@ > #include <nbdkit-plugin.h> > > #include "cleanup.h" > +#include "rounding.h" > #include "vector.h" > > #include "allocator.h" > @@ -81,13 +85,46 @@ m_alloc_free (struct allocator *a) > } > } > > -/* Extend the underlying bytearray if needed. mlock if requested. */ > +/* Extend the underlying bytearray if needed. */ > static int > -extend (struct m_alloc *ma, uint64_t new_size) > +extend_without_mlock (struct m_alloc *ma, uint64_t new_size) > { > - ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock); > size_t old_size, n; > > + if (ma->ba.cap < new_size) { > + old_size = ma->ba.cap; > + n = new_size - ma->ba.cap; > + > + if (bytearray_reserve (&ma->ba, n) == -1) { > + nbdkit_error ("realloc: %m"); > + return -1; > + } > + > + /* Initialize the newly allocated memory to 0. */ > + memset (ma->ba.ptr + old_size, 0, n); > + } > + > + return 0; > +} > + > +#ifdef HAVE_MLOCK > +static int > +extend_with_mlock (struct m_alloc *ma, uint64_t new_size) > +{ > + size_t old_size, n; > + void *p; > +#ifdef HAVE_POSIX_MEMALIGN > + int r; > +#endif > + > + long pagesize = sysconf (_SC_PAGE_SIZE); > + assert (pagesize > 0); > + > + /* POSIX requires both base and size of mlock to be page aligned, > + * even though Linux does not. > + */ > + new_size = ROUND_UP (new_size, pagesize);(2) Same comment as (1). So this is likely unnecessary (but not necessarily wrong either).> + > if (ma->ba.cap < new_size) { > old_size = ma->ba.cap; > n = new_size - ma->ba.cap; > @@ -96,30 +133,62 @@ extend (struct m_alloc *ma, uint64_t new_size) > /* Since the memory might be moved by realloc, we must unlock the > * original array. > */ > - if (ma->use_mlock) > + if (ma->use_mlock && ma->ba.ptr != NULL)(3) I don't understand the nullity check on "ma->ba.ptr".> munlock (ma->ba.ptr, ma->ba.cap); > #endif > > + /* Call the normal vector reserve function so that it does the > + * overflow checks. > + */ > if (bytearray_reserve (&ma->ba, n) == -1) { > nbdkit_error ("realloc: %m"); > return -1; > } > + assert (new_size <= ma->ba.cap); > > /* Initialize the newly allocated memory to 0. */ > memset (ma->ba.ptr + old_size, 0, n); > > -#ifdef HAVE_MLOCK > - if (ma->use_mlock) { > - if (mlock (ma->ba.ptr, ma->ba.cap) == -1) { > - nbdkit_error ("allocator=malloc: mlock: %m"); > - return -1; > - } > + /* But now reallocate it as a page-aligned buffer so we can mlock it. */ > +#ifdef HAVE_POSIX_MEMALIGN > + if ((r = posix_memalign (&p, pagesize, ma->ba.cap)) != 0) { > + errno = r; > + nbdkit_error ("posix_memalign: %m"); > + return -1; > } > +#elif HAVE_VALLOC > + p = valloc (ma->ba.cap); > + if (p == NULL) { > + nbdkit_error ("valloc: %m"); > + return -1; > + } > +#else > +#error "this platform does not have posix_memalign or valloc" > #endif > + > + memcpy (p, ma->ba.ptr, new_size); > + if (mlock (ma->ba.ptr, new_size) == -1) {(4) I don't understand the mlock() call. The memcpy() copies the extended byte-array's contents from the original location to the aligned allocation. But then we don't lock "p", we lock the original byte-array's base address again. I think this was not found by your testing because the size is rounded up, and that masks the problem with realloc() inside the generic vector implementation. (5) Even if we mlocked "p", I still don't understand how the aligned allocation would be saved in "ma". (6) In connection, how does m_alloc_free() deal with the aligned allocation?> + nbdkit_error ("allocator=malloc: mlock: %m"); > + return -1; > + } > } > > return 0; > } > +#endif /* HAVE_MLOCK */ > + > +static int > +extend (struct m_alloc *ma, uint64_t new_size) > +{ > + ACQUIRE_WRLOCK_FOR_CURRENT_SCOPE (&ma->lock); > + > +#ifdef HAVE_MLOCK > + if (ma->use_mlock) > + return extend_with_mlock (ma, new_size); > +#endif > + > + return extend_without_mlock (ma, new_size); > +} > > static int > m_alloc_set_size_hint (struct allocator *a, uint64_t size_hint) > diff --git a/tests/test-memory-allocator-malloc-mlock.sh b/tests/test-memory-allocator-malloc-mlock.sh > index 4ef92db6..983ef22b 100755 > --- a/tests/test-memory-allocator-malloc-mlock.sh > +++ b/tests/test-memory-allocator-malloc-mlock.sh > @@ -46,9 +46,9 @@ if ! nbdkit memory --dump-plugin | grep -sq mlock=yes; then > fi > > # ulimit -l is measured in kilobytes and so for this test must be at > -# least 2 (kilobytes) and we actually check it's a bit larger to allow > +# least 8 (kilobytes) and we actually check it's a bit larger to allow > # room for error. On Linux the default is usually 64. > -requires test `ulimit -l` -gt 8 > +requires test `ulimit -l` -gt 16 > > sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX) > files="memory-allocator-malloc-mlock.pid $sock" > @@ -57,17 +57,21 @@ cleanup_fn rm -f $files > > # Run nbdkit with memory plugin. > start_nbdkit -P memory-allocator-malloc-mlock.pid -U $sock \ > - memory 2048 allocator=malloc,mlock=true > + memory 8192 allocator=malloc,mlock=true > > nbdsh --connect "nbd+unix://?socket=$sock" \ > -c ' > -# Write some stuff to the beginning, middle and end. > +# Write some stuff. > buf1 = b"1" * 512 > h.pwrite(buf1, 0) > buf2 = b"2" * 512 > h.pwrite(buf2, 1024) > buf3 = b"3" * 512 > h.pwrite(buf3, 1536) > +buf4 = b"4" * 512 > +h.pwrite(buf4, 4096) > +buf5 = b"5" * 1024 > +h.pwrite(buf5, 8192-1024) > > # Read it back. > buf11 = h.pread(len(buf1), 0) > @@ -76,4 +80,8 @@ buf22 = h.pread(len(buf2), 1024) > assert buf2 == buf22 > buf33 = h.pread(len(buf3), 1536) > assert buf3 == buf33 > +buf44 = h.pread(len(buf4), 4096) > +assert buf4 == buf44 > +buf55 = h.pread(len(buf5), 8192-1024) > +assert buf5 == buf55 > ' >(7) For consistency, we should perhaps replace 8192-1024 with 8192-len(buf5) Thanks, Laszlo