Richard W.M. Jones
2022-Aug-25 12:10 UTC
[Libguestfs] [PATCH libnbd] ublk: Add new nbdublk program
This patch adds simple support for a ublk-based NBD client. It is also available here: https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk/ublk ublk is a way to write Linux block device drivers in userspace: https://lwn.net/Articles/903855/ For simplicity of implementation and because I don't currently understand the thread model of ublksrv, this only implements synchronous calls for now. It should be possible to extend this to a fully asynchronous client without too much difficulty. It does appear to work, at least for simple cases. I have created filesystems, files, etc on a ublk device backed by an nbdkit RAM disk, eg: On one machine do: $ nbdkit memory 1G On the client machine with the right kernel etc [see below] do: # modprobe ublk_drv # nbdublk /dev/ublkb0 nbd://remote # ublk list # blockdev --getsize64 /dev/ublkb0 # mke2fs /dev/ublkb0 # ... # ublk del -n 0 Testing this is not for the fainthearted. I would start with a throwaway Fedora Rawhide virtual machine, fully upgraded. You will need to recompile the kernel with CONFIG_BLK_DEV_UBLK=m You will need to upgrade to liburing 2.2 (I pushed this to Rawhide a few days ago). You will need to download & compile: https://github.com/ming1/ubdsrv Apply this patch to libnbd and compile it with: export PKG_CONFIG_PATH=$HOME/ubdsrv export CFLAGS="$CFLAGS -I$HOME/ubdsrv/include" export CXXFLAGS="$CXXFLAGS -I$HOME/ubdsrv/include" export LDFLAGS="$LDFLAGS -L$HOME/ubdsrv/lib" ./configure make (Check that ublk dependencies are found and nbdublk is compiled) You will then be able to run nbdublk from the compile directory using: sudo ./run nbdublk --help Rich.
Richard W.M. Jones
2022-Aug-25 12:10 UTC
[Libguestfs] [PATCH libnbd] ublk: Add new nbdublk program
--- .gitignore | 3 + Makefile.am | 3 +- README.md | 2 + bash-completion/Makefile.am | 8 +- bash-completion/nbdsh | 6 + configure.ac | 20 ++ copy/nbdcopy.pod | 1 + docs/libnbd.pod | 1 + fuse/nbdfuse.pod | 1 + info/nbdinfo.pod | 1 + run.in | 1 + sh/nbdsh.pod | 1 + ublk/Makefile.am | 68 +++++ ublk/nbdublk.c | 569 ++++++++++++++++++++++++++++++++++++ ublk/nbdublk.h | 47 +++ ublk/nbdublk.pod | 228 +++++++++++++++ ublk/not.cpp | 23 ++ ublk/tgt.c | 332 +++++++++++++++++++++ 18 files changed, 1312 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index bd4650dd77..071393fbf6 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ Makefile.in /bash-completion/nbddump /bash-completion/nbdfuse /bash-completion/nbdinfo +/bash-completion/nbdublk /common/include/test-array-size /common/include/test-checked-overflow /common/include/test-ispowerof2 @@ -240,4 +241,6 @@ Makefile.in /tests/shutdown-flags /tests/synch-parallel /tests/synch-parallel-tls +/ublk/nbdublk +/ublk/nbdublk.1 /valgrind/suppressions diff --git a/Makefile.am b/Makefile.am index 09a56db04b..dab4ffab46 100644 --- a/Makefile.am +++ b/Makefile.am @@ -49,6 +49,7 @@ SUBDIRS = \ copy \ dump \ fuse \ + ublk \ ocaml \ ocaml/examples \ ocaml/tests \ @@ -81,7 +82,7 @@ maintainer-check-extra-dist: @echo PASS: EXTRA_DIST tests check-valgrind: all - @for d in tests info copy fuse ocaml/tests interop; do \ + @for d in tests info copy fuse ublk ocaml/tests interop; do \ $(MAKE) -C $$d check-valgrind || exit 1; \ done diff --git a/README.md b/README.md index 9e9169a467..b4aa0d5a42 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ The key features are: * Hexdump tool (nbddump) to print NBD content. * Query tool (nbdinfo) to query NBD servers. * FUSE support (nbdfuse) to mount NBD in the local filesystem. +* Linux ublk support (nbdublk) to create the userspace block device. For documentation, see the [docs](docs/) and [examples](examples/) subdirectories. @@ -103,6 +104,7 @@ Optional: * OCaml and ocamlfind are both needed to generate the OCaml bindings. * Python >= 3.3 to build the Python 3 bindings and NBD shell (nbdsh). * FUSE 3 to build the nbdfuse program. +* Linux >= 6.0 and ublksrv library to build nbdublk program. * go and cgo, for compiling the golang bindings and tests. * bash-completion >= 1.99 for tab completion. diff --git a/bash-completion/Makefile.am b/bash-completion/Makefile.am index cab8ffbd8c..dc5d98c75f 100644 --- a/bash-completion/Makefile.am +++ b/bash-completion/Makefile.am @@ -24,7 +24,7 @@ EXTRA_DIST = \ if HAVE_BASH_COMPLETION -bashcomp_DATA = nbddump nbdfuse nbdsh +bashcomp_DATA = nbddump nbdfuse nbdsh nbdublk if HAVE_LIBXML2 bashcomp_DATA += nbdcopy nbdinfo @@ -46,6 +46,10 @@ nbdinfo: nbdsh rm -f $@ $(LN_S) $(srcdir)/nbdsh $@ -CLEANFILES += nbdcopy nbddump nbdfuse nbdinfo +nbdublk: nbdsh + rm -f $@ + $(LN_S) $(srcdir)/nbdsh $@ + +CLEANFILES += nbdcopy nbddump nbdfuse nbdinfo nbdublk endif diff --git a/bash-completion/nbdsh b/bash-completion/nbdsh index a3420038a1..bba0b46fc7 100644 --- a/bash-completion/nbdsh +++ b/bash-completion/nbdsh @@ -67,9 +67,15 @@ _nbdsh () _libnbd_command nbdsh } +_nbdublk () +{ + _libnbd_command nbdublk +} + # Install the handler function. complete -o default -F _nbdcopy nbdcopy complete -o default -F _nbddump nbddump complete -o default -F _nbdfuse nbdfuse complete -o default -F _nbdinfo nbdinfo complete -o default -F _nbdsh nbdsh +complete -o default -F _nbdublk nbdublk diff --git a/configure.ac b/configure.ac index dbb4d250f7..135a2bb0fa 100644 --- a/configure.ac +++ b/configure.ac @@ -337,6 +337,24 @@ AS_IF([test "x$enable_fuse" != "xno"],[ ]) AM_CONDITIONAL([HAVE_FUSE],[test "x$enable_fuse" != "xno"]) +dnl libublksrv is optional to build the nbdublk program. +AC_ARG_ENABLE([ublk], + AS_HELP_STRING([--disable-ublk], [disable ublk (nbdublk) support]), + [], + [enable_ublk=yes]) +AS_IF([test "x$enable_ublk" != "xno"],[ + PKG_CHECK_MODULES([UBLKSRV],[ublksrv],[ + printf "ublksrv version is "; $PKG_CONFIG --modversion ublksrv + AC_SUBST([UBLKSRV_CFLAGS]) + AC_SUBST([UBLKSRV_LIBS]) + AC_DEFINE([HAVE_UBLK],[1],[Define to 1 if you have ublk.]) + ],[ + enable_ublk=no + AC_MSG_WARN([libublksrv (ublk server) library and headers are missing, so optional nbdublk program won't be built]) + ]) +]) +AM_CONDITIONAL([HAVE_UBLK],[test "x$enable_ublk" != "xno"]) + dnl Check we have enough to run podwrapper. AC_CHECK_PROG([PERL],[perl],[perl],[no]) AS_IF([test "x$PERL" != "xno"],[ @@ -605,6 +623,7 @@ AC_CONFIG_FILES([Makefile sh/Makefile tests/Makefile tests/functions.sh + ublk/Makefile valgrind/Makefile]) AC_OUTPUT @@ -640,6 +659,7 @@ echo feature "TLS support" test "x$HAVE_GNUTLS_TRUE" = "x" feature "NBD URI support" test "x$HAVE_LIBXML2_TRUE" = "x" feature "FUSE support" test "x$HAVE_FUSE_TRUE" = "x" +feature "ublk support" test "x$HAVE_UBLK_TRUE" = "x" feature "Manual pages" test "x$HAVE_POD_TRUE" = "x" feature "Bash tab completion" test "x$HAVE_BASH_COMPLETION_TRUE" = "x" diff --git a/copy/nbdcopy.pod b/copy/nbdcopy.pod index f06d1123a7..dc4e8dd428 100644 --- a/copy/nbdcopy.pod +++ b/copy/nbdcopy.pod @@ -304,6 +304,7 @@ L<nbddump(1)>, L<nbdfuse(1)>, L<nbdinfo(1)>, L<nbdsh(1)>, +L<nbdublk(1)>, L<nbdkit(1)>, L<qemu-img(1)>. diff --git a/docs/libnbd.pod b/docs/libnbd.pod index dd880c3bff..7a01179a68 100644 --- a/docs/libnbd.pod +++ b/docs/libnbd.pod @@ -1061,6 +1061,7 @@ L<nbddump(1)>, L<nbdfuse(1)>, L<nbdinfo(1)>, L<nbdsh(1)>, +L<nbdublk(1)>, L<qemu(1)>. =head1 AUTHORS diff --git a/fuse/nbdfuse.pod b/fuse/nbdfuse.pod index daa79c1050..6d23340df5 100644 --- a/fuse/nbdfuse.pod +++ b/fuse/nbdfuse.pod @@ -415,6 +415,7 @@ L<nbdcopy(1)>, L<nbddump(1)>, L<nbdinfo(1)>, L<nbdsh(1)>, +L<nbdublk(1)>, L<fusermount3(1)>, L<mount.fuse3(8)>, L<nbd_connect_uri(3)>, diff --git a/info/nbdinfo.pod b/info/nbdinfo.pod index 7dfb9edb60..c3ec3ee73d 100644 --- a/info/nbdinfo.pod +++ b/info/nbdinfo.pod @@ -421,6 +421,7 @@ L<nbdcopy(1)>, L<nbddump(1)>, L<nbdfuse(1)>, L<nbdsh(1)>, +L<nbdublk(1)>, L<file(1)>, L<jq(1)>, L<qemu-img(1)>, diff --git a/run.in b/run.in index 89226fd0e9..38c80db233 100755 --- a/run.in +++ b/run.in @@ -62,6 +62,7 @@ prepend PATH "$b/dump" prepend PATH "$b/fuse" prepend PATH "$b/info" prepend PATH "$b/sh" +prepend PATH "$b/ublk" export PATH # Set LD_LIBRARY_PATH and DYLD_LIBRARY_PATH to contain library. diff --git a/sh/nbdsh.pod b/sh/nbdsh.pod index c9dac4a7ca..4d14118cd4 100644 --- a/sh/nbdsh.pod +++ b/sh/nbdsh.pod @@ -149,6 +149,7 @@ L<libnbd-security(3)>, L<nbdcopy(1)>, L<nbddump(1)>, L<nbdfuse(1)>, +L<nbdublk(1)>, L<nbdinfo(1)>, L<qemu-img(1)>. diff --git a/ublk/Makefile.am b/ublk/Makefile.am new file mode 100644 index 0000000000..d3e1328ec6 --- /dev/null +++ b/ublk/Makefile.am @@ -0,0 +1,68 @@ +# nbd client library in userspace +# Copyright (C) 2013-2022 Red Hat Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +include $(top_srcdir)/subdir-rules.mk + +EXTRA_DIST = \ + nbdublk.pod \ + $(NULL) + +TESTS_ENVIRONMENT = \ + LIBNBD_DEBUG=1 \ + $(MALLOC_CHECKS) \ + EXPECTED_VERSION=$(VERSION) \ + $(NULL) +LOG_COMPILER = $(top_builddir)/run +TESTS + +if HAVE_UBLK + +bin_PROGRAMS = nbdublk + +nbdublk_SOURCES = \ + nbdublk.c \ + nbdublk.h \ + tgt.c \ + not.cpp \ + $(NULL) +nbdublk_CPPFLAGS = \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/common/include \ + -I$(top_srcdir)/common/utils \ + $(NULL) +nbdublk_CFLAGS = $(WARNINGS_CFLAGS) $(UBLKSRV_CFLAGS) +nbdublk_CXXFLAGS = $(WARNINGS_CFLAGS) $(UBLKSRV_CFLAGS) +nbdublk_LDADD = \ + $(top_builddir)/common/utils/libutils.la \ + $(top_builddir)/lib/libnbd.la \ + $(UBLKSRV_LIBS) \ + $(NULL) + +if HAVE_POD + +man_MANS = \ + nbdublk.1 \ + $(NULL) + +nbdublk.1: nbdublk.pod $(top_builddir)/podwrapper.pl + $(PODWRAPPER) --section=1 --man $@ \ + --html $(top_builddir)/html/$@.html \ + $< + +endif HAVE_POD + +endif HAVE_UBLK diff --git a/ublk/nbdublk.c b/ublk/nbdublk.c new file mode 100644 index 0000000000..e5c7bf0253 --- /dev/null +++ b/ublk/nbdublk.c @@ -0,0 +1,569 @@ +/* NBD client library in userspace + * Copyright (C) 2013-2022 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* ublk support. */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <limits.h> +#include <signal.h> +#include <getopt.h> + +#include <ublksrv.h> + +#include <libnbd.h> + +#include "nbdublk.h" + +#include "ispowerof2.h" +#include "vector.h" +#include "version.h" + +#define DEVICE_PREFIX "/dev/ublkb" +#define DEVICE_PREFIX_LEN 10 + +handles nbd = empty_vector; +unsigned connections = 4; +bool readonly = false; +bool rotational; +bool can_fua; +uint64_t size; +uint64_t min_block_size; +uint64_t pref_block_size; +bool verbose = false; + +/* The single control device. This is a global so the signal handler + * can attempt to stop the device. + */ +static struct ublksrv_ctrl_dev *dev; + +enum mode { + MODE_URI, /* URI */ + MODE_COMMAND, /* --command */ + MODE_FD, /* --fd */ + MODE_SQUARE_BRACKET, /* [ CMD ], same as --socket-activation*/ + MODE_SOCKET_ACTIVATION, /* --socket-activation */ + MODE_TCP, /* --tcp */ + MODE_UNIX, /* --unix */ + MODE_VSOCK, /* --vsock */ +}; + +static void __attribute__((noreturn)) +usage (FILE *fp, int exitcode) +{ + fprintf (fp, +"\n" +"Mount NBD server as a virtual device:\n" +"\n" +#ifdef HAVE_LIBXML2 +" nbdublk [-C N|--connections N] [-r] [-v|--verbose]\n" +" " DEVICE_PREFIX "<N> URI\n" +"\n" +"Other modes:\n" +"\n" +#endif +" nbdublk " DEVICE_PREFIX "<N> [ CMD [ARGS ...] ]\n" +" nbdublk " DEVICE_PREFIX "<N> --command CMD [ARGS ...]\n" +" nbdublk " DEVICE_PREFIX "<N> --fd N\n" +" nbdublk " DEVICE_PREFIX "<N> --tcp HOST PORT\n" +" nbdublk " DEVICE_PREFIX "<N> --unix SOCKET\n" +" nbdublk " DEVICE_PREFIX "<N> --vsock CID PORT\n" +"\n" +"You can also use just the device number or '-' to allocate one:\n" +"\n" +" nbdublk <N> ...\n" +" nbdublk - ...\n" +"\n" +"To unmount:\n" +"\n" +" ublk del -n <N>\n" +"\n" +"Other options:\n" +"\n" +" nbdublk --help\n" +" nbdublk -V|--version\n" +"\n" +"Please read the nbdublk(1) manual page for full usage.\n" +"\n" +); + exit (exitcode); +} + +/* Which modes support multi-conn? We cannot connect multiple times + * to subprocesses (since we'd have to launch multiple subprocesses). + */ +static bool +mode_is_multi_conn_compatible (enum mode mode) +{ + switch (mode) { + case MODE_COMMAND: + case MODE_SQUARE_BRACKET: + case MODE_SOCKET_ACTIVATION: + case MODE_FD: + return false; + case MODE_URI: + case MODE_TCP: + case MODE_UNIX: + case MODE_VSOCK: + return true; + default: + abort (); + } +} + +static struct nbd_handle *create_and_connect (enum mode mode, + int argc, char **argv); +static void signal_handler (int sig); + +int +main (int argc, char *argv[]) +{ + enum mode mode = MODE_URI; + enum { + HELP_OPTION = CHAR_MAX + 1, + LONG_OPTIONS, + SHORT_OPTIONS, + }; + /* Note the "+" means we stop processing as soon as we get to the + * first non-option argument (the device) and then we parse the rest + * of the command line without getopt. + */ + const char *short_options = "+C:rvV"; + const struct option long_options[] = { + { "help", no_argument, NULL, HELP_OPTION }, + { "long-options", no_argument, NULL, LONG_OPTIONS }, + { "connections", required_argument, NULL, 'C' }, + { "readonly", no_argument, NULL, 'r' }, + { "read-only", no_argument, NULL, 'r' }, + { "short-options", no_argument, NULL, SHORT_OPTIONS }, + { "verbose", no_argument, NULL, 'v' }, + { "version", no_argument, NULL, 'V' }, + + { NULL } + }; + int c, r; + size_t i; + struct nbd_handle *h; + int64_t rs; + uint64_t max_block_size; + const char *s; + struct ublksrv_dev_data data = { .dev_id = -1 }; + struct sigaction sa = { 0 }; + + for (;;) { + c = getopt_long (argc, argv, short_options, long_options, NULL); + if (c == -1) + break; + + switch (c) { + case HELP_OPTION: + usage (stdout, EXIT_SUCCESS); + + case LONG_OPTIONS: + for (i = 0; long_options[i].name != NULL; ++i) { + if (strcmp (long_options[i].name, "long-options") != 0 && + strcmp (long_options[i].name, "short-options") != 0) + printf ("--%s\n", long_options[i].name); + } + exit (EXIT_SUCCESS); + + case SHORT_OPTIONS: + for (i = 0; short_options[i]; ++i) { + if (short_options[i] != ':' && short_options[i] != '+') + printf ("-%c\n", short_options[i]); + } + exit (EXIT_SUCCESS); + + case 'C': + if (sscanf (optarg, "%u", &connections) != 1 || + connections < 1 || connections > 1024) { + fprintf (stderr, "%s: --connections parameter must be an unsigned integer >= 1\n", + argv[0]); + exit (EXIT_FAILURE); + } + break; + + case 'r': + readonly = true; + break; + + case 'v': + verbose = true; + break; + + case 'V': + display_version ("nbdublk"); + exit (EXIT_SUCCESS); + + default: + usage (stderr, EXIT_FAILURE); + } + } + + /* There must be at least 2 parameters (device and + * URI/--command/etc). + */ + if (argc - optind < 2) + usage (stderr, EXIT_FAILURE); + + /* Parse and check the device name. */ + s = argv[optind++]; + /* /dev/ublkc<N> */ + if (strncmp (s, DEVICE_PREFIX, DEVICE_PREFIX_LEN) == 0) { + if (sscanf (&s[DEVICE_PREFIX_LEN], "%u", &data.dev_id) != 1) { + fprintf (stderr, "%s: could not parse ublk device name: %s\n", + argv[0], s); + exit (EXIT_FAILURE); + } + } + else if (s[0] >= '0' && s[0] <= '9') { + if (sscanf (s, "%u", &data.dev_id) != 1) { + fprintf (stderr, "%s: could not parse ublk device name: %s\n", + argv[0], s); + exit (EXIT_FAILURE); + } + } + else if (s[0] == '-') { + data.dev_id = -1; /* autoallocate */ + } + else { + fprintf (stderr, "%s: expecting device name %s<N>\n", + argv[0], DEVICE_PREFIX); + exit (EXIT_FAILURE); + } + + /* The next parameter is either a URI or a mode switch. */ + if (strcmp (argv[optind], "--command") == 0 || + strcmp (argv[optind], "--cmd") == 0) { + mode = MODE_COMMAND; + optind++; + } + else if (strcmp (argv[optind], "[") == 0) { + mode = MODE_SQUARE_BRACKET; + optind++; + } + else if (strcmp (argv[optind], "--socket-activation") == 0 || + strcmp (argv[optind], "--systemd-socket-activation") == 0) { + mode = MODE_SOCKET_ACTIVATION; + optind++; + } + else if (strcmp (argv[optind], "--fd") == 0) { + mode = MODE_FD; + optind++; + } + else if (strcmp (argv[optind], "--tcp") == 0) { + mode = MODE_TCP; + optind++; + } + else if (strcmp (argv[optind], "--unix") == 0) { + mode = MODE_UNIX; + optind++; + } + else if (strcmp (argv[optind], "--vsock") == 0) { + mode = MODE_VSOCK; + optind++; + } + /* This is undocumented, but allow either URI or --uri URI. */ + else if (strcmp (argv[optind], "--uri") == 0) { + mode = MODE_URI; + optind++; + } + else if (argv[optind][0] == '-') { + fprintf (stderr, "%s: unknown mode: %s\n", argv[0], argv[optind]); + usage (stderr, EXIT_FAILURE); + } + +#ifndef HAVE_LIBXML2 + if (mode == MODE_URI) { + fprintf (stderr, "%s: URIs are not supported in this build of libnbd\n", + argv[0]); + exit (EXIT_FAILURE); + } +#endif + + /* Check there are enough parameters following given the mode. */ + switch (mode) { + case MODE_URI: + case MODE_FD: + case MODE_UNIX: + if (argc - optind != 1) + usage (stderr, EXIT_FAILURE); + break; + case MODE_TCP: + case MODE_VSOCK: + if (argc - optind != 2) + usage (stderr, EXIT_FAILURE); + break; + case MODE_COMMAND: + case MODE_SOCKET_ACTIVATION: + if (argc - optind < 1) + usage (stderr, EXIT_FAILURE); + break; + case MODE_SQUARE_BRACKET: + if (argc - optind < 2 || strcmp (argv[argc-1], "]") != 0) + usage (stderr, EXIT_FAILURE); + break; + } + /* At this point we know the command line is valid, and so can start + * opening FUSE and libnbd. + */ + + /* Create the libnbd handle and connect to it. */ + h = create_and_connect (mode, argc, argv); + if (handles_append (&nbd, h) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + + /* If the server supports multi-conn, and we are able to, try to + * open more handles. + */ + if (connections > 1 && + mode_is_multi_conn_compatible (mode) && + nbd_can_multi_conn (nbd.ptr[0]) >= 1) { + if (handles_reserve (&nbd, connections-1) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + for (i = 2; i <= connections; ++i) { + h = create_and_connect (mode, argc, argv); + handles_append (&nbd, h); /* reserved above, so can't fail */ + } + } + connections = (unsigned) nbd.len; + + /* Get the size and preferred block sizes. */ + rs = nbd_get_size (nbd.ptr[0]); + if (rs == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + size = (uint64_t) rs; + + rs = nbd_get_block_size (nbd.ptr[0], LIBNBD_SIZE_MAXIMUM); + if (rs <= 0 || rs > 64 * 1024 * 1024) + max_block_size = 64 * 1024 * 1024; + else + max_block_size = rs; + if (!is_power_of_2 (max_block_size)) { + fprintf (stderr, + "%s: %s block size is not a power of two: %" PRIu64 "\n", + argv[0], "maximum", max_block_size); + exit (EXIT_FAILURE); + } + + rs = nbd_get_block_size (nbd.ptr[0], LIBNBD_SIZE_PREFERRED); + if (rs <= 0) + pref_block_size = 4096; + else + pref_block_size = rs; + if (!is_power_of_2 (pref_block_size)) { + fprintf (stderr, + "%s: %s block size is not a power of two: %" PRIu64 "\n", + argv[0], "preferred", pref_block_size); + exit (EXIT_FAILURE); + } + + rs = nbd_get_block_size (nbd.ptr[0], LIBNBD_SIZE_MINIMUM); + if (rs <= 0) + min_block_size = 512; /* minimum that the kernel supports */ + else + min_block_size = rs; + if (!is_power_of_2 (min_block_size)) { + fprintf (stderr, + "%s: %s block size is not a power of two: %" PRIu64 "\n", + argv[0], "minimum", min_block_size); + exit (EXIT_FAILURE); + } + + /* If the remote NBD server is readonly, then act as if the '-r' + * flag was given on the nbdublk command line. + */ + if (nbd_is_read_only (nbd.ptr[0]) > 0) + readonly = true; + + rotational = nbd_is_rotational (nbd.ptr[0]) > 0; + can_fua = nbd_can_fua (nbd.ptr[0]) > 0; + + if (verbose) + fprintf (stderr, "%s: size: %" PRIu64 " connections: %u%s\n", + argv[0], size, connections, readonly ? " readonly" : ""); + + /* Fill in other fields in 'data' struct. */ + data.max_io_buf_bytes = max_block_size; + data.nr_hw_queues = connections; + data.queue_depth = 64; + data.tgt_type = "nbd"; + data.tgt_ops = &tgt_type; + data.flags = 0; + + dev = ublksrv_ctrl_init (&data); + if (!dev) { + fprintf (stderr, "%s: ublksrv_ctrl_init: %m\n", argv[0]); + exit (EXIT_FAILURE); + } + + /* Register signal handlers to try to stop the device. */ + sa.sa_handler = signal_handler; + sigaction (SIGHUP, &sa, NULL); + sigaction (SIGINT, &sa, NULL); + sigaction (SIGTERM, &sa, NULL); + sa.sa_handler = SIG_IGN; + sigaction (SIGPIPE, &sa, NULL); + + r = ublksrv_ctrl_add_dev (dev); + if (r < 0) { + errno = -r; + fprintf (stderr, "%s: ublksrv_ctrl_add_dev: "DEVICE_PREFIX "%d: %m\n", + argv[0], dev->dev_info.dev_id); + ublksrv_ctrl_deinit (dev); + exit (EXIT_FAILURE); + } + + if (verbose) + fprintf (stderr, "%s: created %s%d\n", + argv[0], DEVICE_PREFIX, dev->dev_info.dev_id); + + /* XXX nbdfuse creates a pid file. However I reason that you can + * tell if the service is available when the block device is created + * so a pid file is not necessary. May need to revisit this. + */ + + if (start_daemon (dev) == -1) { + ublksrv_ctrl_del_dev (dev); + ublksrv_ctrl_deinit (dev); + for (i = 0; i < nbd.len; ++i) + nbd_close (nbd.ptr[i]); + exit (EXIT_FAILURE); + } + + /* Close ublk device. */ + ublksrv_ctrl_del_dev (dev); + ublksrv_ctrl_deinit (dev); + + /* Close NBD handle(s). */ + for (i = 0; i < nbd.len; ++i) + nbd_close (nbd.ptr[i]); + + exit (EXIT_SUCCESS); +} + +/* Called from main() above to create an NBD handle and connect to it. + * For multi-conn, this may be called several times. + */ +static struct nbd_handle * +create_and_connect (enum mode mode, int argc, char **argv) +{ + int fd; + uint32_t cid, port; + struct nbd_handle *h; + + /* Create the libnbd handle. */ + h = nbd_create (); + if (h == NULL) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + nbd_set_debug (h, verbose); + + /* Connect to the NBD server synchronously. */ + switch (mode) { + case MODE_URI: + if (nbd_connect_uri (h, argv[optind]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_COMMAND: + if (nbd_connect_command (h, &argv[optind]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_SQUARE_BRACKET: + /* This is the same as MODE_SOCKET_ACTIVATION but we must eat the + * closing square bracket on the command line. + */ + assert (strcmp (argv[argc-1], "]") == 0); /* checked above */ + argv[argc-1] = NULL; + /*FALLTHROUGH*/ + case MODE_SOCKET_ACTIVATION: + if (nbd_connect_systemd_socket_activation (h, &argv[optind]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_FD: + if (sscanf (argv[optind], "%d", &fd) != 1) { + fprintf (stderr, "%s: could not parse file descriptor: %s\n\n", + argv[0], argv[optind]); + exit (EXIT_FAILURE); + } + if (nbd_connect_socket (h, fd) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_TCP: + if (nbd_connect_tcp (h, argv[optind], argv[optind+1]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_UNIX: + if (nbd_connect_unix (h, argv[optind]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + + case MODE_VSOCK: + if (sscanf (argv[optind], "%" SCNu32, &cid) != 1) { + fprintf (stderr, "%s: could not parse vsock cid: %s\n\n", + argv[0], argv[optind]); + exit (EXIT_FAILURE); + } + if (sscanf (argv[optind+1], "%" SCNu32, &port) != 1) { + fprintf (stderr, "%s: could not parse vsock port: %s\n\n", + argv[0], argv[optind]); + exit (EXIT_FAILURE); + } + if (nbd_connect_vsock (h, cid, port) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + break; + } + + return h; +} + +static void +signal_handler (int sig) +{ + /* XXX Racy, but not much else we can do. */ + ublksrv_ctrl_stop_dev (dev); +} diff --git a/ublk/nbdublk.h b/ublk/nbdublk.h new file mode 100644 index 0000000000..086352e9d1 --- /dev/null +++ b/ublk/nbdublk.h @@ -0,0 +1,47 @@ +/* NBD client library in userspace + * Copyright (C) 2013-2022 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef LIBNBD_NBDUBLK_H +#define LIBNBD_NBDUBLK_H + +#include <stdbool.h> + +#include <ublksrv.h> + +#include "vector.h" + +DEFINE_VECTOR_TYPE (handles, struct nbd_handle *) + +#define UBLKSRV_TGT_TYPE_NBD 0 + +extern handles nbd; +extern unsigned connections; +extern bool readonly; +extern bool rotational; +extern bool can_fua; +extern char *filename; +extern uint64_t size; +extern uint64_t min_block_size; +extern uint64_t pref_block_size; +extern bool verbose; + +extern struct ublksrv_tgt_type tgt_type; + +extern int start_daemon (struct ublksrv_ctrl_dev *dev); + +#endif /* LIBNBD_NBDUBLK_H */ diff --git a/ublk/nbdublk.pod b/ublk/nbdublk.pod new file mode 100644 index 0000000000..ef532f1b5b --- /dev/null +++ b/ublk/nbdublk.pod @@ -0,0 +1,228 @@ +=head1 NAME + +nbdublk - connect network block device to a local device + +=head1 SYNOPSIS + + nbdublk [-C N|--connections N] [-r] [-v|--verbose] /dev/ublkb<N> URI + + nbdublk [-C N|--connections N] [-r] [-v|--verbose] <N> URI + + nbdublk [-C N|--connections N] [-r] [-v|--verbose] - URI + +=for paragraph + + nbdublk /dev/ublkb<N> [ CMD [ARGS ...] ] + +=for paragraph + + nbdublk /dev/ublkb<N> --command CMD [ARGS ...] + +=for paragraph + + nbdublk /dev/ublkb<N> --fd N + +=for paragraph + + nbdublk /dev/ublkb<N> --tcp HOST PORT + +=for paragraph + + nbdublk /dev/ublkb<N> --unix SOCKET + +=for paragraph + + nbdublk /dev/ublkb<N> --vsock CID PORT + +To list devices: + + ublk list + +To unmount: + + ublk del -n <N> + +Other options: + + nbdublk --help + +=for paragraph + + nbdublk -V|--version + +=head1 DESCRIPTION + +nbdublk is used to create a Linux F</dev/ublkbI<N>> device from a +network block device server. Reads and writes to the virtual device +are turned into reads and writes to the NBD server. + +The first parameter is the Linux device name of the form +F</dev/ublkbI<N>> (for some number I<N>), for example F</dev/ublkb0>, +F</dev/ublkb1>, &c. You can just use the number on its own, or use +C<-> to get ublk to allocate an unused device. + +The second and following parameters refer to the NBD server, which can +be local or remote. The server can be specified as an NBD URI (like +C<nbd://localhost>), or as an NBD server running as a subprocess of +nbdublk (using S<C<[ ... ]>>), or in various other ways (see +L</MODES>). + +Use L<ublk(8)> to list and delete devices. + +=head2 Requires Linux + +This program requires Linux E<ge> 6.0 and the C<ublk_drv.ko> kernel +module. You may need to load the kernel module and you usually have +to run nbdublk as root. + +=head1 EXAMPLE + +Create an NBD ublk device connected to a remote NBD server: + + # nbdublk /dev/ublkb1 nbd://pick + +List the device: + + # ublk list + dev id 1: nr_hw_queues 4 queue_depth 64 block size 1 dev_capacity 0 + max rq size 67108864 daemon pid 32382 flags 0x0 state LIVE + +You can then use C</dev/ublkb1> as a regular device. To disconnect +the device use: + + # ublk del -n 1 + +=head1 OPTIONS + +=over 4 + +=item B<--help> + +Display brief command line help and exit. + +=item B<-C> N + +=item B<--connections> N + +If multi-conn is used, use N connections to the server. The default +is 4. + +Multi-conn is enabled by default when possible. Modes which run a +subprocess, such as I<--command> are not able to use multi-conn. Mode +I<--fd> also cannot use multi-conn. Also the server must advertise +multi-conn (use L<nbdinfo(1)> to query what the server supports). + +=item B<-C 1> + +=item B<--connections 1> + +Disable multi-conn. Only use a single connection to the NBD server. +See L</THREAD MODEL> below. + +=item B<-r> + +=item B<--readonly> + +Access the network block device read-only. The virtual file will have +read-only permissions, and any writes will return errors. + +If the remote NBD server is read-only then this flag is added +automatically. (Check C<is_read_only:> field in the output of +L<nbdinfo(1)>). + +=item B<-v> + +=item B<--verbose> + +Enable verbose messages to stderr. This enables libnbd debugging and +other messages. + +=item B<-V> + +=item B<--version> + +Display the package name and version and exit. + +=back + +=head1 MODES + +Modes are used to select the NBD server. Possible modes are: + +=over 4 + +=item nbdublk DEVICE URI + +This mode uses an NBD URI (see L<nbd_connect_uri(3)> and +L<https://github.com/NetworkBlockDevice/nbd/blob/master/doc/uri.md>). +For example this specifies a TLS-encrypted connection to +C<example.com> port C<10809>, with export name C<disk>: + + nbdfuse dir nbds://example.com/disk + +=item nbdublk DEVICE B<[> CMD [ARGS ...] B<]> + +Run an NBD server as a subprocess. In this mode an NBD server can be +run directly from the command line with nbdublk communicating with the +server over a socket. This requires that the NBD server supports +systemd socket activation. See L</EXAMPLES> above and +L<nbd_connect_systemd_socket_activation(3)>. + +=item nbdublk DEVICE B<--command> CMD [ARGS ...] + +Select command mode. In this mode an NBD server can be run directly +from the command line with nbdublk communicating with the server over +the server?s stdin/stdout. Normally you would use this with +C<nbdkit -s>. See L<nbd_connect_command(3)>. + +=item nbdublk DEVICE B<--fd> N + +Select file descriptor mode. In this mode a connected socket is +passed to nbdublk. nbdublk connects to the socket on the numbered +file descriptor. See also L<nbd_connect_socket(3)>. + +=item nbdublk DEVICE B<--tcp> HOST PORT + +Select TCP mode. Connect to an NBD server on a host and port over an +unencrypted TCP socket. See also L<nbd_connect_tcp(3)>. + +=item nbdublk DEVICE B<--unix> SOCKET + +Select Unix mode. Connect to an NBD server on a Unix domain socket. +See also L<nbd_connect_unix(3)>. + +=item nbdublk DEVICE B<--vsock> CID PORT + +Select vsock mode. Connect to an NBD server on a C<AF_VSOCK> socket. +See also L<nbd_connect_vsock(3)>. + +=back + +=head1 SEE ALSO + +L<libnbd(3)>, +L<nbdcopy(1)>, +L<nbddump(1)>, +L<nbdfuse(1)>, +L<nbdinfo(1)>, +L<nbdsh(1)>, +L<ublk(8)>, +L<nbd_connect_uri(3)>, +L<nbd_connect_command(3)>, +L<nbd_connect_socket(3)>, +L<nbd_connect_systemd_socket_activation(3)>, +L<nbd_connect_tcp(3)>, +L<nbd_connect_unix(3)>, +L<nbd_connect_vsock(3)>, +L<nbdkit(1)>, +L<nbdkit-loop(1)>, +L<qemu-nbd(8)>, +L<nbd-client(8)>. + +=head1 AUTHORS + +Richard W.M. Jones + +=head1 COPYRIGHT + +Copyright (C) 2019-2022 Red Hat Inc. diff --git a/ublk/not.cpp b/ublk/not.cpp new file mode 100644 index 0000000000..23970e346c --- /dev/null +++ b/ublk/not.cpp @@ -0,0 +1,23 @@ +/* NBD client library in userspace + * Copyright (C) 2013-2022 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <config.h> + +/* This file does nothing except to force nbdublk to be linked as a + * C++ program because libublksrv requires it. + */ diff --git a/ublk/tgt.c b/ublk/tgt.c new file mode 100644 index 0000000000..2a53804e42 --- /dev/null +++ b/ublk/tgt.c @@ -0,0 +1,332 @@ +/* NBD client library in userspace + * Copyright (C) 2013-2022 Red Hat Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdbool.h> +#include <errno.h> +#include <pthread.h> + +#include <ublksrv.h> + +#include <libnbd.h> + +#include "ispowerof2.h" + +#include "nbdublk.h" + +/* Per-thread information. */ +struct thread_info { + struct ublksrv_dev *dev; + size_t thread_num; + pthread_t thread; +}; + +static char jbuf[4096]; +static pthread_mutex_t jbuf_lock = PTHREAD_MUTEX_INITIALIZER; + +static void * +io_thread (void *vpinfo) +{ + struct thread_info *thread_info = vpinfo; + struct ublksrv_dev *dev = thread_info->dev; + const unsigned dev_id = dev->ctrl_dev->dev_info.dev_id; + const size_t q_id = thread_info->thread_num; + struct ublksrv_queue *q; + int r; + + pthread_mutex_lock (&jbuf_lock); + ublksrv_json_write_queue_info (dev->ctrl_dev, jbuf, sizeof jbuf, + q_id, gettid ()); + pthread_mutex_unlock (&jbuf_lock); + + q = ublksrv_queue_init (dev, q_id, NULL); + if (!q) { + perror ("ublksrv_queue_init"); + return NULL; + } + + if (verbose) + fprintf (stderr, "%s: ublk tid %d dev %d queue %d started\n", + "nbdublk", q->tid, dev_id, q->q_id); + + for (;;) { + r = ublksrv_process_io (q); + if (r < 0) { + if (r != -ENODEV) { /* ENODEV is expected when the device is deleted */ + errno = -r; + perror ("ublksrv_process_io"); + } + break; + } + } + + if (verbose) + fprintf (stderr, "%s: ublk tid %d dev %d queue %d exited\n", + "nbdublk", q->tid, dev_id, q->q_id); + + ublksrv_queue_deinit (q); + return NULL; +} + +static int +set_parameters (struct ublksrv_ctrl_dev *ctrl_dev, + const struct ublksrv_dev *dev) +{ + struct ublksrv_ctrl_dev_info *dinfo = &ctrl_dev->dev_info; + const unsigned attrs + (readonly ? UBLK_ATTR_READ_ONLY : 0) | + (rotational ? UBLK_ATTR_ROTATIONAL : 0) | + (can_fua ? UBLK_ATTR_FUA : 0); + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .attrs = attrs, + .logical_bs_shift = 9, + .physical_bs_shift = 9, + .io_opt_shift = log_2_bits (pref_block_size), + .io_min_shift = log_2_bits (min_block_size), + .max_sectors = dinfo->max_io_buf_bytes >> 9, + .dev_sectors = dev->tgt.dev_size >> 9, + }, + .discard = { + .max_discard_sectors = UINT_MAX >> 9, + .max_discard_segments = 1, + }, + }; + int r; + + pthread_mutex_lock (&jbuf_lock); + ublksrv_json_write_params (&p, jbuf, sizeof jbuf); + pthread_mutex_unlock (&jbuf_lock); + + r = ublksrv_ctrl_set_params (ctrl_dev, &p); + if (r < 0) { + errno = -r; + perror ("ublksrv_ctrl_set_params"); + return -1; + } + + return 0; +} + +int +start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) +{ + const struct ublksrv_ctrl_dev_info *dinfo = &ctrl_dev->dev_info; + struct thread_info *thread_info; + struct ublksrv_dev *dev; + size_t i; + int r; + + if (verbose) + fprintf (stderr, "%s: starting daemon\n", "nbdublk"); + + r = ublksrv_ctrl_get_affinity(ctrl_dev); + if (r < 0) { + errno = r; + perror ("ublksrv_ctrl_get_affinity"); + return -1; + } + + thread_info = calloc (dinfo->nr_hw_queues, sizeof (struct thread_info)); + if (thread_info == NULL) { + perror ("calloc"); + return -1; + } + + dev = ublksrv_dev_init (ctrl_dev); + if (!dev) { + /* Annoyingly libublksrv logs some not very useful information to + * syslog when this fails. + */ + fprintf (stderr, "%s: ublksrv_dev_init failed: " + "there may be more information in syslog\n", + "nbdublk"); + return -1; + } + + /* Create the io threads. */ + for (i = 0; i < dinfo->nr_hw_queues; ++i) { + thread_info[i].dev = dev; + thread_info[i].thread_num = i; + r = pthread_create (&thread_info[i].thread, NULL, + io_thread, &thread_info[i]); + if (r != 0) { + errno = r; + perror ("pthread_create"); + ublksrv_dev_deinit (dev); + return -1; + } + } + + if (set_parameters (ctrl_dev, dev) == -1) { + ublksrv_dev_deinit (dev); + return -1; + } + + /* Start the device. */ + r = ublksrv_ctrl_start_dev (ctrl_dev, getpid ()); + if (r < 0) { + errno = -r; + perror ("ublksrv_ctrl_start_dev"); + ublksrv_dev_deinit (dev); + return -1; + } + + ublksrv_ctrl_get_info (ctrl_dev); + ublksrv_ctrl_dump (ctrl_dev, jbuf); + + /* Wait for threads to exit. */ + for (i = 0; i < dinfo->nr_hw_queues; ++i) + pthread_join (thread_info[i].thread, NULL); + + ublksrv_dev_deinit (dev); + free (thread_info); + return 0; +} + +static int +init_tgt (struct ublksrv_dev *dev, int type, int argc, char *argv[]) +{ + const struct ublksrv_ctrl_dev_info *info = &dev->ctrl_dev->dev_info; + struct ublksrv_tgt_info *tgt = &dev->tgt; + struct ublksrv_tgt_base_json tgt_json = { + .type = type, + .name = "nbd", + }; + + if (verbose) + fprintf (stderr, "%s: init_tgt: type = %d\n", "nbdublk", type); + + if (type != UBLKSRV_TGT_TYPE_NBD) + return -1; + + tgt_json.dev_size = tgt->dev_size = size; + tgt->tgt_ring_depth = info->queue_depth; + tgt->nr_fds = 0; + + ublksrv_json_write_dev_info (dev->ctrl_dev, jbuf, sizeof jbuf); + ublksrv_json_write_target_base_info (jbuf, sizeof jbuf, &tgt_json); + + return 0; +} + +static int +handle_io_async (struct ublksrv_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublksrv_get_iod (q, tag); + const unsigned op = ublksrv_get_op (iod); + const unsigned flags = ublksrv_get_flags (iod); + const bool fua = flags & UBLK_IO_F_FUA; + const bool alloc_zero = flags & UBLK_IO_F_NOUNMAP; /* else punch hole */ + const size_t q_id = q->q_id; /* also the NBD handle number */ + struct nbd_handle *h = nbd.ptr[q_id]; + uint32_t nbd_flags = 0; + int r, res; + + if (verbose) + fprintf (stderr, "%s: handle_io_async: tag = %d q_id = %zu\n", + "nbdublk", tag, q_id); + + /* XXX reimplement this using asynch operations */ + switch (op) { + case UBLK_IO_OP_READ: + r = nbd_pread (h, (void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + res = - (nbd_get_errno () ? : EINVAL); + } + else + res = iod->nr_sectors << 9; /* NBD always does complete op. */ + break; + + case UBLK_IO_OP_WRITE: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_pwrite (h, (const void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + res = - (nbd_get_errno () ? : EINVAL); + } + else + res = iod->nr_sectors << 9; /* NBD always does complete op. */ + break; + + case UBLK_IO_OP_FLUSH: + r = nbd_flush (h, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + res = - (nbd_get_errno () ? : EINVAL); + } + else + res = 0; + break; + + case UBLK_IO_OP_DISCARD: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_trim (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + res = - (nbd_get_errno () ? : EINVAL); + } + else + res = iod->nr_sectors << 9; + break; + + case UBLK_IO_OP_WRITE_ZEROES: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + if (alloc_zero) + nbd_flags |= LIBNBD_CMD_FLAG_NO_HOLE; + + r = nbd_zero (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + res = - (nbd_get_errno () ? : EINVAL); + } + else + res = iod->nr_sectors << 9; + break; + + default: + fprintf (stderr, "%s: unknown operation %u\n", "nbdublk", op); + res = -ENOTSUP; + break; + } + + ublksrv_complete_io (q, tag, res); + + return 0; +} + +struct ublksrv_tgt_type tgt_type = { + .type = UBLKSRV_TGT_TYPE_NBD, + .name = "nbd", + .init_tgt = init_tgt, + .handle_io_async = handle_io_async, +}; -- 2.37.0.rc2
Hi Jones, On Thu, Aug 25, 2022 at 01:10:55PM +0100, Richard W.M. Jones wrote:> This patch adds simple support for a ublk-based NBD client. > It is also available here: > https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk/ublk > > ublk is a way to write Linux block device drivers in userspace:Just looked at your nbdublk implementation a bit, basically it is good, and one really nice work. Also follows two suggestions: 1) the io_uring context is multilexed with ublk io command handling, so we should avoid to block in both ->handle_io_async() and ->handle_event(), otherwise performance may be bad 2) in the implementation of nbd worker thread, there are two sleep points(wait for incoming io command, and network FD), I'd suggest to use poll to wait on any of them Recently I are working to add ublksrv io offloading or aio interfaces on this sort of case in which io_uring can't be used, which may simplified this area, please see the attached patch which applies the above two points against your patch. And obvious improvement can be observed on my simple fio test( randread, io, 4k bs, libaio) against backend of 'nbdkit file'. But these interfaces aren't merged to ublksrv github tree yet, you can find them in the aio branch, and demo_event.c is one example wrt. how to use them: https://github.com/ming1/ubdsrv/tree/aio Actually this interface can be improved further for nbdublk case, and the request allocation isn't needed actually for this direct offloading. But they are added for covering some IOs not from ublk driver, such as meta data, so 'struct ublksrv_aio' is allocated. I will try best to finalize them and merge to master branch. BTW, IOPS on nbdublk(backend: nbdkit file) still has big gap compared with ublk-loop, so I guess in future maybe io_uring should be tried and see if big improvement can be observed. diff --git a/generator/API.ml b/generator/API.ml index 3e948aa..bdd0fb8 100644 --- a/generator/API.ml +++ b/generator/API.ml @@ -2289,6 +2289,26 @@ that eventual action is actually expected - for example, if the connection is established but there are no commands in flight, using an infinite timeout will permanently block). +This function is mainly useful as an example of how you might +integrate libnbd with your own main loop, rather than being +intended as something you would use."; + example = Some "examples/aio-connect-read.c"; + }; + + "poll2", { + default_call with + args = [Int "evt"; Int "timeout" ]; ret = RInt; + shortdesc = "poll the handle once with eventfd"; + longdesc = "\ +This is a simple implementation of L<poll(2)> which is used +internally by synchronous API calls. On success, it returns +C<0> if the C<timeout> (in milliseconds) occurs, or C<1> if +the poll completed and the state machine progressed. Set +C<timeout> to C<-1> to block indefinitely (but be careful +that eventual action is actually expected - for example, if +the connection is established but there are no commands in +flight, using an infinite timeout will permanently block). + This function is mainly useful as an example of how you might integrate libnbd with your own main loop, rather than being intended as something you would use."; @@ -3153,6 +3173,7 @@ let first_version = [ "zero", (1, 0); "block_status", (1, 0); "poll", (1, 0); + "poll2", (1, 0); "aio_connect", (1, 0); "aio_connect_uri", (1, 0); "aio_connect_unix", (1, 0); diff --git a/lib/poll.c b/lib/poll.c index df01d94..e9d7924 100644 --- a/lib/poll.c +++ b/lib/poll.c @@ -27,14 +27,21 @@ #include "internal.h" /* A simple main loop implementation using poll(2). */ -int -nbd_unlocked_poll (struct nbd_handle *h, int timeout) +static int +__nbd_unlocked_poll (struct nbd_handle *h, int evt, int timeout) { - struct pollfd fds[1]; - int r; + struct pollfd fds[2]; + int r, nr_fds = 1; /* fd might be negative, and poll will ignore it. */ fds[0].fd = nbd_unlocked_aio_get_fd (h); + if (evt > 0) { + fds[1].fd = evt; + fds[1].events = POLLIN; + fds[1].revents = 0; + nr_fds = 2; + } + switch (nbd_internal_aio_get_direction (get_next_state (h))) { case LIBNBD_AIO_DIRECTION_READ: fds[0].events = POLLIN; @@ -58,7 +65,7 @@ nbd_unlocked_poll (struct nbd_handle *h, int timeout) * passed to poll. */ do { - r = poll (fds, 1, timeout); + r = poll (fds, nr_fds, timeout); debug (h, "poll end: r=%d revents=%x", r, fds[0].revents); } while (r == -1 && errno == EINTR); @@ -91,3 +98,15 @@ nbd_unlocked_poll (struct nbd_handle *h, int timeout) return 1; } + +int +nbd_unlocked_poll (struct nbd_handle *h, int timeout) +{ + return __nbd_unlocked_poll (h, -1, timeout); +} + +int +nbd_unlocked_poll2 (struct nbd_handle *h, int evt, int timeout) +{ + return __nbd_unlocked_poll (h, evt, timeout); +} diff --git a/ublk/tgt.c b/ublk/tgt.c index 4cdd42a..2ab995a 100644 --- a/ublk/tgt.c +++ b/ublk/tgt.c @@ -35,6 +35,7 @@ #endif #include <ublksrv.h> +#include <ublksrv_aio.h> #include <libnbd.h> @@ -46,14 +47,6 @@ /* Number of seconds to wait for commands to complete when closing the dev. */ #define RELEASE_TIMEOUT 5 -/* List of completed commands. */ -struct completion { - struct ublksrv_queue *q; - int tag; - int res; /* The normal return value, if the command completes OK. */ -}; -DEFINE_VECTOR_TYPE(completions, struct completion) - /* Thread model: * * There are two threads per NBD connection. One thread @@ -69,32 +62,170 @@ struct thread_info { pthread_t io_uring_thread; pthread_t nbd_work_thread; - /* This counts the number of commands in flight. The condition is - * used to allow the operations thread to process commands when - * in_flight goes from 0 -> 1. This is roughly equivalent to - * nbd_aio_in_flight, but we need to count it ourselves in order to - * use the condition. - */ - _Atomic size_t in_flight; - pthread_mutex_t in_flight_mutex; - pthread_cond_t in_flight_cond; - - /* Commands have to be completed on the io_uring thread, but they - * run on the NBD thread. So when the NBD command completes we put - * the command on this queue and they are passed to the io_uring - * thread to call ublksrv_complete_io. - */ - pthread_mutex_t completed_commands_lock; - completions completed_commands; + struct ublksrv_aio_list compl; }; DEFINE_VECTOR_TYPE(thread_infos, struct thread_info) static thread_infos thread_info; static pthread_barrier_t barrier; +static struct ublksrv_aio_ctx *aio_ctx = NULL; static char jbuf[4096]; static pthread_mutex_t jbuf_lock = PTHREAD_MUTEX_INITIALIZER; +/* Command completion callback (called on the NBD thread). */ +static int +command_completed (void *vpdata, int *error) +{ + struct ublksrv_aio *req = vpdata; + int q_id = ublksrv_aio_qid(req->id); + struct ublksrv_queue *q = ublksrv_get_queue(aio_ctx->dev, q_id); + struct ublksrv_aio_list *compl = &thread_info.ptr[q_id].compl; + + if (verbose) + fprintf (stderr, + "%s: command_completed: tag=%d q_id=%zu error=%d\n", + "nbdublk", ublksrv_aio_tag(req->id), + ublksrv_aio_qid(req->id), *error); + + /* If the command failed, override the normal result. */ + if (*error != 0) + req->res = *error; + + pthread_spin_lock(&compl->lock); + aio_list_add(&compl->list, req); + pthread_spin_unlock(&compl->lock); + + return 1; +} + + +int aio_submitter(struct ublksrv_aio_ctx *ctx, + struct ublksrv_aio *req) +{ + const struct ublksrv_io_desc *iod = &req->io; + const unsigned op = ublksrv_get_op (iod); + const unsigned flags = ublksrv_get_flags (iod); + const bool fua = flags & UBLK_IO_F_FUA; + const bool alloc_zero = flags & UBLK_IO_F_NOUNMAP; /* else punch hole */ + const size_t q_id = ublksrv_aio_qid(req->id); /* also the NBD handle number */ + struct nbd_handle *h = nbd.ptr[q_id]; + uint32_t nbd_flags = 0; + int64_t r; + nbd_completion_callback cb; + bool sync = false; + + if (verbose) + fprintf (stderr, "%s: handle_io_async: tag = %d q_id = %zu\n", + "nbdublk", ublksrv_aio_tag(req->id), q_id); + + req->res = iod->nr_sectors << 9; + cb.callback = command_completed; + cb.user_data = req; + cb.free = NULL; + + switch (op) { + case UBLK_IO_OP_READ: + r = nbd_aio_pread (h, (void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, cb, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + break; + + case UBLK_IO_OP_WRITE: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_aio_pwrite (h, (const void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, cb, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + break; + + case UBLK_IO_OP_FLUSH: + r = nbd_flush (h, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + case UBLK_IO_OP_DISCARD: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_trim (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + case UBLK_IO_OP_WRITE_ZEROES: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + if (alloc_zero) + nbd_flags |= LIBNBD_CMD_FLAG_NO_HOLE; + + r = nbd_zero (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + default: + fprintf (stderr, "%s: unknown operation %u\n", "nbdublk", op); + return -ENOTSUP; + } + + /* return if this request is completed */ + if (sync) + return 1; + return 0; +} + +static void * +nbd_work_thread (void *vpinfo) +{ + struct thread_info *ti = vpinfo; + struct nbd_handle *h = nbd.ptr[ti->i]; + struct ublksrv_queue *q = ublksrv_get_queue(aio_ctx->dev, ti->i); + struct ublksrv_aio_list *c = &thread_info.ptr[ti->i].compl; + + /* Signal to the main thread that we have initialized. */ + pthread_barrier_wait (&barrier); + + while (!ublksrv_aio_ctx_dead(aio_ctx)) { + struct aio_list compl; + + aio_list_init(&compl); + ublksrv_aio_submit_worker(aio_ctx, aio_submitter, &compl); + + pthread_spin_lock(&c->lock); + aio_list_splice(&c->list, &compl); + pthread_spin_unlock(&c->lock); + + ublksrv_aio_complete_worker(aio_ctx, &compl); + + if (nbd_poll2 (h, aio_ctx->efd, -1) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + } + + /*NOTREACHED*/ + return NULL; +} + static void * io_uring_thread (void *vpinfo) { @@ -139,37 +270,6 @@ io_uring_thread (void *vpinfo) return NULL; } -static void * -nbd_work_thread (void *vpinfo) -{ - struct thread_info *thread_info = vpinfo; - const size_t i = thread_info->i; - struct nbd_handle *h = nbd.ptr[i]; - - /* Signal to the main thread that we have initialized. */ - pthread_barrier_wait (&barrier); - - while (1) { - /* Sleep until at least one command is in flight. */ - pthread_mutex_lock (&thread_info->in_flight_mutex); - while (thread_info->in_flight == 0) - pthread_cond_wait (&thread_info->in_flight_cond, - &thread_info->in_flight_mutex); - pthread_mutex_unlock (&thread_info->in_flight_mutex); - - /* Dispatch work while there are commands in flight. */ - while (thread_info->in_flight > 0) { - if (nbd_poll (h, -1) == -1) { - fprintf (stderr, "%s\n", nbd_get_error ()); - exit (EXIT_FAILURE); - } - } - } - - /*NOTREACHED*/ - return NULL; -} - static int set_parameters (struct ublksrv_ctrl_dev *ctrl_dev, const struct ublksrv_dev *dev) @@ -215,6 +315,7 @@ int start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) { const struct ublksrv_ctrl_dev_info *dinfo = &ctrl_dev->dev_info; + int dev_id = ctrl_dev->dev_info.dev_id; struct ublksrv_dev *dev; size_t i; int r; @@ -260,22 +361,21 @@ start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) return -1; } + aio_ctx = ublksrv_aio_ctx_init(dev, 0); + if (!aio_ctx) { + fprintf(stderr, "dev %d call ublk_aio_ctx_init failed\n", dev_id); + return -ENOMEM; + } + /* Create the threads. */ for (i = 0; i < nbd.len; ++i) { /* Note this cannot fail because of previous reserve. */ thread_infos_append (&thread_info, (struct thread_info) - { .dev = dev, .i = i, .in_flight = 0 }); + { .dev = dev, .i = i,}); + + ublksrv_aio_init_list(&thread_info.ptr[i].compl); - r = pthread_mutex_init (&thread_info.ptr[i].in_flight_mutex, NULL); - if (r != 0) - goto bad_pthread; - r = pthread_cond_init (&thread_info.ptr[i].in_flight_cond, NULL); - if (r != 0) - goto bad_pthread; - r = pthread_mutex_init (&thread_info.ptr[i].completed_commands_lock, NULL); - if (r != 0) - goto bad_pthread; r = pthread_create (&thread_info.ptr[i].io_uring_thread, NULL, io_uring_thread, &thread_info.ptr[i]); if (r != 0) @@ -316,25 +416,11 @@ start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) for (i = 0; i < nbd.len; ++i) pthread_join (thread_info.ptr[i].io_uring_thread, NULL); - /* Wait until a timeout while there are NBD commands in flight. */ - time (&st); - while (time (NULL) - st <= RELEASE_TIMEOUT) { - for (i = 0; i < nbd.len; ++i) { - if (thread_info.ptr[i].in_flight > 0) - break; - } - if (i == nbd.len) /* no commands in flight */ - break; - - /* Signal to the operations threads to work. */ - for (i = 0; i < nbd.len; ++i) { - pthread_mutex_lock (&thread_info.ptr[i].in_flight_mutex); - pthread_cond_signal (&thread_info.ptr[i].in_flight_cond); - pthread_mutex_unlock (&thread_info.ptr[i].in_flight_mutex); - } - - sleep (1); + for (i = 0; i < nbd.len; ++i) { + ublksrv_aio_ctx_shutdown(aio_ctx); + pthread_join (thread_info.ptr[i].nbd_work_thread, NULL); } + ublksrv_aio_ctx_deinit(aio_ctx); ublksrv_dev_deinit (dev); //thread_infos_reset (&thread_info); @@ -367,176 +453,31 @@ init_tgt (struct ublksrv_dev *dev, int type, int argc, char *argv[]) return 0; } -/* Command completion callback (called on the NBD thread). */ -static int -command_completed (void *vpdata, int *error) -{ - struct completion *completion = vpdata; - struct ublksrv_queue *q = completion->q; - const size_t i = q->q_id; - - if (verbose) - fprintf (stderr, - "%s: command_completed: tag=%d q_id=%zu res=%d error=%d\n", - "nbdublk", completion->tag, i, completion->res, *error); - - /* If the command failed, override the normal result. */ - if (*error != 0) - completion->res = *error; - - assert (thread_info.ptr[i].in_flight >= 1); - thread_info.ptr[i].in_flight--; - - /* Copy the command to the list of completed commands. - * - * Note *completion is freed by the .free handler that we added to - * this completion callback. - */ - pthread_mutex_lock (&thread_info.ptr[i].completed_commands_lock); - completions_append (&thread_info.ptr[i].completed_commands, *completion); - - /* Signal io_uring thread that the command has been completed. - * It will call us back in a different thread on ->handle_event - * and we can finally complete the command(s) there. - */ - ublksrv_queue_send_event (q); - pthread_mutex_unlock (&thread_info.ptr[i].completed_commands_lock); - - /* Retire the NBD command. */ - return 1; -} - static void -handle_event (struct ublksrv_queue *q) +nbd_handle_event (struct ublksrv_queue *q) { - const size_t i = q->q_id; - size_t j; - if (verbose) - fprintf (stderr, "%s: handle_event: q_id = %d\n", "nbdublk", q->q_id); + fprintf (stderr, "%s: handle_event: q_id = %d\n", "nbdublk", q->q_id); - pthread_mutex_lock (&thread_info.ptr[i].completed_commands_lock); - - for (j = 0; j < thread_info.ptr[i].completed_commands.len; ++j) { - struct completion *completion - &thread_info.ptr[i].completed_commands.ptr[j]; - ublksrv_complete_io (completion->q, completion->tag, completion->res); - } - completions_reset (&thread_info.ptr[i].completed_commands); - ublksrv_queue_handled_event (q); - - pthread_mutex_unlock (&thread_info.ptr[i].completed_commands_lock); + ublksrv_aio_handle_event(aio_ctx, q); } -/* Start a single command. */ -static int -handle_io_async (struct ublksrv_queue *q, int tag) +static int nbd_handle_io_async(struct ublksrv_queue *q, int tag) { - const struct ublksrv_io_desc *iod = ublksrv_get_iod (q, tag); - const unsigned op = ublksrv_get_op (iod); - const unsigned flags = ublksrv_get_flags (iod); - const bool fua = flags & UBLK_IO_F_FUA; - const bool alloc_zero = flags & UBLK_IO_F_NOUNMAP; /* else punch hole */ - const size_t q_id = q->q_id; /* also the NBD handle number */ - struct nbd_handle *h = nbd.ptr[q_id]; - uint32_t nbd_flags = 0; - int64_t r; - nbd_completion_callback cb; - struct completion *completion; + const struct ublksrv_io_desc *iod = ublksrv_get_iod(q, tag); + struct ublksrv_aio *req = ublksrv_aio_alloc_req(aio_ctx, 0); - if (verbose) - fprintf (stderr, "%s: handle_io_async: tag = %d q_id = %zu\n", - "nbdublk", tag, q_id); - - /* Set up a completion callback and its user data. */ - completion = malloc (sizeof *completion); - if (completion == NULL) abort (); - completion->q = q; - completion->tag = tag; - completion->res = iod->nr_sectors << 9; - cb.callback = command_completed; - cb.user_data = completion; - cb.free = free; - - switch (op) { - case UBLK_IO_OP_READ: - r = nbd_aio_pread (h, (void *) iod->addr, iod->nr_sectors << 9, - iod->start_sector << 9, cb, 0); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_WRITE: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - r = nbd_aio_pwrite (h, (const void *) iod->addr, iod->nr_sectors << 9, - iod->start_sector << 9, cb, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_FLUSH: - r = nbd_flush (h, 0); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; + req->io = *iod; + req->id = ublksrv_aio_pid_tag(q->q_id, tag); + ublksrv_aio_submit_req(aio_ctx, req); - case UBLK_IO_OP_DISCARD: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - r = nbd_trim (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_WRITE_ZEROES: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - if (alloc_zero) - nbd_flags |= LIBNBD_CMD_FLAG_NO_HOLE; - - r = nbd_zero (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - default: - fprintf (stderr, "%s: unknown operation %u\n", "nbdublk", op); - ublksrv_complete_io (q, tag, -ENOTSUP); - return 0; - } - - /* Make sure the corresponding NBD worker sees the command. */ - pthread_mutex_lock (&thread_info.ptr[q_id].in_flight_mutex); - thread_info.ptr[q_id].in_flight++; - pthread_cond_signal (&thread_info.ptr[q_id].in_flight_cond); - pthread_mutex_unlock (&thread_info.ptr[q_id].in_flight_mutex); - - return 0; + return 0; } struct ublksrv_tgt_type tgt_type = { .type = UBLKSRV_TGT_TYPE_NBD, .name = "nbd", .init_tgt = init_tgt, - .handle_io_async = handle_io_async, - .handle_event = handle_event, + .handle_io_async = nbd_handle_io_async, + .handle_event = nbd_handle_event, }; Thanks, Ming
On Tue, Aug 30, 2022 at 10:32:02AM +0800, Ming Lei wrote:> Hi Jones, > > On Thu, Aug 25, 2022 at 01:10:55PM +0100, Richard W.M. Jones wrote: > > This patch adds simple support for a ublk-based NBD client. > > It is also available here: > > https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk/ublk > > > > ublk is a way to write Linux block device drivers in userspace: > > Just looked at your nbdublk implementation a bit, basically it is good, > and one really nice work. > > Also follows two suggestions: > > 1) the io_uring context is multilexed with ublk io command handling, so > we should avoid to block in both ->handle_io_async() and > ->handle_event(), otherwise performance may be bad > > 2) in the implementation of nbd worker thread, there are two sleep > points(wait for incoming io command, and network FD), I'd suggest to use > poll to wait on any of them > > Recently I are working to add ublksrv io offloading or aio interfaces on this > sort of case in which io_uring can't be used, which may simplified this > area, please see the attached patch which applies the above two points against > your patch. And obvious improvement can be observed on my simple fio test( > randread, io, 4k bs, libaio) against backend of 'nbdkit file'. > > But these interfaces aren't merged to ublksrv github tree yet, you can find > them in the aio branch, and demo_event.c is one example wrt. how to use > them: > > https://github.com/ming1/ubdsrv/tree/aio > > Actually this interface can be improved further for nbdublk case, > and the request allocation isn't needed actually for this direct > offloading. But they are added for covering some IOs not from ublk > driver, such as meta data, so 'struct ublksrv_aio' is allocated. > I will try best to finalize them and merge to master branch. > > BTW, IOPS on nbdublk(backend: nbdkit file) still has big gap compared > with ublk-loop, so I guess in future maybe io_uring should be tried and > see if big improvement can be observed. >The patch sent in last email may cause io hang on MQ, and follows the fixed version: diff --git a/generator/API.ml b/generator/API.ml index 3e948aa..bdd0fb8 100644 --- a/generator/API.ml +++ b/generator/API.ml @@ -2289,6 +2289,26 @@ that eventual action is actually expected - for example, if the connection is established but there are no commands in flight, using an infinite timeout will permanently block). +This function is mainly useful as an example of how you might +integrate libnbd with your own main loop, rather than being +intended as something you would use."; + example = Some "examples/aio-connect-read.c"; + }; + + "poll2", { + default_call with + args = [Int "evt"; Int "timeout" ]; ret = RInt; + shortdesc = "poll the handle once with eventfd"; + longdesc = "\ +This is a simple implementation of L<poll(2)> which is used +internally by synchronous API calls. On success, it returns +C<0> if the C<timeout> (in milliseconds) occurs, or C<1> if +the poll completed and the state machine progressed. Set +C<timeout> to C<-1> to block indefinitely (but be careful +that eventual action is actually expected - for example, if +the connection is established but there are no commands in +flight, using an infinite timeout will permanently block). + This function is mainly useful as an example of how you might integrate libnbd with your own main loop, rather than being intended as something you would use."; @@ -3153,6 +3173,7 @@ let first_version = [ "zero", (1, 0); "block_status", (1, 0); "poll", (1, 0); + "poll2", (1, 0); "aio_connect", (1, 0); "aio_connect_uri", (1, 0); "aio_connect_unix", (1, 0); diff --git a/lib/poll.c b/lib/poll.c index df01d94..e9d7924 100644 --- a/lib/poll.c +++ b/lib/poll.c @@ -27,14 +27,21 @@ #include "internal.h" /* A simple main loop implementation using poll(2). */ -int -nbd_unlocked_poll (struct nbd_handle *h, int timeout) +static int +__nbd_unlocked_poll (struct nbd_handle *h, int evt, int timeout) { - struct pollfd fds[1]; - int r; + struct pollfd fds[2]; + int r, nr_fds = 1; /* fd might be negative, and poll will ignore it. */ fds[0].fd = nbd_unlocked_aio_get_fd (h); + if (evt > 0) { + fds[1].fd = evt; + fds[1].events = POLLIN; + fds[1].revents = 0; + nr_fds = 2; + } + switch (nbd_internal_aio_get_direction (get_next_state (h))) { case LIBNBD_AIO_DIRECTION_READ: fds[0].events = POLLIN; @@ -58,7 +65,7 @@ nbd_unlocked_poll (struct nbd_handle *h, int timeout) * passed to poll. */ do { - r = poll (fds, 1, timeout); + r = poll (fds, nr_fds, timeout); debug (h, "poll end: r=%d revents=%x", r, fds[0].revents); } while (r == -1 && errno == EINTR); @@ -91,3 +98,15 @@ nbd_unlocked_poll (struct nbd_handle *h, int timeout) return 1; } + +int +nbd_unlocked_poll (struct nbd_handle *h, int timeout) +{ + return __nbd_unlocked_poll (h, -1, timeout); +} + +int +nbd_unlocked_poll2 (struct nbd_handle *h, int evt, int timeout) +{ + return __nbd_unlocked_poll (h, evt, timeout); +} diff --git a/ublk/tgt.c b/ublk/tgt.c index 4cdd42a..5b478ae 100644 --- a/ublk/tgt.c +++ b/ublk/tgt.c @@ -35,6 +35,7 @@ #endif #include <ublksrv.h> +#include <ublksrv_aio.h> #include <libnbd.h> @@ -46,14 +47,6 @@ /* Number of seconds to wait for commands to complete when closing the dev. */ #define RELEASE_TIMEOUT 5 -/* List of completed commands. */ -struct completion { - struct ublksrv_queue *q; - int tag; - int res; /* The normal return value, if the command completes OK. */ -}; -DEFINE_VECTOR_TYPE(completions, struct completion) - /* Thread model: * * There are two threads per NBD connection. One thread @@ -69,23 +62,8 @@ struct thread_info { pthread_t io_uring_thread; pthread_t nbd_work_thread; - /* This counts the number of commands in flight. The condition is - * used to allow the operations thread to process commands when - * in_flight goes from 0 -> 1. This is roughly equivalent to - * nbd_aio_in_flight, but we need to count it ourselves in order to - * use the condition. - */ - _Atomic size_t in_flight; - pthread_mutex_t in_flight_mutex; - pthread_cond_t in_flight_cond; - - /* Commands have to be completed on the io_uring thread, but they - * run on the NBD thread. So when the NBD command completes we put - * the command on this queue and they are passed to the io_uring - * thread to call ublksrv_complete_io. - */ - pthread_mutex_t completed_commands_lock; - completions completed_commands; + struct ublksrv_aio_ctx *aio_ctx; + struct ublksrv_aio_list compl; }; DEFINE_VECTOR_TYPE(thread_infos, struct thread_info) static thread_infos thread_info; @@ -95,6 +73,161 @@ static pthread_barrier_t barrier; static char jbuf[4096]; static pthread_mutex_t jbuf_lock = PTHREAD_MUTEX_INITIALIZER; +/* Command completion callback (called on the NBD thread). */ +static int +command_completed (void *vpdata, int *error) +{ + struct ublksrv_aio *req = vpdata; + int q_id = ublksrv_aio_qid(req->id); + struct ublksrv_aio_ctx *aio_ctx = thread_info.ptr[q_id].aio_ctx; + struct ublksrv_queue *q = ublksrv_get_queue(aio_ctx->dev, q_id); + struct ublksrv_aio_list *compl = &thread_info.ptr[q_id].compl; + + if (verbose) + fprintf (stderr, + "%s: command_completed: tag=%d q_id=%zu error=%d\n", + "nbdublk", ublksrv_aio_tag(req->id), + ublksrv_aio_qid(req->id), *error); + + /* If the command failed, override the normal result. */ + if (*error != 0) + req->res = *error; + + pthread_spin_lock(&compl->lock); + aio_list_add(&compl->list, req); + pthread_spin_unlock(&compl->lock); + + return 1; +} + + +int aio_submitter(struct ublksrv_aio_ctx *ctx, + struct ublksrv_aio *req) +{ + const struct ublksrv_io_desc *iod = &req->io; + const unsigned op = ublksrv_get_op (iod); + const unsigned flags = ublksrv_get_flags (iod); + const bool fua = flags & UBLK_IO_F_FUA; + const bool alloc_zero = flags & UBLK_IO_F_NOUNMAP; /* else punch hole */ + const size_t q_id = ublksrv_aio_qid(req->id); /* also the NBD handle number */ + struct nbd_handle *h = nbd.ptr[q_id]; + uint32_t nbd_flags = 0; + int64_t r; + nbd_completion_callback cb; + bool sync = false; + + if (verbose) + fprintf (stderr, "%s: handle_io_async: tag = %d q_id = %zu\n", + "nbdublk", ublksrv_aio_tag(req->id), q_id); + + req->res = iod->nr_sectors << 9; + cb.callback = command_completed; + cb.user_data = req; + cb.free = NULL; + + switch (op) { + case UBLK_IO_OP_READ: + r = nbd_aio_pread (h, (void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, cb, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + break; + + case UBLK_IO_OP_WRITE: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_aio_pwrite (h, (const void *) iod->addr, iod->nr_sectors << 9, + iod->start_sector << 9, cb, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + break; + + case UBLK_IO_OP_FLUSH: + r = nbd_flush (h, 0); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + case UBLK_IO_OP_DISCARD: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + r = nbd_trim (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + case UBLK_IO_OP_WRITE_ZEROES: + if (fua && can_fua) + nbd_flags |= LIBNBD_CMD_FLAG_FUA; + + if (alloc_zero) + nbd_flags |= LIBNBD_CMD_FLAG_NO_HOLE; + + r = nbd_zero (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); + if (r == -1) { + fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); + return -EINVAL; + } + sync = true; + break; + + default: + fprintf (stderr, "%s: unknown operation %u\n", "nbdublk", op); + return -ENOTSUP; + } + + /* return if this request is completed */ + if (sync) + return 1; + return 0; +} + +static void * +nbd_work_thread (void *vpinfo) +{ + struct thread_info *ti = vpinfo; + struct nbd_handle *h = nbd.ptr[ti->i]; + struct ublksrv_aio_ctx *aio_ctx = thread_info.ptr[ti->i].aio_ctx; + struct ublksrv_queue *q = ublksrv_get_queue(aio_ctx->dev, ti->i); + struct ublksrv_aio_list *c = &thread_info.ptr[ti->i].compl; + + /* Signal to the main thread that we have initialized. */ + pthread_barrier_wait (&barrier); + + while (!ublksrv_aio_ctx_dead(aio_ctx)) { + struct aio_list compl; + + aio_list_init(&compl); + ublksrv_aio_submit_worker(aio_ctx, aio_submitter, &compl); + + pthread_spin_lock(&c->lock); + aio_list_splice(&c->list, &compl); + pthread_spin_unlock(&c->lock); + + ublksrv_aio_complete_worker(aio_ctx, &compl); + + if (nbd_poll2 (h, aio_ctx->efd, -1) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + } + + /*NOTREACHED*/ + return NULL; +} + static void * io_uring_thread (void *vpinfo) { @@ -139,37 +272,6 @@ io_uring_thread (void *vpinfo) return NULL; } -static void * -nbd_work_thread (void *vpinfo) -{ - struct thread_info *thread_info = vpinfo; - const size_t i = thread_info->i; - struct nbd_handle *h = nbd.ptr[i]; - - /* Signal to the main thread that we have initialized. */ - pthread_barrier_wait (&barrier); - - while (1) { - /* Sleep until at least one command is in flight. */ - pthread_mutex_lock (&thread_info->in_flight_mutex); - while (thread_info->in_flight == 0) - pthread_cond_wait (&thread_info->in_flight_cond, - &thread_info->in_flight_mutex); - pthread_mutex_unlock (&thread_info->in_flight_mutex); - - /* Dispatch work while there are commands in flight. */ - while (thread_info->in_flight > 0) { - if (nbd_poll (h, -1) == -1) { - fprintf (stderr, "%s\n", nbd_get_error ()); - exit (EXIT_FAILURE); - } - } - } - - /*NOTREACHED*/ - return NULL; -} - static int set_parameters (struct ublksrv_ctrl_dev *ctrl_dev, const struct ublksrv_dev *dev) @@ -215,6 +317,7 @@ int start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) { const struct ublksrv_ctrl_dev_info *dinfo = &ctrl_dev->dev_info; + int dev_id = ctrl_dev->dev_info.dev_id; struct ublksrv_dev *dev; size_t i; int r; @@ -265,17 +368,16 @@ start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) /* Note this cannot fail because of previous reserve. */ thread_infos_append (&thread_info, (struct thread_info) - { .dev = dev, .i = i, .in_flight = 0 }); + { .dev = dev, .i = i,}); + + thread_info.ptr[i].aio_ctx = ublksrv_aio_ctx_init(dev, 0); + if (!thread_info.ptr[i].aio_ctx) { + fprintf(stderr, "dev %d queue %d call ublk_aio_ctx_init failed\n", + dev_id, i); + return -ENOMEM; + } + ublksrv_aio_init_list(&thread_info.ptr[i].compl); - r = pthread_mutex_init (&thread_info.ptr[i].in_flight_mutex, NULL); - if (r != 0) - goto bad_pthread; - r = pthread_cond_init (&thread_info.ptr[i].in_flight_cond, NULL); - if (r != 0) - goto bad_pthread; - r = pthread_mutex_init (&thread_info.ptr[i].completed_commands_lock, NULL); - if (r != 0) - goto bad_pthread; r = pthread_create (&thread_info.ptr[i].io_uring_thread, NULL, io_uring_thread, &thread_info.ptr[i]); if (r != 0) @@ -316,24 +418,10 @@ start_daemon (struct ublksrv_ctrl_dev *ctrl_dev) for (i = 0; i < nbd.len; ++i) pthread_join (thread_info.ptr[i].io_uring_thread, NULL); - /* Wait until a timeout while there are NBD commands in flight. */ - time (&st); - while (time (NULL) - st <= RELEASE_TIMEOUT) { - for (i = 0; i < nbd.len; ++i) { - if (thread_info.ptr[i].in_flight > 0) - break; - } - if (i == nbd.len) /* no commands in flight */ - break; - - /* Signal to the operations threads to work. */ - for (i = 0; i < nbd.len; ++i) { - pthread_mutex_lock (&thread_info.ptr[i].in_flight_mutex); - pthread_cond_signal (&thread_info.ptr[i].in_flight_cond); - pthread_mutex_unlock (&thread_info.ptr[i].in_flight_mutex); - } - - sleep (1); + for (i = 0; i < nbd.len; ++i) { + ublksrv_aio_ctx_shutdown(thread_info.ptr[i].aio_ctx); + pthread_join (thread_info.ptr[i].nbd_work_thread, NULL); + ublksrv_aio_ctx_deinit(thread_info.ptr[i].aio_ctx); } ublksrv_dev_deinit (dev); @@ -367,176 +455,37 @@ init_tgt (struct ublksrv_dev *dev, int type, int argc, char *argv[]) return 0; } -/* Command completion callback (called on the NBD thread). */ -static int -command_completed (void *vpdata, int *error) -{ - struct completion *completion = vpdata; - struct ublksrv_queue *q = completion->q; - const size_t i = q->q_id; - - if (verbose) - fprintf (stderr, - "%s: command_completed: tag=%d q_id=%zu res=%d error=%d\n", - "nbdublk", completion->tag, i, completion->res, *error); - - /* If the command failed, override the normal result. */ - if (*error != 0) - completion->res = *error; - - assert (thread_info.ptr[i].in_flight >= 1); - thread_info.ptr[i].in_flight--; - - /* Copy the command to the list of completed commands. - * - * Note *completion is freed by the .free handler that we added to - * this completion callback. - */ - pthread_mutex_lock (&thread_info.ptr[i].completed_commands_lock); - completions_append (&thread_info.ptr[i].completed_commands, *completion); - - /* Signal io_uring thread that the command has been completed. - * It will call us back in a different thread on ->handle_event - * and we can finally complete the command(s) there. - */ - ublksrv_queue_send_event (q); - pthread_mutex_unlock (&thread_info.ptr[i].completed_commands_lock); - - /* Retire the NBD command. */ - return 1; -} - static void -handle_event (struct ublksrv_queue *q) +nbd_handle_event (struct ublksrv_queue *q) { - const size_t i = q->q_id; - size_t j; + struct ublksrv_aio_ctx *aio_ctx = thread_info.ptr[q->q_id].aio_ctx; if (verbose) - fprintf (stderr, "%s: handle_event: q_id = %d\n", "nbdublk", q->q_id); + fprintf (stderr, "%s: handle_event: q_id = %d\n", "nbdublk", q->q_id); - pthread_mutex_lock (&thread_info.ptr[i].completed_commands_lock); - - for (j = 0; j < thread_info.ptr[i].completed_commands.len; ++j) { - struct completion *completion - &thread_info.ptr[i].completed_commands.ptr[j]; - ublksrv_complete_io (completion->q, completion->tag, completion->res); - } - completions_reset (&thread_info.ptr[i].completed_commands); - ublksrv_queue_handled_event (q); - - pthread_mutex_unlock (&thread_info.ptr[i].completed_commands_lock); + ublksrv_aio_handle_event(aio_ctx, q); } -/* Start a single command. */ -static int -handle_io_async (struct ublksrv_queue *q, int tag) +static int nbd_handle_io_async(struct ublksrv_queue *q, int tag) { - const struct ublksrv_io_desc *iod = ublksrv_get_iod (q, tag); - const unsigned op = ublksrv_get_op (iod); - const unsigned flags = ublksrv_get_flags (iod); - const bool fua = flags & UBLK_IO_F_FUA; - const bool alloc_zero = flags & UBLK_IO_F_NOUNMAP; /* else punch hole */ - const size_t q_id = q->q_id; /* also the NBD handle number */ - struct nbd_handle *h = nbd.ptr[q_id]; - uint32_t nbd_flags = 0; - int64_t r; - nbd_completion_callback cb; - struct completion *completion; - - if (verbose) - fprintf (stderr, "%s: handle_io_async: tag = %d q_id = %zu\n", - "nbdublk", tag, q_id); - - /* Set up a completion callback and its user data. */ - completion = malloc (sizeof *completion); - if (completion == NULL) abort (); - completion->q = q; - completion->tag = tag; - completion->res = iod->nr_sectors << 9; - cb.callback = command_completed; - cb.user_data = completion; - cb.free = free; - - switch (op) { - case UBLK_IO_OP_READ: - r = nbd_aio_pread (h, (void *) iod->addr, iod->nr_sectors << 9, - iod->start_sector << 9, cb, 0); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_WRITE: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - r = nbd_aio_pwrite (h, (const void *) iod->addr, iod->nr_sectors << 9, - iod->start_sector << 9, cb, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_FLUSH: - r = nbd_flush (h, 0); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_DISCARD: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - r = nbd_trim (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - case UBLK_IO_OP_WRITE_ZEROES: - if (fua && can_fua) - nbd_flags |= LIBNBD_CMD_FLAG_FUA; - - if (alloc_zero) - nbd_flags |= LIBNBD_CMD_FLAG_NO_HOLE; - - r = nbd_zero (h, iod->nr_sectors << 9, iod->start_sector << 9, nbd_flags); - if (r == -1) { - fprintf (stderr, "%s: %s\n", "nbdublk", nbd_get_error ()); - ublksrv_complete_io (q, tag, - (nbd_get_errno () ? : EINVAL)); - return 0; - } - break; - - default: - fprintf (stderr, "%s: unknown operation %u\n", "nbdublk", op); - ublksrv_complete_io (q, tag, -ENOTSUP); - return 0; - } - - /* Make sure the corresponding NBD worker sees the command. */ - pthread_mutex_lock (&thread_info.ptr[q_id].in_flight_mutex); - thread_info.ptr[q_id].in_flight++; - pthread_cond_signal (&thread_info.ptr[q_id].in_flight_cond); - pthread_mutex_unlock (&thread_info.ptr[q_id].in_flight_mutex); - - return 0; + struct ublksrv_aio_ctx *aio_ctx = thread_info.ptr[q->q_id].aio_ctx; + const struct ublksrv_io_desc *iod = ublksrv_get_iod(q, tag); + struct ublksrv_aio *req = ublksrv_aio_alloc_req(aio_ctx, 0); + + req->io = *iod; + req->id = ublksrv_aio_pid_tag(q->q_id, tag); + if (verbose) + fprintf (stderr, "%s %d qid %d tag %d\n", __func__, __LINE__, + q->q_id, tag); + ublksrv_aio_submit_req(aio_ctx, req); + + return 0; } struct ublksrv_tgt_type tgt_type = { .type = UBLKSRV_TGT_TYPE_NBD, .name = "nbd", .init_tgt = init_tgt, - .handle_io_async = handle_io_async, - .handle_event = handle_event, + .handle_io_async = nbd_handle_io_async, + .handle_event = nbd_handle_event, }; -- Ming