Richard W.M. Jones
2019-Mar-08 10:04 UTC
[Libguestfs] [PATCH nbdkit] Minimal implementation of NBD Structured Replies.
This is about the simplest implementation of NBD Structured Replies (SR) that we can do right now. It accepts NBD_OPT_STRUCTURED_REPLIES when negotiated by the client, but only sends back the simplest possible SR when required to by NBD_CMD_READ. The rest of the time it will send back simple replies as before. We do not modify the plugin API so plugins are unable to send complex SRs. Also we do not handle human-readable error messages yet because that would require changes in how we handle nbdkit_error(). Also we do not understand NBD_CMD_FLAG_DF, but that seems to be OK because (a) we don't advertize the feature and (b) we only send back a single chunk anyway. --- docs/nbdkit-protocol.pod | 6 +- server/protocol.h | 59 +++++++++--- plugins/nbd/nbd.c | 4 +- server/connections.c | 189 ++++++++++++++++++++++++++++++++------- tests/test-layers.c | 2 +- 5 files changed, 212 insertions(+), 48 deletions(-) diff --git a/docs/nbdkit-protocol.pod b/docs/nbdkit-protocol.pod index c44db6a..68438fa 100644 --- a/docs/nbdkit-protocol.pod +++ b/docs/nbdkit-protocol.pod @@ -122,7 +122,11 @@ Supported in nbdkit E<ge> 1.9.9. =item Structured Replies -I<Not supported>. +Supported in nbdkit E<ge> 1.11.8. + +However we don’t expose the capability to send structured replies to +plugins yet, nor do we send human-readable error messages using this +facility. =item Block Status diff --git a/server/protocol.h b/server/protocol.h index 003fc45..0aadd46 100644 --- a/server/protocol.h +++ b/server/protocol.h @@ -1,5 +1,5 @@ /* nbdkit - * Copyright (C) 2013-2018 Red Hat Inc. + * Copyright (C) 2013-2019 Red Hat Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -98,12 +98,13 @@ extern const char *name_of_nbd_flag (int); /* NBD options (new style handshake only). */ extern const char *name_of_nbd_opt (int); -#define NBD_OPT_EXPORT_NAME 1 -#define NBD_OPT_ABORT 2 -#define NBD_OPT_LIST 3 -#define NBD_OPT_STARTTLS 5 -#define NBD_OPT_INFO 6 -#define NBD_OPT_GO 7 +#define NBD_OPT_EXPORT_NAME 1 +#define NBD_OPT_ABORT 2 +#define NBD_OPT_LIST 3 +#define NBD_OPT_STARTTLS 5 +#define NBD_OPT_INFO 6 +#define NBD_OPT_GO 7 +#define NBD_OPT_STRUCTURED_REPLY 8 extern const char *name_of_nbd_rep (int); #define NBD_REP_ACK 1 @@ -144,15 +145,49 @@ struct request { uint32_t count; /* Request length. */ } __attribute__((packed)); -/* Reply (server -> client). */ -struct reply { - uint32_t magic; /* NBD_REPLY_MAGIC. */ +/* Simple reply (server -> client). */ +struct simple_reply { + uint32_t magic; /* NBD_SIMPLE_REPLY_MAGIC. */ uint32_t error; /* NBD_SUCCESS or one of NBD_E*. */ uint64_t handle; /* Opaque handle. */ } __attribute__((packed)); -#define NBD_REQUEST_MAGIC 0x25609513 -#define NBD_REPLY_MAGIC 0x67446698 +/* Structured reply (server -> client). */ +struct structured_reply { + uint32_t magic; /* NBD_STRUCTURED_REPLY_MAGIC. */ + uint16_t flags; /* NBD_REPLY_FLAG_* */ + uint16_t type; /* NBD_REPLY_TYPE_* */ + uint64_t handle; /* Opaque handle. */ + uint32_t length; /* Length of payload which follows. */ +} __attribute__((packed)); + +struct structured_reply_offset_data { + uint64_t offset; /* offset */ + /* Followed by data. */ +} __attribute__((packed)); + +struct structured_reply_error { + uint32_t error; /* NBD_E* error number */ + uint16_t len; /* Length of human readable error. */ + /* Followed by human readable error string. */ +} __attribute__((packed)); + +#define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_SIMPLE_REPLY_MAGIC 0x67446698 +#define NBD_STRUCTURED_REPLY_MAGIC 0x668e33ef + +/* Structured reply flags. */ +extern const char *name_of_nbd_reply_flag (int); +#define NBD_REPLY_FLAG_DONE (1<<0) + +/* Structured reply types. */ +extern const char *name_of_nbd_reply_type (int); +#define NBD_REPLY_TYPE_NONE 0 +#define NBD_REPLY_TYPE_OFFSET_DATA 1 +#define NBD_REPLY_TYPE_OFFSET_HOLE 2 +#define NBD_REPLY_TYPE_BLOCK_STATUS 3 +#define NBD_REPLY_TYPE_ERROR 32769 +#define NBD_REPLY_TYPE_ERROR_OFFSET 32770 /* NBD commands. */ extern const char *name_of_nbd_cmd (int); diff --git a/plugins/nbd/nbd.c b/plugins/nbd/nbd.c index 674f4a4..2f494cd 100644 --- a/plugins/nbd/nbd.c +++ b/plugins/nbd/nbd.c @@ -345,7 +345,7 @@ nbd_request (struct handle *h, uint16_t flags, uint16_t type, uint64_t offset, static int nbd_reply_raw (struct handle *h, int *fd) { - struct reply rep; + struct simple_reply rep; struct transaction *trans; void *buf; uint32_t count; @@ -353,7 +353,7 @@ nbd_reply_raw (struct handle *h, int *fd) *fd = -1; if (read_full (h->fd, &rep, sizeof rep) < 0) return nbd_mark_dead (h); - if (be32toh (rep.magic) != NBD_REPLY_MAGIC) + if (be32toh (rep.magic) != NBD_SIMPLE_REPLY_MAGIC) return nbd_mark_dead (h); nbdkit_debug ("received reply for cookie %#" PRIx64 ", status %s", rep.handle, name_of_nbd_error(be32toh (rep.error))); diff --git a/server/connections.c b/server/connections.c index 51d4912..aeb27f8 100644 --- a/server/connections.c +++ b/server/connections.c @@ -86,6 +86,7 @@ struct connection { bool can_fua; bool can_multi_conn; bool using_tls; + bool structured_replies; int sockin, sockout; connection_recv_function recv; @@ -905,6 +906,26 @@ _negotiate_handshake_newstyle_options (struct connection *conn) break; + case NBD_OPT_STRUCTURED_REPLY: + if (optlen != 0) { + if (send_newstyle_option_reply (conn, option, NBD_REP_ERR_INVALID) + == -1) + return -1; + if (conn_recv_full (conn, data, optlen, + "read: %s: %m", name_of_nbd_opt (option)) == -1) + return -1; + continue; + } + + debug ("newstyle negotiation: %s: client requested structured replies", + name_of_nbd_opt (option)); + + if (send_newstyle_option_reply (conn, option, NBD_REP_ACK) == -1) + return -1; + + conn->structured_replies = true; + break; + default: /* Unknown option. */ if (send_newstyle_option_reply (conn, option, NBD_REP_ERR_UNSUP) == -1) @@ -1224,12 +1245,123 @@ nbd_errno (int error) } } +static int +send_simple_reply (struct connection *conn, + uint64_t handle, uint16_t cmd, + const char *buf, uint32_t count, + uint32_t error) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); + struct simple_reply reply; + int r; + + reply.magic = htobe32 (NBD_SIMPLE_REPLY_MAGIC); + reply.handle = handle; + reply.error = htobe32 (nbd_errno (error)); + + r = conn->send (conn, &reply, sizeof reply); + if (r == -1) { + nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + + /* Send the read data buffer. */ + if (cmd == NBD_CMD_READ && !error) { + r = conn->send (conn, buf, count); + if (r == -1) { + nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + } + + return 1; /* command processed ok */ +} + +static int +send_structured_reply_read (struct connection *conn, + uint64_t handle, uint16_t cmd, + const char *buf, uint32_t count, uint64_t offset) +{ + /* Once we are really using structured replies and sending data back + * in chunks, we'll be able to grab the write lock for each chunk, + * allowing other threads to interleave replies. As we're not doing + * that yet we acquire the lock for the whole function. + */ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); + struct structured_reply reply; + struct structured_reply_offset_data offset_data; + int r; + + assert (cmd == NBD_CMD_READ); + + reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC); + reply.handle = handle; + reply.flags = htobe16 (NBD_REPLY_FLAG_DONE); + reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA); + reply.length = htobe32 (sizeof offset_data + count); + + r = conn->send (conn, &reply, sizeof reply); + if (r == -1) { + nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + + /* Send the offset + read data buffer. */ + offset_data.offset = htobe64 (offset); + r = conn->send (conn, &offset_data, sizeof offset_data); + if (r == -1) { + nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + + r = conn->send (conn, buf, count); + if (r == -1) { + nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + + return 1; /* command processed ok */ +} + +static int +send_structured_reply_error (struct connection *conn, + uint64_t handle, uint16_t cmd, uint32_t error) +{ + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); + struct structured_reply reply; + struct structured_reply_error error_data; + int r; + + reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC); + reply.handle = handle; + reply.flags = htobe16 (NBD_REPLY_FLAG_DONE); + reply.type = htobe16 (NBD_REPLY_TYPE_ERROR); + reply.length = htobe32 (sizeof error_data + 0 /* no human readable error */); + + r = conn->send (conn, &reply, sizeof reply); + if (r == -1) { + nbdkit_error ("write error reply: %m"); + return set_status (conn, -1); + } + + /* Send the error. */ + error_data.error = htobe32 (error); + error_data.len = htobe16 (0); + r = conn->send (conn, &error_data, sizeof error_data); + if (r == -1) { + nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); + return set_status (conn, -1); + } + /* No human readable error message at the moment. */ + + return 1; /* command processed ok */ +} + static int recv_request_send_reply (struct connection *conn) { int r; struct request request; - struct reply reply; uint16_t cmd, flags; uint32_t magic, count, error = 0; uint64_t offset; @@ -1317,40 +1449,33 @@ recv_request_send_reply (struct connection *conn) /* Send the reply packet. */ send_reply: - { - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); - if (get_status (conn) < 0) - return -1; - reply.magic = htobe32 (NBD_REPLY_MAGIC); - reply.handle = request.handle; - reply.error = htobe32 (nbd_errno (error)); + if (get_status (conn) < 0) + return -1; - if (error != 0) { - /* Since we're about to send only the limited NBD_E* errno to the - * client, don't lose the information about what really happened - * on the server side. Make sure there is a way for the operator - * to retrieve the real error. - */ - debug ("sending error reply: %s", strerror (error)); - } - - r = conn->send (conn, &reply, sizeof reply); - if (r == -1) { - nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); - return set_status (conn, -1); - } - - /* Send the read data buffer. */ - if (cmd == NBD_CMD_READ && !error) { - r = conn->send (conn, buf, count); - if (r == -1) { - nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); - return set_status (conn, -1); - } - } + if (error != 0) { + /* Since we're about to send only the limited NBD_E* errno to the + * client, don't lose the information about what really happened + * on the server side. Make sure there is a way for the operator + * to retrieve the real error. + */ + debug ("sending error reply: %s", strerror (error)); } - return 1; /* command processed ok */ + /* Currently we prefer to send simple replies for everything except + * where we have to (ie. NBD_CMD_READ when structured_replies have + * been negotiated). However this prevents us from sending + * human-readable error messages to the client, so we should + * reconsider this in future. + */ + if (conn->structured_replies && cmd == NBD_CMD_READ) { + if (!error) + return send_structured_reply_read (conn, request.handle, cmd, + buf, count, offset); + else + return send_structured_reply_error (conn, request.handle, cmd, error); + } + else + return send_simple_reply (conn, request.handle, cmd, buf, count, error); } /* Write buffer to conn->sockout and either succeed completely diff --git a/tests/test-layers.c b/tests/test-layers.c index c8d15d2..6149c02 100644 --- a/tests/test-layers.c +++ b/tests/test-layers.c @@ -92,7 +92,7 @@ main (int argc, char *argv[]) struct new_handshake_finish handshake_finish; uint16_t eflags; struct request request; - struct reply reply; + struct simple_reply reply; char data[512]; #ifndef HAVE_EXIT_WITH_PARENT -- 2.20.1
Eric Blake
2019-Mar-08 15:03 UTC
Re: [Libguestfs] [PATCH nbdkit] Minimal implementation of NBD Structured Replies.
On 3/8/19 4:04 AM, Richard W.M. Jones wrote:> This is about the simplest implementation of NBD Structured Replies > (SR) that we can do right now. > > It accepts NBD_OPT_STRUCTURED_REPLIES when negotiated by the client, > but only sends back the simplest possible SR when required to by > NBD_CMD_READ. The rest of the time it will send back simple replies > as before. We do not modify the plugin API so plugins are unable to > send complex SRs.Yep, sounds compliant, and similar to how I added it in qemu. I'll read through this one in detail, but interoperability with qemu is already a good sign that it's workable.> > Also we do not handle human-readable error messages yet because that > would require changes in how we handle nbdkit_error(). > > Also we do not understand NBD_CMD_FLAG_DF, but that seems to be OK > because (a) we don't advertize the feature and (b) we only send back a > single chunk anyway.Or, we COULD advertise it because we always honor it (but that's a larger diffstat, and thus at odds with "minimal implementation"). Either way works.> +/* Structured reply types. */ > +extern const char *name_of_nbd_reply_type (int); > +#define NBD_REPLY_TYPE_NONE 0 > +#define NBD_REPLY_TYPE_OFFSET_DATA 1 > +#define NBD_REPLY_TYPE_OFFSET_HOLE 2 > +#define NBD_REPLY_TYPE_BLOCK_STATUS 3 > +#define NBD_REPLY_TYPE_ERROR 32769 > +#define NBD_REPLY_TYPE_ERROR_OFFSET 32770Worth writing these later ones in hex or via a helper macro that does ((1 << 15) | value)? Or would that mess up the generated protocol-to-lookup magic?> +++ b/plugins/nbd/nbd.c > @@ -345,7 +345,7 @@ nbd_request (struct handle *h, uint16_t flags, uint16_t type, uint64_t offset, > static int > nbd_reply_raw (struct handle *h, int *fd) > { > - struct reply rep; > + struct simple_reply rep; > struct transaction *trans; > void *buf; > uint32_t count; > @@ -353,7 +353,7 @@ nbd_reply_raw (struct handle *h, int *fd) > *fd = -1; > if (read_full (h->fd, &rep, sizeof rep) < 0) > return nbd_mark_dead (h); > - if (be32toh (rep.magic) != NBD_REPLY_MAGIC) > + if (be32toh (rep.magic) != NBD_SIMPLE_REPLY_MAGIC) > return nbd_mark_dead (h); > nbdkit_debug ("received reply for cookie %#" PRIx64 ", status %s", > rep.handle, name_of_nbd_error(be32toh (rep.error)));Teaching the nbd plugin to negotiate structured replies as the client is obviously a separate patch, I'm fine if you leave that to me :)> +static int > +send_structured_reply_read (struct connection *conn, > + uint64_t handle, uint16_t cmd, > + const char *buf, uint32_t count, uint64_t offset) > +{ > + /* Once we are really using structured replies and sending data back > + * in chunks, we'll be able to grab the write lock for each chunk, > + * allowing other threads to interleave replies. As we're not doing > + * that yet we acquire the lock for the whole function. > + */Fair enough.> + ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); > + struct structured_reply reply; > + struct structured_reply_offset_data offset_data; > + int r; > + > + assert (cmd == NBD_CMD_READ); > + > + reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC); > + reply.handle = handle; > + reply.flags = htobe16 (NBD_REPLY_FLAG_DONE); > + reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA); > + reply.length = htobe32 (sizeof offset_data + count);This line is correct, but I had to remind myself of C precedence rules on this one; writing 'count + sizeof offset_data' instead has the same effect without worrying whether sizeof binds with higher or lower precedence than +.> @@ -1317,40 +1449,33 @@ recv_request_send_reply (struct connection *conn) > > /* Send the reply packet. */ > send_reply: > - { > - ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); > - if (get_status (conn) < 0) > - return -1; > - reply.magic = htobe32 (NBD_REPLY_MAGIC); > - reply.handle = request.handle; > - reply.error = htobe32 (nbd_errno (error)); > + if (get_status (conn) < 0) > + return -1;Hmm, previously get_status() was checked under lock. But since it is a thread-local variable, we should be safe grabbing it while unlocked. ACK. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Richard W.M. Jones
2019-Mar-08 16:05 UTC
Re: [Libguestfs] [PATCH nbdkit] Minimal implementation of NBD Structured Replies.
On Fri, Mar 08, 2019 at 09:03:05AM -0600, Eric Blake wrote:> On 3/8/19 4:04 AM, Richard W.M. Jones wrote: > > Also we do not understand NBD_CMD_FLAG_DF, but that seems to be OK > > because (a) we don't advertize the feature and (b) we only send back a > > single chunk anyway. > > Or, we COULD advertise it because we always honor it (but that's a > larger diffstat, and thus at odds with "minimal implementation"). Either > way works.What's also unclear to me is how NBD_CMD_FLAG_DF interacts with NBD_CMD_BLOCK_STATUS. What does it mean for extents which are by their nature fragmented?> > +/* Structured reply types. */ > > +extern const char *name_of_nbd_reply_type (int); > > +#define NBD_REPLY_TYPE_NONE 0 > > +#define NBD_REPLY_TYPE_OFFSET_DATA 1 > > +#define NBD_REPLY_TYPE_OFFSET_HOLE 2 > > +#define NBD_REPLY_TYPE_BLOCK_STATUS 3 > > +#define NBD_REPLY_TYPE_ERROR 32769 > > +#define NBD_REPLY_TYPE_ERROR_OFFSET 32770 > > Worth writing these later ones in hex or via a helper macro that does > ((1 << 15) | value)? Or would that mess up the generated > protocol-to-lookup magic?Could do it either way really. The sed magic uses the symbol (eg. NBD_REPLY_TYPE_ERROR) not the value so either should work. I'll play around with it to see which looks nicer.> > + reply.magic = htobe32 (NBD_STRUCTURED_REPLY_MAGIC); > > + reply.handle = handle; > > + reply.flags = htobe16 (NBD_REPLY_FLAG_DONE); > > + reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA); > > + reply.length = htobe32 (sizeof offset_data + count); > > This line is correct, but I had to remind myself of C precedence rules > on this one; writing 'count + sizeof offset_data' instead has the same > effect without worrying whether sizeof binds with higher or lower > precedence than +.Yup I'll change this. Thanks, Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://people.redhat.com/~rjones/virt-df/
Possibly Parallel Threads
- [PATCH nbdkit 2/2] server: Split out NBD protocol code from connections code.
- Re: [PATCH nbdkit] Minimal implementation of NBD Structured Replies.
- [nbdkit PATCH v2 2/2] server: Group related transmission send()s
- [PATCH nbdkit 3/4] common/protocol: Update nbd-protocol.h so it matches libnbd’s copy.
- [nbdkit PATCH 2/2] server: Cork around grouped transmission send()s