Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 0/6] NBD 64-bit extensions (libnbd portions, prep work)
This is preliminary work towards enabling 64-bit NBD extensions in libnbd, based on review feedback given by Laszlo and others. The v3 patch series is here: https://listman.redhat.com/archives/libguestfs/2023-May/031617.html This series focuses on patches 1 and 3-6 of v3 (patch 2 from that series was already pushed earlier); there remains more work to do on 7-22 that will follow later. For convenience, I've tagged this as https://repo.or.cz/libnbd/ericb.git/shortlog/refs/tags/exthdr-v4a Comparison to v3 (more notes in individual patches): 001/6:[down] 'internal: Track chunk payload length left' 002/6:[0106] [FC] 'block_status: Refactor array storage' 003/6:[0033] [FC] 'protocol: Add definitions for extended headers' 004/6:[0032] [FC] 'states: Prepare to send 64-bit requests' 005/6:[0137] [FC] 'states: Prepare to receive 64-bit replies' 006/6:[0017] [FC] 'states: Break deadlock if server goofs on extended replies' Eric Blake (6): internal: Track chunk payload length left block_status: Refactor array storage protocol: Add definitions for extended headers states: Prepare to send 64-bit requests states: Prepare to receive 64-bit replies states: Break deadlock if server goofs on extended replies lib/internal.h | 16 ++- lib/nbd-protocol.h | 71 ++++++++--- generator/API.ml | 20 ++-- generator/state_machine.ml | 9 +- generator/states-issue-command.c | 29 +++-- generator/states-reply.c | 109 +++++++++++++---- generator/states-reply-chunk.c | 197 ++++++++++++++++++++----------- lib/rw.c | 17 ++- tests/Makefile.am | 4 + tests/pwrite-extended.c | 108 +++++++++++++++++ .gitignore | 1 + 11 files changed, 448 insertions(+), 133 deletions(-) create mode 100644 tests/pwrite-extended.c base-commit: 10b37a26360c19effec2733449ee5a37439181bf -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 1/6] internal: Track chunk payload length left
While working on later patches, I noticed that I was frequently referring to the wire contents of h->sbuf.reply.hdr.structured.length, byte-swapping it, then repeating open-coded math for how many bytes were consumed in earlier states. It's easier to normalize the remaining bytes to be parsed into a variable that persists across the life of the reply, and also makes it possible to track that by the end of any structured reply, we have dealt with all of the bytes that the header advertised. Some of the worst implicit assumptions in block status code will be fixed in the next patch with the help of this new variable. Signed-off-by: Eric Blake <eblake at redhat.com> --- v4: new patch, replacing part of v3's 5/22 --- lib/internal.h | 3 ++ generator/states-reply-chunk.c | 59 ++++++++++++++++------------------ 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/lib/internal.h b/lib/internal.h index 4b0043b3..b38ae524 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -304,6 +304,9 @@ struct nbd_handle { string_vector querylist; size_t querynum; + /* Chunk payload length remaining to be parsed */ + size_t payload_left; + /* When receiving block status, this is used. */ uint32_t *bs_entries; diff --git a/generator/states-reply-chunk.c b/generator/states-reply-chunk.c index 26b8a6b0..da5fc426 100644 --- a/generator/states-reply-chunk.c +++ b/generator/states-reply-chunk.c @@ -69,6 +69,7 @@ REPLY.CHUNK_REPLY.START: /* Skip an unexpected structured reply, including to an unknown cookie. */ if (cmd == NULL || !h->structured_replies) goto resync; + h->payload_left = length; switch (type) { case NBD_REPLY_TYPE_NONE: @@ -87,6 +88,7 @@ REPLY.CHUNK_REPLY.START: goto resync; h->rbuf = &h->sbuf.reply.payload.offset_data; h->rlen = sizeof h->sbuf.reply.payload.offset_data; + h->payload_left -= h->rlen; SET_NEXT_STATE (%RECV_OFFSET_DATA); break; @@ -96,6 +98,7 @@ REPLY.CHUNK_REPLY.START: goto resync; h->rbuf = &h->sbuf.reply.payload.offset_hole; h->rlen = sizeof h->sbuf.reply.payload.offset_hole; + h->payload_left -= h->rlen; SET_NEXT_STATE (%RECV_OFFSET_HOLE); break; @@ -141,7 +144,8 @@ REPLY.CHUNK_REPLY.START: resync: h->rbuf = NULL; - h->rlen = length; + h->rlen = h->payload_left; + h->payload_left = 0; SET_NEXT_STATE (%RESYNC); return 0; @@ -156,7 +160,8 @@ REPLY.CHUNK_REPLY.RECV_ERROR: SET_NEXT_STATE (%.READY); return 0; case 0: - length = be32toh (h->sbuf.reply.hdr.structured.length); + length = h->payload_left; + h->payload_left -= MIN (length, sizeof h->sbuf.reply.payload.error.error); assert (length >= sizeof h->sbuf.reply.payload.error.error.error); assert (cmd); @@ -164,12 +169,13 @@ REPLY.CHUNK_REPLY.RECV_ERROR: goto resync; msglen = be16toh (h->sbuf.reply.payload.error.error.len); - if (msglen > length - sizeof h->sbuf.reply.payload.error.error || + if (msglen > h->payload_left || msglen > sizeof h->sbuf.reply.payload.error.msg) goto resync; h->rbuf = h->sbuf.reply.payload.error.msg; h->rlen = msglen; + h->payload_left -= h->rlen; SET_NEXT_STATE (%RECV_ERROR_MESSAGE); } return 0; @@ -180,12 +186,13 @@ REPLY.CHUNK_REPLY.RECV_ERROR: if (cmd->error == 0) cmd->error = nbd_internal_errno_of_nbd_error (error); h->rbuf = NULL; - h->rlen = length - MIN (length, sizeof h->sbuf.reply.payload.error.error); + h->rlen = h->payload_left; + h->payload_left = 0; SET_NEXT_STATE (%RESYNC); return 0; REPLY.CHUNK_REPLY.RECV_ERROR_MESSAGE: - uint32_t length, msglen; + uint32_t msglen; uint16_t type; switch (recv_into_rbuf (h)) { @@ -195,33 +202,31 @@ REPLY.CHUNK_REPLY.RECV_ERROR_MESSAGE: SET_NEXT_STATE (%.READY); return 0; case 0: - length = be32toh (h->sbuf.reply.hdr.structured.length); msglen = be16toh (h->sbuf.reply.payload.error.error.len); type = be16toh (h->sbuf.reply.hdr.structured.type); - length -= sizeof h->sbuf.reply.payload.error.error + msglen; - if (msglen) debug (h, "structured error server message: %.*s", (int)msglen, h->sbuf.reply.payload.error.msg); /* Special case two specific errors; silently ignore tail for all others */ h->rbuf = NULL; - h->rlen = length; + h->rlen = h->payload_left; switch (type) { case NBD_REPLY_TYPE_ERROR: - if (length != 0) + if (h->payload_left != 0) debug (h, "ignoring unexpected slop after error message, " "the server may have a bug"); break; case NBD_REPLY_TYPE_ERROR_OFFSET: - if (length != sizeof h->sbuf.reply.payload.error.offset) + if (h->payload_left != sizeof h->sbuf.reply.payload.error.offset) debug (h, "unable to safely extract error offset, " "the server may have a bug"); else h->rbuf = &h->sbuf.reply.payload.error.offset; break; } + h->payload_left = 0; SET_NEXT_STATE (%RECV_ERROR_TAIL); } return 0; @@ -286,7 +291,6 @@ REPLY.CHUNK_REPLY.RECV_ERROR_TAIL: REPLY.CHUNK_REPLY.RECV_OFFSET_DATA: struct command *cmd = h->reply_cmd; uint64_t offset; - uint32_t length; switch (recv_into_rbuf (h)) { case -1: SET_NEXT_STATE (%.DEAD); return 0; @@ -295,29 +299,24 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_DATA: SET_NEXT_STATE (%.READY); return 0; case 0: - length = be32toh (h->sbuf.reply.hdr.structured.length); offset = be64toh (h->sbuf.reply.payload.offset_data.offset); assert (cmd); /* guaranteed by CHECK */ - assert (cmd->data && cmd->type == NBD_CMD_READ); - /* Length of the data following. */ - length -= 8; - /* Is the data within bounds? */ - if (! structured_reply_in_bounds (offset, length, cmd)) { + if (! structured_reply_in_bounds (offset, h->payload_left, cmd)) { SET_NEXT_STATE (%.DEAD); return 0; } if (cmd->data_seen <= cmd->count) - cmd->data_seen += length; + cmd->data_seen += h->payload_left; /* Now this is the byte offset in the read buffer. */ offset -= cmd->offset; /* Set up to receive the data directly to the user buffer. */ h->rbuf = (char *)cmd->data + offset; - h->rlen = length; + h->rlen = h->payload_left; SET_NEXT_STATE (%RECV_OFFSET_DATA_DATA); } return 0; @@ -325,7 +324,6 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_DATA: REPLY.CHUNK_REPLY.RECV_OFFSET_DATA_DATA: struct command *cmd = h->reply_cmd; uint64_t offset; - uint32_t length; switch (recv_into_rbuf (h)) { case -1: SET_NEXT_STATE (%.DEAD); return 0; @@ -334,7 +332,6 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_DATA_DATA: SET_NEXT_STATE (%.READY); return 0; case 0: - length = be32toh (h->sbuf.reply.hdr.structured.length); offset = be64toh (h->sbuf.reply.payload.offset_data.offset); assert (cmd); /* guaranteed by CHECK */ @@ -343,11 +340,12 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_DATA_DATA: if (CALL_CALLBACK (cmd->cb.fn.chunk, (char *)cmd->data + (offset - cmd->offset), - length - sizeof offset, offset, + h->payload_left, offset, LIBNBD_READ_DATA, &error) == -1) if (cmd->error == 0) cmd->error = error ? error : EPROTO; } + h->payload_left = 0; SET_NEXT_STATE (%FINISH); } @@ -369,7 +367,6 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_HOLE: length = be32toh (h->sbuf.reply.payload.offset_hole.length); assert (cmd); /* guaranteed by CHECK */ - assert (cmd->data && cmd->type == NBD_CMD_READ); /* Is the data within bounds? */ @@ -405,8 +402,8 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_HOLE: REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: struct command *cmd = h->reply_cmd; - uint32_t length; size_t i; + size_t count; uint32_t context_id; switch (recv_into_rbuf (h)) { @@ -416,20 +413,20 @@ REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: SET_NEXT_STATE (%.READY); return 0; case 0: - length = be32toh (h->sbuf.reply.hdr.structured.length); - assert (cmd); /* guaranteed by CHECK */ assert (cmd->type == NBD_CMD_BLOCK_STATUS); assert (CALLBACK_IS_NOT_NULL (cmd->cb.fn.extent)); assert (h->bs_entries); - assert (length >= 12); + assert (h->payload_left >= 12); assert (h->meta_valid); /* Need to byte-swap the entries returned, but apart from that we * don't validate them. */ - for (i = 0; i < length/4; ++i) + for (i = 0; i < h->payload_left / sizeof *h->bs_entries; ++i) h->bs_entries[i] = be32toh (h->bs_entries[i]); + count = (h->payload_left / sizeof *h->bs_entries) - 1; + h->payload_left = 0; /* Look up the context ID. */ context_id = h->bs_entries[0]; @@ -443,8 +440,7 @@ REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: if (CALL_CALLBACK (cmd->cb.fn.extent, h->meta_contexts.ptr[i].name, cmd->offset, - &h->bs_entries[1], (length-4) / 4, - &error) == -1) + &h->bs_entries[1], count, &error) == -1) if (cmd->error == 0) cmd->error = error ? error : EPROTO; } @@ -494,6 +490,7 @@ REPLY.CHUNK_REPLY.RESYNC: REPLY.CHUNK_REPLY.FINISH: uint16_t flags; + assert (h->payload_left == 0); flags = be16toh (h->sbuf.reply.hdr.structured.flags); if (flags & NBD_REPLY_FLAG_DONE) { SET_NEXT_STATE (%^FINISH_COMMAND); -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 2/6] block_status: Refactor array storage
For 32-bit block status, we were able to cheat and use an array with an odd number of elements, with array[0] holding the context id, and passing &array[1] to the user's callback. But once we have 64-bit extents, we can no longer abuse array element 0 like that, for two reasons: 64-bit extents contain uint64_t which might not be alignment-compatible with an array of uint32_t on all architectures, and the new NBD_REPLY_TYPE_BLOCK_STATUS_EXT adds an additional count field before the array. What's more, there's no need to byte-swap the entries if we won't be calling the user's callback, when the server sent us an unknown context id. Split out a new state CHUNK_REPLY.BS_HEADER to receive the context id (and eventually the new count field for 64-bit replies) separately from the extents array. Add another type nbd_chunk_block_status_32 in the payload section for tracking it (with a name anticipating the 64-bit counterpart type under extended headers). Add a helper function bs_reply_length_ok() to enable more assertions that our memory management is sane without quite so much open-coding of magic values. With these changes, the byteswap of the entries can be delayed until we know we need to call the user's callback. No behavioral change, other than the rare possibility of landing in the new state. Signed-off-by: Eric Blake <eblake at redhat.com> --- v4: add more assertions, add bs_reply_length_ok() to factor out computation of length checks [Laszlo], rebase to state machine rename, do byte-swap later, add bs->count for tracking number of descriptors --- lib/internal.h | 2 + lib/nbd-protocol.h | 16 +++-- generator/state_machine.ml | 9 ++- generator/states-reply-chunk.c | 108 ++++++++++++++++++++++++--------- 4 files changed, 98 insertions(+), 37 deletions(-) diff --git a/lib/internal.h b/lib/internal.h index b38ae524..c7793f1f 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -254,6 +254,7 @@ struct nbd_handle { uint64_t align_; /* Start reply.payload on an 8-byte alignment */ struct nbd_chunk_offset_data offset_data; struct nbd_chunk_offset_hole offset_hole; + struct nbd_chunk_block_status_32 bs_hdr_32; struct { struct nbd_chunk_error error; char msg[NBD_MAX_STRING]; /* Common to all error types */ @@ -308,6 +309,7 @@ struct nbd_handle { size_t payload_left; /* When receiving block status, this is used. */ + size_t bs_count; /* count of block descriptors (not array entries!) */ uint32_t *bs_entries; /* Commands which are waiting to be issued [meaning the request diff --git a/lib/nbd-protocol.h b/lib/nbd-protocol.h index c967c840..58583d1d 100644 --- a/lib/nbd-protocol.h +++ b/lib/nbd-protocol.h @@ -182,12 +182,6 @@ struct nbd_fixed_new_option_reply_meta_context { /* followed by a string */ } NBD_ATTRIBUTE_PACKED; -/* NBD_REPLY_TYPE_BLOCK_STATUS block descriptor. */ -struct nbd_block_descriptor { - uint32_t length; /* length of block */ - uint32_t status_flags; /* block type (hole etc) */ -} NBD_ATTRIBUTE_PACKED; - /* Request (client -> server). */ struct nbd_request { uint32_t magic; /* NBD_REQUEST_MAGIC. */ @@ -224,6 +218,16 @@ struct nbd_chunk_offset_hole { uint32_t length; /* Length of hole. */ } NBD_ATTRIBUTE_PACKED; +struct nbd_chunk_block_status_32 { + uint32_t context_id; /* metadata context ID */ + /* followed by array of nbd_block_descriptor_32 extents */ +} NBD_ATTRIBUTE_PACKED; + +struct nbd_block_descriptor_32 { + uint32_t length; /* length of block */ + uint32_t status_flags; /* block type (hole etc) */ +} NBD_ATTRIBUTE_PACKED; + struct nbd_chunk_error { uint32_t error; /* NBD_E* error number */ uint16_t len; /* Length of human readable error. */ diff --git a/generator/state_machine.ml b/generator/state_machine.ml index 3a912508..7a5bc59b 100644 --- a/generator/state_machine.ml +++ b/generator/state_machine.ml @@ -864,10 +864,17 @@ and external_events = []; }; + State { + default_state with + name = "RECV_BS_HEADER"; + comment = "Receive header of a chunk reply block-status payload"; + external_events = []; + }; + State { default_state with name = "RECV_BS_ENTRIES"; - comment = "Receive a chunk reply block-status payload"; + comment = "Receive entries array of chunk reply block-status payload"; external_events = []; }; diff --git a/generator/states-reply-chunk.c b/generator/states-reply-chunk.c index da5fc426..0d76c159 100644 --- a/generator/states-reply-chunk.c +++ b/generator/states-reply-chunk.c @@ -43,6 +43,28 @@ structured_reply_in_bounds (uint64_t offset, uint32_t length, return true; } +/* Return true if payload length of block status reply is valid. + */ +static bool +bs_reply_length_ok (uint16_t type, uint32_t length) +{ + /* TODO support 64-bit replies */ + size_t prefix_len = sizeof (struct nbd_chunk_block_status_32); + size_t extent_len = sizeof (struct nbd_block_descriptor_32); + assert (type == NBD_REPLY_TYPE_BLOCK_STATUS); + + /* At least one descriptor is needed after id prefix */ + if (length < prefix_len + extent_len) + return false; + + /* There must be an integral number of extents */ + length -= prefix_len; + if (length % extent_len != 0) + return false; + + return true; +} + STATE_MACHINE { REPLY.CHUNK_REPLY.START: struct command *cmd = h->reply_cmd; @@ -105,22 +127,13 @@ REPLY.CHUNK_REPLY.START: case NBD_REPLY_TYPE_BLOCK_STATUS: if (cmd->type != NBD_CMD_BLOCK_STATUS || !h->meta_valid || h->meta_contexts.len == 0 || - length < 12 || ((length-4) & 7) != 0) + !bs_reply_length_ok (type, length)) goto resync; assert (CALLBACK_IS_NOT_NULL (cmd->cb.fn.extent)); - /* We read the context ID followed by all the entries into a - * single array and deal with it at the end. - */ - free (h->bs_entries); - h->bs_entries = malloc (length); - if (h->bs_entries == NULL) { - SET_NEXT_STATE (%.DEAD); - set_error (errno, "malloc"); - break; - } - h->rbuf = h->bs_entries; - h->rlen = length; - SET_NEXT_STATE (%RECV_BS_ENTRIES); + /* Start by reading the context ID. */ + h->rbuf = &h->sbuf.reply.payload.bs_hdr_32; + h->rlen = sizeof h->sbuf.reply.payload.bs_hdr_32; + SET_NEXT_STATE (%RECV_BS_HEADER); break; default: @@ -400,10 +413,46 @@ REPLY.CHUNK_REPLY.RECV_OFFSET_HOLE: } return 0; + REPLY.CHUNK_REPLY.RECV_BS_HEADER: + struct command *cmd = h->reply_cmd; + uint16_t type; + + switch (recv_into_rbuf (h)) { + case -1: SET_NEXT_STATE (%.DEAD); return 0; + case 1: + save_reply_state (h); + SET_NEXT_STATE (%.READY); + return 0; + case 0: + type = be16toh (h->sbuf.reply.hdr.structured.type); + + assert (cmd); /* guaranteed by CHECK */ + assert (cmd->type == NBD_CMD_BLOCK_STATUS); + assert (bs_reply_length_ok (type, h->payload_left)); + STATIC_ASSERT (sizeof (struct nbd_block_descriptor_32) =+ 2 * sizeof *h->bs_entries, + _block_desc_is_multiple_of_bs_entry); + h->payload_left -= sizeof h->sbuf.reply.payload.bs_hdr_32; + assert (h->payload_left % sizeof (struct nbd_block_descriptor_32) == 0); + h->bs_count = h->payload_left / sizeof (struct nbd_block_descriptor_32); + + free (h->bs_entries); + h->bs_entries = malloc (h->payload_left); + if (h->bs_entries == NULL) { + SET_NEXT_STATE (%.DEAD); + set_error (errno, "malloc"); + return 0; + } + h->rbuf = h->bs_entries; + h->rlen = h->payload_left; + h->payload_left = 0; + SET_NEXT_STATE (%RECV_BS_ENTRIES); + } + return 0; + REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: struct command *cmd = h->reply_cmd; size_t i; - size_t count; uint32_t context_id; switch (recv_into_rbuf (h)) { @@ -416,31 +465,30 @@ REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: assert (cmd); /* guaranteed by CHECK */ assert (cmd->type == NBD_CMD_BLOCK_STATUS); assert (CALLBACK_IS_NOT_NULL (cmd->cb.fn.extent)); - assert (h->bs_entries); - assert (h->payload_left >= 12); + assert (h->bs_count && h->bs_entries); assert (h->meta_valid); - /* Need to byte-swap the entries returned, but apart from that we - * don't validate them. - */ - for (i = 0; i < h->payload_left / sizeof *h->bs_entries; ++i) - h->bs_entries[i] = be32toh (h->bs_entries[i]); - count = (h->payload_left / sizeof *h->bs_entries) - 1; - h->payload_left = 0; - /* Look up the context ID. */ - context_id = h->bs_entries[0]; + context_id = be32toh (h->sbuf.reply.payload.bs_hdr_32.context_id); for (i = 0; i < h->meta_contexts.len; ++i) if (context_id == h->meta_contexts.ptr[i].context_id) break; if (i < h->meta_contexts.len) { - /* Call the caller's extent function. */ int error = cmd->error; + const char *name = h->meta_contexts.ptr[i].name; - if (CALL_CALLBACK (cmd->cb.fn.extent, - h->meta_contexts.ptr[i].name, cmd->offset, - &h->bs_entries[1], count, &error) == -1) + /* Need to byte-swap the entries returned, but apart from that + * we don't validate them. Yes, our 32-bit public API foolishly + * tracks the number of uint32_t instead of block descriptors; + * see _block_desc_is_multiple_of_bs_entry above. + */ + for (i = 0; i < h->bs_count * 2; ++i) + h->bs_entries[i] = be32toh (h->bs_entries[i]); + + /* Call the caller's extent function. */ + if (CALL_CALLBACK (cmd->cb.fn.extent, name, cmd->offset, + h->bs_entries, h->bs_count * 2, &error) == -1) if (cmd->error == 0) cmd->error = error ? error : EPROTO; } -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 3/6] protocol: Add definitions for extended headers
Add the magic numbers and new structs necessary to implement the NBD protocol extension of extended headers providing 64-bit lengths. This corresponds to upstream nbd commits 36abf47d and a9384e2f on the extension-ext-header branch[1] (commit e6f3b94a for NBD_FLAG_BLOCK_STATUS_PAYLOAD is saved for a later patch). [1] https://github.com/NetworkBlockDevice/nbd/blob/extension-ext-header/doc/proto.md Signed-off-by: Eric Blake <eblake at redhat.com> Reviewed-by: Laszlo Ersek <lersek at redhat.com> --- v4: tweak comment on block_status_64.count [Laszlo], rebase to earlier struct renames (for example, nbd_block_descriptor_64 instead of nbd_structured_reply_block_status_hdr). Seemed obvious enough that I felt okay keeping Laszlo's R-b. --- lib/nbd-protocol.h | 55 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/lib/nbd-protocol.h b/lib/nbd-protocol.h index 58583d1d..b5a28ae4 100644 --- a/lib/nbd-protocol.h +++ b/lib/nbd-protocol.h @@ -124,6 +124,7 @@ struct nbd_fixed_new_option_reply { #define NBD_OPT_STRUCTURED_REPLY 8 #define NBD_OPT_LIST_META_CONTEXT 9 #define NBD_OPT_SET_META_CONTEXT 10 +#define NBD_OPT_EXTENDED_HEADERS 11 #define NBD_REP_ERR(val) (0x80000000 | (val)) #define NBD_REP_IS_ERR(val) (!!((val) & 0x80000000)) @@ -141,6 +142,7 @@ struct nbd_fixed_new_option_reply { #define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR (7) #define NBD_REP_ERR_BLOCK_SIZE_REQD NBD_REP_ERR (8) #define NBD_REP_ERR_TOO_BIG NBD_REP_ERR (9) +#define NBD_REP_ERR_EXT_HEADER_REQD NBD_REP_ERR (10) #define NBD_INFO_EXPORT 0 #define NBD_INFO_NAME 1 @@ -182,16 +184,26 @@ struct nbd_fixed_new_option_reply_meta_context { /* followed by a string */ } NBD_ATTRIBUTE_PACKED; -/* Request (client -> server). */ +/* Compact request (client -> server). */ struct nbd_request { uint32_t magic; /* NBD_REQUEST_MAGIC. */ - uint16_t flags; /* Request flags. */ - uint16_t type; /* Request type. */ + uint16_t flags; /* Request flags: NBD_CMD_FLAG_*. */ + uint16_t type; /* Request type: NBD_CMD_*. */ uint64_t cookie; /* Opaque handle. */ uint64_t offset; /* Request offset. */ uint32_t count; /* Request length. */ } NBD_ATTRIBUTE_PACKED; +/* Extended request (client -> server). */ +struct nbd_request_ext { + uint32_t magic; /* NBD_EXTENDED_REQUEST_MAGIC. */ + uint16_t flags; /* Request flags: NBD_CMD_FLAG_*. */ + uint16_t type; /* Request type: NBD_CMD_*. */ + uint64_t cookie; /* Opaque handle. */ + uint64_t offset; /* Request offset. */ + uint64_t count; /* Request effect or payload length. */ +} NBD_ATTRIBUTE_PACKED; + /* Simple reply (server -> client). */ struct nbd_simple_reply { uint32_t magic; /* NBD_SIMPLE_REPLY_MAGIC. */ @@ -208,6 +220,16 @@ struct nbd_structured_reply { uint32_t length; /* Length of following nbd_chunk_* payload. */ } NBD_ATTRIBUTE_PACKED; +/* Extended reply (server -> client). */ +struct nbd_extended_reply { + uint32_t magic; /* NBD_EXTENDED_REPLY_MAGIC. */ + uint16_t flags; /* NBD_REPLY_FLAG_* */ + uint16_t type; /* NBD_REPLY_TYPE_* */ + uint64_t cookie; /* Opaque handle. */ + uint64_t offset; /* Client's offset. */ + uint64_t length; /* Length of following nbd_chunk_* payload. */ +} NBD_ATTRIBUTE_PACKED; + struct nbd_chunk_offset_data { uint64_t offset; /* offset */ /* Followed by data. */ @@ -228,6 +250,17 @@ struct nbd_block_descriptor_32 { uint32_t status_flags; /* block type (hole etc) */ } NBD_ATTRIBUTE_PACKED; +struct nbd_chunk_block_status_64 { + uint32_t context_id; /* metadata context ID */ + uint32_t count; /* non-zero descriptor count */ + /* followed by nbd_block_descriptor_64[count] extents */ +} NBD_ATTRIBUTE_PACKED; + +struct nbd_block_descriptor_64 { + uint64_t length; /* length of block */ + uint64_t status_flags; /* block type (hole etc) */ +} NBD_ATTRIBUTE_PACKED; + struct nbd_chunk_error { uint32_t error; /* NBD_E* error number */ uint16_t len; /* Length of human readable error. */ @@ -235,8 +268,10 @@ struct nbd_chunk_error { } NBD_ATTRIBUTE_PACKED; #define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_EXTENDED_REQUEST_MAGIC 0x21e41c71 #define NBD_SIMPLE_REPLY_MAGIC 0x67446698 #define NBD_STRUCTURED_REPLY_MAGIC 0x668e33ef +#define NBD_EXTENDED_REPLY_MAGIC 0x6e8a278c /* Structured reply flags. */ #define NBD_REPLY_FLAG_DONE (1U << 0) @@ -245,12 +280,13 @@ struct nbd_chunk_error { #define NBD_REPLY_TYPE_IS_ERR(val) (!!((val) & (1U<<15))) /* Structured reply types. */ -#define NBD_REPLY_TYPE_NONE 0 -#define NBD_REPLY_TYPE_OFFSET_DATA 1 -#define NBD_REPLY_TYPE_OFFSET_HOLE 2 -#define NBD_REPLY_TYPE_BLOCK_STATUS 5 -#define NBD_REPLY_TYPE_ERROR NBD_REPLY_TYPE_ERR (1) -#define NBD_REPLY_TYPE_ERROR_OFFSET NBD_REPLY_TYPE_ERR (2) +#define NBD_REPLY_TYPE_NONE 0 +#define NBD_REPLY_TYPE_OFFSET_DATA 1 +#define NBD_REPLY_TYPE_OFFSET_HOLE 2 +#define NBD_REPLY_TYPE_BLOCK_STATUS 5 +#define NBD_REPLY_TYPE_BLOCK_STATUS_EXT 6 +#define NBD_REPLY_TYPE_ERROR NBD_REPLY_TYPE_ERR (1) +#define NBD_REPLY_TYPE_ERROR_OFFSET NBD_REPLY_TYPE_ERR (2) /* NBD commands. */ #define NBD_CMD_READ 0 @@ -267,6 +303,7 @@ struct nbd_chunk_error { #define NBD_CMD_FLAG_DF (1U << 2) #define NBD_CMD_FLAG_REQ_ONE (1U << 3) #define NBD_CMD_FLAG_FAST_ZERO (1U << 4) +#define NBD_CMD_FLAG_PAYLOAD_LEN (1U << 5) /* NBD error codes. */ #define NBD_SUCCESS 0 -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 4/6] states: Prepare to send 64-bit requests
Support sending 64-bit requests if extended headers were negotiated. This includes setting NBD_CMD_FLAG_PAYLOAD_LEN any time we send an extended NBD_CMD_WRITE; this is such a fundamental part of the protocol that for now it is easier to silently ignore whatever value the user passes in for that bit in the flags parameter of nbd_pwrite regardless of the current settings in set_strict_mode, rather than trying to force the user to pass in the correct value to match whether extended mode is negotiated. However, when we later add APIs to give the user more control for interoperability testing, it may be worth adding a new set_strict_mode control knob to explicitly enable the client to intentionally violate the protocol (the testsuite added in this patch would then be updated to match). At this point, h->extended_headers is permanently false (we can't enable it until all other aspects of the protocol have likewise been converted). Note that struct command.data_seen remains typed as uint32_t despite being an accumulator for count values that are now stored in 64 bits; this is okay, because we drop the connection if count exceeds MAX_REQUEST_SIZE on any one transaction, such that data_seen's use as a bounded accumulator can never exceed 2*MAX_REQUEST_SIZE. Support for using FLAG_PAYLOAD_LEN with NBD_CMD_BLOCK_STATUS is less fundamental, and deserves to be in its own patch. Signed-off-by: Eric Blake <eblake at redhat.com> Reviewed-by: Richard W.M. Jones <rjones at redhat.com> --- v4: fix Makefile indentation [Laszlo], prefer progname over argv[0] in test [Laszlo], enhance commit message about command.data_seen --- lib/internal.h | 10 ++- generator/API.ml | 20 +++--- generator/states-issue-command.c | 29 ++++++--- generator/states-reply-chunk.c | 2 +- lib/rw.c | 17 +++-- tests/Makefile.am | 4 ++ tests/pwrite-extended.c | 108 +++++++++++++++++++++++++++++++ .gitignore | 1 + 8 files changed, 165 insertions(+), 26 deletions(-) create mode 100644 tests/pwrite-extended.c diff --git a/lib/internal.h b/lib/internal.h index c7793f1f..2e4f987c 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -110,6 +110,9 @@ struct nbd_handle { char *tls_username; /* Username, NULL = use current username */ char *tls_psk_file; /* PSK filename, NULL = no PSK */ + /* Extended headers. */ + bool extended_headers; /* If we negotiated NBD_OPT_EXTENDED_HEADERS */ + /* Desired metadata contexts. */ bool request_sr; bool request_meta; @@ -273,7 +276,10 @@ struct nbd_handle { /* Issuing a command must use a buffer separate from sbuf, for the * case when we interrupt a request to service a reply. */ - struct nbd_request request; + union { + struct nbd_request compact; + struct nbd_request_ext extended; + } req; bool in_write_payload; bool in_write_shutdown; @@ -378,7 +384,7 @@ struct command { uint16_t type; uint64_t cookie; uint64_t offset; - uint32_t count; + uint64_t count; void *data; /* Buffer for read/write */ struct command_cb cb; bool initialized; /* For read, true if getting a hole may skip memset */ diff --git a/generator/API.ml b/generator/API.ml index 5fcb0e1f..02c1260d 100644 --- a/generator/API.ml +++ b/generator/API.ml @@ -198,11 +198,12 @@ let cmd_flags flag_prefix = "CMD_FLAG"; guard = Some "((h->strict & LIBNBD_STRICT_FLAGS) || flags > UINT16_MAX)"; flags = [ - "FUA", 1 lsl 0; - "NO_HOLE", 1 lsl 1; - "DF", 1 lsl 2; - "REQ_ONE", 1 lsl 3; - "FAST_ZERO", 1 lsl 4; + "FUA", 1 lsl 0; + "NO_HOLE", 1 lsl 1; + "DF", 1 lsl 2; + "REQ_ONE", 1 lsl 3; + "FAST_ZERO", 1 lsl 4; + "PAYLOAD_LEN", 1 lsl 5; ] } let handshake_flags = { @@ -2507,7 +2508,7 @@ "pread_structured", { "pwrite", { default_call with args = [ BytesIn ("buf", "count"); UInt64 "offset" ]; - optargs = [ OFlags ("flags", cmd_flags, Some ["FUA"]) ]; + optargs = [ OFlags ("flags", cmd_flags, Some ["FUA"; "PAYLOAD_LEN"]) ]; ret = RErr; permitted_states = [ Connected ]; shortdesc = "write to the NBD server"; @@ -2530,7 +2531,10 @@ "pwrite", { C<LIBNBD_CMD_FLAG_FUA> meaning that the server should not return until the data has been committed to permanent storage (if that is supported - some servers cannot do this, see -L<nbd_can_fua(3)>)." +L<nbd_can_fua(3)>). For convenience, libnbd ignores the presence +or absence of the flag C<LIBNBD_CMD_FLAG_PAYLOAD_LEN> in C<flags>, +while correctly using the flag over the wire according to whether +extended headers were negotiated." ^ strict_call_description; see_also = [Link "can_fua"; Link "is_read_only"; Link "aio_pwrite"; Link "get_block_size"; @@ -3220,7 +3224,7 @@ "aio_pwrite", { default_call with args = [ BytesPersistIn ("buf", "count"); UInt64 "offset" ]; optargs = [ OClosure completion_closure; - OFlags ("flags", cmd_flags, Some ["FUA"]) ]; + OFlags ("flags", cmd_flags, Some ["FUA"; "PAYLOAD_LEN"]) ]; ret = RCookie; permitted_states = [ Connected ]; shortdesc = "write to the NBD server"; diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c index 30721946..59b05835 100644 --- a/generator/states-issue-command.c +++ b/generator/states-issue-command.c @@ -41,15 +41,24 @@ ISSUE_COMMAND.START: return 0; } - h->request.magic = htobe32 (NBD_REQUEST_MAGIC); - h->request.flags = htobe16 (cmd->flags); - h->request.type = htobe16 (cmd->type); - h->request.cookie = htobe64 (cmd->cookie); - h->request.offset = htobe64 (cmd->offset); - h->request.count = htobe32 (cmd->count); + /* These fields are coincident between req.compact and req.extended */ + h->req.compact.flags = htobe16 (cmd->flags); + h->req.compact.type = htobe16 (cmd->type); + h->req.compact.cookie = htobe64 (cmd->cookie); + h->req.compact.offset = htobe64 (cmd->offset); + if (h->extended_headers) { + h->req.extended.magic = htobe32 (NBD_EXTENDED_REQUEST_MAGIC); + h->req.extended.count = htobe64 (cmd->count); + h->wlen = sizeof (h->req.extended); + } + else { + assert (cmd->count <= UINT32_MAX); + h->req.compact.magic = htobe32 (NBD_REQUEST_MAGIC); + h->req.compact.count = htobe32 (cmd->count); + h->wlen = sizeof (h->req.compact); + } h->chunks_sent++; - h->wbuf = &h->request; - h->wlen = sizeof (h->request); + h->wbuf = &h->req; if (cmd->type == NBD_CMD_WRITE || cmd->next) h->wflags = MSG_MORE; SET_NEXT_STATE (%SEND_REQUEST); @@ -74,7 +83,7 @@ ISSUE_COMMAND.PREPARE_WRITE_PAYLOAD: assert (h->cmds_to_issue != NULL); cmd = h->cmds_to_issue; - assert (cmd->cookie == be64toh (h->request.cookie)); + assert (cmd->cookie == be64toh (h->req.compact.cookie)); if (cmd->type == NBD_CMD_WRITE) { h->wbuf = cmd->data; h->wlen = cmd->count; @@ -120,7 +129,7 @@ ISSUE_COMMAND.FINISH: assert (!h->wlen); assert (h->cmds_to_issue != NULL); cmd = h->cmds_to_issue; - assert (cmd->cookie == be64toh (h->request.cookie)); + assert (cmd->cookie == be64toh (h->req.compact.cookie)); h->cmds_to_issue = cmd->next; if (h->cmds_to_issue_tail == cmd) h->cmds_to_issue_tail = NULL; diff --git a/generator/states-reply-chunk.c b/generator/states-reply-chunk.c index 0d76c159..1758ac34 100644 --- a/generator/states-reply-chunk.c +++ b/generator/states-reply-chunk.c @@ -34,7 +34,7 @@ structured_reply_in_bounds (uint64_t offset, uint32_t length, offset + length > cmd->offset + cmd->count) { set_error (0, "range of structured reply is out of bounds, " "offset=%" PRIu64 ", cmd->offset=%" PRIu64 ", " - "length=%" PRIu32 ", cmd->count=%" PRIu32 ": " + "length=%" PRIu32 ", cmd->count=%" PRIu64 ": " "this is likely to be a bug in the NBD server", offset, cmd->offset, length, cmd->count); return false; diff --git a/lib/rw.c b/lib/rw.c index 3dc3499e..8b2bd4cc 100644 --- a/lib/rw.c +++ b/lib/rw.c @@ -223,13 +223,11 @@ nbd_internal_command_common (struct nbd_handle *h, } break; - /* Other commands are currently limited by the 32 bit field in the - * command structure on the wire, but in future we hope to support - * 64 bit values here with a change to the NBD protocol which is - * being discussed upstream. + /* Other commands are limited by the 32 bit field in the command + * structure on the wire, unless extended headers were negotiated. */ default: - if (count > UINT32_MAX) { + if (!h->extended_headers && count > UINT32_MAX) { set_error (ERANGE, "request too large: maximum request size is %" PRIu32, UINT32_MAX); goto err; @@ -358,6 +356,15 @@ nbd_unlocked_aio_pwrite (struct nbd_handle *h, const void *buf, return -1; } } + /* It is more convenient to manage PAYLOAD_LEN by what was negotiated + * than to require the user to have to set it correctly. + * TODO: Add new h->strict bit to allow intentional protocol violation + * for interoperability testing. + */ + if (h->extended_headers) + flags |= LIBNBD_CMD_FLAG_PAYLOAD_LEN; + else + flags &= ~LIBNBD_CMD_FLAG_PAYLOAD_LEN; SET_CALLBACK_TO_NULL (*completion); return nbd_internal_command_common (h, flags, NBD_CMD_WRITE, offset, count, diff --git a/tests/Makefile.am b/tests/Makefile.am index 6eddcb7a..52fadd9c 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -233,6 +233,7 @@ check_PROGRAMS += \ closure-lifetimes \ pread-initialize \ socket-activation-name \ + pwrite-extended \ $(NULL) TESTS += \ @@ -655,6 +656,9 @@ socket_activation_name_SOURCES = \ requires.h socket_activation_name_LDADD = $(top_builddir)/lib/libnbd.la +pwrite_extended_SOURCES = pwrite-extended.c +pwrite_extended_LDADD = $(top_builddir)/lib/libnbd.la + #---------------------------------------------------------------------- # Testing TLS support. diff --git a/tests/pwrite-extended.c b/tests/pwrite-extended.c new file mode 100644 index 00000000..c6f4dd29 --- /dev/null +++ b/tests/pwrite-extended.c @@ -0,0 +1,108 @@ +/* NBD client library in userspace + * Copyright Red Hat + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Check behavior of pwrite with PAYLOAD_LEN flag for extended headers. */ + +#include <config.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/stat.h> + +#include <libnbd.h> + +static char *progname; +static char buf[512]; + +static void +check (int experr, const char *prefix) +{ + const char *msg = nbd_get_error (); + int errnum = nbd_get_errno (); + + fprintf (stderr, "error: \"%s\"\n", msg); + fprintf (stderr, "errno: %d (%s)\n", errnum, strerror (errnum)); + if (strncmp (msg, prefix, strlen (prefix)) != 0) { + fprintf (stderr, "%s: test failed: missing context prefix: %s\n", + progname, msg); + exit (EXIT_FAILURE); + } + if (errnum != experr) { + fprintf (stderr, "%s: test failed: " + "expected errno = %d (%s), but got %d\n", + progname, experr, strerror (experr), errnum); + exit (EXIT_FAILURE); + } +} + +int +main (int argc, char *argv[]) +{ + struct nbd_handle *nbd; + const char *cmd[] = { + "nbdkit", "-s", "-v", "--exit-with-parent", "memory", "1048576", NULL + }; + uint32_t strict; + + progname = argv[0]; + + nbd = nbd_create (); + if (nbd == NULL) { + fprintf (stderr, "%s: %s\n", progname, nbd_get_error ()); + exit (EXIT_FAILURE); + } + + /* Connect to the server. */ + if (nbd_connect_command (nbd, (char **)cmd) == -1) { + fprintf (stderr, "%s: %s\n", progname, nbd_get_error ()); + exit (EXIT_FAILURE); + } + + /* FIXME: future API addition to test if server negotiated extended mode. + * Until then, strict flags must ignore the PAYLOAD_LEN flag for pwrite, + * even though it is rejected for other commands. + */ + strict = nbd_get_strict_mode (nbd); + if (!(strict & LIBNBD_STRICT_FLAGS)) { + fprintf (stderr, "%s: test failed: " + "nbd_get_strict_mode did not have expected flag set\n", + progname); + exit (EXIT_FAILURE); + } + if (nbd_aio_pread (nbd, buf, 512, 0, NBD_NULL_COMPLETION, + LIBNBD_CMD_FLAG_PAYLOAD_LEN) != -1) { + fprintf (stderr, "%s: test failed: " + "nbd_aio_pread did not fail with unexpected flag\n", + progname); + exit (EXIT_FAILURE); + } + check (EINVAL, "nbd_aio_pread: "); + + if (nbd_aio_pwrite (nbd, buf, 512, 0, NBD_NULL_COMPLETION, + LIBNBD_CMD_FLAG_PAYLOAD_LEN) == -1 || + nbd_aio_pwrite (nbd, buf, 512, 0, NBD_NULL_COMPLETION, 0) == -1) { + fprintf (stderr, "%s: %s\n", progname, nbd_get_error ()); + exit (EXIT_FAILURE); + } + + nbd_close (nbd); + exit (EXIT_SUCCESS); +} diff --git a/.gitignore b/.gitignore index b43e83c0..e5304f16 100644 --- a/.gitignore +++ b/.gitignore @@ -252,6 +252,7 @@ Makefile.in /tests/pki/ /tests/pread-initialize /tests/private-data +/tests/pwrite-extended /tests/read-only-flag /tests/read-write-flag /tests/server-death -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 5/6] states: Prepare to receive 64-bit replies
Support receiving headers for 64-bit replies if extended headers were negotiated. We already insist that the server not send us too much payload in one reply, so we can exploit that and continue to use h->payload_left as a 32-bit length for the rest of the payload parsing. The NBD protocol specifically documents that extended mode takes precedence over structured replies, and that there are no simple replies in extended mode. We can also take advantage that the cookie field is in the same offset in all the various reply types. Checking that the server sent back the correct offset adds a bit more verbosity for the sake of nicer debug messages. Note that if we negotiate extended headers, but a non-compliant server replies with a non-extended header, this patch will stall waiting for the server to send more bytes rather than noticing that the magic number is wrong (for aio operations, you'll get a magic number mismatch once you send a second command that elicits a reply; but for blocking operations, we basically deadlock). The easy alternative would be to read just the first 4 bytes of magic, then determine how many more bytes to expect; but that would require more states and syscalls, and not worth it since the typical server will be compliant. The other alternative is what the next patch implements: teaching REPLY.RECV_REPLY to handle short reads that were at least long enough to transmit magic to specifically look for magic number mismatch. At this point, h->extended_headers is permanently false (we can't enable it until all other aspects of the protocol have likewise been converted). Signed-off-by: Eric Blake <eblake at redhat.com> --- v4: don't normalize length in-place [Laszlo], rebase on earlier changes, better debug reporting if offsets mismatch --- lib/internal.h | 1 + generator/states-reply.c | 89 ++++++++++++++++++++++++---------- generator/states-reply-chunk.c | 40 +++++++++++---- 3 files changed, 95 insertions(+), 35 deletions(-) diff --git a/lib/internal.h b/lib/internal.h index 2e4f987c..9df6a685 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -244,6 +244,7 @@ struct nbd_handle { union reply_header { struct nbd_simple_reply simple; struct nbd_structured_reply structured; + struct nbd_extended_reply extended; /* The wire formats share magic and cookie at the same offsets; * provide aliases for one less level of typing. */ diff --git a/generator/states-reply.c b/generator/states-reply.c index 0d200513..05301ae8 100644 --- a/generator/states-reply.c +++ b/generator/states-reply.c @@ -68,22 +68,30 @@ REPLY.START: */ ssize_t r; - /* We read all replies initially as if they are simple replies, but - * check the magic in CHECK_REPLY_MAGIC below. This works because - * the structured_reply header is larger, and because the last - * member of a simple reply, cookie, is coincident between the two - * structs (an intentional design decision in the NBD spec when - * structured replies were added). + /* With extended headers, there is only one size to read, so we can + * do it all in one syscall. But for older structured replies, we + * don't know if we have a simple or structured reply until we read + * the magic number, requiring a two-part read with + * CHECK_REPLY_MAGIC below. This works because the structured_reply + * header is larger, and because the last member of a simple reply, + * cookie, is coincident between all three structs (intentional + * design decisions in the NBD spec when structured and extended + * replies were added). */ ASSERT_MEMBER_ALIAS (union reply_header, simple.magic, magic); ASSERT_MEMBER_ALIAS (union reply_header, simple.cookie, cookie); ASSERT_MEMBER_ALIAS (union reply_header, structured.magic, magic); ASSERT_MEMBER_ALIAS (union reply_header, structured.cookie, cookie); + ASSERT_MEMBER_ALIAS (union reply_header, extended.magic, magic); + ASSERT_MEMBER_ALIAS (union reply_header, extended.cookie, cookie); assert (h->reply_cmd == NULL); assert (h->rlen == 0); h->rbuf = &h->sbuf.reply.hdr; - h->rlen = sizeof h->sbuf.reply.hdr.simple; + if (h->extended_headers) + h->rlen = sizeof h->sbuf.reply.hdr.extended; + else + h->rlen = sizeof h->sbuf.reply.hdr.simple; r = h->sock->ops->recv (h, h->sock, h->rbuf, h->rlen); if (r == -1) { @@ -129,10 +137,25 @@ REPLY.CHECK_REPLY_MAGIC: uint64_t cookie; magic = be32toh (h->sbuf.reply.hdr.magic); - if (magic == NBD_SIMPLE_REPLY_MAGIC) { + switch (magic) { + case NBD_SIMPLE_REPLY_MAGIC: + if (h->extended_headers) + /* Server is non-compliant, and we've already read more bytes + * than a simple header contains; no recovery possible + */ + goto invalid; + + /* All other payload checks handled in the simple payload engine */ SET_NEXT_STATE (%SIMPLE_REPLY.START); - } - else if (magic == NBD_STRUCTURED_REPLY_MAGIC) { + break; + + case NBD_STRUCTURED_REPLY_MAGIC: + if (h->extended_headers) + /* Server is non-compliant, and we've already read more bytes + * than a structured header contains; no recovery possible + */ + goto invalid; + /* We've only read the bytes that fill hdr.simple. But * hdr.structured is longer, so prepare to read the remaining * bytes. We depend on the memory aliasing in union sbuf to @@ -145,23 +168,29 @@ REPLY.CHECK_REPLY_MAGIC: h->rlen = sizeof h->sbuf.reply.hdr.structured; h->rlen -= sizeof h->sbuf.reply.hdr.simple; SET_NEXT_STATE (%RECV_STRUCTURED_REMAINING); - } - else { - /* We've probably lost synchronization. */ - SET_NEXT_STATE (%.DEAD); - set_error (0, "invalid reply magic 0x%" PRIx32, magic); -#if 0 /* uncomment to see desynchronized data */ - nbd_internal_hexdump (&h->sbuf.reply.hdr.simple, - sizeof (h->sbuf.reply.hdr.simple), - stderr); -#endif - return 0; + break; + + case NBD_EXTENDED_REPLY_MAGIC: + if (!h->extended_headers) + /* Server is non-compliant. We could continue reading bytes up + * to the length of an extended reply to regain sync, but old + * servers are unlikely to send this magic, so it's just as easy + * to punt. + */ + goto invalid; + + /* All other payload checks handled in the chunk payload engine */ + SET_NEXT_STATE (%CHUNK_REPLY.START); + break; + + default: + goto invalid; } - /* NB: This works for both simple and structured replies, even - * though we haven't finished reading the structured header yet, - * because the cookie is stored at the same offset. See the - * STATIC_ASSERT above in state REPLY.START that confirmed this. + /* NB: This works for all three reply types, even though we haven't + * finished reading a structured header yet, because the cookie is + * stored at the same offset. See the ASSERT_MEMBER_ALIAS above in + * state REPLY.START that confirmed this. */ h->chunks_received++; cookie = be64toh (h->sbuf.reply.hdr.cookie); @@ -175,6 +204,16 @@ REPLY.CHECK_REPLY_MAGIC: h->reply_cmd = cmd; return 0; + invalid: + SET_NEXT_STATE (%.DEAD); /* We've probably lost synchronization. */ + set_error (0, "invalid or unexpected reply magic 0x%" PRIx32, magic); +#if 0 /* uncomment to see desynchronized data */ + nbd_internal_hexdump (&h->sbuf.reply.hdr.simple, + sizeof (h->sbuf.reply.hdr.simple), + stderr); +#endif + return 0; + REPLY.RECV_STRUCTURED_REMAINING: switch (recv_into_rbuf (h)) { case -1: SET_NEXT_STATE (%.DEAD); return 0; diff --git a/generator/states-reply-chunk.c b/generator/states-reply-chunk.c index 1758ac34..17bb5149 100644 --- a/generator/states-reply-chunk.c +++ b/generator/states-reply-chunk.c @@ -69,11 +69,17 @@ STATE_MACHINE { REPLY.CHUNK_REPLY.START: struct command *cmd = h->reply_cmd; uint16_t flags, type; - uint32_t length; + uint64_t length; + uint64_t offset = -1; flags = be16toh (h->sbuf.reply.hdr.structured.flags); type = be16toh (h->sbuf.reply.hdr.structured.type); - length = be32toh (h->sbuf.reply.hdr.structured.length); + if (h->extended_headers) { + length = be64toh (h->sbuf.reply.hdr.extended.length); + offset = be64toh (h->sbuf.reply.hdr.extended.offset); + } + else + length = be32toh (h->sbuf.reply.hdr.structured.length); /* Reject a server that replies with too much information, but don't * reject a single structured reply to NBD_CMD_READ on the largest @@ -83,13 +89,14 @@ REPLY.CHUNK_REPLY.START: * not worth keeping the connection alive. */ if (length > MAX_REQUEST_SIZE + sizeof h->sbuf.reply.payload.offset_data) { - set_error (0, "invalid server reply length %" PRIu32, length); + set_error (0, "invalid server reply length %" PRIu64, length); SET_NEXT_STATE (%.DEAD); return 0; } /* Skip an unexpected structured reply, including to an unknown cookie. */ - if (cmd == NULL || !h->structured_replies) + if (cmd == NULL || !h->structured_replies || + (h->extended_headers && offset != cmd->offset)) goto resync; h->payload_left = length; @@ -504,7 +511,8 @@ REPLY.CHUNK_REPLY.RECV_BS_ENTRIES: REPLY.CHUNK_REPLY.RESYNC: struct command *cmd = h->reply_cmd; uint16_t type; - uint32_t length; + uint64_t length; + uint64_t offset = -1; assert (h->rbuf == NULL); switch (recv_into_rbuf (h)) { @@ -524,11 +532,23 @@ REPLY.CHUNK_REPLY.RESYNC: return 0; } type = be16toh (h->sbuf.reply.hdr.structured.type); - length = be32toh (h->sbuf.reply.hdr.structured.length); - debug (h, "unexpected reply type %u or payload length %" PRIu32 - " for cookie %" PRIu64 " and command %" PRIu32 - ", this is probably a server bug", - type, length, cmd->cookie, cmd->type); + if (h->extended_headers) { + length = be64toh (h->sbuf.reply.hdr.extended.length); + offset = be64toh (h->sbuf.reply.hdr.extended.offset); + if (offset != cmd->offset) + debug (h, "unexpected reply offset %" PRIu64 " for cookie %" PRIu64 + " and command %" PRIu32 ", this is probably a server bug", + length, cmd->cookie, cmd->type); + else + offset = -1; + } + else + length = be32toh (h->sbuf.reply.hdr.structured.length); + if (offset == -1) + debug (h, "unexpected reply type %u or payload length %" PRIu64 + " for cookie %" PRIu64 " and command %" PRIu32 + ", this is probably a server bug", + type, length, cmd->cookie, cmd->type); if (cmd->error == 0) cmd->error = EPROTO; SET_NEXT_STATE (%FINISH); -- 2.41.0
Eric Blake
2023-Jul-21 16:08 UTC
[Libguestfs] [libnbd PATCH v4 6/6] states: Break deadlock if server goofs on extended replies
One of the benefits of extended replies is that we can do a fixed-length read for the entire header of every server reply, which is fewer syscalls than the split-read approach required by structured replies. But one of the drawbacks of doing a large read is that if the server is non-compliant (not a problem for normal servers, but something I hit rather more than I'd like to admit while developing extended header support in servers), nbd_pwrite() and friends will deadlock if the server replies with the wrong header. Add in some code to catch that failure mode and move the state machine to DEAD sooner, to make it easier to diagnose the fault in the server. Unlike in the case of an unexpected simple reply from a structured server (where, since b31e7bac, we can merely fail the command with EPROTO and successfully move on to the next server reply, because we didn't read too many bytes yet), in this case we really do have to move to DEAD: we cannot assume our short read was just the 16 (simple) or 20 (structured) bytes that the server sent for this command, because it is also possible that we can see a short read even while reading two back-to-back replies; if we have already read the initial bytes of the server's next reply, we have no way to push those extra bytes back onto our read stream for parsing on our next pass through the state machine. Signed-off-by: Eric Blake <eblake at redhat.com> --- v4: rebase to master, improve comment accuracy [Laszlo], drop bogus #if 0 --- generator/states-reply.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/generator/states-reply.c b/generator/states-reply.c index 05301ae8..9f0918e2 100644 --- a/generator/states-reply.c +++ b/generator/states-reply.c @@ -126,7 +126,25 @@ REPLY.START: REPLY.RECV_REPLY: switch (recv_into_rbuf (h)) { case -1: SET_NEXT_STATE (%.DEAD); return 0; - case 1: SET_NEXT_STATE (%.READY); return 0; + case 1: SET_NEXT_STATE (%.READY); + /* Special case: if we have a short read, but got at least far + * enough to decode the magic number, we can check if the server + * is matching our expectations. This lets us avoid deadlocking if + * we are blocked waiting for a 32-byte extended reply, while a + * buggy server only sent a shorter simple or structured reply. + * Magic number checks here must be repeated in CHECK_REPLY_MAGIC, + * since we do not always encounter a short read. + */ + if (h->extended_headers && + (char *)h->rbuf >+ (char *)&h->sbuf.reply.hdr + sizeof h->sbuf.reply.hdr.magic) { + uint32_t magic = be32toh (h->sbuf.reply.hdr.magic); + if (magic != NBD_EXTENDED_REPLY_MAGIC) { + SET_NEXT_STATE (%.DEAD); /* We've probably lost synchronization. */ + set_error (0, "invalid or unexpected reply magic 0x%" PRIx32, magic); + } + } + return 0; case 0: SET_NEXT_STATE (%CHECK_REPLY_MAGIC); } return 0; -- 2.41.0