Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 0/7] Avoid deadlock with in-flight commands
Since v2: - rebase to Rich's new API calls - more refactoring in patch 1 (retitled) - new patches 3 and 4 - fix data corruption in patch 6 (was 4) - more tweaks to the reproducer example (including using new API from 3) Eric Blake (7): lib: Refactor command_common() to do more common work commands: Allow for a command queue commands: Expose FIFO ordering of server completions disconnect: Allow shutdown during processing states: Split ISSUE_COMMAND.SEND_REQUEST states: Allow in-flight read while writing next command examples: Add example to demonstrate just-fixed deadlock scenario .gitignore | 1 + examples/Makefile.am | 10 ++ examples/batched-read-write.c | 214 +++++++++++++++++++++++++++++++ generator/generator | 52 ++++++-- generator/states-issue-command.c | 85 ++++++++---- generator/states-reply.c | 13 +- generator/states.c | 5 + lib/aio.c | 13 ++ lib/disconnect.c | 23 +--- lib/internal.h | 14 +- lib/rw.c | 131 ++++++++----------- 11 files changed, 429 insertions(+), 132 deletions(-) create mode 100644 examples/batched-read-write.c -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 1/7] lib: Refactor command_common() to do more common work
Our construction of struct command_in_flight was ad hoc; most parameters were set in command_common(), but extent callbacks were done after the fact, and NBD_CMD_DISC was open-coding things. Furthermore, every caller was triggering nbd_internal_run() for the cmd_issue event; doing that in a central place makes it easier for the next patch to improve that logic without duplicating the fix at each caller. Fix these things by renaming the function to nbd_internal_command_common, which is necessary for exporting it, and by adding parameters and an updated return type. --- lib/disconnect.c | 20 +++------ lib/internal.h | 7 ++++ lib/rw.c | 103 +++++++++++++---------------------------------- 3 files changed, 39 insertions(+), 91 deletions(-) diff --git a/lib/disconnect.c b/lib/disconnect.c index bc43b4c..9706835 100644 --- a/lib/disconnect.c +++ b/lib/disconnect.c @@ -60,27 +60,17 @@ nbd_unlocked_shutdown (struct nbd_handle *h) int nbd_unlocked_aio_disconnect (struct nbd_connection *conn) { - struct command_in_flight *cmd; + int64_t id; - cmd = malloc (sizeof *cmd); - if (cmd == NULL) { - set_error (errno, "malloc"); + id = nbd_internal_command_common (conn, 0, NBD_CMD_DISC, 0, 0, NULL, + 0, NULL); + if (id == -1) return -1; - } - cmd->flags = 0; - cmd->type = NBD_CMD_DISC; - cmd->handle = conn->h->unique++; - cmd->offset = 0; - cmd->count = 0; - cmd->data = NULL; - - cmd->next = conn->cmds_to_issue; - conn->cmds_to_issue = cmd; /* This will leave the command on the in-flight list. Is this a * problem? Probably it isn't. If it is, we could add a flag to * the command struct to tell SEND_REQUEST not to add it to the * in-flight list. */ - return nbd_internal_run (conn->h, conn, cmd_issue); + return 0; } diff --git a/lib/internal.h b/lib/internal.h index 1f742da..de9b8bc 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -265,6 +265,13 @@ extern void nbd_internal_set_last_error (int errnum, char *error); extern int nbd_internal_errno_of_nbd_error (uint32_t error); extern const char *nbd_internal_name_of_nbd_cmd (uint16_t type); +/* rw.c */ +extern int64_t nbd_internal_command_common (struct nbd_connection *conn, + uint16_t flags, uint16_t type, + uint64_t offset, uint64_t count, + void *data, int64_t id, + extent_fn extent); + /* socket.c */ struct socket *nbd_internal_socket_create (int fd); diff --git a/lib/rw.c b/lib/rw.c index 861ab67..8f6227d 100644 --- a/lib/rw.c +++ b/lib/rw.c @@ -241,10 +241,11 @@ nbd_unlocked_block_status (struct nbd_handle *h, return r == -1 ? -1 : 0; } -static struct command_in_flight * -command_common (struct nbd_connection *conn, - uint16_t flags, uint16_t type, - uint64_t offset, uint64_t count, void *data) +int64_t +nbd_internal_command_common (struct nbd_connection *conn, + uint16_t flags, uint16_t type, + uint64_t offset, uint64_t count, void *data, + int64_t id, extent_fn extent) { struct command_in_flight *cmd; @@ -255,7 +256,7 @@ command_common (struct nbd_connection *conn, if (count > MAX_REQUEST_SIZE) { set_error (ERANGE, "request too large: maximum request size is %d", MAX_REQUEST_SIZE); - return NULL; + return -1; } break; @@ -268,7 +269,7 @@ command_common (struct nbd_connection *conn, if (count > UINT32_MAX) { set_error (ERANGE, "request too large: maximum request size is %" PRIu32, UINT32_MAX); - return NULL; + return -1; } break; } @@ -276,7 +277,7 @@ command_common (struct nbd_connection *conn, cmd = calloc (1, sizeof *cmd); if (cmd == NULL) { set_error (errno, "calloc"); - return NULL; + return -1; } cmd->flags = flags; cmd->type = type; @@ -284,6 +285,8 @@ command_common (struct nbd_connection *conn, cmd->offset = offset; cmd->count = count; cmd->data = data; + cmd->extent_id = id; + cmd->extent_fn = extent; /* If structured replies were negotiated then we trust the server to * send back sufficient data to cover the whole buffer. It's tricky @@ -298,23 +301,18 @@ command_common (struct nbd_connection *conn, cmd->next = conn->cmds_to_issue; conn->cmds_to_issue = cmd; + if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) + return -1; - return cmd; + return cmd->handle; } int64_t nbd_unlocked_aio_pread (struct nbd_connection *conn, void *buf, size_t count, uint64_t offset) { - struct command_in_flight *cmd; - - cmd = command_common (conn, 0, NBD_CMD_READ, offset, count, buf); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, 0, NBD_CMD_READ, offset, count, + buf, 0, NULL); } int64_t @@ -322,8 +320,6 @@ nbd_unlocked_aio_pwrite (struct nbd_connection *conn, const void *buf, size_t count, uint64_t offset, uint32_t flags) { - struct command_in_flight *cmd; - if (nbd_unlocked_read_only (conn->h) == 1) { set_error (EINVAL, "server does not support write operations"); return -1; @@ -340,33 +336,20 @@ nbd_unlocked_aio_pwrite (struct nbd_connection *conn, const void *buf, return -1; } - cmd = command_common (conn, flags, NBD_CMD_WRITE, offset, count, - (void *) buf); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, flags, NBD_CMD_WRITE, offset, count, + (void *) buf, 0, NULL); } int64_t nbd_unlocked_aio_flush (struct nbd_connection *conn) { - struct command_in_flight *cmd; - if (nbd_unlocked_can_flush (conn->h) != 1) { set_error (EINVAL, "server does not support flush operations"); return -1; } - cmd = command_common (conn, 0, NBD_CMD_FLUSH, 0, 0, NULL); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, 0, NBD_CMD_FLUSH, 0, 0, + NULL, 0, NULL); } int64_t @@ -374,8 +357,6 @@ nbd_unlocked_aio_trim (struct nbd_connection *conn, uint64_t count, uint64_t offset, uint32_t flags) { - struct command_in_flight *cmd; - if (nbd_unlocked_read_only (conn->h) == 1) { set_error (EINVAL, "server does not support write operations"); return -1; @@ -397,21 +378,14 @@ nbd_unlocked_aio_trim (struct nbd_connection *conn, return -1; } - cmd = command_common (conn, flags, NBD_CMD_TRIM, offset, count, NULL); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, flags, NBD_CMD_TRIM, offset, count, + NULL, 0, NULL); } int64_t nbd_unlocked_aio_cache (struct nbd_connection *conn, uint64_t count, uint64_t offset) { - struct command_in_flight *cmd; - /* Actually according to the NBD protocol document, servers do exist * that support NBD_CMD_CACHE but don't advertize the * NBD_FLAG_SEND_CACHE bit, but we ignore those. @@ -421,13 +395,8 @@ nbd_unlocked_aio_cache (struct nbd_connection *conn, return -1; } - cmd = command_common (conn, 0, NBD_CMD_CACHE, offset, count, NULL); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, 0, NBD_CMD_CACHE, offset, count, + NULL, 0, NULL); } int64_t @@ -435,8 +404,6 @@ nbd_unlocked_aio_zero (struct nbd_connection *conn, uint64_t count, uint64_t offset, uint32_t flags) { - struct command_in_flight *cmd; - if (nbd_unlocked_read_only (conn->h) == 1) { set_error (EINVAL, "server does not support write operations"); return -1; @@ -458,13 +425,8 @@ nbd_unlocked_aio_zero (struct nbd_connection *conn, return -1; } - cmd = command_common (conn, flags, NBD_CMD_WRITE_ZEROES, offset, count, NULL); - if (!cmd) - return -1; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, flags, NBD_CMD_WRITE_ZEROES, offset, + count, NULL, 0, NULL); } int64_t @@ -474,8 +436,6 @@ nbd_unlocked_aio_block_status (struct nbd_connection *conn, int64_t id, extent_fn extent) { - struct command_in_flight *cmd; - if (!conn->structured_replies) { set_error (ENOTSUP, "server does not support structured replies"); return -1; @@ -498,15 +458,6 @@ nbd_unlocked_aio_block_status (struct nbd_connection *conn, return -1; } - cmd = command_common (conn, flags, NBD_CMD_BLOCK_STATUS, offset, count, NULL); - if (!cmd) - return -1; - - cmd->extent_fn = extent; - cmd->extent_id = id; - - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; - - return cmd->handle; + return nbd_internal_command_common (conn, flags, NBD_CMD_BLOCK_STATUS, offset, + count, NULL, id, extent); } -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 2/7] commands: Allow for a command queue
Previously, our 'cmds_to_issue' list was at most 1 element long, because we reject all commands except from state READY, but don't get back to state READY until the issue-commands sequence has completed. However, this is not as friendly on the client - once we are in transmission phase, a client may want to queue up another command whether or not the state machine is still tied up in processing a previous command. We still want to reject commands sent before the first time we reach READY, as well as keep the cmd_issue event for kicking the state machine into action when there is no previous command being worked on, but otherwise, the state machine itself can recognize when the command queue needs draining. --- generator/states.c | 5 +++++ lib/rw.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/generator/states.c b/generator/states.c index 202c305..22ce853 100644 --- a/generator/states.c +++ b/generator/states.c @@ -110,6 +110,11 @@ send_from_wbuf (struct nbd_connection *conn) /*----- End of prologue. -----*/ /* STATE MACHINE */ { + READY: + if (conn->cmds_to_issue) + SET_NEXT_STATE (%ISSUE_COMMAND.START); + return 0; + DEAD: if (conn->sock) { conn->sock->ops->close (conn->sock); diff --git a/lib/rw.c b/lib/rw.c index 8f6227d..594593a 100644 --- a/lib/rw.c +++ b/lib/rw.c @@ -24,6 +24,7 @@ #include <stdint.h> #include <inttypes.h> #include <errno.h> +#include <assert.h> #include "internal.h" @@ -247,7 +248,15 @@ nbd_internal_command_common (struct nbd_connection *conn, uint64_t offset, uint64_t count, void *data, int64_t id, extent_fn extent) { - struct command_in_flight *cmd; + struct command_in_flight *cmd, *prev_cmd; + + if (!nbd_unlocked_aio_is_ready (conn) && + !nbd_unlocked_aio_is_processing (conn)) { + set_error (0, "command request %s is invalid in state %s", + nbd_internal_name_of_nbd_cmd (type), + nbd_internal_state_short_string (conn->state)); + return -1; + } switch (type) { /* Commands which send or receive data are limited to MAX_REQUEST_SIZE. */ @@ -299,10 +308,23 @@ nbd_internal_command_common (struct nbd_connection *conn, if (conn->structured_replies && cmd->data && type == NBD_CMD_READ) memset (cmd->data, 0, cmd->count); - cmd->next = conn->cmds_to_issue; - conn->cmds_to_issue = cmd; - if (nbd_internal_run (conn->h, conn, cmd_issue) == -1) - return -1; + /* Add the command to the end of the queue. Kick the state machine + * if there is no other command being processed, otherwise, it will + * be handled automatically on a future cycle around to READY. + */ + if (conn->cmds_to_issue != NULL) { + assert (nbd_unlocked_aio_is_processing (conn)); + prev_cmd = conn->cmds_to_issue; + while (prev_cmd->next) + prev_cmd = prev_cmd->next; + prev_cmd->next = cmd; + } + else { + conn->cmds_to_issue = cmd; + if (nbd_unlocked_aio_is_ready (conn) && + nbd_internal_run (conn->h, conn, cmd_issue) == -1) + return -1; + } return cmd->handle; } -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 3/7] commands: Expose FIFO ordering of server completions
A generic client exploiting multiple in-flight commands should be prepared for out-of-order responses (and should probably ensure that there are no offset/count overlaps between parallel in-flight commands to avoid unspecified disk contents if the server acts on commands in an arbitrary order or even exposing non-atomic splicing effects). But a specific client aware of a specific server's behavior may have reason to depend on fully serialized answers. For example, knowing whether a write response arrived prior to a flush response can be used to learn whether the flush covered the write, or whether another flush may be needed. We need both a way to let the client query which command completed first (a new nbd_aio_peek_command_completed) and to treat the completed list as a queue rather than a stack to preserve FIFO order. --- generator/generator | 12 ++++++++++++ generator/states-reply.c | 13 ++++++++++--- lib/aio.c | 13 +++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/generator/generator b/generator/generator index 60dfb72..a1523e1 100755 --- a/generator/generator +++ b/generator/generator @@ -1628,6 +1628,18 @@ The C<handle> parameter is the unique 64 bit handle for the command, as returned by a call such as C<nbd_aio_pread>."; }; + "aio_peek_command_completed", { + default_call with + args = []; ret = RInt64; + shortdesc = "check if any command has completed"; + longdesc = "\ +Return the unique 64 bit handle of the first non-retired but completed +command, or -1 if no command is awaiting retirement. Any handle +returned by this function must still be passed to +C<aio_command_completed> to actually retire the command and learn +whether the command was successful."; + }; + "connection_state", { default_call with args = []; ret = RConstString; diff --git a/generator/states-reply.c b/generator/states-reply.c index 93f6cda..45362d4 100644 --- a/generator/states-reply.c +++ b/generator/states-reply.c @@ -103,13 +103,20 @@ } assert (cmd != NULL); - /* Move it to the cmds_done list. */ + /* Move it to the end of the cmds_done list. */ if (prev_cmd != NULL) prev_cmd->next = cmd->next; else conn->cmds_in_flight = cmd->next; - cmd->next = conn->cmds_done; - conn->cmds_done = cmd; + cmd->next = NULL; + if (conn->cmds_done) { + prev_cmd = conn->cmds_done; + while (prev_cmd->next) + prev_cmd = prev_cmd->next; + prev_cmd->next = cmd; + } + else + conn->cmds_done = cmd; SET_NEXT_STATE (%.READY); return 0; diff --git a/lib/aio.c b/lib/aio.c index c7764f8..105651f 100644 --- a/lib/aio.c +++ b/lib/aio.c @@ -158,3 +158,16 @@ nbd_unlocked_aio_command_completed (struct nbd_connection *conn, nbd_internal_name_of_nbd_cmd (type), strerror (error)); return -1; } + +int64_t +nbd_unlocked_aio_peek_command_completed (struct nbd_connection *conn) +{ + if (conn->cmds_done != NULL) + return conn->cmds_done->handle; + + if (conn->cmds_in_flight != NULL || conn->cmds_to_issue != NULL) + set_error (0, "no in-flight command has completed yet"); + else + set_error (0, "no commands are in flight"); + return -1; +} -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 4/7] disconnect: Allow shutdown during processing
Although most of our synchronous commands are fine doing a round-robin for the first ready connection, and will leave the connection back in the ready state on conclusion, shutdown is a different beast. Once handshake has finished, we want a shutdown request to reach all connections, even if it has to be queued behind other commands already being processed on that connection. In the future, we may want to consider a flags argument on whether to wait for the connection to have no in-flight commands, and/or to force a disconnect request to the front of the queue of commands to issue, depending on whether the client wants a fast shutdown vs. completion of outstanding requests to avoid data corruption. The NBD spec recommends that NBD_CMD_DISC not be sent while any other commands are in flight, but similarly that a server should not shut down due to NBD_CMD_DISC until all other responses have been flushed. We may also want to update the state machine to track that once NBD_CMD_DISC has been sent, it is not valid to queue or send any further commands on that connection, even if it is still up to receive replies to previous in-flight commands. --- lib/disconnect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/disconnect.c b/lib/disconnect.c index 9706835..0b8fea0 100644 --- a/lib/disconnect.c +++ b/lib/disconnect.c @@ -31,7 +31,8 @@ nbd_unlocked_shutdown (struct nbd_handle *h) size_t i; for (i = 0; i < h->multi_conn; ++i) { - if (nbd_unlocked_aio_is_ready (h->conns[i])) { + if (nbd_unlocked_aio_is_ready (h->conns[i]) || + nbd_unlocked_aio_is_processing (h->conns[i])) { if (nbd_unlocked_aio_disconnect (h->conns[i]) == -1) return -1; } -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 5/7] states: Split ISSUE_COMMAND.SEND_REQUEST
In order to handle reading an in-flight response while in the middle of sending a second command, we'll need a way to jump back into the middle of a command being sent. This is easier if the state that sets wbuf/wlen is distinct from the state that reads into wbuf and alters wlen, and also easier if we don't move the command to the in-flight queue until after the writes finish (since the previous patch was careful to queue new pending commands to the end of the list, the head of the list is always the current command needing to be sent). --- generator/generator | 14 ++++++++++ generator/states-issue-command.c | 45 ++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/generator/generator b/generator/generator index a1523e1..f8d59a0 100755 --- a/generator/generator +++ b/generator/generator @@ -638,12 +638,26 @@ and issue_command_state_machine = [ external_events = [ NotifyWrite, "" ]; }; + State { + default_state with + name = "PREPARE_WRITE_PAYLOAD"; + comment = "Prepare the write payload to send to the remote server"; + external_events = []; + }; + State { default_state with name = "SEND_WRITE_PAYLOAD"; comment = "Sending the write payload to the remote server"; external_events = [ NotifyWrite, "" ]; }; + +State { + default_state with + name = "FINISH"; + comment = "Finish issuing a command"; + external_events = []; + }; ] (* Receiving a reply from the server. *) diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c index a57f40f..e24ea34 100644 --- a/generator/states-issue-command.c +++ b/generator/states-issue-command.c @@ -24,9 +24,6 @@ assert (conn->cmds_to_issue != NULL); cmd = conn->cmds_to_issue; - conn->cmds_to_issue = cmd->next; - cmd->next = conn->cmds_in_flight; - conn->cmds_in_flight = cmd; conn->sbuf.request.magic = htobe32 (NBD_REQUEST_MAGIC); conn->sbuf.request.flags = htobe16 (cmd->flags); @@ -40,29 +37,43 @@ return 0; ISSUE_COMMAND.SEND_REQUEST: - struct command_in_flight *cmd; - switch (send_from_wbuf (conn)) { case -1: SET_NEXT_STATE (%.DEAD); return -1; - case 0: - assert (conn->cmds_in_flight != NULL); - cmd = conn->cmds_in_flight; - assert (cmd->handle == be64toh (conn->sbuf.request.handle)); - if (cmd->type == NBD_CMD_WRITE) { - conn->wbuf = cmd->data; - conn->wlen = cmd->count; - SET_NEXT_STATE (%SEND_WRITE_PAYLOAD); - } - else - SET_NEXT_STATE (%.READY); + case 0: SET_NEXT_STATE (%PREPARE_WRITE_PAYLOAD); } return 0; + ISSUE_COMMAND.PREPARE_WRITE_PAYLOAD: + struct command_in_flight *cmd; + + assert (conn->cmds_to_issue != NULL); + cmd = conn->cmds_to_issue; + assert (cmd->handle == be64toh (conn->sbuf.request.handle)); + if (cmd->type == NBD_CMD_WRITE) { + conn->wbuf = cmd->data; + conn->wlen = cmd->count; + SET_NEXT_STATE (%SEND_WRITE_PAYLOAD); + } + else + SET_NEXT_STATE (%FINISH); + return 0; + ISSUE_COMMAND.SEND_WRITE_PAYLOAD: switch (send_from_wbuf (conn)) { case -1: SET_NEXT_STATE (%.DEAD); return -1; - case 0: SET_NEXT_STATE (%.READY); + case 0: SET_NEXT_STATE (%FINISH); } return 0; + ISSUE_COMMAND.FINISH: + struct command_in_flight *cmd; + + assert (conn->cmds_to_issue != NULL); + cmd = conn->cmds_to_issue; + conn->cmds_to_issue = cmd->next; + cmd->next = conn->cmds_in_flight; + conn->cmds_in_flight = cmd; + SET_NEXT_STATE (%.READY); + return 0; + } /* END STATE MACHINE */ -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 6/7] states: Allow in-flight read while writing next command
As already noted in our state machine, a client that batches up a large read followed by large writes, coupled with a server that only processes commands in order, can result in deadlock (the server won't read more until we unblock its ability to write out its reply to our first command; but we aren't willing to read until we are done writing out our second command). Break the deadlock by teaching the generator that while we are in the middle of writing a command, we must remain responsive to read_notify events; if the server has data for us to read, we should consume that before jumping back into the middle of our command issue. With the new overlapping execution, we also have to rearrange the storage - a reply will scribble over conn->sbuf, but we must not corrupt the bytes being sent during SEND_REQUEST, so we have to split the reply storage to a separate part of conn. --- generator/generator | 26 ++++++++++++++------ generator/states-issue-command.c | 42 +++++++++++++++++++++++++------- lib/internal.h | 7 +++++- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/generator/generator b/generator/generator index f8d59a0..e37c71b 100755 --- a/generator/generator +++ b/generator/generator @@ -621,12 +621,6 @@ and issue_command_state_machine = [ State { default_state with name = "START"; - (* XXX There's a possible deadlock here if a server cannot - * handle multiple requests pipelined on a single connection. - * We could try to issue a command and block, but reads might - * be available. It should be possible to break this with - * another state. - *) comment = "Begin issuing a command to the remote server"; external_events = []; }; @@ -635,7 +629,15 @@ and issue_command_state_machine = [ default_state with name = "SEND_REQUEST"; comment = "Sending a request to the remote server"; - external_events = [ NotifyWrite, "" ]; + external_events = [ NotifyWrite, ""; + NotifyRead, "PAUSE_SEND_REQUEST" ]; + }; + + State { + default_state with + name = "PAUSE_SEND_REQUEST"; + comment = "Interrupt send request to receive an earlier command's reply"; + external_events = []; }; State { @@ -649,7 +651,15 @@ and issue_command_state_machine = [ default_state with name = "SEND_WRITE_PAYLOAD"; comment = "Sending the write payload to the remote server"; - external_events = [ NotifyWrite, "" ]; + external_events = [ NotifyWrite, ""; + NotifyRead, "PAUSE_WRITE_PAYLOAD" ]; + }; + +State { + default_state with + name = "PAUSE_WRITE_PAYLOAD"; + comment = "Interrupt write payload to receive an earlier command's reply"; + external_events = []; }; State { diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c index e24ea34..5f00aa7 100644 --- a/generator/states-issue-command.c +++ b/generator/states-issue-command.c @@ -25,14 +25,23 @@ assert (conn->cmds_to_issue != NULL); cmd = conn->cmds_to_issue; - conn->sbuf.request.magic = htobe32 (NBD_REQUEST_MAGIC); - conn->sbuf.request.flags = htobe16 (cmd->flags); - conn->sbuf.request.type = htobe16 (cmd->type); - conn->sbuf.request.handle = htobe64 (cmd->handle); - conn->sbuf.request.offset = htobe64 (cmd->offset); - conn->sbuf.request.count = htobe32 ((uint32_t) cmd->count); - conn->wbuf = &conn->sbuf; - conn->wlen = sizeof (conn->sbuf.request); + /* Were we interrupted by reading a reply to an earlier command? */ + if (conn->wlen) { + if (conn->in_write_payload) + SET_NEXT_STATE(%SEND_WRITE_PAYLOAD); + else + SET_NEXT_STATE(%SEND_REQUEST); + return 0; + } + + conn->request.magic = htobe32 (NBD_REQUEST_MAGIC); + conn->request.flags = htobe16 (cmd->flags); + conn->request.type = htobe16 (cmd->type); + conn->request.handle = htobe64 (cmd->handle); + conn->request.offset = htobe64 (cmd->offset); + conn->request.count = htobe32 ((uint32_t) cmd->count); + conn->wbuf = &conn->request; + conn->wlen = sizeof (conn->request); SET_NEXT_STATE (%SEND_REQUEST); return 0; @@ -43,12 +52,19 @@ } return 0; + ISSUE_COMMAND.PAUSE_SEND_REQUEST: + assert (conn->wlen); + assert (conn->cmds_to_issue != NULL); + conn->in_write_payload = false; + SET_NEXT_STATE (%^REPLY.START); + return 0; + ISSUE_COMMAND.PREPARE_WRITE_PAYLOAD: struct command_in_flight *cmd; assert (conn->cmds_to_issue != NULL); cmd = conn->cmds_to_issue; - assert (cmd->handle == be64toh (conn->sbuf.request.handle)); + assert (cmd->handle == be64toh (conn->request.handle)); if (cmd->type == NBD_CMD_WRITE) { conn->wbuf = cmd->data; conn->wlen = cmd->count; @@ -65,9 +81,17 @@ } return 0; + ISSUE_COMMAND.PAUSE_WRITE_PAYLOAD: + assert (conn->wlen); + assert (conn->cmds_to_issue != NULL); + conn->in_write_payload = true; + SET_NEXT_STATE (%^REPLY.START); + return 0; + ISSUE_COMMAND.FINISH: struct command_in_flight *cmd; + assert (!conn->wlen); assert (conn->cmds_to_issue != NULL); cmd = conn->cmds_to_issue; conn->cmds_to_issue = cmd->next; diff --git a/lib/internal.h b/lib/internal.h index de9b8bc..13832d7 100644 --- a/lib/internal.h +++ b/lib/internal.h @@ -133,7 +133,6 @@ struct nbd_connection { } payload; } __attribute__((packed)) or; struct nbd_export_name_option_reply export_name_reply; - struct nbd_request request; struct nbd_simple_reply simple_reply; struct { struct nbd_structured_reply structured_reply; @@ -149,6 +148,12 @@ struct nbd_connection { uint32_t nrqueries; } sbuf; + /* Issuing a command must use a buffer separate from sbuf, for the + * case when we interrupt a request to service a reply. + */ + struct nbd_request request; + bool in_write_payload; + /* When connecting, this stores the socket address. */ struct sockaddr_storage connaddr; socklen_t connaddrlen; -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 7/7] examples: Add example to demonstrate just-fixed deadlock scenario
Added merely as an example to be run by hand, rather than automated into 'make check', since nbdkit --filter=noparallel is quite new. --- .gitignore | 1 + examples/Makefile.am | 10 ++ examples/batched-read-write.c | 214 ++++++++++++++++++++++++++++++++++ 3 files changed, 225 insertions(+) create mode 100644 examples/batched-read-write.c diff --git a/.gitignore b/.gitignore index 66ff811..2ea7da0 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ Makefile.in /docs/libnbd.3 /docs/libnbd-api.3 /docs/libnbd-api.pod +/examples/batched-read-write /examples/threaded-reads-and-writes /examples/simple-fetch-first-sector /examples/simple-reads-and-writes diff --git a/examples/Makefile.am b/examples/Makefile.am index be3f21d..6825384 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -18,6 +18,7 @@ include $(top_srcdir)/subdir-rules.mk noinst_PROGRAMS = \ + batched-read-write \ simple-fetch-first-sector \ simple-reads-and-writes \ threaded-reads-and-writes @@ -50,3 +51,12 @@ threaded_reads_and_writes_CFLAGS = \ threaded_reads_and_writes_LDADD = \ $(top_builddir)/lib/libnbd.la \ $(PTHREAD_LIBS) + +batched_read_write_SOURCES = \ + batched-read-write.c +batched_read_write_CPPFLAGS = \ + -I$(top_srcdir)/include +batched_read_write_CFLAGS = \ + $(WARNINGS_CFLAGS) +batched_read_write_LDADD = \ + $(top_builddir)/lib/libnbd.la diff --git a/examples/batched-read-write.c b/examples/batched-read-write.c new file mode 100644 index 0000000..5dda2e2 --- /dev/null +++ b/examples/batched-read-write.c @@ -0,0 +1,214 @@ +/* This example can be copied, used and modified for any purpose + * without restrictions. + * + * Example usage with nbdkit: + * + * nbdkit -U - --filter=noparallel memory 2M \ + * --run './batched-read-write $unixsocket' + * + * This will attempt to batch a large aio read request immediately + * followed by a large aio write request, prior to waiting for any + * command replies from the server. A naive client that does not check + * for available read data related to the first command while trying + * to write data for the second command, coupled with a server that + * only processes commands serially, would cause deadlock (both + * processes fill up their write buffers waiting for a reader); thus, + * this tests that libnbd is smart enough to always respond to replies + * for in-flight requests even when it has batched up other commands + * to write. + * + * To run it against a remote server over TCP: + * + * ./batched-read-write hostname port + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <poll.h> +#include <time.h> +#include <assert.h> +#include <signal.h> + +#include <libnbd.h> + +/* The single NBD handle. */ +static struct nbd_handle *nbd; + +/* Buffers used for the test. */ +static char *in, *out; +static int64_t packetsize; + +static int +try_deadlock (void *arg) +{ + struct pollfd fds[1]; + struct nbd_connection *conn; + size_t i; + int64_t handles[2], done; + size_t in_flight; /* counts number of requests in flight */ + int dir, r; + + /* The single thread "owns" the connection. */ + conn = nbd_get_connection (nbd, 0); + + /* Issue commands. */ + in_flight = 0; + handles[0] = nbd_aio_pread (conn, in, packetsize, 0); + if (handles[0] == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + goto error; + } + in_flight++; + handles[1] = nbd_aio_pwrite (conn, out, packetsize, packetsize, 0); + if (handles[1] == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + goto error; + } + in_flight++; + + /* Now wait for commands to retire, or for deadlock to occur */ + while (in_flight > 0) { + if (nbd_aio_is_dead (conn) || nbd_aio_is_closed (conn)) { + fprintf (stderr, "connection is dead or closed\n"); + goto error; + } + + fds[0].fd = nbd_aio_get_fd (conn); + fds[0].events = 0; + fds[0].revents = 0; + dir = nbd_aio_get_direction (conn); + if ((dir & LIBNBD_AIO_DIRECTION_READ) != 0) + fds[0].events |= POLLIN; + if ((dir & LIBNBD_AIO_DIRECTION_WRITE) != 0) + fds[0].events |= POLLOUT; + + if (poll (fds, 1, -1) == -1) { + perror ("poll"); + goto error; + } + + if ((dir & LIBNBD_AIO_DIRECTION_READ) != 0 && + (fds[0].revents & POLLIN) != 0) + nbd_aio_notify_read (conn); + else if ((dir & LIBNBD_AIO_DIRECTION_WRITE) != 0 && + (fds[0].revents & POLLOUT) != 0) + nbd_aio_notify_write (conn); + + /* If a command is ready to retire, retire it. */ + while ((done = nbd_aio_peek_command_completed (conn)) >= 0) { + for (i = 0; i < in_flight; ++i) { + if (handles[i] == done) { + r = nbd_aio_command_completed (conn, handles[i]); + if (r == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + goto error; + } + assert (r); + memmove (&handles[i], &handles[i+1], + sizeof (handles[0]) * (in_flight - i - 1)); + break; + } + } + assert (i < in_flight); + in_flight--; + } + } + + printf ("finished OK\n"); + + return 0; + + error: + fprintf (stderr, "failed\n"); + return -1; +} + +static void +alarm_handler (int sig) +{ + fprintf (stderr, "alarm fired; deadlock probably occurred\n"); + exit (EXIT_FAILURE); +} + +int +main (int argc, char *argv[]) +{ + int64_t exportsize; + + if (argc < 2 || argc > 3) { + fprintf (stderr, "%s socket | hostname port\n", argv[0]); + exit (EXIT_FAILURE); + } + + nbd = nbd_create (); + if (nbd == NULL) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + + /* Connect synchronously as this is simpler. */ + if (argc == 2) { + if (nbd_connect_unix (nbd, argv[1]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + } + else { + if (nbd_connect_tcp (nbd, argv[1], argv[2]) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + } + + if (nbd_read_only (nbd) == 1) { + fprintf (stderr, "%s: error: this NBD export is read-only\n", argv[0]); + exit (EXIT_FAILURE); + } + + exportsize = nbd_get_size (nbd); + if (exportsize == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + packetsize = exportsize / 2; + if (packetsize > 2 * 1024 * 1024) + packetsize = 2 * 1024 * 1024; + + in = malloc (packetsize); + out = malloc (packetsize); + if (!in || !out) { + fprintf (stderr, "insufficient memory\n"); + exit (EXIT_FAILURE); + } + + /* Attempt to be non-destructive, by writing what file already contains */ + if (nbd_pread (nbd, out, packetsize, packetsize) == -1) { + fprintf (stderr, "sync read failed: %s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + + /* When not debugging, set an alarm, in case this test deadlocks + * instead of succeeding + */ + if (nbd_get_debug (nbd) < 1) { + signal (SIGALRM, alarm_handler); + alarm (10); + } + + if (try_deadlock (NULL) == -1) + exit (EXIT_FAILURE); + + if (nbd_shutdown (nbd) == -1) { + fprintf (stderr, "%s\n", nbd_get_error ()); + exit (EXIT_FAILURE); + } + + nbd_close (nbd); + + return EXIT_SUCCESS; +} -- 2.20.1
Richard W.M. Jones
2019-May-22 22:12 UTC
Re: [Libguestfs] [libnbd PATCH v3 1/7] lib: Refactor command_common() to do more common work
This patch is now fine, although unfortunately I think you'll find it won't apply as-is since I made a mechanical change of turning int64_t extent_id into void *extent_data. But other than that, ACK. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Richard W.M. Jones
2019-May-22 22:14 UTC
Re: [Libguestfs] [libnbd PATCH v3 2/7] commands: Allow for a command queue
This one is fine, ACK. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Richard W.M. Jones
2019-May-22 22:17 UTC
Re: [Libguestfs] [libnbd PATCH v3 3/7] commands: Expose FIFO ordering of server completions
On Wed, May 22, 2019 at 04:29:03PM -0500, Eric Blake wrote:> +int64_t > +nbd_unlocked_aio_peek_command_completed (struct nbd_connection *conn) > +{ > + if (conn->cmds_done != NULL) > + return conn->cmds_done->handle; > + > + if (conn->cmds_in_flight != NULL || conn->cmds_to_issue != NULL) > + set_error (0, "no in-flight command has completed yet"); > + else > + set_error (0, "no commands are in flight"); > + return -1; > +} > --There are two ways I think we could improve this. Either: (1) I believe that all handles are >= 1 (and if this isn't true, it's easy to achieve by starting the h->unique counter at 1). Return 0 to mean "no completed command". Of course "no commands in flight" is still an error. I think (1) is more elegant, but the alternative is: (2) Use the errno field (currently 0) to distinguish the two cases with a suitable errno value (EAGAIN maybe?) for the "no completed command" case. No need to post this patch again, ACK if you make this kind of change. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://people.redhat.com/~rjones/virt-df/
Richard W.M. Jones
2019-May-22 22:20 UTC
Re: [Libguestfs] [libnbd PATCH v3 4/7] disconnect: Allow shutdown during processing
On Wed, May 22, 2019 at 04:29:04PM -0500, Eric Blake wrote:> Although most of our synchronous commands are fine doing a round-robin > for the first ready connection, and will leave the connection back in > the ready state on conclusion, shutdown is a different beast. Once > handshake has finished, we want a shutdown request to reach all > connections, even if it has to be queued behind other commands already > being processed on that connection. > > In the future, we may want to consider a flags argument on whether to > wait for the connection to have no in-flight commands, and/or to force > a disconnect request to the front of the queue of commands to issue, > depending on whether the client wants a fast shutdown vs. completion > of outstanding requests to avoid data corruption. The NBD spec > recommends that NBD_CMD_DISC not be sent while any other commands are > in flight, but similarly that a server should not shut down due to > NBD_CMD_DISC until all other responses have been flushed. We may also > want to update the state machine to track that once NBD_CMD_DISC has > been sent, it is not valid to queue or send any further commands on > that connection, even if it is still up to receive replies to previous > in-flight commands. > --- > lib/disconnect.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/lib/disconnect.c b/lib/disconnect.c > index 9706835..0b8fea0 100644 > --- a/lib/disconnect.c > +++ b/lib/disconnect.c > @@ -31,7 +31,8 @@ nbd_unlocked_shutdown (struct nbd_handle *h) > size_t i; > > for (i = 0; i < h->multi_conn; ++i) { > - if (nbd_unlocked_aio_is_ready (h->conns[i])) { > + if (nbd_unlocked_aio_is_ready (h->conns[i]) || > + nbd_unlocked_aio_is_processing (h->conns[i])) { > if (nbd_unlocked_aio_disconnect (h->conns[i]) == -1) > return -1; > }ACK Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Richard W.M. Jones
2019-May-22 22:20 UTC
Re: [Libguestfs] [libnbd PATCH v3 7/7] examples: Add example to demonstrate just-fixed deadlock scenario
I admit I'm a bit tired now, but this all seems plausible to me, so ACK series. Let's see what breaks ... Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://people.redhat.com/~rjones/virt-df/
Possibly Parallel Threads
- [libnbd PATCH v2 0/5] Avoid deadlock with in-flight commands
- [libnbd PATCH 0/3] Avoid deadlock with in-flight commands
- [PATCH libnbd] api: Get rid of nbd_connection.
- [libnbd PATCH 0/2] in_flight improvements
- [libnbd PATCH 0/6] new APIs: aio_in_flight, aio_FOO_notify