Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 0/7] Avoid deadlock with in-flight commands
Since v2: - rebase to Rich's new API calls - more refactoring in patch 1 (retitled) - new patches 3 and 4 - fix data corruption in patch 6 (was 4) - more tweaks to the reproducer example (including using new API from 3) Eric Blake (7): lib: Refactor command_common() to do more common work commands: Allow for a command queue commands: Expose FIFO ordering of server completions disconnect: Allow shutdown during processing states: Split ISSUE_COMMAND.SEND_REQUEST states: Allow in-flight read while writing next command examples: Add example to demonstrate just-fixed deadlock scenario .gitignore | 1 + examples/Makefile.am | 10 ++ examples/batched-read-write.c | 214 +++++++++++++++++++++++++++++++ generator/generator | 52 ++++++-- generator/states-issue-command.c | 85 ++++++++---- generator/states-reply.c | 13 +- generator/states.c | 5 + lib/aio.c | 13 ++ lib/disconnect.c | 23 +--- lib/internal.h | 14 +- lib/rw.c | 131 ++++++++----------- 11 files changed, 429 insertions(+), 132 deletions(-) create mode 100644 examples/batched-read-write.c -- 2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 1/7] lib: Refactor command_common() to do more common work
Our construction of struct command_in_flight was ad hoc; most
parameters were set in command_common(), but extent callbacks were
done after the fact, and NBD_CMD_DISC was open-coding things.
Furthermore, every caller was triggering nbd_internal_run() for the
cmd_issue event; doing that in a central place makes it easier for the
next patch to improve that logic without duplicating the fix at each
caller. Fix these things by renaming the function to
nbd_internal_command_common, which is necessary for exporting it, and
by adding parameters and an updated return type.
---
lib/disconnect.c | 20 +++------
lib/internal.h | 7 ++++
lib/rw.c | 103 +++++++++++++----------------------------------
3 files changed, 39 insertions(+), 91 deletions(-)
diff --git a/lib/disconnect.c b/lib/disconnect.c
index bc43b4c..9706835 100644
--- a/lib/disconnect.c
+++ b/lib/disconnect.c
@@ -60,27 +60,17 @@ nbd_unlocked_shutdown (struct nbd_handle *h)
int
nbd_unlocked_aio_disconnect (struct nbd_connection *conn)
{
- struct command_in_flight *cmd;
+ int64_t id;
- cmd = malloc (sizeof *cmd);
- if (cmd == NULL) {
- set_error (errno, "malloc");
+ id = nbd_internal_command_common (conn, 0, NBD_CMD_DISC, 0, 0, NULL,
+ 0, NULL);
+ if (id == -1)
return -1;
- }
- cmd->flags = 0;
- cmd->type = NBD_CMD_DISC;
- cmd->handle = conn->h->unique++;
- cmd->offset = 0;
- cmd->count = 0;
- cmd->data = NULL;
-
- cmd->next = conn->cmds_to_issue;
- conn->cmds_to_issue = cmd;
/* This will leave the command on the in-flight list. Is this a
* problem? Probably it isn't. If it is, we could add a flag to
* the command struct to tell SEND_REQUEST not to add it to the
* in-flight list.
*/
- return nbd_internal_run (conn->h, conn, cmd_issue);
+ return 0;
}
diff --git a/lib/internal.h b/lib/internal.h
index 1f742da..de9b8bc 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -265,6 +265,13 @@ extern void nbd_internal_set_last_error (int errnum, char
*error);
extern int nbd_internal_errno_of_nbd_error (uint32_t error);
extern const char *nbd_internal_name_of_nbd_cmd (uint16_t type);
+/* rw.c */
+extern int64_t nbd_internal_command_common (struct nbd_connection *conn,
+ uint16_t flags, uint16_t type,
+ uint64_t offset, uint64_t count,
+ void *data, int64_t id,
+ extent_fn extent);
+
/* socket.c */
struct socket *nbd_internal_socket_create (int fd);
diff --git a/lib/rw.c b/lib/rw.c
index 861ab67..8f6227d 100644
--- a/lib/rw.c
+++ b/lib/rw.c
@@ -241,10 +241,11 @@ nbd_unlocked_block_status (struct nbd_handle *h,
return r == -1 ? -1 : 0;
}
-static struct command_in_flight *
-command_common (struct nbd_connection *conn,
- uint16_t flags, uint16_t type,
- uint64_t offset, uint64_t count, void *data)
+int64_t
+nbd_internal_command_common (struct nbd_connection *conn,
+ uint16_t flags, uint16_t type,
+ uint64_t offset, uint64_t count, void *data,
+ int64_t id, extent_fn extent)
{
struct command_in_flight *cmd;
@@ -255,7 +256,7 @@ command_common (struct nbd_connection *conn,
if (count > MAX_REQUEST_SIZE) {
set_error (ERANGE, "request too large: maximum request size is
%d",
MAX_REQUEST_SIZE);
- return NULL;
+ return -1;
}
break;
@@ -268,7 +269,7 @@ command_common (struct nbd_connection *conn,
if (count > UINT32_MAX) {
set_error (ERANGE, "request too large: maximum request size is
%" PRIu32,
UINT32_MAX);
- return NULL;
+ return -1;
}
break;
}
@@ -276,7 +277,7 @@ command_common (struct nbd_connection *conn,
cmd = calloc (1, sizeof *cmd);
if (cmd == NULL) {
set_error (errno, "calloc");
- return NULL;
+ return -1;
}
cmd->flags = flags;
cmd->type = type;
@@ -284,6 +285,8 @@ command_common (struct nbd_connection *conn,
cmd->offset = offset;
cmd->count = count;
cmd->data = data;
+ cmd->extent_id = id;
+ cmd->extent_fn = extent;
/* If structured replies were negotiated then we trust the server to
* send back sufficient data to cover the whole buffer. It's tricky
@@ -298,23 +301,18 @@ command_common (struct nbd_connection *conn,
cmd->next = conn->cmds_to_issue;
conn->cmds_to_issue = cmd;
+ if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
+ return -1;
- return cmd;
+ return cmd->handle;
}
int64_t
nbd_unlocked_aio_pread (struct nbd_connection *conn, void *buf,
size_t count, uint64_t offset)
{
- struct command_in_flight *cmd;
-
- cmd = command_common (conn, 0, NBD_CMD_READ, offset, count, buf);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, 0, NBD_CMD_READ, offset, count,
+ buf, 0, NULL);
}
int64_t
@@ -322,8 +320,6 @@ nbd_unlocked_aio_pwrite (struct nbd_connection *conn, const
void *buf,
size_t count, uint64_t offset,
uint32_t flags)
{
- struct command_in_flight *cmd;
-
if (nbd_unlocked_read_only (conn->h) == 1) {
set_error (EINVAL, "server does not support write operations");
return -1;
@@ -340,33 +336,20 @@ nbd_unlocked_aio_pwrite (struct nbd_connection *conn,
const void *buf,
return -1;
}
- cmd = command_common (conn, flags, NBD_CMD_WRITE, offset, count,
- (void *) buf);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, flags, NBD_CMD_WRITE, offset,
count,
+ (void *) buf, 0, NULL);
}
int64_t
nbd_unlocked_aio_flush (struct nbd_connection *conn)
{
- struct command_in_flight *cmd;
-
if (nbd_unlocked_can_flush (conn->h) != 1) {
set_error (EINVAL, "server does not support flush operations");
return -1;
}
- cmd = command_common (conn, 0, NBD_CMD_FLUSH, 0, 0, NULL);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, 0, NBD_CMD_FLUSH, 0, 0,
+ NULL, 0, NULL);
}
int64_t
@@ -374,8 +357,6 @@ nbd_unlocked_aio_trim (struct nbd_connection *conn,
uint64_t count, uint64_t offset,
uint32_t flags)
{
- struct command_in_flight *cmd;
-
if (nbd_unlocked_read_only (conn->h) == 1) {
set_error (EINVAL, "server does not support write operations");
return -1;
@@ -397,21 +378,14 @@ nbd_unlocked_aio_trim (struct nbd_connection *conn,
return -1;
}
- cmd = command_common (conn, flags, NBD_CMD_TRIM, offset, count, NULL);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, flags, NBD_CMD_TRIM, offset, count,
+ NULL, 0, NULL);
}
int64_t
nbd_unlocked_aio_cache (struct nbd_connection *conn,
uint64_t count, uint64_t offset)
{
- struct command_in_flight *cmd;
-
/* Actually according to the NBD protocol document, servers do exist
* that support NBD_CMD_CACHE but don't advertize the
* NBD_FLAG_SEND_CACHE bit, but we ignore those.
@@ -421,13 +395,8 @@ nbd_unlocked_aio_cache (struct nbd_connection *conn,
return -1;
}
- cmd = command_common (conn, 0, NBD_CMD_CACHE, offset, count, NULL);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, 0, NBD_CMD_CACHE, offset, count,
+ NULL, 0, NULL);
}
int64_t
@@ -435,8 +404,6 @@ nbd_unlocked_aio_zero (struct nbd_connection *conn,
uint64_t count, uint64_t offset,
uint32_t flags)
{
- struct command_in_flight *cmd;
-
if (nbd_unlocked_read_only (conn->h) == 1) {
set_error (EINVAL, "server does not support write operations");
return -1;
@@ -458,13 +425,8 @@ nbd_unlocked_aio_zero (struct nbd_connection *conn,
return -1;
}
- cmd = command_common (conn, flags, NBD_CMD_WRITE_ZEROES, offset, count,
NULL);
- if (!cmd)
- return -1;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, flags, NBD_CMD_WRITE_ZEROES,
offset,
+ count, NULL, 0, NULL);
}
int64_t
@@ -474,8 +436,6 @@ nbd_unlocked_aio_block_status (struct nbd_connection *conn,
int64_t id,
extent_fn extent)
{
- struct command_in_flight *cmd;
-
if (!conn->structured_replies) {
set_error (ENOTSUP, "server does not support structured
replies");
return -1;
@@ -498,15 +458,6 @@ nbd_unlocked_aio_block_status (struct nbd_connection *conn,
return -1;
}
- cmd = command_common (conn, flags, NBD_CMD_BLOCK_STATUS, offset, count,
NULL);
- if (!cmd)
- return -1;
-
- cmd->extent_fn = extent;
- cmd->extent_id = id;
-
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
-
- return cmd->handle;
+ return nbd_internal_command_common (conn, flags, NBD_CMD_BLOCK_STATUS,
offset,
+ count, NULL, id, extent);
}
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 2/7] commands: Allow for a command queue
Previously, our 'cmds_to_issue' list was at most 1 element long,
because we reject all commands except from state READY, but don't get
back to state READY until the issue-commands sequence has completed.
However, this is not as friendly on the client - once we are in
transmission phase, a client may want to queue up another command
whether or not the state machine is still tied up in processing a
previous command. We still want to reject commands sent before the
first time we reach READY, as well as keep the cmd_issue event for
kicking the state machine into action when there is no previous
command being worked on, but otherwise, the state machine itself can
recognize when the command queue needs draining.
---
generator/states.c | 5 +++++
lib/rw.c | 32 +++++++++++++++++++++++++++-----
2 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/generator/states.c b/generator/states.c
index 202c305..22ce853 100644
--- a/generator/states.c
+++ b/generator/states.c
@@ -110,6 +110,11 @@ send_from_wbuf (struct nbd_connection *conn)
/*----- End of prologue. -----*/
/* STATE MACHINE */ {
+ READY:
+ if (conn->cmds_to_issue)
+ SET_NEXT_STATE (%ISSUE_COMMAND.START);
+ return 0;
+
DEAD:
if (conn->sock) {
conn->sock->ops->close (conn->sock);
diff --git a/lib/rw.c b/lib/rw.c
index 8f6227d..594593a 100644
--- a/lib/rw.c
+++ b/lib/rw.c
@@ -24,6 +24,7 @@
#include <stdint.h>
#include <inttypes.h>
#include <errno.h>
+#include <assert.h>
#include "internal.h"
@@ -247,7 +248,15 @@ nbd_internal_command_common (struct nbd_connection *conn,
uint64_t offset, uint64_t count, void *data,
int64_t id, extent_fn extent)
{
- struct command_in_flight *cmd;
+ struct command_in_flight *cmd, *prev_cmd;
+
+ if (!nbd_unlocked_aio_is_ready (conn) &&
+ !nbd_unlocked_aio_is_processing (conn)) {
+ set_error (0, "command request %s is invalid in state %s",
+ nbd_internal_name_of_nbd_cmd (type),
+ nbd_internal_state_short_string (conn->state));
+ return -1;
+ }
switch (type) {
/* Commands which send or receive data are limited to MAX_REQUEST_SIZE. */
@@ -299,10 +308,23 @@ nbd_internal_command_common (struct nbd_connection *conn,
if (conn->structured_replies && cmd->data && type ==
NBD_CMD_READ)
memset (cmd->data, 0, cmd->count);
- cmd->next = conn->cmds_to_issue;
- conn->cmds_to_issue = cmd;
- if (nbd_internal_run (conn->h, conn, cmd_issue) == -1)
- return -1;
+ /* Add the command to the end of the queue. Kick the state machine
+ * if there is no other command being processed, otherwise, it will
+ * be handled automatically on a future cycle around to READY.
+ */
+ if (conn->cmds_to_issue != NULL) {
+ assert (nbd_unlocked_aio_is_processing (conn));
+ prev_cmd = conn->cmds_to_issue;
+ while (prev_cmd->next)
+ prev_cmd = prev_cmd->next;
+ prev_cmd->next = cmd;
+ }
+ else {
+ conn->cmds_to_issue = cmd;
+ if (nbd_unlocked_aio_is_ready (conn) &&
+ nbd_internal_run (conn->h, conn, cmd_issue) == -1)
+ return -1;
+ }
return cmd->handle;
}
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 3/7] commands: Expose FIFO ordering of server completions
A generic client exploiting multiple in-flight commands should be
prepared for out-of-order responses (and should probably ensure that
there are no offset/count overlaps between parallel in-flight commands
to avoid unspecified disk contents if the server acts on commands in
an arbitrary order or even exposing non-atomic splicing effects). But
a specific client aware of a specific server's behavior may have
reason to depend on fully serialized answers. For example, knowing
whether a write response arrived prior to a flush response can be used
to learn whether the flush covered the write, or whether another flush
may be needed. We need both a way to let the client query which
command completed first (a new nbd_aio_peek_command_completed) and to
treat the completed list as a queue rather than a stack to preserve
FIFO order.
---
generator/generator | 12 ++++++++++++
generator/states-reply.c | 13 ++++++++++---
lib/aio.c | 13 +++++++++++++
3 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/generator/generator b/generator/generator
index 60dfb72..a1523e1 100755
--- a/generator/generator
+++ b/generator/generator
@@ -1628,6 +1628,18 @@ The C<handle> parameter is the unique 64 bit handle
for the command,
as returned by a call such as C<nbd_aio_pread>.";
};
+ "aio_peek_command_completed", {
+ default_call with
+ args = []; ret = RInt64;
+ shortdesc = "check if any command has completed";
+ longdesc = "\
+Return the unique 64 bit handle of the first non-retired but completed
+command, or -1 if no command is awaiting retirement. Any handle
+returned by this function must still be passed to
+C<aio_command_completed> to actually retire the command and learn
+whether the command was successful.";
+ };
+
"connection_state", {
default_call with
args = []; ret = RConstString;
diff --git a/generator/states-reply.c b/generator/states-reply.c
index 93f6cda..45362d4 100644
--- a/generator/states-reply.c
+++ b/generator/states-reply.c
@@ -103,13 +103,20 @@
}
assert (cmd != NULL);
- /* Move it to the cmds_done list. */
+ /* Move it to the end of the cmds_done list. */
if (prev_cmd != NULL)
prev_cmd->next = cmd->next;
else
conn->cmds_in_flight = cmd->next;
- cmd->next = conn->cmds_done;
- conn->cmds_done = cmd;
+ cmd->next = NULL;
+ if (conn->cmds_done) {
+ prev_cmd = conn->cmds_done;
+ while (prev_cmd->next)
+ prev_cmd = prev_cmd->next;
+ prev_cmd->next = cmd;
+ }
+ else
+ conn->cmds_done = cmd;
SET_NEXT_STATE (%.READY);
return 0;
diff --git a/lib/aio.c b/lib/aio.c
index c7764f8..105651f 100644
--- a/lib/aio.c
+++ b/lib/aio.c
@@ -158,3 +158,16 @@ nbd_unlocked_aio_command_completed (struct nbd_connection
*conn,
nbd_internal_name_of_nbd_cmd (type), strerror (error));
return -1;
}
+
+int64_t
+nbd_unlocked_aio_peek_command_completed (struct nbd_connection *conn)
+{
+ if (conn->cmds_done != NULL)
+ return conn->cmds_done->handle;
+
+ if (conn->cmds_in_flight != NULL || conn->cmds_to_issue != NULL)
+ set_error (0, "no in-flight command has completed yet");
+ else
+ set_error (0, "no commands are in flight");
+ return -1;
+}
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 4/7] disconnect: Allow shutdown during processing
Although most of our synchronous commands are fine doing a round-robin
for the first ready connection, and will leave the connection back in
the ready state on conclusion, shutdown is a different beast. Once
handshake has finished, we want a shutdown request to reach all
connections, even if it has to be queued behind other commands already
being processed on that connection.
In the future, we may want to consider a flags argument on whether to
wait for the connection to have no in-flight commands, and/or to force
a disconnect request to the front of the queue of commands to issue,
depending on whether the client wants a fast shutdown vs. completion
of outstanding requests to avoid data corruption. The NBD spec
recommends that NBD_CMD_DISC not be sent while any other commands are
in flight, but similarly that a server should not shut down due to
NBD_CMD_DISC until all other responses have been flushed. We may also
want to update the state machine to track that once NBD_CMD_DISC has
been sent, it is not valid to queue or send any further commands on
that connection, even if it is still up to receive replies to previous
in-flight commands.
---
lib/disconnect.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/disconnect.c b/lib/disconnect.c
index 9706835..0b8fea0 100644
--- a/lib/disconnect.c
+++ b/lib/disconnect.c
@@ -31,7 +31,8 @@ nbd_unlocked_shutdown (struct nbd_handle *h)
size_t i;
for (i = 0; i < h->multi_conn; ++i) {
- if (nbd_unlocked_aio_is_ready (h->conns[i])) {
+ if (nbd_unlocked_aio_is_ready (h->conns[i]) ||
+ nbd_unlocked_aio_is_processing (h->conns[i])) {
if (nbd_unlocked_aio_disconnect (h->conns[i]) == -1)
return -1;
}
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 5/7] states: Split ISSUE_COMMAND.SEND_REQUEST
In order to handle reading an in-flight response while in the middle
of sending a second command, we'll need a way to jump back into the
middle of a command being sent. This is easier if the state that sets
wbuf/wlen is distinct from the state that reads into wbuf and alters
wlen, and also easier if we don't move the command to the in-flight
queue until after the writes finish (since the previous patch was
careful to queue new pending commands to the end of the list, the head
of the list is always the current command needing to be sent).
---
generator/generator | 14 ++++++++++
generator/states-issue-command.c | 45 ++++++++++++++++++++------------
2 files changed, 42 insertions(+), 17 deletions(-)
diff --git a/generator/generator b/generator/generator
index a1523e1..f8d59a0 100755
--- a/generator/generator
+++ b/generator/generator
@@ -638,12 +638,26 @@ and issue_command_state_machine = [
external_events = [ NotifyWrite, "" ];
};
+ State {
+ default_state with
+ name = "PREPARE_WRITE_PAYLOAD";
+ comment = "Prepare the write payload to send to the remote
server";
+ external_events = [];
+ };
+
State {
default_state with
name = "SEND_WRITE_PAYLOAD";
comment = "Sending the write payload to the remote server";
external_events = [ NotifyWrite, "" ];
};
+
+State {
+ default_state with
+ name = "FINISH";
+ comment = "Finish issuing a command";
+ external_events = [];
+ };
]
(* Receiving a reply from the server. *)
diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c
index a57f40f..e24ea34 100644
--- a/generator/states-issue-command.c
+++ b/generator/states-issue-command.c
@@ -24,9 +24,6 @@
assert (conn->cmds_to_issue != NULL);
cmd = conn->cmds_to_issue;
- conn->cmds_to_issue = cmd->next;
- cmd->next = conn->cmds_in_flight;
- conn->cmds_in_flight = cmd;
conn->sbuf.request.magic = htobe32 (NBD_REQUEST_MAGIC);
conn->sbuf.request.flags = htobe16 (cmd->flags);
@@ -40,29 +37,43 @@
return 0;
ISSUE_COMMAND.SEND_REQUEST:
- struct command_in_flight *cmd;
-
switch (send_from_wbuf (conn)) {
case -1: SET_NEXT_STATE (%.DEAD); return -1;
- case 0:
- assert (conn->cmds_in_flight != NULL);
- cmd = conn->cmds_in_flight;
- assert (cmd->handle == be64toh (conn->sbuf.request.handle));
- if (cmd->type == NBD_CMD_WRITE) {
- conn->wbuf = cmd->data;
- conn->wlen = cmd->count;
- SET_NEXT_STATE (%SEND_WRITE_PAYLOAD);
- }
- else
- SET_NEXT_STATE (%.READY);
+ case 0: SET_NEXT_STATE (%PREPARE_WRITE_PAYLOAD);
}
return 0;
+ ISSUE_COMMAND.PREPARE_WRITE_PAYLOAD:
+ struct command_in_flight *cmd;
+
+ assert (conn->cmds_to_issue != NULL);
+ cmd = conn->cmds_to_issue;
+ assert (cmd->handle == be64toh (conn->sbuf.request.handle));
+ if (cmd->type == NBD_CMD_WRITE) {
+ conn->wbuf = cmd->data;
+ conn->wlen = cmd->count;
+ SET_NEXT_STATE (%SEND_WRITE_PAYLOAD);
+ }
+ else
+ SET_NEXT_STATE (%FINISH);
+ return 0;
+
ISSUE_COMMAND.SEND_WRITE_PAYLOAD:
switch (send_from_wbuf (conn)) {
case -1: SET_NEXT_STATE (%.DEAD); return -1;
- case 0: SET_NEXT_STATE (%.READY);
+ case 0: SET_NEXT_STATE (%FINISH);
}
return 0;
+ ISSUE_COMMAND.FINISH:
+ struct command_in_flight *cmd;
+
+ assert (conn->cmds_to_issue != NULL);
+ cmd = conn->cmds_to_issue;
+ conn->cmds_to_issue = cmd->next;
+ cmd->next = conn->cmds_in_flight;
+ conn->cmds_in_flight = cmd;
+ SET_NEXT_STATE (%.READY);
+ return 0;
+
} /* END STATE MACHINE */
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 6/7] states: Allow in-flight read while writing next command
As already noted in our state machine, a client that batches up a
large read followed by large writes, coupled with a server that only
processes commands in order, can result in deadlock (the server won't
read more until we unblock its ability to write out its reply to our
first command; but we aren't willing to read until we are done writing
out our second command). Break the deadlock by teaching the generator
that while we are in the middle of writing a command, we must remain
responsive to read_notify events; if the server has data for us to
read, we should consume that before jumping back into the middle of
our command issue. With the new overlapping execution, we also have to
rearrange the storage - a reply will scribble over conn->sbuf, but we
must not corrupt the bytes being sent during SEND_REQUEST, so we have
to split the reply storage to a separate part of conn.
---
generator/generator | 26 ++++++++++++++------
generator/states-issue-command.c | 42 +++++++++++++++++++++++++-------
lib/internal.h | 7 +++++-
3 files changed, 57 insertions(+), 18 deletions(-)
diff --git a/generator/generator b/generator/generator
index f8d59a0..e37c71b 100755
--- a/generator/generator
+++ b/generator/generator
@@ -621,12 +621,6 @@ and issue_command_state_machine = [
State {
default_state with
name = "START";
- (* XXX There's a possible deadlock here if a server cannot
- * handle multiple requests pipelined on a single connection.
- * We could try to issue a command and block, but reads might
- * be available. It should be possible to break this with
- * another state.
- *)
comment = "Begin issuing a command to the remote server";
external_events = [];
};
@@ -635,7 +629,15 @@ and issue_command_state_machine = [
default_state with
name = "SEND_REQUEST";
comment = "Sending a request to the remote server";
- external_events = [ NotifyWrite, "" ];
+ external_events = [ NotifyWrite, "";
+ NotifyRead, "PAUSE_SEND_REQUEST" ];
+ };
+
+ State {
+ default_state with
+ name = "PAUSE_SEND_REQUEST";
+ comment = "Interrupt send request to receive an earlier command's
reply";
+ external_events = [];
};
State {
@@ -649,7 +651,15 @@ and issue_command_state_machine = [
default_state with
name = "SEND_WRITE_PAYLOAD";
comment = "Sending the write payload to the remote server";
- external_events = [ NotifyWrite, "" ];
+ external_events = [ NotifyWrite, "";
+ NotifyRead, "PAUSE_WRITE_PAYLOAD" ];
+ };
+
+State {
+ default_state with
+ name = "PAUSE_WRITE_PAYLOAD";
+ comment = "Interrupt write payload to receive an earlier command's
reply";
+ external_events = [];
};
State {
diff --git a/generator/states-issue-command.c b/generator/states-issue-command.c
index e24ea34..5f00aa7 100644
--- a/generator/states-issue-command.c
+++ b/generator/states-issue-command.c
@@ -25,14 +25,23 @@
assert (conn->cmds_to_issue != NULL);
cmd = conn->cmds_to_issue;
- conn->sbuf.request.magic = htobe32 (NBD_REQUEST_MAGIC);
- conn->sbuf.request.flags = htobe16 (cmd->flags);
- conn->sbuf.request.type = htobe16 (cmd->type);
- conn->sbuf.request.handle = htobe64 (cmd->handle);
- conn->sbuf.request.offset = htobe64 (cmd->offset);
- conn->sbuf.request.count = htobe32 ((uint32_t) cmd->count);
- conn->wbuf = &conn->sbuf;
- conn->wlen = sizeof (conn->sbuf.request);
+ /* Were we interrupted by reading a reply to an earlier command? */
+ if (conn->wlen) {
+ if (conn->in_write_payload)
+ SET_NEXT_STATE(%SEND_WRITE_PAYLOAD);
+ else
+ SET_NEXT_STATE(%SEND_REQUEST);
+ return 0;
+ }
+
+ conn->request.magic = htobe32 (NBD_REQUEST_MAGIC);
+ conn->request.flags = htobe16 (cmd->flags);
+ conn->request.type = htobe16 (cmd->type);
+ conn->request.handle = htobe64 (cmd->handle);
+ conn->request.offset = htobe64 (cmd->offset);
+ conn->request.count = htobe32 ((uint32_t) cmd->count);
+ conn->wbuf = &conn->request;
+ conn->wlen = sizeof (conn->request);
SET_NEXT_STATE (%SEND_REQUEST);
return 0;
@@ -43,12 +52,19 @@
}
return 0;
+ ISSUE_COMMAND.PAUSE_SEND_REQUEST:
+ assert (conn->wlen);
+ assert (conn->cmds_to_issue != NULL);
+ conn->in_write_payload = false;
+ SET_NEXT_STATE (%^REPLY.START);
+ return 0;
+
ISSUE_COMMAND.PREPARE_WRITE_PAYLOAD:
struct command_in_flight *cmd;
assert (conn->cmds_to_issue != NULL);
cmd = conn->cmds_to_issue;
- assert (cmd->handle == be64toh (conn->sbuf.request.handle));
+ assert (cmd->handle == be64toh (conn->request.handle));
if (cmd->type == NBD_CMD_WRITE) {
conn->wbuf = cmd->data;
conn->wlen = cmd->count;
@@ -65,9 +81,17 @@
}
return 0;
+ ISSUE_COMMAND.PAUSE_WRITE_PAYLOAD:
+ assert (conn->wlen);
+ assert (conn->cmds_to_issue != NULL);
+ conn->in_write_payload = true;
+ SET_NEXT_STATE (%^REPLY.START);
+ return 0;
+
ISSUE_COMMAND.FINISH:
struct command_in_flight *cmd;
+ assert (!conn->wlen);
assert (conn->cmds_to_issue != NULL);
cmd = conn->cmds_to_issue;
conn->cmds_to_issue = cmd->next;
diff --git a/lib/internal.h b/lib/internal.h
index de9b8bc..13832d7 100644
--- a/lib/internal.h
+++ b/lib/internal.h
@@ -133,7 +133,6 @@ struct nbd_connection {
} payload;
} __attribute__((packed)) or;
struct nbd_export_name_option_reply export_name_reply;
- struct nbd_request request;
struct nbd_simple_reply simple_reply;
struct {
struct nbd_structured_reply structured_reply;
@@ -149,6 +148,12 @@ struct nbd_connection {
uint32_t nrqueries;
} sbuf;
+ /* Issuing a command must use a buffer separate from sbuf, for the
+ * case when we interrupt a request to service a reply.
+ */
+ struct nbd_request request;
+ bool in_write_payload;
+
/* When connecting, this stores the socket address. */
struct sockaddr_storage connaddr;
socklen_t connaddrlen;
--
2.20.1
Eric Blake
2019-May-22 21:29 UTC
[Libguestfs] [libnbd PATCH v3 7/7] examples: Add example to demonstrate just-fixed deadlock scenario
Added merely as an example to be run by hand, rather than automated
into 'make check', since nbdkit --filter=noparallel is quite new.
---
.gitignore | 1 +
examples/Makefile.am | 10 ++
examples/batched-read-write.c | 214 ++++++++++++++++++++++++++++++++++
3 files changed, 225 insertions(+)
create mode 100644 examples/batched-read-write.c
diff --git a/.gitignore b/.gitignore
index 66ff811..2ea7da0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,6 +30,7 @@ Makefile.in
/docs/libnbd.3
/docs/libnbd-api.3
/docs/libnbd-api.pod
+/examples/batched-read-write
/examples/threaded-reads-and-writes
/examples/simple-fetch-first-sector
/examples/simple-reads-and-writes
diff --git a/examples/Makefile.am b/examples/Makefile.am
index be3f21d..6825384 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -18,6 +18,7 @@
include $(top_srcdir)/subdir-rules.mk
noinst_PROGRAMS = \
+ batched-read-write \
simple-fetch-first-sector \
simple-reads-and-writes \
threaded-reads-and-writes
@@ -50,3 +51,12 @@ threaded_reads_and_writes_CFLAGS = \
threaded_reads_and_writes_LDADD = \
$(top_builddir)/lib/libnbd.la \
$(PTHREAD_LIBS)
+
+batched_read_write_SOURCES = \
+ batched-read-write.c
+batched_read_write_CPPFLAGS = \
+ -I$(top_srcdir)/include
+batched_read_write_CFLAGS = \
+ $(WARNINGS_CFLAGS)
+batched_read_write_LDADD = \
+ $(top_builddir)/lib/libnbd.la
diff --git a/examples/batched-read-write.c b/examples/batched-read-write.c
new file mode 100644
index 0000000..5dda2e2
--- /dev/null
+++ b/examples/batched-read-write.c
@@ -0,0 +1,214 @@
+/* This example can be copied, used and modified for any purpose
+ * without restrictions.
+ *
+ * Example usage with nbdkit:
+ *
+ * nbdkit -U - --filter=noparallel memory 2M \
+ * --run './batched-read-write $unixsocket'
+ *
+ * This will attempt to batch a large aio read request immediately
+ * followed by a large aio write request, prior to waiting for any
+ * command replies from the server. A naive client that does not check
+ * for available read data related to the first command while trying
+ * to write data for the second command, coupled with a server that
+ * only processes commands serially, would cause deadlock (both
+ * processes fill up their write buffers waiting for a reader); thus,
+ * this tests that libnbd is smart enough to always respond to replies
+ * for in-flight requests even when it has batched up other commands
+ * to write.
+ *
+ * To run it against a remote server over TCP:
+ *
+ * ./batched-read-write hostname port
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <poll.h>
+#include <time.h>
+#include <assert.h>
+#include <signal.h>
+
+#include <libnbd.h>
+
+/* The single NBD handle. */
+static struct nbd_handle *nbd;
+
+/* Buffers used for the test. */
+static char *in, *out;
+static int64_t packetsize;
+
+static int
+try_deadlock (void *arg)
+{
+ struct pollfd fds[1];
+ struct nbd_connection *conn;
+ size_t i;
+ int64_t handles[2], done;
+ size_t in_flight; /* counts number of requests in flight */
+ int dir, r;
+
+ /* The single thread "owns" the connection. */
+ conn = nbd_get_connection (nbd, 0);
+
+ /* Issue commands. */
+ in_flight = 0;
+ handles[0] = nbd_aio_pread (conn, in, packetsize, 0);
+ if (handles[0] == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto error;
+ }
+ in_flight++;
+ handles[1] = nbd_aio_pwrite (conn, out, packetsize, packetsize, 0);
+ if (handles[1] == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto error;
+ }
+ in_flight++;
+
+ /* Now wait for commands to retire, or for deadlock to occur */
+ while (in_flight > 0) {
+ if (nbd_aio_is_dead (conn) || nbd_aio_is_closed (conn)) {
+ fprintf (stderr, "connection is dead or closed\n");
+ goto error;
+ }
+
+ fds[0].fd = nbd_aio_get_fd (conn);
+ fds[0].events = 0;
+ fds[0].revents = 0;
+ dir = nbd_aio_get_direction (conn);
+ if ((dir & LIBNBD_AIO_DIRECTION_READ) != 0)
+ fds[0].events |= POLLIN;
+ if ((dir & LIBNBD_AIO_DIRECTION_WRITE) != 0)
+ fds[0].events |= POLLOUT;
+
+ if (poll (fds, 1, -1) == -1) {
+ perror ("poll");
+ goto error;
+ }
+
+ if ((dir & LIBNBD_AIO_DIRECTION_READ) != 0 &&
+ (fds[0].revents & POLLIN) != 0)
+ nbd_aio_notify_read (conn);
+ else if ((dir & LIBNBD_AIO_DIRECTION_WRITE) != 0 &&
+ (fds[0].revents & POLLOUT) != 0)
+ nbd_aio_notify_write (conn);
+
+ /* If a command is ready to retire, retire it. */
+ while ((done = nbd_aio_peek_command_completed (conn)) >= 0) {
+ for (i = 0; i < in_flight; ++i) {
+ if (handles[i] == done) {
+ r = nbd_aio_command_completed (conn, handles[i]);
+ if (r == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ goto error;
+ }
+ assert (r);
+ memmove (&handles[i], &handles[i+1],
+ sizeof (handles[0]) * (in_flight - i - 1));
+ break;
+ }
+ }
+ assert (i < in_flight);
+ in_flight--;
+ }
+ }
+
+ printf ("finished OK\n");
+
+ return 0;
+
+ error:
+ fprintf (stderr, "failed\n");
+ return -1;
+}
+
+static void
+alarm_handler (int sig)
+{
+ fprintf (stderr, "alarm fired; deadlock probably occurred\n");
+ exit (EXIT_FAILURE);
+}
+
+int
+main (int argc, char *argv[])
+{
+ int64_t exportsize;
+
+ if (argc < 2 || argc > 3) {
+ fprintf (stderr, "%s socket | hostname port\n", argv[0]);
+ exit (EXIT_FAILURE);
+ }
+
+ nbd = nbd_create ();
+ if (nbd == NULL) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+
+ /* Connect synchronously as this is simpler. */
+ if (argc == 2) {
+ if (nbd_connect_unix (nbd, argv[1]) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ }
+ else {
+ if (nbd_connect_tcp (nbd, argv[1], argv[2]) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ }
+
+ if (nbd_read_only (nbd) == 1) {
+ fprintf (stderr, "%s: error: this NBD export is read-only\n",
argv[0]);
+ exit (EXIT_FAILURE);
+ }
+
+ exportsize = nbd_get_size (nbd);
+ if (exportsize == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+ packetsize = exportsize / 2;
+ if (packetsize > 2 * 1024 * 1024)
+ packetsize = 2 * 1024 * 1024;
+
+ in = malloc (packetsize);
+ out = malloc (packetsize);
+ if (!in || !out) {
+ fprintf (stderr, "insufficient memory\n");
+ exit (EXIT_FAILURE);
+ }
+
+ /* Attempt to be non-destructive, by writing what file already contains */
+ if (nbd_pread (nbd, out, packetsize, packetsize) == -1) {
+ fprintf (stderr, "sync read failed: %s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+
+ /* When not debugging, set an alarm, in case this test deadlocks
+ * instead of succeeding
+ */
+ if (nbd_get_debug (nbd) < 1) {
+ signal (SIGALRM, alarm_handler);
+ alarm (10);
+ }
+
+ if (try_deadlock (NULL) == -1)
+ exit (EXIT_FAILURE);
+
+ if (nbd_shutdown (nbd) == -1) {
+ fprintf (stderr, "%s\n", nbd_get_error ());
+ exit (EXIT_FAILURE);
+ }
+
+ nbd_close (nbd);
+
+ return EXIT_SUCCESS;
+}
--
2.20.1
Richard W.M. Jones
2019-May-22 22:12 UTC
Re: [Libguestfs] [libnbd PATCH v3 1/7] lib: Refactor command_common() to do more common work
This patch is now fine, although unfortunately I think you'll find it won't apply as-is since I made a mechanical change of turning int64_t extent_id into void *extent_data. But other than that, ACK. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com libguestfs lets you edit virtual machines. Supports shell scripting, bindings from many languages. http://libguestfs.org
Richard W.M. Jones
2019-May-22 22:14 UTC
Re: [Libguestfs] [libnbd PATCH v3 2/7] commands: Allow for a command queue
This one is fine, ACK. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Richard W.M. Jones
2019-May-22 22:17 UTC
Re: [Libguestfs] [libnbd PATCH v3 3/7] commands: Expose FIFO ordering of server completions
On Wed, May 22, 2019 at 04:29:03PM -0500, Eric Blake wrote:> +int64_t > +nbd_unlocked_aio_peek_command_completed (struct nbd_connection *conn) > +{ > + if (conn->cmds_done != NULL) > + return conn->cmds_done->handle; > + > + if (conn->cmds_in_flight != NULL || conn->cmds_to_issue != NULL) > + set_error (0, "no in-flight command has completed yet"); > + else > + set_error (0, "no commands are in flight"); > + return -1; > +} > --There are two ways I think we could improve this. Either: (1) I believe that all handles are >= 1 (and if this isn't true, it's easy to achieve by starting the h->unique counter at 1). Return 0 to mean "no completed command". Of course "no commands in flight" is still an error. I think (1) is more elegant, but the alternative is: (2) Use the errno field (currently 0) to distinguish the two cases with a suitable errno value (EAGAIN maybe?) for the "no completed command" case. No need to post this patch again, ACK if you make this kind of change. Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://people.redhat.com/~rjones/virt-df/
Richard W.M. Jones
2019-May-22 22:20 UTC
Re: [Libguestfs] [libnbd PATCH v3 4/7] disconnect: Allow shutdown during processing
On Wed, May 22, 2019 at 04:29:04PM -0500, Eric Blake wrote:> Although most of our synchronous commands are fine doing a round-robin > for the first ready connection, and will leave the connection back in > the ready state on conclusion, shutdown is a different beast. Once > handshake has finished, we want a shutdown request to reach all > connections, even if it has to be queued behind other commands already > being processed on that connection. > > In the future, we may want to consider a flags argument on whether to > wait for the connection to have no in-flight commands, and/or to force > a disconnect request to the front of the queue of commands to issue, > depending on whether the client wants a fast shutdown vs. completion > of outstanding requests to avoid data corruption. The NBD spec > recommends that NBD_CMD_DISC not be sent while any other commands are > in flight, but similarly that a server should not shut down due to > NBD_CMD_DISC until all other responses have been flushed. We may also > want to update the state machine to track that once NBD_CMD_DISC has > been sent, it is not valid to queue or send any further commands on > that connection, even if it is still up to receive replies to previous > in-flight commands. > --- > lib/disconnect.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/lib/disconnect.c b/lib/disconnect.c > index 9706835..0b8fea0 100644 > --- a/lib/disconnect.c > +++ b/lib/disconnect.c > @@ -31,7 +31,8 @@ nbd_unlocked_shutdown (struct nbd_handle *h) > size_t i; > > for (i = 0; i < h->multi_conn; ++i) { > - if (nbd_unlocked_aio_is_ready (h->conns[i])) { > + if (nbd_unlocked_aio_is_ready (h->conns[i]) || > + nbd_unlocked_aio_is_processing (h->conns[i])) { > if (nbd_unlocked_aio_disconnect (h->conns[i]) == -1) > return -1; > }ACK Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Richard W.M. Jones
2019-May-22 22:20 UTC
Re: [Libguestfs] [libnbd PATCH v3 7/7] examples: Add example to demonstrate just-fixed deadlock scenario
I admit I'm a bit tired now, but this all seems plausible to me, so ACK series. Let's see what breaks ... Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://people.redhat.com/~rjones/virt-df/
Seemingly Similar Threads
- [libnbd PATCH v2 0/5] Avoid deadlock with in-flight commands
- [libnbd PATCH 0/3] Avoid deadlock with in-flight commands
- [PATCH libnbd] api: Get rid of nbd_connection.
- [libnbd PATCH 0/2] in_flight improvements
- [libnbd PATCH 0/6] new APIs: aio_in_flight, aio_FOO_notify