Eric Blake
2019-Jun-07 14:15 UTC
[Libguestfs] [nbdkit PATCH v2 0/2] Reduce network overhead with MSG_MORE/corking
This time around, the numbers are indeed looking better than in v1; and I like the interface better. Eric Blake (2): server: Prefer send() over write() server: Group related transmission send()s server/internal.h | 7 +++- server/connections.c | 51 +++++++++++++++++++++++++--- server/crypto.c | 11 ++++-- server/protocol-handshake-newstyle.c | 22 ++++++------ server/protocol-handshake-oldstyle.c | 2 +- server/protocol.c | 22 ++++++------ 6 files changed, 85 insertions(+), 30 deletions(-) -- 2.20.1
Eric Blake
2019-Jun-07 14:15 UTC
[Libguestfs] [nbdkit PATCH v2 1/2] server: Prefer send() over write()
The next patch wants to use the MSG_MORE flag of send() where available. In preparation for that, we need a flags parameter to conn->send (always 0 for now), as well as a decision on whether we can use send() or must stick with write() (think nbdkit -s). Signed-off-by: Eric Blake <eblake@redhat.com> --- server/internal.h | 3 +- server/connections.c | 45 ++++++++++++++++++++++++---- server/crypto.c | 4 +-- server/protocol-handshake-newstyle.c | 22 +++++++------- server/protocol-handshake-oldstyle.c | 2 +- server/protocol.c | 20 ++++++------- 6 files changed, 66 insertions(+), 30 deletions(-) diff --git a/server/internal.h b/server/internal.h index 2ee5e23..50525f3 100644 --- a/server/internal.h +++ b/server/internal.h @@ -143,7 +143,8 @@ typedef int (*connection_recv_function) (struct connection *, void *buf, size_t len) __attribute__((__nonnull__ (1, 2))); typedef int (*connection_send_function) (struct connection *, - const void *buf, size_t len) + const void *buf, size_t len, + int flags) __attribute__((__nonnull__ (1, 2))); typedef void (*connection_close_function) (struct connection *) __attribute__((__nonnull__ (1))); diff --git a/server/connections.c b/server/connections.c index b7d9a6a..1a749b6 100644 --- a/server/connections.c +++ b/server/connections.c @@ -38,6 +38,7 @@ #include <inttypes.h> #include <string.h> #include <unistd.h> +#include <sys/socket.h> #include "internal.h" @@ -50,7 +51,10 @@ static void free_connection (struct connection *conn); /* Don't call these raw socket functions directly. Use conn->recv etc. */ static int raw_recv (struct connection *, void *buf, size_t len); -static int raw_send (struct connection *, const void *buf, size_t len); +static int raw_send_socket (struct connection *, const void *buf, size_t len, + int flags); +static int raw_send_other (struct connection *, const void *buf, size_t len, + int flags); static void raw_close (struct connection *); int @@ -268,6 +272,8 @@ static struct connection * new_connection (int sockin, int sockout, int nworkers) { struct connection *conn; + int opt; + socklen_t optlen = sizeof opt; conn = calloc (1, sizeof *conn); if (conn == NULL) { @@ -285,7 +291,10 @@ new_connection (int sockin, int sockout, int nworkers) pthread_mutex_init (&conn->status_lock, NULL); conn->recv = raw_recv; - conn->send = raw_send; + if (getsockopt (sockout, SOL_SOCKET, SO_TYPE, &opt, &optlen) == 0) + conn->send = raw_send_socket; + else + conn->send = raw_send_other; conn->close = raw_close; return conn; @@ -320,11 +329,37 @@ free_connection (struct connection *conn) free (conn); } -/* Write buffer to conn->sockout and either succeed completely - * (returns 0) or fail (returns -1). +/* Write buffer to conn->sockout with send() and either succeed completely + * (returns 0) or fail (returns -1). flags is ignored for now. */ static int -raw_send (struct connection *conn, const void *vbuf, size_t len) +raw_send_socket (struct connection *conn, const void *vbuf, size_t len, + int flags) +{ + int sock = conn->sockout; + const char *buf = vbuf; + ssize_t r; + + while (len > 0) { + r = send (sock, buf, len, 0); + if (r == -1) { + if (errno == EINTR || errno == EAGAIN) + continue; + return -1; + } + buf += r; + len -= r; + } + + return 0; +} + +/* Write buffer to conn->sockout with write() and either succeed completely + * (returns 0) or fail (returns -1). flags is ignored. + */ +static int +raw_send_other (struct connection *conn, const void *vbuf, size_t len, + int flags) { int sock = conn->sockout; const char *buf = vbuf; diff --git a/server/crypto.c b/server/crypto.c index 978a843..3f87944 100644 --- a/server/crypto.c +++ b/server/crypto.c @@ -346,10 +346,10 @@ crypto_recv (struct connection *conn, void *vbuf, size_t len) } /* Write buffer to GnuTLS and either succeed completely - * (returns 0) or fail (returns -1). + * (returns 0) or fail (returns -1). flags is ignored for now. */ static int -crypto_send (struct connection *conn, const void *vbuf, size_t len) +crypto_send (struct connection *conn, const void *vbuf, size_t len, int flags) { gnutls_session_t session = conn->crypto_session; const char *buf = vbuf; diff --git a/server/protocol-handshake-newstyle.c b/server/protocol-handshake-newstyle.c index dac2032..6993c8e 100644 --- a/server/protocol-handshake-newstyle.c +++ b/server/protocol-handshake-newstyle.c @@ -63,7 +63,7 @@ send_newstyle_option_reply (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply) == -1) { + sizeof fixed_new_option_reply, 0) == -1) { /* The protocol document says that the client is allowed to simply * drop the connection after sending NBD_OPT_ABORT, or may read * the reply. @@ -94,18 +94,18 @@ send_newstyle_option_reply_exportname (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply) == -1) { + sizeof fixed_new_option_reply, 0) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; } len = htobe32 (name_len); - if (conn->send (conn, &len, sizeof len) == -1) { + if (conn->send (conn, &len, sizeof len, 0) == -1) { nbdkit_error ("write: %s: %s: %m", name_of_nbd_opt (option), "sending length"); return -1; } - if (conn->send (conn, exportname, name_len) == -1) { + if (conn->send (conn, exportname, name_len, 0) == -1) { nbdkit_error ("write: %s: %s: %m", name_of_nbd_opt (option), "sending export name"); return -1; @@ -132,8 +132,8 @@ send_newstyle_option_reply_info_export (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply) == -1 || - conn->send (conn, &export, sizeof export) == -1) { + sizeof fixed_new_option_reply, 0) == -1 || + conn->send (conn, &export, sizeof export, 0) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; } @@ -161,9 +161,9 @@ send_newstyle_option_reply_meta_context (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply) == -1 || - conn->send (conn, &context, sizeof context) == -1 || - conn->send (conn, name, namelen) == -1) { + sizeof fixed_new_option_reply, 0) == -1 || + conn->send (conn, &context, sizeof context, 0) == -1 || + conn->send (conn, name, namelen, 0) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; } @@ -292,7 +292,7 @@ negotiate_handshake_newstyle_options (struct connection *conn) &handshake_finish, (conn->cflags & NBD_FLAG_NO_ZEROES) ? offsetof (struct new_handshake_finish, zeroes) - : sizeof handshake_finish) == -1) { + : sizeof handshake_finish, 0) == -1) { nbdkit_error ("write: %s: %m", optname); return -1; } @@ -656,7 +656,7 @@ protocol_handshake_newstyle (struct connection *conn) handshake.version = htobe64 (NEW_VERSION); handshake.gflags = htobe16 (gflags); - if (conn->send (conn, &handshake, sizeof handshake) == -1) { + if (conn->send (conn, &handshake, sizeof handshake, 0) == -1) { nbdkit_error ("write: %s: %m", "sending newstyle handshake"); return -1; } diff --git a/server/protocol-handshake-oldstyle.c b/server/protocol-handshake-oldstyle.c index 8dc87f4..9fde1ca 100644 --- a/server/protocol-handshake-oldstyle.c +++ b/server/protocol-handshake-oldstyle.c @@ -84,7 +84,7 @@ protocol_handshake_oldstyle (struct connection *conn) handshake.gflags = htobe16 (gflags); handshake.eflags = htobe16 (eflags); - if (conn->send (conn, &handshake, sizeof handshake) == -1) { + if (conn->send (conn, &handshake, sizeof handshake, 0) == -1) { nbdkit_error ("write: %m"); return -1; } diff --git a/server/protocol.c b/server/protocol.c index 6d519e7..0e054ee 100644 --- a/server/protocol.c +++ b/server/protocol.c @@ -398,7 +398,7 @@ send_simple_reply (struct connection *conn, reply.handle = handle; reply.error = htobe32 (nbd_errno (error, false)); - r = conn->send (conn, &reply, sizeof reply); + r = conn->send (conn, &reply, sizeof reply, 0); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -406,7 +406,7 @@ send_simple_reply (struct connection *conn, /* Send the read data buffer. */ if (cmd == NBD_CMD_READ && !error) { - r = conn->send (conn, buf, count); + r = conn->send (conn, buf, count, 0); if (r == -1) { nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -439,7 +439,7 @@ send_structured_reply_read (struct connection *conn, reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA); reply.length = htobe32 (count + sizeof offset_data); - r = conn->send (conn, &reply, sizeof reply); + r = conn->send (conn, &reply, sizeof reply, 0); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -447,13 +447,13 @@ send_structured_reply_read (struct connection *conn, /* Send the offset + read data buffer. */ offset_data.offset = htobe64 (offset); - r = conn->send (conn, &offset_data, sizeof offset_data); + r = conn->send (conn, &offset_data, sizeof offset_data, 0); if (r == -1) { nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); } - r = conn->send (conn, buf, count); + r = conn->send (conn, buf, count, 0); if (r == -1) { nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -573,7 +573,7 @@ send_structured_reply_block_status (struct connection *conn, reply.length = htobe32 (sizeof context_id + nr_blocks * sizeof (struct block_descriptor)); - r = conn->send (conn, &reply, sizeof reply); + r = conn->send (conn, &reply, sizeof reply, 0); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -581,7 +581,7 @@ send_structured_reply_block_status (struct connection *conn, /* Send the base:allocation context ID. */ context_id = htobe32 (base_allocation_id); - r = conn->send (conn, &context_id, sizeof context_id); + r = conn->send (conn, &context_id, sizeof context_id, 0); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -589,7 +589,7 @@ send_structured_reply_block_status (struct connection *conn, /* Send each block descriptor. */ for (i = 0; i < nr_blocks; ++i) { - r = conn->send (conn, &blocks[i], sizeof blocks[i]); + r = conn->send (conn, &blocks[i], sizeof blocks[i], 0); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -615,7 +615,7 @@ send_structured_reply_error (struct connection *conn, reply.type = htobe16 (NBD_REPLY_TYPE_ERROR); reply.length = htobe32 (0 /* no human readable error */ + sizeof error_data); - r = conn->send (conn, &reply, sizeof reply); + r = conn->send (conn, &reply, sizeof reply, 0); if (r == -1) { nbdkit_error ("write error reply: %m"); return connection_set_status (conn, -1); @@ -624,7 +624,7 @@ send_structured_reply_error (struct connection *conn, /* Send the error. */ error_data.error = htobe32 (nbd_errno (error, flags & NBD_CMD_FLAG_DF)); error_data.len = htobe16 (0); - r = conn->send (conn, &error_data, sizeof error_data); + r = conn->send (conn, &error_data, sizeof error_data, 0); if (r == -1) { nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); -- 2.20.1
Eric Blake
2019-Jun-07 14:15 UTC
[Libguestfs] [nbdkit PATCH v2 2/2] server: Group related transmission send()s
We disabled Nagle's algorithm to allow less latency in our responses reaching the client; but as a side effect, it leads to more network overhead when we send a reply split across more than one write(). Take advantage of various means for grouping related writes (Linux' MSG_MORE for sockets, gnutls' corking for TLS) to send a larger packet, and adjust callers to pass in our internal SEND_MORE flag as a hint when to use the improvements. I tested with appropriate combinations from: $ nbdkit {-p 10810,-U -} \ {--tls=require --tls-verify-peer --tls-psk=./keys.psk,} memory size=64m \ --run './aio-parallel-load{-tls,} {$unixsocket,nbd://localhost:10810}' with the following IOPS measurements averaged over multiple runs: pre post gain unix plain: 802627.8 822944.1 2.53% unix tls: 121871.6 125538.0 3.01% tcp plain: 656400.1 685795.0 4.48% tcp tls: 114552.1 120197.3 4.93% which looks like an overall improvement, even though it is still close to the margins for being in the noise. Signed-off-by: Eric Blake <eblake@redhat.com> --- server/internal.h | 4 ++++ server/connections.c | 10 ++++++++-- server/crypto.c | 7 +++++++ server/protocol-handshake-newstyle.c | 10 +++++----- server/protocol.c | 16 +++++++++------- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/server/internal.h b/server/internal.h index 50525f3..2281937 100644 --- a/server/internal.h +++ b/server/internal.h @@ -139,6 +139,10 @@ extern void change_user (void); /* connections.c */ struct connection; +enum { + SEND_MORE = 1, /* Hint to use MSG_MORE/corking to group send()s */ +}; + typedef int (*connection_recv_function) (struct connection *, void *buf, size_t len) __attribute__((__nonnull__ (1, 2))); diff --git a/server/connections.c b/server/connections.c index 1a749b6..f56590c 100644 --- a/server/connections.c +++ b/server/connections.c @@ -330,7 +330,8 @@ free_connection (struct connection *conn) } /* Write buffer to conn->sockout with send() and either succeed completely - * (returns 0) or fail (returns -1). flags is ignored for now. + * (returns 0) or fail (returns -1). flags may include SEND_MORE as a hint + * that this send will be followed by related data. */ static int raw_send_socket (struct connection *conn, const void *vbuf, size_t len, @@ -339,9 +340,14 @@ raw_send_socket (struct connection *conn, const void *vbuf, size_t len, int sock = conn->sockout; const char *buf = vbuf; ssize_t r; + int f = 0; +#ifdef MSG_MORE + if (flags & SEND_MORE) + f |= MSG_MORE; +#endif while (len > 0) { - r = send (sock, buf, len, 0); + r = send (sock, buf, len, f); if (r == -1) { if (errno == EINTR || errno == EAGAIN) continue; diff --git a/server/crypto.c b/server/crypto.c index 3f87944..3f3d96b 100644 --- a/server/crypto.c +++ b/server/crypto.c @@ -357,6 +357,9 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len, int flags) assert (session != NULL); + if (flags & SEND_MORE) + gnutls_record_cork (session); + while (len > 0) { r = gnutls_record_send (session, buf, len); if (r < 0) { @@ -368,6 +371,10 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len, int flags) len -= r; } + if (!(flags & SEND_MORE) && + gnutls_record_uncork (session, GNUTLS_RECORD_WAIT) < 0) + return -1; + return 0; } diff --git a/server/protocol-handshake-newstyle.c b/server/protocol-handshake-newstyle.c index 6993c8e..e0136de 100644 --- a/server/protocol-handshake-newstyle.c +++ b/server/protocol-handshake-newstyle.c @@ -94,13 +94,13 @@ send_newstyle_option_reply_exportname (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply, 0) == -1) { + sizeof fixed_new_option_reply, SEND_MORE) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; } len = htobe32 (name_len); - if (conn->send (conn, &len, sizeof len, 0) == -1) { + if (conn->send (conn, &len, sizeof len, SEND_MORE) == -1) { nbdkit_error ("write: %s: %s: %m", name_of_nbd_opt (option), "sending length"); return -1; @@ -132,7 +132,7 @@ send_newstyle_option_reply_info_export (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply, 0) == -1 || + sizeof fixed_new_option_reply, SEND_MORE) == -1 || conn->send (conn, &export, sizeof export, 0) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; @@ -161,8 +161,8 @@ send_newstyle_option_reply_meta_context (struct connection *conn, if (conn->send (conn, &fixed_new_option_reply, - sizeof fixed_new_option_reply, 0) == -1 || - conn->send (conn, &context, sizeof context, 0) == -1 || + sizeof fixed_new_option_reply, SEND_MORE) == -1 || + conn->send (conn, &context, sizeof context, SEND_MORE) == -1 || conn->send (conn, name, namelen, 0) == -1) { nbdkit_error ("write: %s: %m", name_of_nbd_opt (option)); return -1; diff --git a/server/protocol.c b/server/protocol.c index 0e054ee..5967622 100644 --- a/server/protocol.c +++ b/server/protocol.c @@ -393,12 +393,13 @@ send_simple_reply (struct connection *conn, ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&conn->write_lock); struct simple_reply reply; int r; + int f = (cmd == NBD_CMD_READ && !error) ? SEND_MORE : 0; reply.magic = htobe32 (NBD_SIMPLE_REPLY_MAGIC); reply.handle = handle; reply.error = htobe32 (nbd_errno (error, false)); - r = conn->send (conn, &reply, sizeof reply, 0); + r = conn->send (conn, &reply, sizeof reply, f); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -439,7 +440,7 @@ send_structured_reply_read (struct connection *conn, reply.type = htobe16 (NBD_REPLY_TYPE_OFFSET_DATA); reply.length = htobe32 (count + sizeof offset_data); - r = conn->send (conn, &reply, sizeof reply, 0); + r = conn->send (conn, &reply, sizeof reply, SEND_MORE); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -447,7 +448,7 @@ send_structured_reply_read (struct connection *conn, /* Send the offset + read data buffer. */ offset_data.offset = htobe64 (offset); - r = conn->send (conn, &offset_data, sizeof offset_data, 0); + r = conn->send (conn, &offset_data, sizeof offset_data, SEND_MORE); if (r == -1) { nbdkit_error ("write data: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -573,7 +574,7 @@ send_structured_reply_block_status (struct connection *conn, reply.length = htobe32 (sizeof context_id + nr_blocks * sizeof (struct block_descriptor)); - r = conn->send (conn, &reply, sizeof reply, 0); + r = conn->send (conn, &reply, sizeof reply, SEND_MORE); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -581,7 +582,7 @@ send_structured_reply_block_status (struct connection *conn, /* Send the base:allocation context ID. */ context_id = htobe32 (base_allocation_id); - r = conn->send (conn, &context_id, sizeof context_id, 0); + r = conn->send (conn, &context_id, sizeof context_id, SEND_MORE); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -589,7 +590,8 @@ send_structured_reply_block_status (struct connection *conn, /* Send each block descriptor. */ for (i = 0; i < nr_blocks; ++i) { - r = conn->send (conn, &blocks[i], sizeof blocks[i], 0); + r = conn->send (conn, &blocks[i], sizeof blocks[i], + i == nr_blocks - 1 ? 0 : SEND_MORE); if (r == -1) { nbdkit_error ("write reply: %s: %m", name_of_nbd_cmd (cmd)); return connection_set_status (conn, -1); @@ -615,7 +617,7 @@ send_structured_reply_error (struct connection *conn, reply.type = htobe16 (NBD_REPLY_TYPE_ERROR); reply.length = htobe32 (0 /* no human readable error */ + sizeof error_data); - r = conn->send (conn, &reply, sizeof reply, 0); + r = conn->send (conn, &reply, sizeof reply, SEND_MORE); if (r == -1) { nbdkit_error ("write error reply: %m"); return connection_set_status (conn, -1); -- 2.20.1
Richard W.M. Jones
2019-Jun-07 14:48 UTC
Re: [Libguestfs] [nbdkit PATCH v2 2/2] server: Group related transmission send()s
I too prefer this approach instead of TCP_CORK. While the performance gains are modest, the patch looks like the right thing to do, so: ACK series Thanks, Rich. -- Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones Read my programming and virtualization blog: http://rwmj.wordpress.com virt-p2v converts physical machines to virtual machines. Boot with a live CD or over the network (PXE) and turn machines into KVM guests. http://libguestfs.org/virt-v2v
Eric Blake
2019-Jun-10 16:23 UTC
Re: [Libguestfs] [nbdkit PATCH v2 2/2] server: Group related transmission send()s
On 6/7/19 9:15 AM, Eric Blake wrote:> We disabled Nagle's algorithm to allow less latency in our responses > reaching the client; but as a side effect, it leads to more network > overhead when we send a reply split across more than one write(). > Take advantage of various means for grouping related writes (Linux' > MSG_MORE for sockets, gnutls' corking for TLS) to send a larger > packet, and adjust callers to pass in our internal SEND_MORE flag as a > hint when to use the improvements. I tested with appropriate > combinations from: > > $ nbdkit {-p 10810,-U -} \ > {--tls=require --tls-verify-peer --tls-psk=./keys.psk,} memory size=64m \ > --run './aio-parallel-load{-tls,} {$unixsocket,nbd://localhost:10810}' > > with the following IOPS measurements averaged over multiple runs: > > pre post gain > unix plain: 802627.8 822944.1 2.53% > unix tls: 121871.6 125538.0 3.01% > tcp plain: 656400.1 685795.0 4.48% > tcp tls: 114552.1 120197.3 4.93% > > which looks like an overall improvement, even though it is still close > to the margins for being in the noise. >> +++ b/server/crypto.c > @@ -357,6 +357,9 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len, int flags) > > assert (session != NULL); > > + if (flags & SEND_MORE) > + gnutls_record_cork (session); > + > while (len > 0) { > r = gnutls_record_send (session, buf, len); > if (r < 0) { > @@ -368,6 +371,10 @@ crypto_send (struct connection *conn, const void *vbuf, size_t len, int flags) > len -= r; > } > > + if (!(flags & SEND_MORE) && > + gnutls_record_uncork (session, GNUTLS_RECORD_WAIT) < 0) > + return -1; > +Even though my numbers showed improvements, aio-parallel-load (which used a 64k buf) is not necessarily the most typical load pattern, and later threads (here and on qemu where I triggered actual pessimisation when using a sequence of commands doing 2M accesses) have pointed out that it's probably better to do this more like: crypto_send() { if (SEND_MORE) { cork send } else if (size < threshold) { send uncork } else { uncork send } } as gnutls has a very annoying habit of realloc'ing space regardless of the sizing of how much data is sent while corked, then slams the underlying socket with enough data that you WILL need to wait for POLLOUT, while the point of SEND_MORE is to optimize only the cases where the ENTIRE message is likely to fit in a single TCP segment. Once you have a very large NBD_CMD_READ, where the remaining data is so large that it's going to take CPU time to encrypt it and where you're going to split into multiple TCP segments no matter what, there's no longer a benefit to stalling the partial packet of the reply header in waiting for the payload. Or stated another way, if you have 40 bytes overhead per TCP segment (probably larger for TLS), and have a 26-byte NBD_REPLY_TYPE_ERROR message split into a 20-byte header and 6-byte payload, the savings for transmitting 20+40 + 6+40 vs. 26+40 are huge (37% savings). If you have NBD_REPLY_TYPE_OFFSET_DATA covering 512 bytes of NBD_CMD_READ reply, the savings are a little less (20+40 + 8+40 + 512+40 vs. 540+40, but still a modest 12%). But once you have NBD_REPLY_TYPE_OFFSET_DATA covering 64k bytes of NBD_CMD_READ, you're GOING to have multiple segments no matter what, and whether those segments split as 48/64k or as 64k/48, you get the same overhead. However, I don't yet have a good idea for what that threshold number should be (64k per the maximum TCP max-segment-size, or 1500 bytes per traditional TCP MTU, or somewhere in between? Is there a way to dynamically learn a good threshold, or is a compile-time one good enough? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Maybe Matching Threads
- [PATCH nbdkit v2 0/3] server: Remove explicit connection parameter.
- [PATCH nbdkit 0/2] server: Split out NBD protocol code from connections code.
- [PATCH nbdkit 0/3] server: Remove explicit connection parameter.
- [nbdkit PATCH 0/2] Reduce network overhead with corking
- [PATCH nbdkit 0/4] common/protocol: Unify public <nbd-protocol.h>