On Tue, 9 Nov 2021, rapier wrote:> So this isn't an issue as much as a weird situation I am not fully > understanding. That said, if I can understand it then it might be a benefit. > > In the function client_process_net_input in clientloop.c if I increase the > size of buf[SSH_IOBUFSZ] to 128k I'm seeing a pretty substantial performance > improvement - mostly when using aes256-ctr. > > For example; with the command > > ./ssh HostB -caes256-ctr "cat /dev/zero" > /dev/null > > I'm seeing throughput of around 610MB/s on a 10Gb network. > > When I use an unmodified version I'll see 480MB/s. > > Same hosts, same command. The only difference being the size of buf in > client_process_net_input. > > HostA is a Xeon x5675 @3Ghz. HostB is an AMD Ryzen 7 5800X. > > My initial assumption is since HostA is CPU bound reducing the number of reads > has a significant impact. That said, a nearly 30% improvement seems excessive > for that to be all that's going on. Additionally, I'm not seeing as much > improvement using chachapoly. In that case, I'm only seeing about a 20% > improvement. 310MB/s for stock and 375MB/s for the big buffer. > > Additionally, I'm only seeing the improvement when HostB is sending the data > and HostA receiving. When HostA (the Xeon) is sending (cat /dev/zero | ./ssh > HostB "cat > /dev/null") then I'm seeing about 290MB/s with either version. > > I'm not suggesting any changes to the code. I'm just trying to understand what > might be happening as it could be the buffer size, something with the CPU > architecture, the switch I'm using, the distro (HostA is fedora, HostB is > ubuntu), etc. Any clues would be appreciated.Maybe try this instead, it avoids the intermediate buffer and tries to make larger reads directly to the channel buffer. I wrote it while trying to investigate a report that re-enabling TCP Nagle improved performance, but I couldn't replicate the reported problem. (I wrote this a while ago, but it's only lightly tested) diff --git a/channels.c b/channels.c index 4903ad1..d6b2024 100644 --- a/channels.c +++ b/channels.c @@ -1928,16 +1928,41 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, char buf[CHAN_RBUF]; ssize_t len; int r; + size_t avail, rlen, maxlen; if (c->rfd == -1 || !FD_ISSET(c->rfd, readset)) return 1; + /* + * For "simple" channels (i.e. not datagram or filtered), try to + * read up to a complete remote window of data directly to the + * channel buffer. + */ + if (c->input_filter == NULL && !c->datagram) { + if (sshbuf_len(c->input) >= c->remote_window || + (avail = sshbuf_avail(c->input)) == 0) + return 1; /* shouldn't happen */ + maxlen = c->remote_window - sshbuf_len(c->input); + if (maxlen > avail) + maxlen = avail; + if (maxlen > CHANNEL_MAX_READ) + maxlen = CHANNEL_MAX_READ; + if ((r = sshbuf_read(c->rfd, c->input, maxlen, &rlen)) != 0) { + debug2("channel %d: read failed rfd %d maxlen %zu: %s", + c->self, c->rfd, maxlen, ssh_err(r)); + goto rfail; + } + return 1; + } + len = read(c->rfd, buf, sizeof(buf)); if (len == -1 && (errno == EINTR || errno == EAGAIN)) return 1; if (len <= 0) { - debug2("channel %d: read<=0 rfd %d len %zd", - c->self, c->rfd, len); + debug2("channel %d: read<=0 rfd %d len %zd: %s", + c->self, c->rfd, len, + len == 0 ? "closed" : strerror(errno)); + rfail: if (c->type != SSH_CHANNEL_OPEN) { debug2("channel %d: not open", c->self); chan_mark_dead(ssh, c); @@ -1955,8 +1980,7 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, } else if (c->datagram) { if ((r = sshbuf_put_string(c->input, buf, len)) != 0) fatal_fr(r, "channel %i: put datagram", c->self); - } else if ((r = sshbuf_put(c->input, buf, len)) != 0) - fatal_fr(r, "channel %i: put data", c->self); + } return 1; } diff --git a/channels.h b/channels.h index 633adc3..db90c18 100644 --- a/channels.h +++ b/channels.h @@ -231,6 +231,9 @@ struct Channel { /* Read buffer size */ #define CHAN_RBUF (16*1024) +/* Maximum size for direct reads to buffers */ +#define CHANNEL_MAX_READ (CHAN_SES_WINDOW_DEFAULT*2) + /* Maximum channel input buffer size */ #define CHAN_INPUT_MAX (16*1024*1024) diff --git a/sshbuf-io.c b/sshbuf-io.c index 966f820..0b7628f 100644 --- a/sshbuf-io.c +++ b/sshbuf-io.c @@ -113,3 +113,39 @@ sshbuf_write_file(const char *path, struct sshbuf *buf) return 0; } +int +sshbuf_read(int fd, struct sshbuf *buf, size_t maxlen, size_t *rlen) +{ + int r, oerrno; + size_t adjust; + ssize_t rr; + u_char *d; + + if (rlen != NULL) + *rlen = 0; + if ((r = sshbuf_reserve(buf, maxlen, &d)) != 0) + return r; + rr = read(fd, d, maxlen); + oerrno = errno; + + /* Adjust the buffer to include only what was actually read */ + if (rr > 0 && (adjust = maxlen - rr) > 0) { + if ((r = sshbuf_consume_end(buf, adjust)) != 0) { + /* avoid returning uninitialised data to caller */ + memset(d + rr, '\0', adjust); + return SSH_ERR_INTERNAL_ERROR; /* shouldn't happen */ + } + } + if (rr < 0) { + errno = oerrno; + return SSH_ERR_SYSTEM_ERROR; + } else if (rr == 0) { + errno = EPIPE; + return SSH_ERR_SYSTEM_ERROR; + } + /* success */ + if (rlen != NULL) + *rlen = (size_t)rr; + return 0; +} + diff --git a/sshbuf.h b/sshbuf.h index 2b77d15..1ee9266 100644 --- a/sshbuf.h +++ b/sshbuf.h @@ -310,6 +310,10 @@ int sshbuf_load_file(const char *, struct sshbuf **) int sshbuf_write_file(const char *path, struct sshbuf *buf) __attribute__((__nonnull__ (2))); +/* Read up to maxlen bytes from a fd directly to a buffer */ +int sshbuf_read(int, struct sshbuf *, size_t, size_t *) + __attribute__((__nonnull__ (2))); + /* Macros for decoding/encoding integers */ #define PEEK_U64(p) \ (((u_int64_t)(((const u_char *)(p))[0]) << 56) | \
Thanks, I'll take a look. I actually got it up to 815MB/s and the 320MB/s improvement is just too much for me to not poke at ruthlessly. My goal has always been to get near line rate with full encryption. Chris On 11/9/21 6:12 PM, Damien Miller wrote:> On Tue, 9 Nov 2021, rapier wrote: > >> So this isn't an issue as much as a weird situation I am not fully >> understanding. That said, if I can understand it then it might be a benefit. >> >> In the function client_process_net_input in clientloop.c if I increase the >> size of buf[SSH_IOBUFSZ] to 128k I'm seeing a pretty substantial performance >> improvement - mostly when using aes256-ctr. >> >> For example; with the command >> >> ./ssh HostB -caes256-ctr "cat /dev/zero" > /dev/null >> >> I'm seeing throughput of around 610MB/s on a 10Gb network. >> >> When I use an unmodified version I'll see 480MB/s. >> >> Same hosts, same command. The only difference being the size of buf in >> client_process_net_input. >> >> HostA is a Xeon x5675 @3Ghz. HostB is an AMD Ryzen 7 5800X. >> >> My initial assumption is since HostA is CPU bound reducing the number of reads >> has a significant impact. That said, a nearly 30% improvement seems excessive >> for that to be all that's going on. Additionally, I'm not seeing as much >> improvement using chachapoly. In that case, I'm only seeing about a 20% >> improvement. 310MB/s for stock and 375MB/s for the big buffer. >> >> Additionally, I'm only seeing the improvement when HostB is sending the data >> and HostA receiving. When HostA (the Xeon) is sending (cat /dev/zero | ./ssh >> HostB "cat > /dev/null") then I'm seeing about 290MB/s with either version. >> >> I'm not suggesting any changes to the code. I'm just trying to understand what >> might be happening as it could be the buffer size, something with the CPU >> architecture, the switch I'm using, the distro (HostA is fedora, HostB is >> ubuntu), etc. Any clues would be appreciated. > > Maybe try this instead, it avoids the intermediate buffer and tries to make > larger reads directly to the channel buffer. I wrote it while trying to > investigate a report that re-enabling TCP Nagle improved performance, > but I couldn't replicate the reported problem. > > (I wrote this a while ago, but it's only lightly tested) > > diff --git a/channels.c b/channels.c > index 4903ad1..d6b2024 100644 > --- a/channels.c > +++ b/channels.c > @@ -1928,16 +1928,41 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, > char buf[CHAN_RBUF]; > ssize_t len; > int r; > + size_t avail, rlen, maxlen; > > if (c->rfd == -1 || !FD_ISSET(c->rfd, readset)) > return 1; > > + /* > + * For "simple" channels (i.e. not datagram or filtered), try to > + * read up to a complete remote window of data directly to the > + * channel buffer. > + */ > + if (c->input_filter == NULL && !c->datagram) { > + if (sshbuf_len(c->input) >= c->remote_window || > + (avail = sshbuf_avail(c->input)) == 0) > + return 1; /* shouldn't happen */ > + maxlen = c->remote_window - sshbuf_len(c->input); > + if (maxlen > avail) > + maxlen = avail; > + if (maxlen > CHANNEL_MAX_READ) > + maxlen = CHANNEL_MAX_READ; > + if ((r = sshbuf_read(c->rfd, c->input, maxlen, &rlen)) != 0) { > + debug2("channel %d: read failed rfd %d maxlen %zu: %s", > + c->self, c->rfd, maxlen, ssh_err(r)); > + goto rfail; > + } > + return 1; > + } > + > len = read(c->rfd, buf, sizeof(buf)); > if (len == -1 && (errno == EINTR || errno == EAGAIN)) > return 1; > if (len <= 0) { > - debug2("channel %d: read<=0 rfd %d len %zd", > - c->self, c->rfd, len); > + debug2("channel %d: read<=0 rfd %d len %zd: %s", > + c->self, c->rfd, len, > + len == 0 ? "closed" : strerror(errno)); > + rfail: > if (c->type != SSH_CHANNEL_OPEN) { > debug2("channel %d: not open", c->self); > chan_mark_dead(ssh, c); > @@ -1955,8 +1980,7 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, > } else if (c->datagram) { > if ((r = sshbuf_put_string(c->input, buf, len)) != 0) > fatal_fr(r, "channel %i: put datagram", c->self); > - } else if ((r = sshbuf_put(c->input, buf, len)) != 0) > - fatal_fr(r, "channel %i: put data", c->self); > + } > return 1; > } > > diff --git a/channels.h b/channels.h > index 633adc3..db90c18 100644 > --- a/channels.h > +++ b/channels.h > @@ -231,6 +231,9 @@ struct Channel { > /* Read buffer size */ > #define CHAN_RBUF (16*1024) > > +/* Maximum size for direct reads to buffers */ > +#define CHANNEL_MAX_READ (CHAN_SES_WINDOW_DEFAULT*2) > + > /* Maximum channel input buffer size */ > #define CHAN_INPUT_MAX (16*1024*1024) > > diff --git a/sshbuf-io.c b/sshbuf-io.c > index 966f820..0b7628f 100644 > --- a/sshbuf-io.c > +++ b/sshbuf-io.c > @@ -113,3 +113,39 @@ sshbuf_write_file(const char *path, struct sshbuf *buf) > return 0; > } > > +int > +sshbuf_read(int fd, struct sshbuf *buf, size_t maxlen, size_t *rlen) > +{ > + int r, oerrno; > + size_t adjust; > + ssize_t rr; > + u_char *d; > + > + if (rlen != NULL) > + *rlen = 0; > + if ((r = sshbuf_reserve(buf, maxlen, &d)) != 0) > + return r; > + rr = read(fd, d, maxlen); > + oerrno = errno; > + > + /* Adjust the buffer to include only what was actually read */ > + if (rr > 0 && (adjust = maxlen - rr) > 0) { > + if ((r = sshbuf_consume_end(buf, adjust)) != 0) { > + /* avoid returning uninitialised data to caller */ > + memset(d + rr, '\0', adjust); > + return SSH_ERR_INTERNAL_ERROR; /* shouldn't happen */ > + } > + } > + if (rr < 0) { > + errno = oerrno; > + return SSH_ERR_SYSTEM_ERROR; > + } else if (rr == 0) { > + errno = EPIPE; > + return SSH_ERR_SYSTEM_ERROR; > + } > + /* success */ > + if (rlen != NULL) > + *rlen = (size_t)rr; > + return 0; > +} > + > diff --git a/sshbuf.h b/sshbuf.h > index 2b77d15..1ee9266 100644 > --- a/sshbuf.h > +++ b/sshbuf.h > @@ -310,6 +310,10 @@ int sshbuf_load_file(const char *, struct sshbuf **) > int sshbuf_write_file(const char *path, struct sshbuf *buf) > __attribute__((__nonnull__ (2))); > > +/* Read up to maxlen bytes from a fd directly to a buffer */ > +int sshbuf_read(int, struct sshbuf *, size_t, size_t *) > + __attribute__((__nonnull__ (2))); > + > /* Macros for decoding/encoding integers */ > #define PEEK_U64(p) \ > (((u_int64_t)(((const u_char *)(p))[0]) << 56) | \ >
This patch seems to be faster but I'm running into a weird issue. I have your patch running on the server iztli10 on port 2205. If I run this command: "dd if=/dev/urandom bs=1024 count=5000 | /opt/ssh/stock-8.7/bin/ssh -caes256-ctr -p 2205 iztli10 'cat > /dev/null' > foo" I end up with 2097152 bytes of nulls in 'foo'. I see this when running a stock 8.7 client or a patched client. The patch didn't apply cleanly to 8.7 but I might not have integrated it properly. What version of ssh did you generate this patch against? Chris On 11/9/21 6:12 PM, Damien Miller wrote:> On Tue, 9 Nov 2021, rapier wrote: > >> So this isn't an issue as much as a weird situation I am not fully >> understanding. That said, if I can understand it then it might be a benefit. >> >> In the function client_process_net_input in clientloop.c if I increase the >> size of buf[SSH_IOBUFSZ] to 128k I'm seeing a pretty substantial performance >> improvement - mostly when using aes256-ctr. >> >> For example; with the command >> >> ./ssh HostB -caes256-ctr "cat /dev/zero" > /dev/null >> >> I'm seeing throughput of around 610MB/s on a 10Gb network. >> >> When I use an unmodified version I'll see 480MB/s. >> >> Same hosts, same command. The only difference being the size of buf in >> client_process_net_input. >> >> HostA is a Xeon x5675 @3Ghz. HostB is an AMD Ryzen 7 5800X. >> >> My initial assumption is since HostA is CPU bound reducing the number of reads >> has a significant impact. That said, a nearly 30% improvement seems excessive >> for that to be all that's going on. Additionally, I'm not seeing as much >> improvement using chachapoly. In that case, I'm only seeing about a 20% >> improvement. 310MB/s for stock and 375MB/s for the big buffer. >> >> Additionally, I'm only seeing the improvement when HostB is sending the data >> and HostA receiving. When HostA (the Xeon) is sending (cat /dev/zero | ./ssh >> HostB "cat > /dev/null") then I'm seeing about 290MB/s with either version. >> >> I'm not suggesting any changes to the code. I'm just trying to understand what >> might be happening as it could be the buffer size, something with the CPU >> architecture, the switch I'm using, the distro (HostA is fedora, HostB is >> ubuntu), etc. Any clues would be appreciated. > > Maybe try this instead, it avoids the intermediate buffer and tries to make > larger reads directly to the channel buffer. I wrote it while trying to > investigate a report that re-enabling TCP Nagle improved performance, > but I couldn't replicate the reported problem. > > (I wrote this a while ago, but it's only lightly tested) > > diff --git a/channels.c b/channels.c > index 4903ad1..d6b2024 100644 > --- a/channels.c > +++ b/channels.c > @@ -1928,16 +1928,41 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, > char buf[CHAN_RBUF]; > ssize_t len; > int r; > + size_t avail, rlen, maxlen; > > if (c->rfd == -1 || !FD_ISSET(c->rfd, readset)) > return 1; > > + /* > + * For "simple" channels (i.e. not datagram or filtered), try to > + * read up to a complete remote window of data directly to the > + * channel buffer. > + */ > + if (c->input_filter == NULL && !c->datagram) { > + if (sshbuf_len(c->input) >= c->remote_window || > + (avail = sshbuf_avail(c->input)) == 0) > + return 1; /* shouldn't happen */ > + maxlen = c->remote_window - sshbuf_len(c->input); > + if (maxlen > avail) > + maxlen = avail; > + if (maxlen > CHANNEL_MAX_READ) > + maxlen = CHANNEL_MAX_READ; > + if ((r = sshbuf_read(c->rfd, c->input, maxlen, &rlen)) != 0) { > + debug2("channel %d: read failed rfd %d maxlen %zu: %s", > + c->self, c->rfd, maxlen, ssh_err(r)); > + goto rfail; > + } > + return 1; > + } > + > len = read(c->rfd, buf, sizeof(buf)); > if (len == -1 && (errno == EINTR || errno == EAGAIN)) > return 1; > if (len <= 0) { > - debug2("channel %d: read<=0 rfd %d len %zd", > - c->self, c->rfd, len); > + debug2("channel %d: read<=0 rfd %d len %zd: %s", > + c->self, c->rfd, len, > + len == 0 ? "closed" : strerror(errno)); > + rfail: > if (c->type != SSH_CHANNEL_OPEN) { > debug2("channel %d: not open", c->self); > chan_mark_dead(ssh, c); > @@ -1955,8 +1980,7 @@ channel_handle_rfd(struct ssh *ssh, Channel *c, > } else if (c->datagram) { > if ((r = sshbuf_put_string(c->input, buf, len)) != 0) > fatal_fr(r, "channel %i: put datagram", c->self); > - } else if ((r = sshbuf_put(c->input, buf, len)) != 0) > - fatal_fr(r, "channel %i: put data", c->self); > + } > return 1; > } > > diff --git a/channels.h b/channels.h > index 633adc3..db90c18 100644 > --- a/channels.h > +++ b/channels.h > @@ -231,6 +231,9 @@ struct Channel { > /* Read buffer size */ > #define CHAN_RBUF (16*1024) > > +/* Maximum size for direct reads to buffers */ > +#define CHANNEL_MAX_READ (CHAN_SES_WINDOW_DEFAULT*2) > + > /* Maximum channel input buffer size */ > #define CHAN_INPUT_MAX (16*1024*1024) > > diff --git a/sshbuf-io.c b/sshbuf-io.c > index 966f820..0b7628f 100644 > --- a/sshbuf-io.c > +++ b/sshbuf-io.c > @@ -113,3 +113,39 @@ sshbuf_write_file(const char *path, struct sshbuf *buf) > return 0; > } > > +int > +sshbuf_read(int fd, struct sshbuf *buf, size_t maxlen, size_t *rlen) > +{ > + int r, oerrno; > + size_t adjust; > + ssize_t rr; > + u_char *d; > + > + if (rlen != NULL) > + *rlen = 0; > + if ((r = sshbuf_reserve(buf, maxlen, &d)) != 0) > + return r; > + rr = read(fd, d, maxlen); > + oerrno = errno; > + > + /* Adjust the buffer to include only what was actually read */ > + if (rr > 0 && (adjust = maxlen - rr) > 0) { > + if ((r = sshbuf_consume_end(buf, adjust)) != 0) { > + /* avoid returning uninitialised data to caller */ > + memset(d + rr, '\0', adjust); > + return SSH_ERR_INTERNAL_ERROR; /* shouldn't happen */ > + } > + } > + if (rr < 0) { > + errno = oerrno; > + return SSH_ERR_SYSTEM_ERROR; > + } else if (rr == 0) { > + errno = EPIPE; > + return SSH_ERR_SYSTEM_ERROR; > + } > + /* success */ > + if (rlen != NULL) > + *rlen = (size_t)rr; > + return 0; > +} > + > diff --git a/sshbuf.h b/sshbuf.h > index 2b77d15..1ee9266 100644 > --- a/sshbuf.h > +++ b/sshbuf.h > @@ -310,6 +310,10 @@ int sshbuf_load_file(const char *, struct sshbuf **) > int sshbuf_write_file(const char *path, struct sshbuf *buf) > __attribute__((__nonnull__ (2))); > > +/* Read up to maxlen bytes from a fd directly to a buffer */ > +int sshbuf_read(int, struct sshbuf *, size_t, size_t *) > + __attribute__((__nonnull__ (2))); > + > /* Macros for decoding/encoding integers */ > #define PEEK_U64(p) \ > (((u_int64_t)(((const u_char *)(p))[0]) << 56) | \ > _______________________________________________ > openssh-unix-dev mailing list > openssh-unix-dev at mindrot.org > https://lists.mindrot.org/mailman/listinfo/openssh-unix-dev >