Lucas Mulling
2025-Jun-03 18:58 UTC
OpenSSH server fails to terminate executed program under certain conditions
Hello, We started experiencing failures in all architectures in a libssh test after updating to OpenSSH 10.0p2. The test in question is torture_channel_exit_signal, relevant code below: rc = snprintf(request, sizeof(request), "cat"); [...] /* Make the request, read parts with close */ rc = ssh_channel_request_exec(channel, request); assert_ssh_return_code(session, rc); rc = ssh_channel_request_send_signal(channel, "TERM"); assert_ssh_return_code(session, rc); exit_status = ssh_channel_get_exit_state(channel, &exit_status, &exit_signal, &core_dumped); In debugging I've found that the test requests SIGTERM so fast that the server -- OpenSSH -- has not had yet time to properly setup the child, resulting in killpg from session_signal_req failing (see log bellow). OpenSSH was build with: ./configure \ --with-pam \ --with-privsep-path=/var/lib/empty \ --with-systemd --libexecdir=$(pwd) With an added log to: @@ -1508,6 +1513,8 @@ do_child(struct ssh *ssh, Session *s, const char *command) struct passwd *pw = s->pw; int r = 0; + debug2("do child"); + sshpkt_fmt_connection_id(ssh, remote_id, sizeof(remote_id)); /* remove keys from memory */ Relevant log snippet: debug3: send packet: type 99 debug3: receive packet: type 98 debug1: server_input_channel_req: channel 0 request signal reply 0 debug1: session_by_channel: session 0 channel 0 debug1: session_input_channel_req: session 0 req signal debug1: session_signal_req: signal TERM, killpg(458610, 15) debug1: temporarily_use_uid: 5001/9000 (e=5001/9000) debug1: restore_uid: (unprivileged) session_signal_req: status(0) session_signal_req: killpg(458610, 15): No such process debug2: do child Note that do_child is called after killpg is called, this means (in my understating) that the correct gid has not been setup for killpg to work. This would, probably, not happen in a real world use case due to network delays. Could you clarify if this behaviour is intended?
Damien Miller
2025-Jun-03 23:37 UTC
OpenSSH server fails to terminate executed program under certain conditions
On Tue, 3 Jun 2025, Lucas Mulling via openssh-unix-dev wrote:> Hello, > > We started experiencing failures in all architectures in a libssh test after > updating to OpenSSH 10.0p2. The test in question is > torture_channel_exit_signal, relevant code below: > > rc = snprintf(request, sizeof(request), "cat"); > [...] > /* Make the request, read parts with close */ > rc = ssh_channel_request_exec(channel, request); > assert_ssh_return_code(session, rc); > rc = ssh_channel_request_send_signal(channel, "TERM"); > assert_ssh_return_code(session, rc); > > exit_status = ssh_channel_get_exit_state(channel, > &exit_status, > &exit_signal, > &core_dumped); > > In debugging I've found that the test requests SIGTERM so fast that the server > -- OpenSSH -- has not had yet time to properly setup the child, resulting in > killpg from session_signal_req failing (see log bellow). > > OpenSSH was build with: > ./configure \ > --with-pam \ > --with-privsep-path=/var/lib/empty \ > --with-systemd --libexecdir=$(pwd) > > With an added log to: > @@ -1508,6 +1513,8 @@ do_child(struct ssh *ssh, Session *s, const char *command) > struct passwd *pw = s->pw; > int r = 0; > > + debug2("do child"); > + > sshpkt_fmt_connection_id(ssh, remote_id, sizeof(remote_id)); > > /* remove keys from memory */ > > Relevant log snippet: > debug3: send packet: type 99 > debug3: receive packet: type 98 > debug1: server_input_channel_req: channel 0 request signal reply 0 > debug1: session_by_channel: session 0 channel 0 > debug1: session_input_channel_req: session 0 req signal > debug1: session_signal_req: signal TERM, killpg(458610, 15) > debug1: temporarily_use_uid: 5001/9000 (e=5001/9000) > debug1: restore_uid: (unprivileged) > session_signal_req: status(0) > session_signal_req: killpg(458610, 15): No such process > debug2: do child > > Note that do_child is called after killpg is called, this means (in my > understating) that the correct gid has not been setup for killpg to > work. This would, probably, not happen in a real world use case due to > network delays. > > Could you clarify if this behaviour is intended?I'd describe the behaviour as "incidental" :) The session child process is started via fork() and there is a small window before it becomes a process group leader via setsid(). Your test is hitting that window. I think we could eliminate this window by adding an interlock between the parent and child. Does this work for you? diff --git a/session.c b/session.c index acd44c6..f1ca285 100644 --- a/session.c +++ b/session.c @@ -339,6 +339,23 @@ xauth_valid_string(const char *s) return 1; } +/* Wait for child to signal it is now a process group leader by closing fd */ +static void +wait_child_setsid(Session *s, int fd) +{ + char c; + + debug3("session %d: waiting for child to start", s->self); + while (read(fd, &c, 1) == -1) { + if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) { + error("session %d: read ready status: %s", s->self, + strerror(errno)); + break; + } + } + debug3("session %d: child ready", s->self); +} + #define USE_PIPES 1 /* * This is called to fork and execute a command when we have no tty. This @@ -350,7 +367,7 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) { pid_t pid; #ifdef USE_PIPES - int pin[2], pout[2], perr[2]; + int pin[2], pout[2], perr[2], ready[2]; if (s == NULL) fatal("do_exec_no_pty: no session"); @@ -362,20 +379,24 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) } if (pipe(pout) == -1) { error_f("pipe out: %.100s", strerror(errno)); + close1: close(pin[0]); close(pin[1]); return -1; } if (pipe(perr) == -1) { error_f("pipe err: %.100s", strerror(errno)); - close(pin[0]); - close(pin[1]); + close2: close(pout[0]); close(pout[1]); - return -1; + goto close1; + } + if (pipe(ready) == -1) { + error_f("pipe ready: %.100s", strerror(errno)); + goto close2; } #else - int inout[2], err[2]; + int inout[2], err[2], ready[2]; if (s == NULL) fatal("do_exec_no_pty: no session"); @@ -387,10 +408,15 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) } if (socketpair(AF_UNIX, SOCK_STREAM, 0, err) == -1) { error_f("socketpair #2: %.100s", strerror(errno)); + close1: close(inout[0]); close(inout[1]); return -1; } + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ready) == -1) { + error_f("socketpair rdy: %.100s", strerror(errno)); + goto close1; + } #endif session_proctitle(s); @@ -406,22 +432,28 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) close(pout[1]); close(perr[0]); close(perr[1]); + close(ready[0]); + close(ready[1]); #else close(inout[0]); close(inout[1]); close(err[0]); close(err[1]); + close(ready[0]); + close(ready[1]); #endif return -1; case 0: is_child = 1; + close(ready[0]); /* * Create a new session and process group since the 4.4BSD * setlogin() affects the entire process group. */ if (setsid() == -1) error("setsid failed: %.100s", strerror(errno)); + close(ready[1]); #ifdef USE_PIPES /* @@ -461,7 +493,6 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) perror("dup2 stderr"); close(err[0]); #endif - /* Do processing for the child (exec command etc). */ do_child(ssh, s, command); /* NOTREACHED */ @@ -469,6 +500,10 @@ do_exec_no_pty(struct ssh *ssh, Session *s, const char *command) break; } + close(ready[1]); + wait_child_setsid(s, ready[0]); + close(ready[0]); + s->pid = pid; /* Set interactive/non-interactive mode. */ ssh_packet_set_interactive(ssh, s->display != NULL, @@ -508,12 +543,25 @@ do_exec_pty(struct ssh *ssh, Session *s, const char *command) { int fdout, ptyfd, ttyfd, ptymaster; pid_t pid; + int ready[2]; if (s == NULL) fatal("do_exec_pty: no session"); ptyfd = s->ptyfd; ttyfd = s->ttyfd; +#ifdef USE_PIPES + if (pipe(ready) == -1) { + error_f("pipe ready: %.100s", strerror(errno)); + return -1; + } +#else + if (socketpair(AF_UNIX, SOCK_STREAM, 0, ready) == -1) { + error_f("socketpair rdy: %.100s", strerror(errno)); + return -1; + } +#endif + /* * Create another descriptor of the pty master side for use as the * standard input. We could use the original descriptor, but this @@ -544,18 +592,22 @@ do_exec_pty(struct ssh *ssh, Session *s, const char *command) close(ptymaster); close(ttyfd); close(ptyfd); + close(ready[0]); + close(ready[1]); return -1; case 0: is_child = 1; close(fdout); close(ptymaster); + close(ready[0]); /* Close the master side of the pseudo tty. */ close(ptyfd); /* Make the pseudo tty our controlling tty. */ pty_make_controlling_tty(&ttyfd, s->tty); + close(ready[1]); /* Redirect stdin/stdout/stderr from the pseudo tty. */ if (dup2(ttyfd, 0) == -1) @@ -582,6 +634,10 @@ do_exec_pty(struct ssh *ssh, Session *s, const char *command) } s->pid = pid; + close(ready[1]); + wait_child_setsid(s, ready[0]); + close(ready[0]); + /* Parent. Close the slave side of the pseudo tty. */ close(ttyfd);