list moderators: sorry, I previously sent the below message while my gmail client was still in html mode. it should be in plain text mode now. As follow up to the message below. I caught a ssh mux process in the early stages of the problem and got a very different strace output. it is possible that previous strace was after the problem compounded on itself and reached "too many open files". Here's an strace of the ssh mux process when the problem starts, but is still in the early stages: select(80, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382562195}) = 0 accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 80 getsockopt(80, SOL_SOCKET, SO_PEERCRED, {pid=20099, uid=1000, gid=1000}, [12]) = 0 getuid() = 1000 fcntl(80, F_SETFD, FD_CLOEXEC) = 0 fcntl(80, F_GETFL) = 0x2 (flags O_RDWR) fcntl(80, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl(80, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382828691}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382855499}) = 0 select(81, [3 4 5 9 80], [80], NULL, NULL) = 2 (in [80], out [80]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 382924335}) = 0 read(80, "\0\0\0\10", 4) = 4 read(80, "\0\0\0\1\0\0\0\4", 8) = 8 write(80, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383147628}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383176998}) = 0 select(81, [3 4 5 9 80], [], NULL, NULL) = 1 (in [80]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383240708}) = 0 read(80, "\0\0\0\10", 4) = 4 read(80, "\20\0\0\4\0\0\0\0", 8) = 8 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383331936}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383356457}) = 0 select(81, [3 4 5 9 80], [80], NULL, NULL) = 1 (out [80]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383416936}) = 0 write(80, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383472208}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383496660}) = 0 select(81, [3 4 5 9 80], [], NULL, NULL) = 1 (in [80]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383556519}) = 0 read(80, "\0\0\0J", 4) = 4 read(80, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"..., 74) = 74 recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {81}}, msg_flags=0}, 0) = 1 recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {82}}, msg_flags=0}, 0) = 1 recvmsg(80, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {83}}, msg_flags=0}, 0) = 1 ioctl(81, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(81, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(82, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(83, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 fcntl(81, F_SETFD, FD_CLOEXEC) = 0 fcntl(82, F_SETFD, FD_CLOEXEC) = 0 fcntl(83, F_SETFD, FD_CLOEXEC) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383965138}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 383994722}) = 0 select(84, [3 4 5 9], [3], NULL, NULL) = 1 (out [3]) clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384069185}) = 0 write(3, "\0\0\0 \324\340\334\t\351p\fW^}\335\31m\2074\231\241\3\362\275)\33,r\211\26\370B"..., 52) = 52 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384139111}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504866, 384163755}) = 0 select(84, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618201367}) = 0 accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 84 getsockopt(84, SOL_SOCKET, SO_PEERCRED, {pid=21881, uid=1000, gid=1000}, [12]) = 0 getuid() = 1000 fcntl(84, F_SETFD, FD_CLOEXEC) = 0 fcntl(84, F_GETFL) = 0x2 (flags O_RDWR) fcntl(84, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl(84, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK) brk(0) = 0x7fa4ab18f000 brk(0x7fa4ab1b0000) = 0x7fa4ab1b0000 brk(0) = 0x7fa4ab1b0000 brk(0) = 0x7fa4ab1b0000 brk(0x7fa4ab1af000) = 0x7fa4ab1af000 brk(0) = 0x7fa4ab1af000 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618828564}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 618871803}) = 0 select(85, [3 4 5 9 84], [84], NULL, NULL) = 2 (in [84], out [84]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619030573}) = 0 read(84, "\0\0\0\10", 4) = 4 read(84, "\0\0\0\1\0\0\0\4", 8) = 8 write(84, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619228330}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619273086}) = 0 select(85, [3 4 5 9 84], [], NULL, NULL) = 1 (in [84]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619493236}) = 0 read(84, "\0\0\0\10", 4) = 4 read(84, "\20\0\0\4\0\0\0\0", 8) = 8 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619630299}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619671307}) = 0 select(85, [3 4 5 9 84], [84], NULL, NULL) = 1 (out [84]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619801592}) = 0 write(84, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619900136}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 619951778}) = 0 select(85, [3 4 5 9 84], [], NULL, NULL) = 1 (in [84]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 620130640}) = 0 read(84, "\0\0\0J", 4) = 4 read(84, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"..., 74) = 74 recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource temporarily unavailable) poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}]) recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {85}}, msg_flags=0}, 0) = 1 recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource temporarily unavailable) poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}]) recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {86}}, msg_flags=0}, 0) = 1 recvmsg(84, 0x7ffe9921d480, 0) = -1 EAGAIN (Resource temporarily unavailable) poll([{fd=84, events=POLLIN}], 1, 4294967295) = 1 ([{fd=84, revents=POLLIN}]) recvmsg(84, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {87}}, msg_flags=0}, 0) = 1 ioctl(85, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(85, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(86, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 ioctl(87, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, {B38400 opost isig icanon echo ...}) = 0 fcntl(85, F_SETFD, FD_CLOEXEC) = 0 fcntl(86, F_SETFD, FD_CLOEXEC) = 0 fcntl(87, F_SETFD, FD_CLOEXEC) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621065833}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621128532}) = 0 select(88, [3 4 5 9], [3], NULL, NULL) = 1 (out [3]) clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621248699}) = 0 write(3, "\0\0\0 \201d\201M\343eRg\347*\0232Q\25 \265\345\f(\304\224\272\265\246U\217\247\325"..., 52) = 52 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621355375}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504955, 621397659}) = 0 select(88, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574394274}) = 0 accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 88 getsockopt(88, SOL_SOCKET, SO_PEERCRED, {pid=29906, uid=1000, gid=1000}, [12]) = 0 getuid() = 1000 fcntl(88, F_SETFD, FD_CLOEXEC) = 0 fcntl(88, F_GETFL) = 0x2 (flags O_RDWR) fcntl(88, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl(88, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574787923}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574830224}) = 0 select(89, [3 4 5 9 88], [88], NULL, NULL) = 2 (in [88], out [88]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 574931974}) = 0 read(88, "\0\0\0\10", 4) = 4 read(88, "\0\0\0\1\0\0\0\4", 8) = 8 write(88, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575318288}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575371615}) = 0 select(89, [3 4 5 9 88], [], NULL, NULL) = 1 (in [88]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575450221}) = 0 read(88, "\0\0\0\10", 4) = 4 read(88, "\20\0\0\4\0\0\0\0", 8) = 8 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575575486}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575614725}) = 0 select(89, [3 4 5 9 88], [88], NULL, NULL) = 1 (out [88]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575707881}) = 0 write(88, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575793033}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 575834086}) = 0 select(89, [3 4 5 9 88], [], NULL, NULL) = 1 (in [88]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 576980524}) = 0 read(88, "\0\0\0,", 4) = 4 read(88, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"..., 44) = 44 recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {89}}, msg_flags=0}, 0) = 1 recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {90}}, msg_flags=0}, 0) = 1 recvmsg(88, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {91}}, msg_flags=0}, 0) = 1 ioctl(89, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(89, F_GETFL) = 0x8802 (flags O_RDWR|O_NONBLOCK|O_LARGEFILE) ioctl(90, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(90, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE) fcntl(90, F_SETFL, O_RDWR|O_NONBLOCK|O_LARGEFILE) = 0 ioctl(91, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(91, F_GETFL) = 0x8802 (flags O_RDWR|O_NONBLOCK|O_LARGEFILE) fcntl(89, F_SETFD, FD_CLOEXEC) = 0 fcntl(90, F_SETFD, FD_CLOEXEC) = 0 fcntl(91, F_SETFD, FD_CLOEXEC) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577707089}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577748657}) = 0 select(92, [3 4 5 9], [3], NULL, NULL) = 1 (out [3]) clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577841298}) = 0 write(3, "\0\0\0 \370L#\214*\37f\343+>4\315S\352[\301\2758\302\301\344\206\235\373#q@\345"..., 52) = 52 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577942105}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504957, 577982422}) = 0 select(92, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417300261}) = 0 accept(4, {sa_family=AF_LOCAL, NULL}, [2]) = 92 getsockopt(92, SOL_SOCKET, SO_PEERCRED, {pid=33339, uid=1000, gid=1000}, [12]) = 0 getuid() = 1000 fcntl(92, F_SETFD, FD_CLOEXEC) = 0 fcntl(92, F_GETFL) = 0x2 (flags O_RDWR) fcntl(92, F_SETFL, O_RDWR|O_NONBLOCK) = 0 fcntl(92, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417563912}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417591948}) = 0 select(93, [3 4 5 9 92], [92], NULL, NULL) = 2 (in [92], out [92]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417663899}) = 0 read(92, "\0\0\0\10", 4) = 4 read(92, "\0\0\0\1\0\0\0\4", 8) = 8 write(92, "\0\0\0\10\0\0\0\1\0\0\0\4", 12) = 12 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417802399}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417829593}) = 0 select(93, [3 4 5 9 92], [], NULL, NULL) = 1 (in [92]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 417918397}) = 0 read(92, "\0\0\0\10", 4) = 4 read(92, "\20\0\0\4\0\0\0\0", 8) = 8 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418018539}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418046283}) = 0 select(93, [3 4 5 9 92], [92], NULL, NULL) = 1 (out [92]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418128912}) = 0 write(92, "\0\0\0\f\200\0\0\5\0\0\0\0\0\0Gd", 16) = 16 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418235759}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418287367}) = 0 select(93, [3 4 5 9 92], [], NULL, NULL) = 1 (in [92]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 418444113}) = 0 read(92, "\0\0\0,", 4) = 4 read(92, "\20\0\0\2\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0~"..., 44) = 44 recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {93}}, msg_flags=0}, 0) = 1 recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {94}}, msg_flags=0}, 0) = 1 recvmsg(92, {msg_name(0)=NULL, msg_iov(1)=[{"\0", 1}], msg_controllen=24, {cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, {95}}, msg_flags=0}, 0) = 1 ioctl(93, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(93, F_GETFL) = 0x8802 (flags O_RDWR|O_NONBLOCK|O_LARGEFILE) ioctl(94, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(94, F_GETFL) = 0x8002 (flags O_RDWR|O_LARGEFILE) fcntl(94, F_SETFL, O_RDWR|O_NONBLOCK|O_LARGEFILE) = 0 ioctl(95, SNDCTL_TMR_TIMEBASE or SNDRV_TIMER_IOCTL_NEXT_DEVICE or TCGETS, 0x7ffe9921d460) = -1 ENOTTY (Inappropriate ioctl for device) fcntl(95, F_GETFL) = 0x8802 (flags O_RDWR|O_NONBLOCK|O_LARGEFILE) fcntl(93, F_SETFD, FD_CLOEXEC) = 0 fcntl(94, F_SETFD, FD_CLOEXEC) = 0 fcntl(95, F_SETFD, FD_CLOEXEC) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420023362}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420067236}) = 0 select(96, [3 4 5 9], [3], NULL, NULL) = 1 (out [3]) clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420167386}) = 0 write(3, "\0\0\0 \221rL\31Z\230\302\362'}A.\256 at P\373\222\336\2321\344I\310D\37\21\301\177"..., 52) = 52 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420282075}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {15504958, 420321699}) = 0 select(96, [3 4 5 9], [], NULL, NULL <detached ...> --- below was original message previously sent --- hello ssh list, long time user of openssh, but relatively new to the concept of ssh multiplexing. i'm experiencing some issues and i haven't figured out how to troubleshoot it just yet. would appreciate some help if possible. i'm using ssh as a communications mechanism to pass text file based messages between 2 hosts. There are programs on each side that send and receive these messages. When I found out about ssh multiplexing, i was excited to use it because we were seeing several hundred ssh connections going back and forth between the 2 hosts. when i tried ssh multiplexing, the message latency dropped dramatically by 10 fold! however, now that this mechanism has been in use for a week, I'm starting to see some problems. First, this is the .ssh/config contents: Host * ControlPath ~/.ssh/cm-%r@%h:%p ControlMaster auto ControlPersist 10m Everything seems to work for a few days, but then ssh starts to hang, and we start seeing several hundred ssh processes all trying to send their message but cannot. When i try to run ssh by hand, this is what i get: $ ssh -vvv boss at ui1 OpenSSH_6.6.1, OpenSSL 1.0.1e-fips 11 Feb 2013 debug1: Reading configuration data /var/lib/worker/.ssh/config debug1: /var/lib/worker/.ssh/config line 1: Applying options for * debug1: Reading configuration data /etc/ssh/ssh_config debug1: /etc/ssh/ssh_config line 56: Applying options for * debug1: auto-mux: Trying existing master And it hangs at that point indefinitely until Ctrl-C. At this point in time, we do see the ssh mux process still running: $ ps -eo pid,user,args | awk '$2=="worker" && $3=="ssh:" && $5=="[mux]" {print}' 29305 worker ssh: /var/lib/worker/.ssh/cm-boss at ui1:22 [mux] I tried to attach strace to the ssh mux process, and this is what i see when the problem is happening: select(1024, [3 5 9], [], NULL, {0, 11336}) = 0 (Timeout) clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778030739}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778085461}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778109973}) = 0 select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778186890}) = 0 accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files) clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778263743}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778298340}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873813, 778343707}) = 0 select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout) clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778457543}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778518096}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778546349}) = 0 select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778627517}) = 0 accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files) clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778693493}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778725395}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873814, 778749417}) = 0 select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout) clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778904087}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778963540}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873815, 778988943}) = 0 select(1024, [3 4 5 9], [], NULL, NULL) = 1 (in [4]) clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779072887}) = 0 accept(4, 0x7ffe26b34360, [128]) = -1 EMFILE (Too many open files) clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779158255}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779191597}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873815, 779216201}) = 0 select(1024, [3 5 9], [], NULL, {1, 0}) = 0 (Timeout) clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779334945}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779393178}) = 0 clock_gettime(0x7 /* CLOCK_??? */, {17873816, 779418473}) = 0 Does this indicate a open file limit for this user? Or is this something else? This is ulimit -a for that user: -bash-4.2$ ulimit -a core file size (blocks, -c) 0 data seg size (kbytes, -d) unlimited scheduling priority (-e) 0 file size (blocks, -f) unlimited pending signals (-i) 2062375 max locked memory (kbytes, -l) 64 max memory size (kbytes, -m) unlimited open files (-n) 1024 pipe size (512 bytes, -p) 8 POSIX message queues (bytes, -q) 819200 real-time priority (-r) 0 stack size (kbytes, -s) 8192 cpu time (seconds, -t) unlimited max user processes (-u) 4096 virtual memory (kbytes, -v) unlimited file locks (-x) unlimited Any advice on how to troubleshoot this further? Thanks in advance...