Hi,
We started working with rsync to sync data between remote sites.
We started seeing many stuck rsync processes.
Usually it happens in the ssh stage while the ssh issues a "select"
syscall on fd #4 while it is long gone.
Here is an example:
root@ptsl2171:/root# ps -efwww |grep ekrimer
ekrimer 28619 4979 0 Jan28 ? 00:00:00
/var/netstar//lib/build_0134_19/nbjobleader.out
/arch/projects/gesher/gesher_high
/a/nfs/iil/proj/mpgarch/arch_vpool_1/ekrimer/ambig/task_nhm /netbatch
/a/nfs/iil/proj/mpgarch/arch_vpool_1/ekrimer/ambig/task_nhm/##post_exec_
1.vpool_idc.7496709 /netbatch/##post_exec_1.vpool_idc.7496709
/a/nfs/iil/proj/mpgarch/arch_vpool_1/ekrimer/ambig/task_nhm/##post_exec_
1.vpool_idc.7496709 /netbatch/##post_exec_1.vpool_idc.7496709 0 batch
ptsl2171 BATCH 1138446228 1138472243 1.vpool_idc.7496709 19 ,cputime
soft = unlimited,cputime hard = unlimited,filesize soft unlimited,filesize hard
= unlimited,datasize soft = unlimited,datasize
hard = unlimited,stacksize soft = 8192,stacksize hard unlimited,coredumpsize
soft = 0,coredumpsize hard = unlimited,openfiles
soft = 1024,openfiles hard = 8192,descriptors soft = 1024,descriptors
hard = 8192,addressspace soft = unlimited,addressspace hard
unlimited,memorylocked soft = unlimited,memorylocked hard unlimited,maxproc soft
= 16384,maxproc hard = 16384,memoryuse soft unlimited,memoryuse hard = unlimited
null false false false 5 0
/nfs/site/proj/mpgarch/perf/tools/scripts/bin/arch_post.csh
/netbatch/ekrimer/task_nhm_296/runs
/nfs/site/proj/mpgarch/arch_vpool_1/ekrimer/ambig/results
ekrimer 28620 28619 0 Jan28 ? 00:00:00 /bin/csh -f
/nfs/site/proj/mpgarch/perf/tools/scripts/bin/arch_post.csh
/netbatch/ekrimer/task_nhm_296/runs
/nfs/site/proj/mpgarch/arch_vpool_1/ekrimer/ambig/results
ekrimer 28641 28620 0 Jan28 ? 00:00:00 /usr/intel/bin/rsync -e
ssh -azx --rsync-path=/usr/intel/bin/rsync
/netbatch/ekrimer/task_nhm_296/runs
rsync-mpgarch.iil.intel.com:/nfs/site/proj/mpgarch/arch_vpool_1/ekrimer/
ambig/results
ekrimer 28642 28641 0 Jan28 ? 00:00:00 ssh
rsync-mpgarch.iil.intel.com /usr/intel/bin/rsync --server -logDtprxz .
/nfs/site/proj/mpgarch/arch_vpool_1/ekrimer/ambig/results
root 7647 7606 0 20:22 pts/0 00:00:00 grep ekrimer
root@ptsl2171:/root# strace -p 28620
root@ptsl2171:/root# strace -p 28641
select(5, NULL, [4], NULL, {48, 20000} <unfinished ...>
root@ptsl2171:/root# ls -l /proc/28641/fd/5
lrwx------ 1 ekrimer arch 64 Jan 30 20:23 /proc/28641/fd/5
-> socket:[92942621]
root@ptsl2171:/root# strace -p 28642
select(4, [], [3], NULL, NULL <unfinished ...>
root@ptsl2171:/root# ls -l /proc/28642/fd/4
ls: /proc/28642/fd/4: No such file or directory
root@ptsl2171:/root# ls -l /proc/*/fd/* | grep 'socket:\[92942621\]'
ls: /proc/8035/fd/255: No such file or directory
ls: /proc/8035/fd/3: No such file or directory
ls: /proc/self/fd/255: No such file or directory
ls: /proc/self/fd/3: No such file or directory
lrwx------ 1 ekrimer arch 64 Jan 30 20:25 /proc/28641/fd/5
-> socket:[92942621]
Does anybody have an idea the reason for that?
Thanks,
Oren Mark
Intel - Israel Engineering Computing
Unix Server Platforms
oren.mark@intel.com <mailto:oren.mark@intel.com>
(+) 972-4-865-5987
iNET: 465-5987
-------------- next part --------------
HTML attachment scrubbed and removed