To Whom It May Concern: Below is a patch, that I have used to eliminate the unexplained errors in the rsync program. I was able to trace the problem to the order in which the sigchld_handler and wait_process routines were executed. If sigchld_handler executes first it retrieves the status that wait_process needs to indicate proper rsync termination. The code below allows the sigchld_handler to save the status of up to 10 child processes and make them available for use by wait_process. I hope I've done this right. Thanks for your help JW. Dave ---------------------------------------------------------------------------------- 29a30,34> struct pid_status { > pid_t pid; > int status; > } pid_stat_table[10]; >37c42,47 < while (waitpid(pid, status, WNOHANG) == 0) { ---> pid_t waited_pid; > int cnt; > > while ( 1 ) { > waited_pid = waitpid(pid, status, WNOHANG); > if ( waited_pid == 0 ) {39a50> } else break;40a52,63> if (( waited_pid == -1 ) && ( errno == ECHILD )) { > /* status of requested child no longer available. Check */ > /* to see if it was processed by the sigchld_handler. */ > cnt = 0; > while ( cnt < 10 ) { > if ( pid == pid_stat_table[cnt].pid ) { > *status = pid_stat_table[cnt].status; > break; > } > cnt++; > } > }795c818,833 < while (waitpid(-1, NULL, WNOHANG) > 0) ; ---> int cnt = 0; > pid_t pid = 0; > int status = 0; > while ( 1 ) { > pid = waitpid(-1, &status, WNOHANG); > cnt = 0; > while ( cnt < 10 ) { > if (pid_stat_table[cnt].pid == 0 ) { > pid_stat_table[cnt].pid = pid; > pid_stat_table[cnt].status = status; > break; > } > cnt++; > } > if ( pid < 1 ) break; > };
Dave:
I don't suppose you could upload this as a context diff? ("diff -c
<original> <new file>") My patch program (2.5.4) couldn't
get it to apply vs. the current CVS version of rsync.
-- Steve
JW,
I pushed everything to a LINUX box and did the diff again this time with the
-bur option.
It does look significantly different. I think I've got below what you are
looking for, so I am
resending it to rsync@lists.samba.org.
I'm including two additional people as they have asked for the patch
information
as well.
The diff was performed on main.c (2.5.5 version from rsync.samba.org). I hope
everyone
knows that I'm my purpose in providing this information, is so that everyone
can
critique
my code (to make it better) and we can eliminate this problem for everyone.
Dave
-------------------------------------------------------------
--- main.c.orig Tue Sep 3 16:38:23 2002
+++ main.c Tue Sep 3 16:41:08 2002
@@ -27,6 +27,11 @@
extern int verbose;
+struct pid_status {
+ pid_t pid;
+ int status;
+ } pid_stat_table[10];
+
static void show_malloc_stats(void);
/****************************************************************************
@@ -34,9 +39,27 @@
****************************************************************************/
void wait_process(pid_t pid, int *status)
{
- while (waitpid(pid, status, WNOHANG) == 0) {
+ pid_t waited_pid;
+ int cnt;
+
+ while ( 1 ) {
+ waited_pid = waitpid(pid, status, WNOHANG);
+ if ( waited_pid == 0 ) {
msleep(20);
io_flush();
+ } else break;
+ }
+ if (( waited_pid == -1 ) && ( errno == ECHILD )) {
+ /* status of requested child no longer available. Check */
+ /* to see if it was processed by the sigchld_handler. */
+ cnt = 0;
+ while ( cnt < 10 ) {
+ if ( pid == pid_stat_table[cnt].pid ) {
+ *status = pid_stat_table[cnt].status;
+ break;
+ }
+ cnt++;
+ }
}
/* TODO: If the child exited on a signal, then log an
@@ -792,7 +815,22 @@
static RETSIGTYPE sigchld_handler(int UNUSED(val)) {
#ifdef WNOHANG
- while (waitpid(-1, NULL, WNOHANG) > 0) ;
+ int cnt = 0;
+ pid_t pid = 0;
+ int status = 0;
+ while ( 1 ) {
+ pid = waitpid(-1, &status, WNOHANG);
+ cnt = 0;
+ while ( cnt < 10 ) {
+ if (pid_stat_table[cnt].pid == 0 ) {
+ pid_stat_table[cnt].pid = pid;
+ pid_stat_table[cnt].status = status;
+ break;
+ }
+ cnt++;
+ }
+ if ( pid < 1 ) break;
+ };
#endif
}