Roger Pau Monne
2012-Jul-09 11:54 UTC
[PATCH] libxl: check pending child signals on libxl__ev_child_inuse
Since we might call libxl__ev_child_inuse from the callback of another event, and we might have pending signals from dead processes, update the processes status before returning to the caller. This was noticed because we processed POLLHUP before the child pipe, so we ended up trying to kill the child process from the PULLHUP callback, when the child was already dead. With this patch libxl__ev_child_inuse returns false, and we no longer try to kill an already dead child. Cc: Ian Jackson <ian.jackson@eu.citrix.com> Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> --- tools/libxl/libxl_aoutils.c | 2 +- tools/libxl/libxl_bootloader.c | 2 +- tools/libxl/libxl_device.c | 6 ++-- tools/libxl/libxl_exec.c | 10 +++++++- tools/libxl/libxl_fork.c | 42 ++++++++++++++++++++++++++++++++++++++ tools/libxl/libxl_internal.h | 9 ++++--- tools/libxl/libxl_save_callout.c | 4 +- 7 files changed, 62 insertions(+), 13 deletions(-) diff --git a/tools/libxl/libxl_aoutils.c b/tools/libxl/libxl_aoutils.c index 99972a2..9239657 100644 --- a/tools/libxl/libxl_aoutils.c +++ b/tools/libxl/libxl_aoutils.c @@ -336,7 +336,7 @@ int libxl__openptys(libxl__openpty_state *op, out: if (sockets[0] >= 0) close(sockets[0]); libxl__carefd_close(for_child); - if (libxl__ev_child_inuse(&op->child)) { + if (libxl__ev_child_inuse(gc, &op->child)) { op->rc = rc; /* we will get a callback when the child dies */ return 0; diff --git a/tools/libxl/libxl_bootloader.c b/tools/libxl/libxl_bootloader.c index 7ba3502..50f91cc 100644 --- a/tools/libxl/libxl_bootloader.c +++ b/tools/libxl/libxl_bootloader.c @@ -286,7 +286,7 @@ static void bootloader_abort(libxl__egc *egc, libxl__datacopier_kill(&bl->keystrokes); libxl__datacopier_kill(&bl->display); - if (libxl__ev_child_inuse(&bl->child)) { + if (libxl__ev_child_inuse(gc, &bl->child)) { r = kill(bl->child.pid, SIGTERM); if (r) LOGE(WARN, "after failure, failed to kill bootloader [%lu]", (unsigned long)bl->child.pid); diff --git a/tools/libxl/libxl_device.c b/tools/libxl/libxl_device.c index 91a6ef4..5e63d57 100644 --- a/tools/libxl/libxl_device.c +++ b/tools/libxl/libxl_device.c @@ -829,7 +829,7 @@ static void device_hotplug(libxl__egc *egc, libxl__ao_device *aodev) abort(); } - assert(libxl__ev_child_inuse(&aodev->child)); + assert(libxl__ev_child_inuse(gc, &aodev->child)); return; @@ -845,7 +845,7 @@ static void device_hotplug_timeout_cb(libxl__egc *egc, libxl__ev_time *ev, libxl__ao_device *aodev = CONTAINER_OF(ev, *aodev, ev); STATE_AO_GC(aodev->ao); - assert(libxl__ev_child_inuse(&aodev->child)); + assert(libxl__ev_child_inuse(gc, &aodev->child)); LOG(DEBUG, "killing hotplug script %s because of timeout", aodev->what); if (kill(aodev->child.pid, SIGKILL)) { LOGEV(ERROR, errno, "unable to kill hotplug script %s [%ld]", @@ -915,7 +915,7 @@ static void device_hotplug_clean(libxl__gc *gc, libxl__ao_device *aodev) { /* Clean events and check reentrancy */ libxl__ev_time_deregister(gc, &aodev->ev); - assert(!libxl__ev_child_inuse(&aodev->child)); + assert(!libxl__ev_child_inuse(gc, &aodev->child)); } static void devices_remove_callback(libxl__egc *egc, libxl__ao_devices *aodevs, diff --git a/tools/libxl/libxl_exec.c b/tools/libxl/libxl_exec.c index 0477386..ac32f24 100644 --- a/tools/libxl/libxl_exec.c +++ b/tools/libxl/libxl_exec.c @@ -157,6 +157,12 @@ out: return rc ? SIGTERM : 0; } +int libxl__spawn_inuse(libxl__spawn_state *ss) +{ + STATE_AO_GC(ss->ao); + return libxl__ev_child_inuse(gc, &ss->mid); +} + int libxl__wait_for_offspring(libxl__gc *gc, uint32_t domid, uint32_t timeout, char *what, @@ -349,7 +355,7 @@ int libxl__spawn_spawn(libxl__egc *egc, libxl__spawn_state *ss) static void spawn_cleanup(libxl__gc *gc, libxl__spawn_state *ss) { - assert(!libxl__ev_child_inuse(&ss->mid)); + assert(!libxl__ev_child_inuse(gc, &ss->mid)); libxl__ev_time_deregister(gc, &ss->timeout); libxl__ev_xswatch_deregister(gc, &ss->xswatch); } @@ -361,7 +367,7 @@ static void spawn_detach(libxl__gc *gc, libxl__spawn_state *ss) { int r; - assert(libxl__ev_child_inuse(&ss->mid)); + assert(libxl__ev_child_inuse(gc, &ss->mid)); libxl__ev_time_deregister(gc, &ss->timeout); libxl__ev_xswatch_deregister(gc, &ss->xswatch); diff --git a/tools/libxl/libxl_fork.c b/tools/libxl/libxl_fork.c index 044ddad..a0abc87 100644 --- a/tools/libxl/libxl_fork.c +++ b/tools/libxl/libxl_fork.c @@ -366,6 +366,48 @@ pid_t libxl__ev_child_fork(libxl__gc *gc, libxl__ev_child *ch, return rc; } +/* + * We need to update the child status, since there might be pending + * events on the child pipe. + */ + +int libxl__ev_child_inuse(libxl__gc *gc, libxl__ev_child *childw_out) +{ + EGC_INIT_FROM_GC; + struct pollfd fds[1]; + int selfpipe, rc; + + CTX_LOCK; + selfpipe = libxl__fork_selfpipe_active(CTX); + if (selfpipe >= 0) { + fds[0].fd = selfpipe; + fds[0].events = POLLIN; + + CTX_UNLOCK; + do { + rc = poll(fds, 1, 0); + } while (rc < 0 && errno == EINTR); + CTX_LOCK; + + if (rc < 0) { + LIBXL__LOG_ERRNOVAL(CTX, LIBXL__LOG_ERROR, errno, "poll failed"); + rc = ERROR_FAIL; + goto out; + } + if (rc) { + int e = libxl__self_pipe_eatall(selfpipe); + if (e) LIBXL__EVENT_DISASTER(egc, "read sigchld pipe", e, 0); + libxl__fork_selfpipe_woken(egc); + } + } + rc = childw_out->pid >= 0; + +out: + CTX_UNLOCK; + EGC_FREE; + return rc; +} + void libxl_childproc_setmode(libxl_ctx *ctx, const libxl_childproc_hooks *hooks, void *user) { diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h index 376a521..b9ce6eb 100644 --- a/tools/libxl/libxl_internal.h +++ b/tools/libxl/libxl_internal.h @@ -703,8 +703,7 @@ _hidden pid_t libxl__ev_child_fork(libxl__gc *gc, libxl__ev_child *childw_out, libxl__ev_child_callback *death); static inline void libxl__ev_child_init(libxl__ev_child *childw_out) { childw_out->pid = -1; } -static inline int libxl__ev_child_inuse(libxl__ev_child *childw_out) - { return childw_out->pid >= 0; } +_hidden int libxl__ev_child_inuse(libxl__gc *gc, libxl__ev_child *childw_out); /* Useable (only) in the child to once more make the ctx useable for * xenstore operations. logs failure in the form "what: <error @@ -1136,8 +1135,7 @@ struct libxl__spawn_state { libxl__ev_xswatch xswatch; }; -static inline int libxl__spawn_inuse(libxl__spawn_state *ss) - { return libxl__ev_child_inuse(&ss->mid); } +_hidden int libxl__spawn_inuse(libxl__spawn_state *ss); /* * libxl_spawner_record_pid - Record given pid in xenstore @@ -1528,6 +1526,9 @@ _hidden void libxl__egc_cleanup(libxl__egc *egc); libxl__egc egc[1]; LIBXL_INIT_EGC(egc[0],ctx); \ EGC_GC +#define EGC_INIT_FROM_GC \ + libxl__egc egc[1]; LIBXL_INIT_EGC(egc[0],CTX); + #define EGC_FREE libxl__egc_cleanup(egc) diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c index 078b7ee..685b417 100644 --- a/tools/libxl/libxl_save_callout.c +++ b/tools/libxl/libxl_save_callout.c @@ -242,7 +242,7 @@ static void helper_failed(libxl__egc *egc, libxl__save_helper_state *shs, libxl__ev_fd_deregister(gc, &shs->readable); - if (!libxl__ev_child_inuse(&shs->child)) { + if (!libxl__ev_child_inuse(gc, &shs->child)) { helper_done(egc, shs); return; } @@ -325,7 +325,7 @@ static void helper_done(libxl__egc *egc, libxl__save_helper_state *shs) libxl__ev_fd_deregister(gc, &shs->readable); libxl__carefd_close(shs->pipes[0]); shs->pipes[0] = 0; libxl__carefd_close(shs->pipes[1]); shs->pipes[1] = 0; - assert(!libxl__ev_child_inuse(&shs->child)); + assert(!libxl__ev_child_inuse(gc, &shs->child)); if (shs->toolstack_data_file) fclose(shs->toolstack_data_file); shs->egc = egc; -- 1.7.7.5 (Apple Git-26)
Roger Pau Monne
2012-Jul-09 12:22 UTC
Re: [PATCH] libxl: check pending child signals on libxl__ev_child_inuse
Roger Pau Monne wrote:> Since we might call libxl__ev_child_inuse from the callback of another > event, and we might have pending signals from dead processes, update > the processes status before returning to the caller. > > This was noticed because we processed POLLHUP before the child pipe, > so we ended up trying to kill the child process from the PULLHUP > callback, when the child was already dead. With this patch > libxl__ev_child_inuse returns false, and we no longer try to kill an > already dead child.This is not correct, since we might assert that the child is running after a fork, but if the execution time is very short it might be dead. A better solution might be to change the order in which events are processed, the following patch makes libxl process child events before anything else. 8<-------------------------------------------------------------------- [PATCH] libxl: check child signals before other events Check pending signals from child processes before processing other events, since the callbacks for other events might check the status of the child and try to kill them, when they might already be dead. Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> --- tools/libxl/libxl_event.c | 30 +++++++++++++++++------------- 1 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/libxl/libxl_event.c b/tools/libxl/libxl_event.c index 4b02cd7..fe1cf49 100644 --- a/tools/libxl/libxl_event.c +++ b/tools/libxl/libxl_event.c @@ -916,6 +916,23 @@ static void afterpoll_internal(libxl__egc *egc, libxl__poller *poller, * CTX->efds is more complicated; see below. */ + /* + * We should check for child signals before doing anything else, + * since callbacks for other events might check the child status. + */ + if (afterpoll_check_fd(poller,fds,nfds, poller->wakeup_pipe[0],POLLIN, 0)) { + int e = libxl__self_pipe_eatall(poller->wakeup_pipe[0]); + if (e) LIBXL__EVENT_DISASTER(egc, "read wakeup", e, 0); + } + + int selfpipe = libxl__fork_selfpipe_active(CTX); + if (selfpipe >= 0 && + afterpoll_check_fd(poller,fds,nfds, selfpipe, POLLIN, 0)) { + int e = libxl__self_pipe_eatall(selfpipe); + if (e) LIBXL__EVENT_DISASTER(egc, "read sigchld pipe", e, 0); + libxl__fork_selfpipe_woken(egc); + } + for (;;) { /* We restart our scan of fd events whenever we call a * callback function. This is necessary because such @@ -949,19 +966,6 @@ static void afterpoll_internal(libxl__egc *egc, libxl__poller *poller, *(rindices[i]) = INT_MAX; } - if (afterpoll_check_fd(poller,fds,nfds, poller->wakeup_pipe[0],POLLIN, 0)) { - int e = libxl__self_pipe_eatall(poller->wakeup_pipe[0]); - if (e) LIBXL__EVENT_DISASTER(egc, "read wakeup", e, 0); - } - - int selfpipe = libxl__fork_selfpipe_active(CTX); - if (selfpipe >= 0 && - afterpoll_check_fd(poller,fds,nfds, selfpipe, POLLIN, 0)) { - int e = libxl__self_pipe_eatall(selfpipe); - if (e) LIBXL__EVENT_DISASTER(egc, "read sigchld pipe", e, 0); - libxl__fork_selfpipe_woken(egc); - } - for (;;) { libxl__ev_time *etime = LIBXL_TAILQ_FIRST(&CTX->etimes); if (!etime) -- 1.7.7.5 (Apple Git-26)
Roger Pau Monne
2012-Jul-10 11:13 UTC
Re: [PATCH] libxl: check pending child signals on libxl__ev_child_inuse
Roger Pau Monne wrote:> Roger Pau Monne wrote: >> Since we might call libxl__ev_child_inuse from the callback of another >> event, and we might have pending signals from dead processes, update >> the processes status before returning to the caller. >> >> This was noticed because we processed POLLHUP before the child pipe, >> so we ended up trying to kill the child process from the PULLHUP >> callback, when the child was already dead. With this patch >> libxl__ev_child_inuse returns false, and we no longer try to kill an >> already dead child. > > This is not correct, since we might assert that the child is running > after a fork, but if the execution time is very short it might be dead. > > A better solution might be to change the order in which events are > processed, the following patch makes libxl process child events before > anything else. > > 8<-------------------------------------------------------------------- > > [PATCH] libxl: check child signals before other events > > Check pending signals from child processes before processing other > events, since the callbacks for other events might check the status of > the child and try to kill them, when they might already be dead.Since this is also prone to races, we should ignore POLLHUP on datacopier_readable and wait for the underlying process (the bootloader in this case) to exit. Linux probably doesn''t send a POLLHUP when the process on the other end of the pty dies, but NetBSD does so, and since we process the fd events first we should ignore this and expect that the process dies. 8<--------------------------------------------------------------------- Subject: [PATCH] libxl: react correctly to POLLHUP On datacopier_readable ignore PULLHUP, since we will shortly process the death of the underlying process. Cc: Ian Jackson <ian.jackson@eu.citrix.com> Signed-off-by: Roger Pau Monne <roger.pau@citrix.com> --- tools/libxl/libxl_aoutils.c | 8 ++++++++ 1 files changed, 8 insertions(+), 0 deletions(-) diff --git a/tools/libxl/libxl_aoutils.c b/tools/libxl/libxl_aoutils.c index 99972a2..fa07154 100644 --- a/tools/libxl/libxl_aoutils.c +++ b/tools/libxl/libxl_aoutils.c @@ -102,6 +102,14 @@ static void datacopier_readable(libxl__egc *egc, libxl__ev_fd *ev, libxl__datacopier_state *dc = CONTAINER_OF(ev, *dc, toread); STATE_AO_GC(dc->ao); + if (revents & POLLHUP) { + LOG(DEBUG, "received POLLHUP on %s during copy of %s, " + "waiting for process to die", + dc->readwhat, dc->copywhat); + libxl__ev_fd_deregister(gc, &dc->toread); + return; + } + if (revents & ~POLLIN) { LOG(ERROR, "unexpected poll event 0x%x (should be POLLIN)" " on %s during copy of %s", revents, dc->readwhat, dc->copywhat); -- 1.7.7.5 (Apple Git-26)
Ian Jackson
2012-Jul-17 14:27 UTC
Re: [PATCH] libxl: check pending child signals on libxl__ev_child_inuse
Roger Pau Monne writes ("Re: [Xen-devel] [PATCH] libxl: check pending child signals on libxl__ev_child_inuse"):> Subject: [PATCH] libxl: react correctly to POLLHUP > > On datacopier_readable ignore PULLHUP, since we will shortly process > the death of the underlying process.Again, I don''t think this is correct.> Since this is also prone to races, we should ignore POLLHUP on > datacopier_readable and wait for the underlying process (the bootloader > in this case) to exit. Linux probably doesn''t send a POLLHUP when the > process on the other end of the pty dies, but NetBSD does so, and since > we process the fd events first we should ignore this and expect that the > process dies.You are assuming that the POLLHUP necessarily means the bootloader process is going to die. But perhaps it has gone mad and isn''t going to do so. That''s why we need to send it a SIGKILL. Ian.