Richard W.M. Jones
2022-Jul-14 12:36 UTC
[Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs
Add a new, optional [?wait] parameter to On_exit.kill, allowing programs to wait for a number of seconds for the subprocess to exit. --- mltools/on_exit.ml | 30 +++++++++++++++++++++--------- mltools/on_exit.mli | 14 ++++++++++++-- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml index e8353df..cdaa83c 100644 --- a/mltools/on_exit.ml +++ b/mltools/on_exit.ml @@ -24,10 +24,10 @@ open Unix open Printf type action - | Unlink of string (* filename *) - | Rm_rf of string (* directory *) - | Kill of int * int (* signal, pid *) - | Fn of (unit -> unit) (* generic function *) + | Unlink of string (* filename *) + | Rm_rf of string (* directory *) + | Kill of int * int * int (* wait, signal, pid *) + | Fn of (unit -> unit) (* generic function *) (* List of (priority, action). *) let actions = ref [] @@ -35,18 +35,30 @@ let actions = ref [] (* Perform a single exit action, printing any exception but * otherwise ignoring failures. *) -let do_action action +let rec do_action action try match action with | Unlink file -> Unix.unlink file | Rm_rf dir -> let cmd = sprintf "rm -rf %s" (Filename.quote dir) in ignore (Tools_utils.shell_command cmd) - | Kill (signal, pid) -> - kill pid signal + | Kill (wait, signal, pid) -> + do_kill ~wait ~signal ~pid | Fn f -> f () with exn -> debug "%s" (Printexc.to_string exn) +and do_kill ~wait ~signal ~pid + kill pid signal; + + let rec loop i + if i > 0 then ( + let pid', _ = waitpid [ WNOHANG ] pid in + if pid' = 0 then + loop (i-1) + ) + in + loop wait + (* Make sure the actions are performed only once. *) let done_actions = ref false @@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir register (); List.push_front (prio, Rm_rf dir) actions -let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid +let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid register (); - List.push_front (prio, Kill (signal, pid)) actions + List.push_front (prio, Kill (wait, signal, pid)) actions diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli index 910783e..dd35101 100644 --- a/mltools/on_exit.mli +++ b/mltools/on_exit.mli @@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit val rm_rf : ?prio:int -> string -> unit (** Recursively remove a temporary directory on exit (using [rm -rf]). *) -val kill : ?prio:int -> ?signal:int -> int -> unit +val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit (** Kill [PID] on exit. The signal sent defaults to [Sys.sigterm]. Use this with care since you can end up unintentionally killing another process if [PID] goes away or doesn't exist before the - program exits. *) + program exits. + + The optional [?wait] flag attempts to wait for a specified + number of seconds for the subprocess to go away. For example + using [~wait:5] will wait for up to 5 seconds. Since this + runs when virt-v2v is exiting, it is best to keep waiting times + as short as possible. Also there is no way to report errors + in the subprocess. If reliable cleanup of a subprocess is + required then this is not the correct place to do it. + + [?wait] defaults to [0] which means we do not try to wait. *) val register : unit -> unit (** Force this module to register its at_exit function and signal -- 2.37.0.rc2
Laszlo Ersek
2022-Jul-15 07:30 UTC
[Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs
On 07/14/22 14:36, Richard W.M. Jones wrote:> Add a new, optional [?wait] parameter to On_exit.kill, allowing > programs to wait for a number of seconds for the subprocess to exit. > --- > mltools/on_exit.ml | 30 +++++++++++++++++++++--------- > mltools/on_exit.mli | 14 ++++++++++++-- > 2 files changed, 33 insertions(+), 11 deletions(-) > > diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml > index e8353df..cdaa83c 100644 > --- a/mltools/on_exit.ml > +++ b/mltools/on_exit.ml > @@ -24,10 +24,10 @@ open Unix > open Printf > > type action > - | Unlink of string (* filename *) > - | Rm_rf of string (* directory *) > - | Kill of int * int (* signal, pid *) > - | Fn of (unit -> unit) (* generic function *) > + | Unlink of string (* filename *) > + | Rm_rf of string (* directory *) > + | Kill of int * int * int (* wait, signal, pid *) > + | Fn of (unit -> unit) (* generic function *) > > (* List of (priority, action). *) > let actions = ref [] > @@ -35,18 +35,30 @@ let actions = ref [] > (* Perform a single exit action, printing any exception but > * otherwise ignoring failures. > *) > -let do_action action > +let rec do_action actionI'd slightly prefer (a) do_kill to be introduced either before do_action, or (b) for do_kill to be defined inside do_action, to using "let rec" here. I think I understand what "rec" does to scoping, but still, we don't have actual recursion here. (I've noticed this pattern in many places in the v2v projects, and it always confuses me -- it doesn't bring too much convenience IMO, so I'd rather restrict it to actual recursion.)> try > match action with > | Unlink file -> Unix.unlink file > | Rm_rf dir -> > let cmd = sprintf "rm -rf %s" (Filename.quote dir) in > ignore (Tools_utils.shell_command cmd) > - | Kill (signal, pid) -> > - kill pid signal > + | Kill (wait, signal, pid) -> > + do_kill ~wait ~signal ~pid > | Fn f -> f () > with exn -> debug "%s" (Printexc.to_string exn) > > +and do_kill ~wait ~signal ~pid > + kill pid signal; > + > + let rec loop i > + if i > 0 then ( > + let pid', _ = waitpid [ WNOHANG ] pid in > + if pid' = 0 then > + loop (i-1)Missing: "sleep 1;" before the tail-recursive call, I think.> + ) > + in > + loop wait > + > (* Make sure the actions are performed only once. *) > let done_actions = ref false > > @@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir > register (); > List.push_front (prio, Rm_rf dir) actions > > -let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid > +let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid > register (); > - List.push_front (prio, Kill (signal, pid)) actions > + List.push_front (prio, Kill (wait, signal, pid)) actions > diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli > index 910783e..dd35101 100644 > --- a/mltools/on_exit.mli > +++ b/mltools/on_exit.mli > @@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit > val rm_rf : ?prio:int -> string -> unit > (** Recursively remove a temporary directory on exit (using [rm -rf]). *) > > -val kill : ?prio:int -> ?signal:int -> int -> unit > +val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit > (** Kill [PID] on exit. The signal sent defaults to [Sys.sigterm]. > > Use this with care since you can end up unintentionally killing > another process if [PID] goes away or doesn't exist before the > - program exits. *) > + program exits. > + > + The optional [?wait] flag attempts to wait for a specified > + number of seconds for the subprocess to go away. For example > + using [~wait:5] will wait for up to 5 seconds. Since this > + runs when virt-v2v is exiting, it is best to keep waiting times > + as short as possible. Also there is no way to report errors > + in the subprocess. If reliable cleanup of a subprocess is > + required then this is not the correct place to do it. > + > + [?wait] defaults to [0] which means we do not try to wait. *) > > val register : unit -> unit > (** Force this module to register its at_exit function and signal >(please consider formatting *.mli before *.ml) I believe I take the opposite position on this; I'd rather wait forever. No subprocess is expected to hang, and we should leave no subprocess behind. If a subprocess hangs, the parent process should (apparently) hang forever too, and let users report bugs. That would also eliminate the question of *how* to wait for N seconds; we'd just drop WNOHANG from waitpid. But this is just my opinion; food for thought. :) Thanks, Laszlo