[Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs

Laszlo Ersek lersek at redhat.com
Fri Jul 15 07:30:38 UTC 2022


On 07/14/22 14:36, Richard W.M. Jones wrote:
> Add a new, optional [?wait] parameter to On_exit.kill, allowing
> programs to wait for a number of seconds for the subprocess to exit.
> ---
>  mltools/on_exit.ml  | 30 +++++++++++++++++++++---------
>  mltools/on_exit.mli | 14 ++++++++++++--
>  2 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml
> index e8353df..cdaa83c 100644
> --- a/mltools/on_exit.ml
> +++ b/mltools/on_exit.ml
> @@ -24,10 +24,10 @@ open Unix
>  open Printf
>  
>  type action =
> -  | Unlink of string     (* filename *)
> -  | Rm_rf of string      (* directory *)
> -  | Kill of int * int    (* signal, pid *)
> -  | Fn of (unit -> unit) (* generic function *)
> +  | Unlink of string        (* filename *)
> +  | Rm_rf of string         (* directory *)
> +  | Kill of int * int * int (* wait, signal, pid *)
> +  | Fn of (unit -> unit)    (* generic function *)
>  
>  (* List of (priority, action). *)
>  let actions = ref []
> @@ -35,18 +35,30 @@ let actions = ref []
>  (* Perform a single exit action, printing any exception but
>   * otherwise ignoring failures.
>   *)
> -let do_action action =
> +let rec do_action action =

I'd slightly prefer (a) do_kill to be introduced either before
do_action, or (b) for do_kill to be defined inside do_action, to using
"let rec" here. I think I understand what "rec" does to scoping, but
still, we don't have actual recursion here.

(I've noticed this pattern in many places in the v2v projects, and it
always confuses me -- it doesn't bring too much convenience IMO, so I'd
rather restrict it to actual recursion.)

>    try
>      match action with
>      | Unlink file -> Unix.unlink file
>      | Rm_rf dir ->
>         let cmd = sprintf "rm -rf %s" (Filename.quote dir) in
>         ignore (Tools_utils.shell_command cmd)
> -    | Kill (signal, pid) ->
> -       kill pid signal
> +    | Kill (wait, signal, pid) ->
> +       do_kill ~wait ~signal ~pid
>      | Fn f -> f ()
>    with exn -> debug "%s" (Printexc.to_string exn)
>  
> +and do_kill ~wait ~signal ~pid =
> +  kill pid signal;
> +
> +  let rec loop i =
> +    if i > 0 then (
> +      let pid', _ = waitpid [ WNOHANG ] pid in
> +      if pid' = 0 then
> +        loop (i-1)

Missing: "sleep 1;" before the tail-recursive call, I think.

> +    )
> +  in
> +  loop wait
> +
>  (* Make sure the actions are performed only once. *)
>  let done_actions = ref false
>  
> @@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir =
>    register ();
>    List.push_front (prio, Rm_rf dir) actions
>  
> -let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid =
> +let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid =
>    register ();
> -  List.push_front (prio, Kill (signal, pid)) actions
> +  List.push_front (prio, Kill (wait, signal, pid)) actions
> diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli
> index 910783e..dd35101 100644
> --- a/mltools/on_exit.mli
> +++ b/mltools/on_exit.mli
> @@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit
>  val rm_rf : ?prio:int -> string -> unit
>  (** Recursively remove a temporary directory on exit (using [rm -rf]). *)
>  
> -val kill : ?prio:int -> ?signal:int -> int -> unit
> +val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit
>  (** Kill [PID] on exit.  The signal sent defaults to [Sys.sigterm].
>  
>      Use this with care since you can end up unintentionally killing
>      another process if [PID] goes away or doesn't exist before the
> -    program exits. *)
> +    program exits.
> +
> +    The optional [?wait] flag attempts to wait for a specified
> +    number of seconds for the subprocess to go away.  For example
> +    using [~wait:5] will wait for up to 5 seconds.   Since this
> +    runs when virt-v2v is exiting, it is best to keep waiting times
> +    as short as possible.  Also there is no way to report errors
> +    in the subprocess.  If reliable cleanup of a subprocess is
> +    required then this is not the correct place to do it.
> +
> +    [?wait] defaults to [0] which means we do not try to wait. *)
>  
>  val register : unit -> unit
>  (** Force this module to register its at_exit function and signal
> 

(please consider formatting *.mli before *.ml)

I believe I take the opposite position on this; I'd rather wait forever.
No subprocess is expected to hang, and we should leave no subprocess
behind. If a subprocess hangs, the parent process should (apparently)
hang forever too, and let users report bugs.

That would also eliminate the question of *how* to wait for N seconds;
we'd just drop WNOHANG from waitpid.

But this is just my opinion; food for thought. :)

Thanks,
Laszlo


More information about the Libguestfs mailing list