[Libguestfs] [PATCH common 4/4] mltools: Allow waiting for killed PIDs
Laszlo Ersek
lersek at redhat.com
Fri Jul 15 07:30:38 UTC 2022
On 07/14/22 14:36, Richard W.M. Jones wrote:
> Add a new, optional [?wait] parameter to On_exit.kill, allowing
> programs to wait for a number of seconds for the subprocess to exit.
> ---
> mltools/on_exit.ml | 30 +++++++++++++++++++++---------
> mltools/on_exit.mli | 14 ++++++++++++--
> 2 files changed, 33 insertions(+), 11 deletions(-)
>
> diff --git a/mltools/on_exit.ml b/mltools/on_exit.ml
> index e8353df..cdaa83c 100644
> --- a/mltools/on_exit.ml
> +++ b/mltools/on_exit.ml
> @@ -24,10 +24,10 @@ open Unix
> open Printf
>
> type action =
> - | Unlink of string (* filename *)
> - | Rm_rf of string (* directory *)
> - | Kill of int * int (* signal, pid *)
> - | Fn of (unit -> unit) (* generic function *)
> + | Unlink of string (* filename *)
> + | Rm_rf of string (* directory *)
> + | Kill of int * int * int (* wait, signal, pid *)
> + | Fn of (unit -> unit) (* generic function *)
>
> (* List of (priority, action). *)
> let actions = ref []
> @@ -35,18 +35,30 @@ let actions = ref []
> (* Perform a single exit action, printing any exception but
> * otherwise ignoring failures.
> *)
> -let do_action action =
> +let rec do_action action =
I'd slightly prefer (a) do_kill to be introduced either before
do_action, or (b) for do_kill to be defined inside do_action, to using
"let rec" here. I think I understand what "rec" does to scoping, but
still, we don't have actual recursion here.
(I've noticed this pattern in many places in the v2v projects, and it
always confuses me -- it doesn't bring too much convenience IMO, so I'd
rather restrict it to actual recursion.)
> try
> match action with
> | Unlink file -> Unix.unlink file
> | Rm_rf dir ->
> let cmd = sprintf "rm -rf %s" (Filename.quote dir) in
> ignore (Tools_utils.shell_command cmd)
> - | Kill (signal, pid) ->
> - kill pid signal
> + | Kill (wait, signal, pid) ->
> + do_kill ~wait ~signal ~pid
> | Fn f -> f ()
> with exn -> debug "%s" (Printexc.to_string exn)
>
> +and do_kill ~wait ~signal ~pid =
> + kill pid signal;
> +
> + let rec loop i =
> + if i > 0 then (
> + let pid', _ = waitpid [ WNOHANG ] pid in
> + if pid' = 0 then
> + loop (i-1)
Missing: "sleep 1;" before the tail-recursive call, I think.
> + )
> + in
> + loop wait
> +
> (* Make sure the actions are performed only once. *)
> let done_actions = ref false
>
> @@ -106,6 +118,6 @@ let rm_rf ?(prio = 1000) dir =
> register ();
> List.push_front (prio, Rm_rf dir) actions
>
> -let kill ?(prio = 1000) ?(signal = Sys.sigterm) pid =
> +let kill ?(prio = 1000) ?(wait = 0) ?(signal = Sys.sigterm) pid =
> register ();
> - List.push_front (prio, Kill (signal, pid)) actions
> + List.push_front (prio, Kill (wait, signal, pid)) actions
> diff --git a/mltools/on_exit.mli b/mltools/on_exit.mli
> index 910783e..dd35101 100644
> --- a/mltools/on_exit.mli
> +++ b/mltools/on_exit.mli
> @@ -58,12 +58,22 @@ val unlink : ?prio:int -> string -> unit
> val rm_rf : ?prio:int -> string -> unit
> (** Recursively remove a temporary directory on exit (using [rm -rf]). *)
>
> -val kill : ?prio:int -> ?signal:int -> int -> unit
> +val kill : ?prio:int -> ?wait:int -> ?signal:int -> int -> unit
> (** Kill [PID] on exit. The signal sent defaults to [Sys.sigterm].
>
> Use this with care since you can end up unintentionally killing
> another process if [PID] goes away or doesn't exist before the
> - program exits. *)
> + program exits.
> +
> + The optional [?wait] flag attempts to wait for a specified
> + number of seconds for the subprocess to go away. For example
> + using [~wait:5] will wait for up to 5 seconds. Since this
> + runs when virt-v2v is exiting, it is best to keep waiting times
> + as short as possible. Also there is no way to report errors
> + in the subprocess. If reliable cleanup of a subprocess is
> + required then this is not the correct place to do it.
> +
> + [?wait] defaults to [0] which means we do not try to wait. *)
>
> val register : unit -> unit
> (** Force this module to register its at_exit function and signal
>
(please consider formatting *.mli before *.ml)
I believe I take the opposite position on this; I'd rather wait forever.
No subprocess is expected to hang, and we should leave no subprocess
behind. If a subprocess hangs, the parent process should (apparently)
hang forever too, and let users report bugs.
That would also eliminate the question of *how* to wait for N seconds;
we'd just drop WNOHANG from waitpid.
But this is just my opinion; food for thought. :)
Thanks,
Laszlo
More information about the Libguestfs
mailing list