[Libguestfs] [PATCH libnbd v2 2/2] api: Implement local command with systemd socket activation.

Tue Oct 1 13:24:33 UTC 2019

On 9/30/19 11:32 AM, Richard W.M. Jones wrote:
> This adds new APIs for running a local NBD server and connecting to it
> using systemd socket activation (instead of stdin/stdout).
> 
> This includes interop tests against nbdkit and qemu-nbd which I
> believe are the only NBD servers supporting socket activation.  (If we
> find others then we can add more interop tests in future.)
> 
> The upstream spec for systemd socket activation is here:
> http://0pointer.de/blog/projects/socket-activation.html
> ---

> +++ b/generator/states-connect-socket-activation.c

> +/* This is baked into the systemd socket activation API. */
> +#define FIRST_SOCKET_ACTIVATION_FD 3
> +
> +/* Prepare environment for calling execvpe when doing systemd socket
> + * activation.  Takes the current environment and copies it.  Removes
> + * any existing LISTEN_PID or LISTEN_FDS and replaces them with new
> + * variables.  env[0] is "LISTEN_PID=..." which is filled in by
> + * CONNECT_SA.START, and env[1] is "LISTEN_FDS=1".
> + */

I know that getenv()/setenv()/putenv() tend to prefer sorted environ, 
but I also think that exec HAS to handle a hand-built environ that is 
not sorted, so you should be okay with this shortcut.

> +static char **
> +prepare_socket_activation_environment (void)
> +{
> +  char **env = NULL;
> +  char *p0 = NULL, *p1 = NULL;
> +  size_t i, len;
> +  void *vp;
> +
> +  p0 = strdup ("LISTEN_PID=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx");
> +  if (p0 == NULL)
> +    goto err;
> +  p1 = strdup ("LISTEN_FDS=1");
> +  if (p1 == NULL)
> +    goto err;
> +
> +  /* Copy the current environment. */
> +  env = nbd_internal_copy_string_list (environ);

POSIX says the external symbol 'environ' has to exist for linking 
purposes, but also states that no standard header is required to declare 
it.  You may want to add an 'extern char **environ;' line before this 
function for portability.  On the other hand, gnulib documents that on 
newer Mac OS, even that doesn't work, where the solution is '#define 
environ (*_NSGetEnviron())'.  I guess we'll deal with it when somebody 
actually reports compilation failure.

> +
> +  /* Remove any existing LISTEN_PID or LISTEN_FDS instances. */
> +  for (i = 2; env[i] != NULL; ++i) {
> +    if (strncmp (env[i], "LISTEN_PID=", 11) == 0 ||
> +        strncmp (env[i], "LISTEN_FDS=", 11) == 0) {
> +      memmove (&env[i], &env[i+1],
> +               sizeof (char *) * (nbd_internal_string_list_length (&env[i])));
> +      i--;
> +    }
> +  }

Lots of O(N) traversals of the list, but this probably isn't our hot 
spot, and so probably not worth optimizing further.

> +STATE_MACHINE {
> + CONNECT_SA.START:
> +#ifdef HAVE_EXECVPE
> +  size_t len;
> +  int s;
> +  struct sockaddr_un addr;
> +  char **env;
> +  pid_t pid;
> +  int flags;
> +
> +  assert (!h->sock);
> +  assert (h->argv);
> +  assert (h->argv[0]);
> +
> +  /* Use /tmp instead of TMPDIR because we must ensure the path is
> +   * short enough to store in the sockaddr_un.  On some platforms this
> +   * may cause problems so we may need to revisit it.  XXX
> +   */
> +  h->sa_tmpdir = strdup ("/tmp/libnbdXXXXXX");
> +  if (h->sa_tmpdir == NULL) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "strdup");
> +    return 0;
> +  }
> +  if (mkdtemp (h->sa_tmpdir) == NULL) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "mkdtemp");
> +    /* Avoid cleanup in nbd_close. */
> +    free (h->sa_tmpdir);
> +    h->sa_tmpdir = NULL;
> +    return 0;
> +  }
> +
> +  h->sa_sockpath = strdup ("/tmp/libnbdXXXXXX/sock");
> +  if (h->sa_sockpath == NULL) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "strdup");
> +    return 0;
> +  }
> +
> +  len = strlen (h->sa_tmpdir);
> +  memcpy (h->sa_sockpath, h->sa_tmpdir, len);

Is it worth using:

asprintf (&h->sa_sockpath, "%s/sock", h->sa_tmpdir);

for less code?  asprintf might not be standard, but we already require 
execvpe, which probably means asprintf is available.  But your 
open-coded variant works, too.

> +
> +  s = socket (AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
> +  if (s == -1) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "socket");
> +    return 0;
> +  }

I guess the child process can add O_NONBLOCK if they want it.

> +
> +  addr.sun_family = AF_UNIX;
> +  memcpy (addr.sun_path, h->sa_sockpath, strlen (h->sa_sockpath) + 1);
> +  if (bind (s, (struct sockaddr *) &addr, sizeof addr) == -1) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "bind: %s", h->sa_sockpath);
> +    close (s);
> +    return 0;
> +  }
> +
> +  if (listen (s, 1) == -1) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "listen");
> +    close (s);
> +    return 0;
> +  }
> +
> +  env = prepare_socket_activation_environment ();
> +  if (!env) {
> +    SET_NEXT_STATE (%.DEAD);
> +    close (s);
> +    return 0;
> +  }
> +
> +  pid = fork ();
> +  if (pid == -1) {
> +    SET_NEXT_STATE (%.DEAD);
> +    set_error (errno, "fork");
> +    close (s);
> +    nbd_internal_free_string_list (env);
> +    return 0;
> +  }
> +  if (pid == 0) {         /* child - run command */
> +    if (s != FIRST_SOCKET_ACTIVATION_FD) {
> +      dup2 (s, FIRST_SOCKET_ACTIVATION_FD);
> +      close (s);
> +    }
> +    else {
> +      /* We must unset CLOEXEC on the fd.  (dup2 above does this
> +       * implicitly because CLOEXEC is set on the fd, not on the
> +       * socket).
> +       */
> +      flags = fcntl (s, F_GETFD, 0);
> +      if (flags == -1) {
> +        nbd_internal_fork_safe_perror ("fcntl: F_GETFD");
> +        _exit (126);
> +      }
> +      if (fcntl (s, F_SETFD, flags & ~FD_CLOEXEC) == -1) {
> +        nbd_internal_fork_safe_perror ("fcntl: F_SETFD");
> +        _exit (126);
> +      }
> +    }
> +

Looks correct.

> +    char buf[32];
> +    const char *v =
> +      nbd_internal_fork_safe_itoa ((long) getpid (), buf, sizeof buf);
> +    strcpy (&env[0][11], v);

We're using the magic '11' in several places, maybe it's worth a #define 
to make it obvious it is strlen("LISTEN_PID=") ?

> +
> +    /* Restore SIGPIPE back to SIG_DFL. */
> +    signal (SIGPIPE, SIG_DFL);
> +
> +    execvpe (h->argv[0], h->argv, env);
> +    nbd_internal_fork_safe_perror (h->argv[0]);
> +    if (errno == ENOENT)
> +      _exit (127);
> +    else
> +      _exit (126);
> +  }
> +
> +  /* Parent. */
> +  close (s);
> +  nbd_internal_free_string_list (env);
> +  h->pid = pid;
> +
> +  h->connaddrlen = sizeof addr;
> +  memcpy (&h->connaddr, &addr, h->connaddrlen);
> +  SET_NEXT_STATE (%^CONNECT.START);
> +  return 0;
> +
> +#else /* !HAVE_EXECVPE */
> +  SET_NEXT_STATE (%.DEAD)
> +  set_error (ENOTSUP, "platform does not support socket activation");
> +  return 0;
> +#endif

We probably ought to add a matching nbd_supports_socket_activation() 
feature function.

Or, it would be possible to create a fallback for execvpe() on platforms 
that lack it by using execlpe() and our own path-walker utility 
function.  Can be done as a followup patch.  If we do that, then the 
mere presence of LIBNBD_HAVE_NBD_CONNECT_SA is witness enough of the 
functionality, rather than needing a runtime probe.

> +++ b/lib/connect.c

> +
> +int
> +nbd_unlocked_aio_connect_socket_activation (struct nbd_handle *h, char **argv)
> +{
> +  char **copy;
> +
> +  copy = nbd_internal_copy_string_list (argv);
> +  if (!copy) {
> +    set_error (errno, "copy_string_list");
> +    return -1;
> +  }
> +
> +  if (h->argv)
> +    nbd_internal_free_string_list (h->argv);

How can h->argv ever be previously set?

> +  h->argv = copy;
> +
> +  return nbd_internal_run (h, cmd_connect_sa);
> +}
> diff --git a/lib/handle.c b/lib/handle.c
> index 2af25fe..a7f2c79 100644
> --- a/lib/handle.c
> +++ b/lib/handle.c
> @@ -129,6 +129,16 @@ nbd_close (struct nbd_handle *h)
>     free_cmd_list (h->cmds_in_flight);
>     free_cmd_list (h->cmds_done);
>     nbd_internal_free_string_list (h->argv);
> +  if (h->sa_sockpath) {
> +    if (h->pid > 0)
> +      kill (h->pid, SIGTERM);
> +    unlink (h->sa_sockpath);
> +    free (h->sa_sockpath);
> +  }
> +  if (h->sa_tmpdir) {
> +    rmdir (h->sa_tmpdir);
> +    free (h->sa_tmpdir);
> +  }
>     free (h->unixsocket);
>     free (h->hostname);
>     free (h->port);

Somewhat pre-existing: we have a waitpid() here (good, so we don't hang 
on to a zombie process), but we are relying on the child process to 
gracefully go away (whether for connect_command when stdin closes, or 
for connect_sa on receipt of SIGTERM).  Do we need a retry loop that 
escalates to SIGKILL if the child process does not quickly respond to 
the initial condition?  On the other hand, the fact that our waitpid() 
blocks until the child changes status means that if a child ever wedges, 
the fact that we wedge too gives some visibility to the client that it's 
not libnbd's fault and that they need to get the bug fixed in their 
child process.

I think it is ready to push. We may still need further tweaks, but 
that's often the case.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org