[libvirt] [PATCH] virNetSocketNewConnectUNIX: Use flocks when spawning a daemon

John Ferlan jferlan at redhat.com
Wed Apr 15 18:07:53 UTC 2015



On 04/02/2015 12:21 PM, Michal Privoznik wrote:
> https://bugzilla.redhat.com/show_bug.cgi?id=1200149
> 
> Even though we have a mutex mechanism so that two clients don't spawn
> two daemons, it's not strong enough. It can happen that while one
> client is spawning the daemon, the other one fails to connect.
> Basically two possible errors can happen:
> 
>   error: Failed to connect socket to '/home/mprivozn/.cache/libvirt/libvirt-sock': Connection refused
> 
> or:
> 
>   error: Failed to connect socket to '/home/mprivozn/.cache/libvirt/libvirt-sock': No such file or directory
> 
> The problem in both cases is, the daemon is only starting up, while we
> are trying to connect (and fail). We should postpone the connecting
> phase until the daemon is started (by the other thread that is
> spawning it). In order to do that, create a file lock 'libvirt-lock'
> in the directory where session daemon would create its socket. So even
> when called from multiple processes, spawning a daemon will serialize
> on the file lock. So only the first to come will spawn the daemon.
> 
> Signed-off-by: Michal Privoznik <mprivozn at redhat.com>
> ---
> 
> diff to v1:
> -hopefully this one is race free
> 
>  src/rpc/virnetsocket.c | 148 ++++++++++++-------------------------------------
>  1 file changed, 36 insertions(+), 112 deletions(-)
> 
> diff --git a/src/rpc/virnetsocket.c b/src/rpc/virnetsocket.c
> index 6b019cc..2b891ae 100644
> --- a/src/rpc/virnetsocket.c
> +++ b/src/rpc/virnetsocket.c
> @@ -123,7 +123,7 @@ VIR_ONCE_GLOBAL_INIT(virNetSocket)
>  
>  
>  #ifndef WIN32
> -static int virNetSocketForkDaemon(const char *binary, int passfd)
> +static int virNetSocketForkDaemon(const char *binary)
>  {
>      int ret;
>      virCommandPtr cmd = virCommandNewArgList(binary,
> @@ -136,10 +136,6 @@ static int virNetSocketForkDaemon(const char *binary, int passfd)
>      virCommandAddEnvPassBlockSUID(cmd, "XDG_RUNTIME_DIR", NULL);
>      virCommandClearCaps(cmd);
>      virCommandDaemonize(cmd);
> -    if (passfd) {
> -        virCommandPassFD(cmd, passfd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
> -        virCommandPassListenFDs(cmd);
> -    }
>      ret = virCommandRun(cmd, NULL);
>      virCommandFree(cmd);
>      return ret;
> @@ -543,10 +539,10 @@ int virNetSocketNewConnectUNIX(const char *path,
>                                 const char *binary,
>                                 virNetSocketPtr *retsock)
>  {
> -    char *binname = NULL;
> -    char *pidpath = NULL;
> -    int fd, passfd = -1;
> -    int pidfd = -1;
> +    char *lockpath = NULL;

Because we assign this to NULL...

> +    int lockfd = -1;
> +    int fd = -1;
> +    int retries = 100;
>      virSocketAddr localAddr;
>      virSocketAddr remoteAddr;
>  
> @@ -561,6 +557,22 @@ int virNetSocketNewConnectUNIX(const char *path,
>          return -1;
>      }
>  
> +    if (spawnDaemon) {
> +        if (virPidFileConstructPath(false, NULL, "libvirt-lock", &lockpath) < 0)
> +            goto error;
> +
> +        if ((lockfd = open(lockpath, O_RDWR | O_CREAT, 0600)) < 0 ||
> +            virSetCloseExec(lockfd) < 0) {
> +            virReportSystemError(errno, _("Unable to create lock '%s'"), lockpath);

And can jump to error right here

> +            goto error;
> +        }
> +
> +        if (virFileLock(lockfd, false, 0, 1, true) < 0) {
> +            virReportSystemError(errno, _("Unable to lock '%s'"), lockpath);
> +            goto error;
> +        }
> +    }
> +
>      if ((fd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
>          virReportSystemError(errno, "%s", _("Failed to create socket"));
>          goto error;
> @@ -574,108 +586,25 @@ int virNetSocketNewConnectUNIX(const char *path,
>      if (remoteAddr.data.un.sun_path[0] == '@')
>          remoteAddr.data.un.sun_path[0] = '\0';
>  
> - retry:
> -    if (connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) {
> -        int status = 0;
> -        pid_t pid = 0;
> -
> -        if (!spawnDaemon) {
> +    while (retries &&
> +           connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) {
> +        if (!(spawnDaemon && errno == ENOENT)) {
>              virReportSystemError(errno, _("Failed to connect socket to '%s'"),
>                                   path);
>              goto error;
>          }
>  
> -        if (!(binname = last_component(binary)) || binname[0] == '\0') {
> -            virReportError(VIR_ERR_INTERNAL_ERROR,
> -                           _("Cannot determine basename for binary '%s'"),
> -                           binary);
> -            goto error;
> -        }
> -
> -        if (virPidFileConstructPath(false, NULL, binname, &pidpath) < 0)
> -            goto error;
> -
> -        pidfd = virPidFileAcquirePath(pidpath, false, getpid());
> -        if (pidfd < 0) {
> -            /*
> -             * This can happen in a very rare case of two clients spawning two
> -             * daemons at the same time, and the error in the logs that gets
> -             * reset here can be a clue to some future debugging.
> -             */
> -            virResetLastError();
> -            spawnDaemon = false;
> -            goto retry;
> -        }
> -
> -        if ((passfd = socket(PF_UNIX, SOCK_STREAM, 0)) < 0) {
> -            virReportSystemError(errno, "%s", _("Failed to create socket"));
> +        if (virNetSocketForkDaemon(binary) < 0)
>              goto error;
> -        }
>  
> -        /*
> -         * We already even acquired the pidfile, so no one else should be using
> -         * @path right now.  So we're OK to unlink it and paying attention to
> -         * the return value makes no real sense here.  Only if it's not an
> -         * abstract socket, of course.
> -         */
> -        if (path[0] != '@')
> -            unlink(path);
> -
> -        /*
> -         * We have to fork() here, because umask() is set per-process, chmod()
> -         * is racy and fchmod() has undefined behaviour on sockets according to
> -         * POSIX, so it doesn't work outside Linux.
> -         */
> -        if ((pid = virFork()) < 0)
> -            goto error;
> -
> -        if (pid == 0) {
> -            umask(0077);
> -            if (bind(passfd, &remoteAddr.data.sa, remoteAddr.len) < 0)
> -                _exit(EXIT_FAILURE);
> -
> -            _exit(EXIT_SUCCESS);
> -        }
> +        retries--;
> +        usleep(5000);
> +    }
>  
> -        if (virProcessWait(pid, &status, false) < 0)
> -            goto error;
> -
> -        if (status != EXIT_SUCCESS) {
> -            /*
> -             * OK, so the child failed to bind() the socket.  This may mean that
> -             * another daemon was starting at the same time and succeeded with
> -             * its bind() (even though it should not happen because we using a
> -             * pidfile for the race).  So we'll try connecting again, but this
> -             * time without spawning the daemon.
> -             */
> -            spawnDaemon = false;
> -            virPidFileDeletePath(pidpath);
> -            VIR_FORCE_CLOSE(pidfd);
> -            VIR_FORCE_CLOSE(passfd);
> -            goto retry;
> -        }
> -
> -        if (listen(passfd, 0) < 0) {
> -            virReportSystemError(errno, "%s",
> -                                 _("Failed to listen on socket that's about "
> -                                   "to be passed to the daemon"));
> -            goto error;
> -        }
> -
> -        if (connect(fd, &remoteAddr.data.sa, remoteAddr.len) < 0) {
> -            virReportSystemError(errno, _("Failed to connect socket to '%s'"),
> -                                 path);
> -            goto error;
> -        }
> -
> -        /*
> -         * Do we need to eliminate the super-rare race here any more?  It would
> -         * need incorporating the following VIR_FORCE_CLOSE() into a
> -         * virCommandHook inside a virNetSocketForkDaemon().
> -         */
> -        VIR_FORCE_CLOSE(pidfd);
> -        if (virNetSocketForkDaemon(binary, passfd) < 0)
> -            goto error;
> +    if (lockfd) {
> +        unlink(lockpath);
> +        VIR_FORCE_CLOSE(lockfd);
> +        VIR_FREE(lockpath);
>      }
>  
>      localAddr.len = sizeof(localAddr.data);
> @@ -687,19 +616,14 @@ int virNetSocketNewConnectUNIX(const char *path,
>      if (!(*retsock = virNetSocketNew(&localAddr, &remoteAddr, true, fd, -1, 0)))
>          goto error;
>  
> -    VIR_FREE(pidpath);
> -
>      return 0;
>  
>   error:
> -    if (pidfd >= 0)
> -        virPidFileDeletePath(pidpath);
> -    VIR_FREE(pidpath);
> +    if (lockfd)
> +        unlink(lockpath);

Coverity complains about calling unlink with NULL:

(8) Event var_deref_model: 	Passing null pointer "lockpath" to "unlink",
which dereferences it.
Also see events:

> +    VIR_FREE(lockpath);
>      VIR_FORCE_CLOSE(fd);
> -    VIR_FORCE_CLOSE(passfd);
> -    VIR_FORCE_CLOSE(pidfd);
> -    if (spawnDaemon)
> -        unlink(path);
> +    VIR_FORCE_CLOSE(lockfd);
>      return -1;
>  }
>  #else
> 




More information about the libvir-list mailing list