[libvirt] [PATCH 3/3] qemu: Deal with stucked qemu on daemon startup

Daniel P. Berrange berrange at redhat.com
Tue Aug 23 12:42:11 UTC 2011


On Tue, Aug 16, 2011 at 06:39:12PM +0200, Michal Privoznik wrote:
> If libvirt daemon gets restarted and there is (at least) one
> unresponsive qemu, the startup procedure hangs up. This patch creates
> one thread per vm in which we try to reconnect to monitor. Therefore,
> blocking in one thread will not affect other APIs.
> ---
>  src/qemu/qemu_driver.c  |   23 +++---------
>  src/qemu/qemu_process.c |   87 ++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 85 insertions(+), 25 deletions(-)
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 421a98e..4574b6c 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -143,26 +143,15 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
>  
>      virDomainObjLock(vm);
>      virResetLastError();
> -    if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
> -                                        QEMU_JOB_MODIFY) < 0) {
> +    if (vm->autostart &&
> +        !virDomainObjIsActive(vm) &&
> +        qemuDomainObjStart(data->conn, data->driver, vm,
> +                           false, false,
> +                           data->driver->autoStartBypassCache) < 0) {
>          err = virGetLastError();
> -        VIR_ERROR(_("Failed to start job on VM '%s': %s"),
> +        VIR_ERROR(_("Failed to autostart VM '%s': %s"),
>                    vm->def->name,
>                    err ? err->message : _("unknown error"));
> -    } else {
> -        if (vm->autostart &&
> -            !virDomainObjIsActive(vm) &&
> -            qemuDomainObjStart(data->conn, data->driver, vm,
> -                               false, false,
> -                               data->driver->autoStartBypassCache) < 0) {
> -            err = virGetLastError();
> -            VIR_ERROR(_("Failed to autostart VM '%s': %s"),
> -                      vm->def->name,
> -                      err ? err->message : _("unknown error"));
> -        }
> -
> -        if (qemuDomainObjEndJob(data->driver, vm) == 0)
> -            vm = NULL;
>      }

I'm not really seeing why this change is safe. You're removing the
job protection from the autostart code, but AFAICT, no where else
in the caller of this method is changed to acquire the job instead.

> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 21e73a5..1daf6ae 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -820,6 +820,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>  {
>      qemuDomainObjPrivatePtr priv = vm->privateData;
>      int ret = -1;
> +    qemuMonitorPtr mon = NULL;
>  
>      if (virSecurityManagerSetSocketLabel(driver->securityManager, vm) < 0) {
>          VIR_ERROR(_("Failed to set security context for monitor for %s"),
> @@ -831,15 +832,29 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>       * deleted while the monitor is active */
>      virDomainObjRef(vm);
>  
> -    priv->mon = qemuMonitorOpen(vm,
> -                                priv->monConfig,
> -                                priv->monJSON,
> -                                &monitorCallbacks);
> +    ignore_value(virTimeMs(&priv->monStart));
> +    virDomainObjUnlock(vm);
> +    qemuDriverUnlock(driver);
> +
> +    mon = qemuMonitorOpen(vm,
> +                          priv->monConfig,
> +                          priv->monJSON,
> +                          &monitorCallbacks);
> +
> +    qemuDriverLock(driver);
> +    virDomainObjLock(vm);
> +    priv->monStart = 0;

I'm also not convinced it is safe to release the driver/domain
locks in this way. At the very *least* you need to be acquiring
an extra reference on the virDomainObjPtr, ideally using the
EnterMonitor/ExitMonitor methods, otherwise some other thread
may destroy the virDomainObjPtr while it is unlocked.

Also what is the 'monStart' field used for ?

>  
>      /* Safe to ignore value since ref count was incremented above */
> -    if (priv->mon == NULL)
> +    if (mon == NULL)
>          ignore_value(virDomainObjUnref(vm));
>  
> +    if (!virDomainObjIsActive(vm)) {
> +        qemuMonitorClose(mon);
> +        goto error;
> +    }
> +    priv->mon = mon;
> +
>      if (virSecurityManagerClearSocketLabel(driver->securityManager, vm) < 0) {
>          VIR_ERROR(_("Failed to clear security context for monitor for %s"),
>                    vm->def->name);
> @@ -2484,21 +2499,25 @@ qemuProcessRecoverJob(struct qemud_driver *driver,
>  struct qemuProcessReconnectData {
>      virConnectPtr conn;
>      struct qemud_driver *driver;
> +    void *payload;
>  };
>  /*
>   * Open an existing VM's monitor, re-detect VCPU threads
>   * and re-reserve the security labels in use
>   */
>  static void
> -qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaque)
> +qemuProcessReconnect(void *opaque)
>  {
> -    virDomainObjPtr obj = payload;
>      struct qemuProcessReconnectData *data = opaque;
>      struct qemud_driver *driver = data->driver;
> +    virDomainObjPtr obj = data->payload;
>      qemuDomainObjPrivatePtr priv;
>      virConnectPtr conn = data->conn;
>      struct qemuDomainJobObj oldjob;
>  
> +    VIR_FREE(data);
> +
> +    qemuDriverLock(driver);
>      virDomainObjLock(obj);
>  
>      qemuDomainObjRestoreJob(obj, &oldjob);
> @@ -2572,12 +2591,15 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
>      if (virDomainObjUnref(obj) > 0)
>          virDomainObjUnlock(obj);
>  
> +    qemuDriverUnlock(driver);
> +
>      return;
>  
>  error:
>      if (!virDomainObjIsActive(obj)) {
>          if (virDomainObjUnref(obj) > 0)
>              virDomainObjUnlock(obj);
> +        qemuDriverUnlock(driver);
>          return;
>      }
>  
> @@ -2591,6 +2613,55 @@ error:
>          else
>              virDomainObjUnlock(obj);
>      }
> +    qemuDriverUnlock(driver);
> +}
> +
> +static void
> +qemuProcessReconnectHelper(void *payload,
> +                           const void *name ATTRIBUTE_UNUSED,
> +                           void *opaque)
> +{
> +    virThread thread;
> +    struct qemuProcessReconnectData *src = opaque;
> +    struct qemuProcessReconnectData *data;
> +
> +    if (VIR_ALLOC(data) < 0) {
> +        virReportOOMError();
> +        return;
> +    }
> +
> +    memcpy(data, src, sizeof(*data));
> +    data->payload = payload;
> +
> +    /* This iterator is called with driver being locked.
> +     * However, we need to unlock it so qemuProcessReconnect,
> +     * which will run in separate thread can lock it itself
> +     * (if needed). The whole idea behind: qemuProcessReconnect:
> +     * 1. lock driver
> +     * 2. just before monitor reconnect, do lightweight MonitorEnter
> +     *    (unlock VM & driver)
> +     * 3. Reconnect to monitor
> +     * 4. do lightweight MonitorExit (lock driver & VM)
> +     * 5. continue reconnect process
> +     * 6. unlock driver
> +     *
> +     * It is necessary to NOT hold driver lock for the entire run
> +     * of reconnect, otherwise we will get blocked if there is
> +     * unresponsive qemu.
> +     * However, iterating over hash table MUST be done on locked
> +     * driver.
> +     *
> +     * NB, we can't do normal MonitorEnter & MonitorExit because
> +     * these two lock the monitor lock, which does not exists in
> +     * this early phase.
> +     */
> +    qemuDriverUnlock(src->driver);
> +    if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
> +        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                        _("Could not create thread. QEMU initialization "
> +                          "might be incomplete"));
> +    }
> +    qemuDriverLock(src->driver);
>  }

This comment & code is not quite right. Since the 'qemuProcessReconnect'
function runs in a separate thread, there is no need to unlock the
src->driver here. Unlocking the driver is in fact dangerous, since this
means something can (accidentally or not) change the driver->domains
hash table, while we're in the middle of iterating over it.

If you remove these qemuDriverUnlock/Lock calls here, it simply means
that the thread you spawn will start, but immediately block waiting
for the lock.  When the thread that triggers reconnect finally
unlocks the driver later, all the threads will be able to start
executing.

The other unsafe thing here though, is that the main thread owns the
reference of virDomainObjPtr, and something could happily destroy
this reference, before the thread you spawn gets a chance to run.

So *before* spawning the thread, you need to start a job on the
virDomainObjPtr. The thread will then actually do the job work
and release the job when it is done.

So this code should look something more like

     virDomainObjPtr obj = payload;

     ...begin job on obj...
     if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                        _("Could not create thread to reconnect to domain");
        ...end job on obj...
        ...destroy the running guest PID if any..
     }  else {
        ...nothing. the thread ends the job...
     }


Regards,
Daniel
-- 
|: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org              -o-             http://virt-manager.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|




More information about the libvir-list mailing list