[libvirt] [PATCH 3/3 v2] qemu: Deal with stucked qemu on daemon startup

Daniel P. Berrange berrange at redhat.com
Thu Aug 25 14:32:43 UTC 2011


On Tue, Aug 23, 2011 at 08:22:18PM +0200, Michal Privoznik wrote:
> If libvirt daemon gets restarted and there is (at least) one
> unresponsive qemu, the startup procedure hangs up. This patch creates
> one thread per vm in which we try to reconnect to monitor. Therefore,
> blocking in one thread will not affect other APIs.
> ---
>  src/qemu/qemu_driver.c  |   23 +++++----
>  src/qemu/qemu_process.c |  123 +++++++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 127 insertions(+), 19 deletions(-)
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index c8dda73..8c2e356 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -143,16 +143,18 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
>  
>      virDomainObjLock(vm);
>      virResetLastError();
> -    if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
> -                                        QEMU_JOB_MODIFY) < 0) {
> -        err = virGetLastError();
> -        VIR_ERROR(_("Failed to start job on VM '%s': %s"),
> -                  vm->def->name,
> -                  err ? err->message : _("unknown error"));
> -    } else {
> -        if (vm->autostart &&
> -            !virDomainObjIsActive(vm) &&
> -            qemuDomainObjStart(data->conn, data->driver, vm,
> +    if (vm->autostart &&
> +        !virDomainObjIsActive(vm)) {
> +        if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
> +                                            QEMU_JOB_MODIFY) < 0) {
> +            err = virGetLastError();
> +            VIR_ERROR(_("Failed to start job on VM '%s': %s"),
> +                      vm->def->name,
> +                      err ? err->message : _("unknown error"));
> +            goto cleanup;
> +        }
> +
> +        if (qemuDomainObjStart(data->conn, data->driver, vm,
>                                 false, false,
>                                 data->driver->autoStartBypassCache) < 0) {
>              err = virGetLastError();
> @@ -165,6 +167,7 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
>              vm = NULL;
>      }
>  
> +cleanup:
>      if (vm)
>          virDomainObjUnlock(vm);
>  }
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 21e73a5..4cc8ae6 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -820,6 +820,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>  {
>      qemuDomainObjPrivatePtr priv = vm->privateData;
>      int ret = -1;
> +    qemuMonitorPtr mon = NULL;
>  
>      if (virSecurityManagerSetSocketLabel(driver->securityManager, vm) < 0) {
>          VIR_ERROR(_("Failed to set security context for monitor for %s"),
> @@ -831,15 +832,29 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>       * deleted while the monitor is active */
>      virDomainObjRef(vm);
>  
> -    priv->mon = qemuMonitorOpen(vm,
> -                                priv->monConfig,
> -                                priv->monJSON,
> -                                &monitorCallbacks);
> +    ignore_value(virTimeMs(&priv->monStart));
> +    virDomainObjUnlock(vm);
> +    qemuDriverUnlock(driver);
> +
> +    mon = qemuMonitorOpen(vm,
> +                          priv->monConfig,
> +                          priv->monJSON,
> +                          &monitorCallbacks);
> +
> +    qemuDriverLock(driver);
> +    virDomainObjLock(vm);
> +    priv->monStart = 0;
>  
>      /* Safe to ignore value since ref count was incremented above */
> -    if (priv->mon == NULL)
> +    if (mon == NULL)
>          ignore_value(virDomainObjUnref(vm));
>  
> +    if (!virDomainObjIsActive(vm)) {
> +        qemuMonitorClose(mon);
> +        goto error;
> +    }
> +    priv->mon = mon;
> +
>      if (virSecurityManagerClearSocketLabel(driver->securityManager, vm) < 0) {
>          VIR_ERROR(_("Failed to clear security context for monitor for %s"),
>                    vm->def->name);
> @@ -2484,24 +2499,30 @@ qemuProcessRecoverJob(struct qemud_driver *driver,
>  struct qemuProcessReconnectData {
>      virConnectPtr conn;
>      struct qemud_driver *driver;
> +    void *payload;
> +    struct qemuDomainJobObj oldjob;
>  };
>  /*
>   * Open an existing VM's monitor, re-detect VCPU threads
>   * and re-reserve the security labels in use
>   */
>  static void
> -qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaque)
> +qemuProcessReconnect(void *opaque)
>  {
> -    virDomainObjPtr obj = payload;
>      struct qemuProcessReconnectData *data = opaque;
>      struct qemud_driver *driver = data->driver;
> +    virDomainObjPtr obj = data->payload;
>      qemuDomainObjPrivatePtr priv;
>      virConnectPtr conn = data->conn;
>      struct qemuDomainJobObj oldjob;
>  
> +    memcpy(&oldjob, &data->oldjob, sizeof(oldjob));
> +
> +    VIR_FREE(data);
> +
> +    qemuDriverLock(driver);
>      virDomainObjLock(obj);
>  
> -    qemuDomainObjRestoreJob(obj, &oldjob);
>  
>      VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name);
>  
> @@ -2572,12 +2593,21 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
>      if (virDomainObjUnref(obj) > 0)
>          virDomainObjUnlock(obj);
>  
> +    if (qemuDomainObjEndJob(driver, obj) == 0)
> +        obj = NULL;
> +
> +    qemuDriverUnlock(driver);
> +
>      return;
>  
>  error:
> +    if (qemuDomainObjEndJob(driver, obj) == 0)
> +        obj = NULL;
> +
>      if (!virDomainObjIsActive(obj)) {
>          if (virDomainObjUnref(obj) > 0)
>              virDomainObjUnlock(obj);
> +        qemuDriverUnlock(driver);
>          return;
>      }
>  
> @@ -2591,6 +2621,81 @@ error:
>          else
>              virDomainObjUnlock(obj);
>      }
> +    qemuDriverUnlock(driver);
> +}
> +
> +static void
> +qemuProcessReconnectHelper(void *payload,
> +                           const void *name ATTRIBUTE_UNUSED,
> +                           void *opaque)
> +{
> +    virThread thread;
> +    struct qemuProcessReconnectData *src = opaque;
> +    struct qemuProcessReconnectData *data;
> +    virDomainObjPtr obj = payload;
> +
> +    if (VIR_ALLOC(data) < 0) {
> +        virReportOOMError();
> +        return;
> +    }
> +
> +    memcpy(data, src, sizeof(*data));
> +    data->payload = payload;
> +
> +    /* This iterator is called with driver being locked.
> +     * We create a separate thread to run qemuProcessReconnect in it.
> +     * However, qemuProcessReconnect needs to:
> +     * 1. lock driver
> +     * 2. just before monitor reconnect do lightweight MonitorEnter
> +     *    (increase VM refcount, unlock VM & driver)
> +     * 3. reconnect to monitor
> +     * 4. do lightweight MonitorExit (lock driver & VM)
> +     * 5. continue reconnect process
> +     * 6. EndJob
> +     * 7. unlock driver
> +     *
> +     * It is necessary to NOT hold driver lock for the entire run
> +     * of reconnect, otherwise we will get blocked if there is
> +     * unresponsive qemu.
> +     * However, iterating over hash table MUST be done on locked
> +     * driver.
> +     *
> +     * NB, we can't do normal MonitorEnter & MonitorExit because
> +     * these two lock the monitor lock, which does not exists in
> +     * this early phase.
> +     */
> +
> +    virDomainObjLock(obj);
> +
> +    qemuDomainObjRestoreJob(obj, &data->oldjob);
> +
> +    if (qemuDomainObjBeginJobWithDriver(src->driver, obj, QEMU_JOB_MODIFY) < 0)
> +        goto error;
> +
> +    if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
> +        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                        _("Could not create thread. QEMU initialization "
> +                          "might be incomplete"));
> +        if (qemuDomainObjEndJob(src->driver, obj) == 0) {
> +            obj = NULL;
> +        } else if (virDomainObjUnref(obj) > 0) {
> +           /* We can't spawn a thread and thus connect to monitor.
> +            * Kill qemu */
> +            qemuProcessStop(src->driver, obj, 0, VIR_DOMAIN_SHUTOFF_FAILED);
> +            if (!obj->persistent)
> +                virDomainRemoveInactive(&src->driver->domains, obj);
> +            else
> +                virDomainObjUnlock(obj);
> +        }
> +        goto error;
> +    }
> +
> +    virDomainObjUnlock(obj);
> +
> +    return;
> +
> +error:
> +    VIR_FREE(data);
>  }
>  
>  /**
> @@ -2603,7 +2708,7 @@ void
>  qemuProcessReconnectAll(virConnectPtr conn, struct qemud_driver *driver)
>  {
>      struct qemuProcessReconnectData data = {conn, driver};
> -    virHashForEach(driver->domains.objs, qemuProcessReconnect, &data);
> +    virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data);
>  }
>  
>  int qemuProcessStart(virConnectPtr conn,

ACK

Daniel
-- 
|: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org              -o-             http://virt-manager.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org       -o-       http://live.gnome.org/gtk-vnc :|




More information about the libvir-list mailing list