[libvirt] [PATCH 06/19] qemu: Recover from interrupted jobs

Jiri Denemark jdenemar at redhat.com
Thu Jul 7 23:34:11 UTC 2011


Detect and react on situations when libvirtd was restarted or killed
when a job was active.
---
 src/qemu/qemu_domain.c  |   14 ++++++++
 src/qemu/qemu_domain.h  |    2 +
 src/qemu/qemu_process.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 0 deletions(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 062ecc7..b26308e 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -142,6 +142,20 @@ qemuDomainObjResetAsyncJob(qemuDomainObjPrivatePtr priv)
     memset(&job->signalsData, 0, sizeof(job->signalsData));
 }
 
+void
+qemuDomainObjRestoreJob(virDomainObjPtr obj,
+                        struct qemuDomainJobObj *job)
+{
+    qemuDomainObjPrivatePtr priv = obj->privateData;
+
+    memset(job, 0, sizeof(*job));
+    job->active = priv->job.active;
+    job->asyncJob = priv->job.asyncJob;
+
+    qemuDomainObjResetJob(priv);
+    qemuDomainObjResetAsyncJob(priv);
+}
+
 static void
 qemuDomainObjFreeJob(qemuDomainObjPrivatePtr priv)
 {
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 17d1356..49be3d2 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -177,6 +177,8 @@ void qemuDomainObjEndNestedJob(struct qemud_driver *driver,
 void qemuDomainObjSaveJob(struct qemud_driver *driver, virDomainObjPtr obj);
 void qemuDomainObjSetAsyncJobMask(virDomainObjPtr obj,
                                   unsigned long long allowedJobs);
+void qemuDomainObjRestoreJob(virDomainObjPtr obj,
+                             struct qemuDomainJobObj *job);
 void qemuDomainObjDiscardAsyncJob(struct qemud_driver *driver,
                                   virDomainObjPtr obj);
 
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 3ffde51..49625b5 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -2223,6 +2223,80 @@ qemuProcessUpdateState(struct qemud_driver *driver, virDomainObjPtr vm)
     return 0;
 }
 
+static int
+qemuProcessRecoverJob(struct qemud_driver *driver,
+                      virDomainObjPtr vm,
+                      virConnectPtr conn,
+                      const struct qemuDomainJobObj *job)
+{
+    virDomainState state;
+    int reason;
+
+    state = virDomainObjGetState(vm, &reason);
+
+    switch (job->asyncJob) {
+    case QEMU_ASYNC_JOB_MIGRATION_OUT:
+    case QEMU_ASYNC_JOB_MIGRATION_IN:
+        /* we don't know what to do yet */
+        break;
+
+    case QEMU_ASYNC_JOB_SAVE:
+    case QEMU_ASYNC_JOB_DUMP:
+        /* TODO cancel possibly running migrate operation */
+        /* resume the domain but only if it was paused as a result of
+         * running save/dump operation */
+        if (state == VIR_DOMAIN_PAUSED &&
+            ((job->asyncJob == QEMU_ASYNC_JOB_DUMP &&
+              reason == VIR_DOMAIN_PAUSED_DUMP) ||
+             (job->asyncJob == QEMU_ASYNC_JOB_SAVE &&
+              reason == VIR_DOMAIN_PAUSED_SAVE) ||
+             reason == VIR_DOMAIN_PAUSED_UNKNOWN)) {
+            if (qemuProcessStartCPUs(driver, vm, conn,
+                                     VIR_DOMAIN_RUNNING_UNPAUSED) < 0) {
+                VIR_WARN("Could not resume domain %s after", vm->def->name);
+            }
+        }
+        break;
+
+    case QEMU_ASYNC_JOB_NONE:
+    case QEMU_ASYNC_JOB_LAST:
+        break;
+    }
+
+    if (!virDomainObjIsActive(vm))
+        return -1;
+
+    switch (job->active) {
+    case QEMU_JOB_QUERY:
+        /* harmless */
+        break;
+
+    case QEMU_JOB_DESTROY:
+        VIR_DEBUG("Domain %s should have already been destroyed",
+                  vm->def->name);
+        return -1;
+
+    case QEMU_JOB_SUSPEND:
+        /* mostly harmless */
+        break;
+
+    case QEMU_JOB_MODIFY:
+        /* XXX depending on the command we may be in an inconsistent state and
+         * we should probably fall back to "monitor error" state and refuse to
+         */
+        break;
+
+    case QEMU_JOB_ASYNC:
+    case QEMU_JOB_ASYNC_NESTED:
+        /* async job was already handled above */
+    case QEMU_JOB_NONE:
+    case QEMU_JOB_LAST:
+        break;
+    }
+
+    return 0;
+}
+
 struct qemuProcessReconnectData {
     virConnectPtr conn;
     struct qemud_driver *driver;
@@ -2239,9 +2313,12 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
     struct qemud_driver *driver = data->driver;
     qemuDomainObjPrivatePtr priv;
     virConnectPtr conn = data->conn;
+    struct qemuDomainJobObj oldjob;
 
     virDomainObjLock(obj);
 
+    qemuDomainObjRestoreJob(obj, &oldjob);
+
     VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name);
 
     priv = obj->privateData;
@@ -2287,6 +2364,9 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
     if (qemuProcessFiltersInstantiate(conn, obj->def))
         goto error;
 
+    if (qemuProcessRecoverJob(driver, obj, conn, &oldjob) < 0)
+        goto error;
+
     priv->job.active = QEMU_JOB_NONE;
 
     /* update domain state XML with possibly updated state in virDomainObj */
-- 
1.7.6




More information about the libvir-list mailing list