[libvirt] [PATCH 5/5] qemu: Avoid dangling migration-out job when client dies

Jiri Denemark jdenemar at redhat.com
Mon Mar 19 16:19:00 UTC 2012


When a client which started non-p2p migration dies in a bad time, the
source libvirtd never clears the migration job and almost nothing can be
done with the domain without restarting the daemon. This patch makes use
of connection close callbacks and ensures that migration job is properly
discarded when the client disconnects.
---
 src/qemu/qemu_driver.c    |    4 +++
 src/qemu/qemu_migration.c |   66 +++++++++++++++++++++++++++++++++++++++++++++
 src/qemu/qemu_migration.h |    4 +++
 3 files changed, 74 insertions(+), 0 deletions(-)

diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 3a5ef09..c2622d3 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -8838,6 +8838,9 @@ qemuDomainMigrateBegin3(virDomainPtr domain,
          * This prevents any other APIs being invoked while migration is taking
          * place.
          */
+        if (qemuDriverCloseCallbackSet(driver, vm, domain->conn,
+                                       qemuMigrationCleanup) < 0)
+            goto endjob;
         if (qemuMigrationJobContinue(vm) == 0) {
             vm = NULL;
             qemuReportError(VIR_ERR_OPERATION_FAILED,
@@ -9069,6 +9072,7 @@ qemuDomainMigrateConfirm3(virDomainPtr domain,
         phase = QEMU_MIGRATION_PHASE_CONFIRM3;
 
     qemuMigrationJobStartPhase(driver, vm, phase);
+    qemuDriverCloseCallbackUnset(driver, vm, qemuMigrationCleanup);
 
     ret = qemuMigrationConfirm(driver, domain->conn, vm,
                                cookiein, cookieinlen,
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 4eb3bf4..e02ba86 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -1037,6 +1037,67 @@ qemuDomainMigrateGraphicsRelocate(struct qemud_driver *driver,
 }
 
 
+/* This is called for outgoing non-p2p migrations when a connection to the
+ * client which initiated the migration was closed and we are waiting for it
+ * to follow up with the next phase, that is, in between
+ * virDomainMigrateBegin3 and qemuDomainMigratePerform3 or
+ * qemuDomainMigratePerform3 and qemuDomainMigrateConfirm3.
+ */
+virDomainObjPtr
+qemuMigrationCleanup(struct qemud_driver *driver,
+                     virDomainObjPtr vm,
+                     virConnectPtr conn)
+{
+    qemuDomainObjPrivatePtr priv = vm->privateData;
+
+    VIR_DEBUG("vm=%s, conn=%p, asyncJob=%s, phase=%s",
+              vm->def->name, conn,
+              qemuDomainAsyncJobTypeToString(priv->job.asyncJob),
+              qemuDomainAsyncJobPhaseToString(priv->job.asyncJob,
+                                              priv->job.phase));
+
+    if (!qemuMigrationJobIsActive(vm, QEMU_ASYNC_JOB_MIGRATION_OUT))
+        goto cleanup;
+
+    VIR_DEBUG("The connection which started outgoing migration of domain %s"
+              " was closed; cancelling the migration",
+              vm->def->name);
+
+    switch ((enum qemuMigrationJobPhase) priv->job.phase) {
+    case QEMU_MIGRATION_PHASE_BEGIN3:
+        /* just forget we were about to migrate */
+        qemuDomainObjDiscardAsyncJob(driver, vm);
+        break;
+
+    case QEMU_MIGRATION_PHASE_PERFORM3_DONE:
+        VIR_WARN("Migration of domain %s finished but we don't know if the"
+                 " domain was successfully started on destination or not",
+                 vm->def->name);
+        /* clear the job and let higher levels decide what to do */
+        qemuDomainObjDiscardAsyncJob(driver, vm);
+        break;
+
+    case QEMU_MIGRATION_PHASE_PERFORM3:
+        /* cannot be seen without an active migration API; unreachable */
+    case QEMU_MIGRATION_PHASE_CONFIRM3:
+    case QEMU_MIGRATION_PHASE_CONFIRM3_CANCELLED:
+        /* all done; unreachable */
+    case QEMU_MIGRATION_PHASE_PREPARE:
+    case QEMU_MIGRATION_PHASE_FINISH2:
+    case QEMU_MIGRATION_PHASE_FINISH3:
+        /* incoming migration; unreachable */
+    case QEMU_MIGRATION_PHASE_PERFORM2:
+        /* single phase outgoing migration; unreachable */
+    case QEMU_MIGRATION_PHASE_NONE:
+    case QEMU_MIGRATION_PHASE_LAST:
+        /* unreachable */
+        ;
+    }
+
+cleanup:
+    return vm;
+}
+
 /* The caller is supposed to lock the vm and start a migration job. */
 char *qemuMigrationBegin(struct qemud_driver *driver,
                          virDomainObjPtr vm,
@@ -2547,6 +2608,7 @@ qemuMigrationPerformPhase(struct qemud_driver *driver,
     }
 
     qemuMigrationJobStartPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3);
+    qemuDriverCloseCallbackUnset(driver, vm, qemuMigrationCleanup);
 
     resume = virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING;
     ret = doNativeMigrate(driver, vm, uri, cookiein, cookieinlen,
@@ -2577,6 +2639,10 @@ qemuMigrationPerformPhase(struct qemud_driver *driver,
 
     qemuMigrationJobSetPhase(driver, vm, QEMU_MIGRATION_PHASE_PERFORM3_DONE);
 
+    if (qemuDriverCloseCallbackSet(driver, vm, conn,
+                                   qemuMigrationCleanup) < 0)
+        goto endjob;
+
 endjob:
     if (ret < 0)
         refs = qemuMigrationJobFinish(driver, vm);
diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h
index 41e4eac..5fab0ca 100644
--- a/src/qemu/qemu_migration.h
+++ b/src/qemu/qemu_migration.h
@@ -77,6 +77,10 @@ int qemuMigrationJobFinish(struct qemud_driver *driver, virDomainObjPtr obj)
 int qemuMigrationSetOffline(struct qemud_driver *driver,
                             virDomainObjPtr vm);
 
+virDomainObjPtr qemuMigrationCleanup(struct qemud_driver *driver,
+                                     virDomainObjPtr vm,
+                                     virConnectPtr conn);
+
 char *qemuMigrationBegin(struct qemud_driver *driver,
                          virDomainObjPtr vm,
                          const char *xmlin,
-- 
1.7.8.5




More information about the libvir-list mailing list