[libvirt PATCH 3/4] qemu: Remember failed post-copy migration in job

Jiri Denemark jdenemar at redhat.com
Thu Dec 15 14:37:43 UTC 2022


When post-copy migration fails, the domain stays running on the
destination with a VIR_DOMAIN_RUNNING_POSTCOPY_FAILED reason. Both the
state and the reason can later be rewritten in case the domain gets
paused for other reasons (such as an I/O error). Thus we need a separate
place to remember the post-copy migration failed to be able to resume
the migration.

https://bugzilla.redhat.com/show_bug.cgi?id=2111948

Signed-off-by: Jiri Denemark <jdenemar at redhat.com>
---
 src/conf/domain_conf.c    |  7 ++++++-
 src/conf/virdomainjob.c   |  1 +
 src/conf/virdomainjob.h   |  1 +
 src/qemu/qemu_domainjob.c |  9 +++++++++
 src/qemu/qemu_migration.c | 34 +++++++++++++++++++++++-----------
 src/qemu/qemu_process.c   | 15 +++++++++++++++
 6 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 9e2eea79e7..f83586c549 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -27874,8 +27874,13 @@ virDomainObjGetState(virDomainObj *dom, int *reason)
 
 bool
 virDomainObjIsFailedPostcopy(virDomainObj *dom,
-                             virDomainJobObj *job G_GNUC_UNUSED)
+                             virDomainJobObj *job)
 {
+    if (job && job->asyncPaused &&
+        (job->asyncJob == VIR_ASYNC_JOB_MIGRATION_IN ||
+         job->asyncJob == VIR_ASYNC_JOB_MIGRATION_OUT))
+        return true;
+
     return ((dom->state.state == VIR_DOMAIN_PAUSED &&
              dom->state.reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED) ||
             (dom->state.state == VIR_DOMAIN_RUNNING &&
diff --git a/src/conf/virdomainjob.c b/src/conf/virdomainjob.c
index 256b665a42..c4cbbe8f6d 100644
--- a/src/conf/virdomainjob.c
+++ b/src/conf/virdomainjob.c
@@ -174,6 +174,7 @@ virDomainObjResetAsyncJob(virDomainJobObj *job)
     job->asyncOwner = 0;
     g_clear_pointer(&job->asyncOwnerAPI, g_free);
     job->asyncStarted = 0;
+    job->asyncPaused = false;
     job->phase = 0;
     job->mask = VIR_JOB_DEFAULT_MASK;
     job->abortJob = false;
diff --git a/src/conf/virdomainjob.h b/src/conf/virdomainjob.h
index b1ac36a2fa..0d62bab287 100644
--- a/src/conf/virdomainjob.h
+++ b/src/conf/virdomainjob.h
@@ -176,6 +176,7 @@ struct _virDomainJobObj {
     unsigned long long asyncOwner;      /* Thread which set current async job */
     char *asyncOwnerAPI;                /* The API which owns the async job */
     unsigned long long asyncStarted;    /* When the current async job started */
+    bool asyncPaused;                   /* The async job is paused */
     int phase;                          /* Job phase (mainly for migrations) */
     unsigned long long mask;            /* Jobs allowed during async job */
     virDomainJobData *current;       /* async job progress data */
diff --git a/src/qemu/qemu_domainjob.c b/src/qemu/qemu_domainjob.c
index 8d958b9d21..27beb5229f 100644
--- a/src/qemu/qemu_domainjob.c
+++ b/src/qemu/qemu_domainjob.c
@@ -695,6 +695,8 @@ qemuDomainObjPrivateXMLFormatJob(virBuffer *buf,
     if (vm->job->asyncJob != VIR_ASYNC_JOB_NONE) {
         virBufferAsprintf(&attrBuf, " flags='0x%x'", vm->job->apiFlags);
         virBufferAsprintf(&attrBuf, " asyncStarted='%llu'", vm->job->asyncStarted);
+        if (vm->job->asyncPaused)
+            virBufferAddLit(&attrBuf, " asyncPaused='yes'");
     }
 
     if (vm->job->cb &&
@@ -732,6 +734,7 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
 
     if ((tmp = virXPathString("string(@async)", ctxt))) {
         int async;
+        virTristateBool paused;
 
         if ((async = virDomainAsyncJobTypeFromString(tmp)) < 0) {
             virReportError(VIR_ERR_INTERNAL_ERROR,
@@ -757,6 +760,12 @@ qemuDomainObjPrivateXMLParseJob(virDomainObj *vm,
                            _("Invalid async job start"));
             return -1;
         }
+
+        if (virXMLPropTristateBool(ctxt->node, "asyncPaused", VIR_XML_PROP_NONE,
+                                   &paused) < 0)
+            return -1;
+
+        vm->job->asyncPaused = paused == VIR_TRISTATE_BOOL_YES;
     }
 
     if (virXMLPropUInt(ctxt->node, "flags", 16, VIR_XML_PROP_NONE,
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 27a74795d6..f258e7d700 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -1666,17 +1666,19 @@ qemuMigrationSrcPostcopyFailed(virDomainObj *vm)
 
     state = virDomainObjGetState(vm, &reason);
 
-    VIR_DEBUG("%s/%s",
+    VIR_DEBUG("%s/%s, asyncPaused=%u",
               virDomainStateTypeToString(state),
-              virDomainStateReasonToString(state, reason));
+              virDomainStateReasonToString(state, reason),
+              vm->job->asyncPaused);
 
     if (state != VIR_DOMAIN_PAUSED ||
-        reason == VIR_DOMAIN_PAUSED_POSTCOPY_FAILED)
+        virDomainObjIsFailedPostcopy(vm, vm->job))
         return;
 
     VIR_WARN("Migration of domain %s failed during post-copy; "
              "leaving the domain paused", vm->def->name);
 
+    vm->job->asyncPaused = true;
     virDomainObjSetState(vm, VIR_DOMAIN_PAUSED,
                          VIR_DOMAIN_PAUSED_POSTCOPY_FAILED);
     event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
@@ -1696,21 +1698,31 @@ qemuMigrationDstPostcopyFailed(virDomainObj *vm)
 
     state = virDomainObjGetState(vm, &reason);
 
-    VIR_DEBUG("%s/%s",
+    VIR_DEBUG("%s/%s, asyncPaused=%u",
               virDomainStateTypeToString(state),
-              virDomainStateReasonToString(state, reason));
+              virDomainStateReasonToString(state, reason),
+              vm->job->asyncPaused);
 
-    if (state != VIR_DOMAIN_RUNNING ||
-        reason == VIR_DOMAIN_RUNNING_POSTCOPY_FAILED)
+    if ((state != VIR_DOMAIN_RUNNING && state != VIR_DOMAIN_PAUSED) ||
+        virDomainObjIsFailedPostcopy(vm, vm->job))
         return;
 
     VIR_WARN("Incoming migration of domain '%s' failed during post-copy; "
              "leaving the domain running", vm->def->name);
 
-    virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
-                         VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
-    event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
-                                              VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
+    vm->job->asyncPaused = true;
+    if (state == VIR_DOMAIN_RUNNING) {
+        virDomainObjSetState(vm, VIR_DOMAIN_RUNNING,
+                             VIR_DOMAIN_RUNNING_POSTCOPY_FAILED);
+        event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_RESUMED,
+                                                  VIR_DOMAIN_EVENT_RESUMED_POSTCOPY_FAILED);
+    } else {
+        /* The domain was paused for other reasons (I/O error, ...) so we don't
+         * want to rewrite the original reason and just emit a postcopy-failed
+         * event. */
+        event = virDomainEventLifecycleNewFromObj(vm, VIR_DOMAIN_EVENT_SUSPENDED,
+                                                  VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY_FAILED);
+    }
     virObjectEventStateQueue(driver->domainEventState, event);
 }
 
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 6091c9f1a9..017a05d57e 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -712,6 +712,15 @@ qemuProcessHandleResume(qemuMonitor *mon G_GNUC_UNUSED,
                   vm->def->name, virDomainRunningReasonTypeToString(reason),
                   eventDetail);
 
+        /* When a domain is running in (failed) post-copy migration on the
+         * destination host, we need to make sure to set the appropriate reason
+         * here. */
+        if (virDomainObjIsPostcopy(vm, vm->job)) {
+            if (virDomainObjIsFailedPostcopy(vm, vm->job))
+                reason = VIR_DOMAIN_RUNNING_POSTCOPY_FAILED;
+            else
+                reason = VIR_DOMAIN_RUNNING_POSTCOPY;
+        }
         virDomainObjSetState(vm, VIR_DOMAIN_RUNNING, reason);
         event = virDomainEventLifecycleNewFromObj(vm,
                                                   VIR_DOMAIN_EVENT_RESUMED,
@@ -1491,6 +1500,7 @@ qemuProcessHandleMigrationStatus(qemuMonitor *mon G_GNUC_UNUSED,
                       vm->def->name,
                       virDomainStateTypeToString(state),
                       NULLSTR(virDomainStateReasonToString(state, reason)));
+            vm->job->asyncPaused = false;
             virDomainObjSetState(vm, state, reason);
             event = virDomainEventLifecycleNewFromObj(vm, eventType, eventDetail);
             qemuDomainSaveStatus(vm);
@@ -3420,6 +3430,7 @@ qemuProcessRestoreMigrationJob(virDomainObj *vm,
     job->privateData = g_steal_pointer(&vm->job->privateData);
     vm->job->privateData = jobPriv;
     vm->job->apiFlags = job->apiFlags;
+    vm->job->asyncPaused = job->asyncPaused;
 
     qemuDomainCleanupAdd(vm, qemuProcessCleanupMigrationJob);
 }
@@ -3645,6 +3656,7 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
         if (migStatus == VIR_DOMAIN_JOB_STATUS_POSTCOPY) {
             VIR_DEBUG("Post-copy migration of domain %s still running, it will be handled as unattended",
                       vm->def->name);
+            vm->job->asyncPaused = false;
             return 0;
         }
 
@@ -3653,6 +3665,9 @@ qemuProcessRecoverMigration(virQEMUDriver *driver,
                 qemuMigrationSrcPostcopyFailed(vm);
             else
                 qemuMigrationDstPostcopyFailed(vm);
+            /* Set the asyncPaused flag in case we're reconnecting to a domain
+             * started by an older libvirt. */
+            vm->job->asyncPaused = true;
             return 0;
         }
 
-- 
2.39.0



More information about the libvir-list mailing list