[libvirt] [PATCH REBASE 4/5] qemu: fix domain object wait to handle monitor errors

Nikolay Shirokovskiy nshirokovskiy at virtuozzo.com
Wed Apr 18 14:44:43 UTC 2018


Block job abort operation can not handle properly qemu crashes
when waiting for abort/pivot completion. Deadlock scenario is next:

- qemuDomainBlockJobAbort waits for pivot/abort completion
- qemu crashes, then qemuProcessBeginStopJob broadcasts for VM condition and
  then waits for job condition (taken by qemuDomainBlockJobAbort)
- qemuDomainBlockJobAbort awakes but nothing really changed, VM is still
  active (vm->def->id != -1) so thread starts waiting for completion again.
  Now two threads are in deadlock.

First let's add some condition besides domain active status so that waiting
thread can check it when awakes. Second let's move signalling to the place
where condition is set, namely monitor eof/error handlers. Having signalling
in qemuProcessBeginStopJob is not useful.

The patch copies monitor error to domain state because at time
waiting thread awakes there can be no monitor and it is useful to
report monitor error to user.

The patch has a drawback that on destroying a domain when waiting for
event from monitor we get not very convinient error message like
'EOF from monitor' from waiting API. On the other hand if qemu crashes
this is more useful then 'domain is not running'. The first case
will be addressed in another patch.

The patch also fixes other places where we wait for event from qemu.
Namely handling device removal and tray ejection. Other places which
used virDomainObjWait (dump, migration, save) were good because of
async jobs which allow concurrent destroy job.

Signed-off-by: Nikolay Shirokovskiy <nshirokovskiy at virtuozzo.com>
---
 src/conf/domain_conf.c    | 43 -------------------------------------------
 src/conf/domain_conf.h    |  3 ---
 src/libvirt_private.syms  |  2 --
 src/qemu/qemu_domain.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 src/qemu/qemu_domain.h    |  5 ++++-
 src/qemu/qemu_driver.c    |  6 +++---
 src/qemu/qemu_hotplug.c   |  4 ++--
 src/qemu/qemu_migration.c | 12 ++++++------
 src/qemu/qemu_process.c   | 27 ++++++++++++++++++++++-----
 9 files changed, 82 insertions(+), 65 deletions(-)

diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c
index 5f1af91..e046008 100644
--- a/src/conf/domain_conf.c
+++ b/src/conf/domain_conf.c
@@ -3249,49 +3249,6 @@ virDomainObjBroadcast(virDomainObjPtr vm)
 }
 
 
-int
-virDomainObjWait(virDomainObjPtr vm)
-{
-    if (virCondWait(&vm->cond, &vm->parent.lock) < 0) {
-        virReportSystemError(errno, "%s",
-                             _("failed to wait for domain condition"));
-        return -1;
-    }
-
-    if (!virDomainObjIsActive(vm)) {
-        virReportError(VIR_ERR_OPERATION_FAILED, "%s",
-                       _("domain is not running"));
-        return -1;
-    }
-
-    return 0;
-}
-
-
-/**
- * Waits for domain condition to be triggered for a specific period of time.
- *
- * Returns:
- *  -1 in case of error
- *  0 on success
- *  1 on timeout
- */
-int
-virDomainObjWaitUntil(virDomainObjPtr vm,
-                      unsigned long long whenms)
-{
-    if (virCondWaitUntil(&vm->cond, &vm->parent.lock, whenms) < 0) {
-        if (errno != ETIMEDOUT) {
-            virReportSystemError(errno, "%s",
-                                 _("failed to wait for domain condition"));
-            return -1;
-        }
-        return 1;
-    }
-    return 0;
-}
-
-
 /*
  * Mark the current VM config as transient. Ensures transient hotplug
  * operations do not persist past shutdown.
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 122a051..b4d5bdb 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2747,9 +2747,6 @@ bool virDomainObjTaint(virDomainObjPtr obj,
                        virDomainTaintFlags taint);
 
 void virDomainObjBroadcast(virDomainObjPtr vm);
-int virDomainObjWait(virDomainObjPtr vm);
-int virDomainObjWaitUntil(virDomainObjPtr vm,
-                          unsigned long long whenms);
 
 void virDomainPanicDefFree(virDomainPanicDefPtr panic);
 void virDomainResourceDefFree(virDomainResourceDefPtr resource);
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 6bbbf77..ed5498e 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -493,8 +493,6 @@ virDomainObjSetMetadata;
 virDomainObjSetState;
 virDomainObjTaint;
 virDomainObjUpdateModificationImpact;
-virDomainObjWait;
-virDomainObjWaitUntil;
 virDomainOSTypeFromString;
 virDomainOSTypeToString;
 virDomainParseMemory;
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 672f08b..1f40ff1 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1909,6 +1909,7 @@ qemuDomainObjPrivateFree(void *data)
     qemuDomainObjFreeJob(priv);
     VIR_FREE(priv->lockState);
     VIR_FREE(priv->origname);
+    virResetError(&priv->monError);
 
     virChrdevFree(priv->devs);
 
@@ -11881,3 +11882,47 @@ qemuProcessEventFree(struct qemuProcessEvent *event)
     }
     VIR_FREE(event);
 }
+
+
+/**
+ * Waits for domain condition to be triggered for a specific period of time.
+ * if @until is 0 then waits indefinetely.
+ *
+ * Returns:
+ *  -1 on error
+ *   0 on success
+ *   1 on timeout
+ */
+int
+qemuDomainObjWait(virDomainObjPtr vm, unsigned long long until)
+{
+    qemuDomainObjPrivatePtr priv = vm->privateData;
+    int rc;
+
+    if (until)
+        rc = virCondWaitUntil(&vm->cond, &vm->parent.lock, until);
+    else
+        rc = virCondWait(&vm->cond, &vm->parent.lock);
+
+    if (rc < 0) {
+        if (until && errno == ETIMEDOUT)
+            return 1;
+
+        virReportSystemError(errno, "%s",
+                             _("failed to wait for domain condition"));
+        return -1;
+    }
+
+    if (!virDomainObjIsActive(vm)) {
+        virReportError(VIR_ERR_OPERATION_FAILED, "%s",
+                       _("domain is not running"));
+        return -1;
+    }
+
+    if (priv->monError.code != VIR_ERR_OK) {
+        virSetError(&priv->monError);
+        return -1;
+    }
+
+    return 0;
+}
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 40d9a6f..494ed35 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -262,7 +262,7 @@ struct _qemuDomainObjPrivate {
     qemuMonitorPtr mon;
     virDomainChrSourceDefPtr monConfig;
     bool monJSON;
-    bool monError;
+    virError monError;
     unsigned long long monStart;
 
     qemuAgentPtr agent;
@@ -994,4 +994,7 @@ qemuDomainPrepareDiskSource(virDomainDiskDefPtr disk,
                             qemuDomainObjPrivatePtr priv,
                             virQEMUDriverConfigPtr cfg);
 
+int
+qemuDomainObjWait(virDomainObjPtr vm, unsigned long long until);
+
 #endif /* __QEMU_DOMAIN_H__ */
diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
index 0dd6032..03969d8 100644
--- a/src/qemu/qemu_driver.c
+++ b/src/qemu/qemu_driver.c
@@ -2727,7 +2727,7 @@ qemuDomainGetControlInfo(virDomainPtr dom,
 
     memset(info, 0, sizeof(*info));
 
-    if (priv->monError) {
+    if (priv->monError.code != VIR_ERR_OK) {
         info->state = VIR_DOMAIN_CONTROL_ERROR;
         info->details = VIR_DOMAIN_CONTROL_ERROR_REASON_MONITOR;
     } else if (priv->job.active) {
@@ -3726,7 +3726,7 @@ qemuDumpWaitForCompletion(virDomainObjPtr vm)
 
     VIR_DEBUG("Waiting for dump completion");
     while (!priv->job.dumpCompleted && !priv->job.abortJob) {
-        if (virDomainObjWait(vm) < 0)
+        if (qemuDomainObjWait(vm, 0) < 0)
             return -1;
     }
 
@@ -16924,7 +16924,7 @@ qemuDomainBlockJobAbort(virDomainPtr dom,
         qemuDomainDiskPrivatePtr diskPriv = QEMU_DOMAIN_DISK_PRIVATE(disk);
         qemuBlockJobUpdate(driver, vm, QEMU_ASYNC_JOB_NONE, disk, NULL);
         while (diskPriv->blockjob) {
-            if (virDomainObjWait(vm) < 0) {
+            if (qemuDomainObjWait(vm, 0) < 0) {
                 ret = -1;
                 goto endjob;
             }
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index 8d3191f..527ce3f 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -206,7 +206,7 @@ qemuHotplugWaitForTrayEject(virQEMUDriverPtr driver,
         return -1;
 
     while (disk->tray_status != VIR_DOMAIN_DISK_TRAY_OPEN) {
-        if ((rc = virDomainObjWaitUntil(vm, now + CHANGE_MEDIA_TIMEOUT)) < 0)
+        if ((rc = qemuDomainObjWait(vm, now + CHANGE_MEDIA_TIMEOUT)) < 0)
             return -1;
 
         if (rc > 0) {
@@ -4602,7 +4602,7 @@ qemuDomainWaitForDeviceRemoval(virDomainObjPtr vm)
     until += qemuDomainRemoveDeviceWaitTime;
 
     while (priv->unplug.alias) {
-        if ((rc = virDomainObjWaitUntil(vm, until)) == 1)
+        if ((rc = qemuDomainObjWait(vm, until)) == 1)
             return 0;
 
         if (rc < 0) {
diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
index 88b8253..82e1c7f 100644
--- a/src/qemu/qemu_migration.c
+++ b/src/qemu/qemu_migration.c
@@ -738,7 +738,7 @@ qemuMigrationSrcCancelDriveMirror(virQEMUDriverPtr driver,
         if (failed && !err)
             err = virSaveLastError();
 
-        if (virDomainObjWait(vm) < 0)
+        if (qemuDomainObjWait(vm, 0) < 0)
             goto cleanup;
     }
 
@@ -877,7 +877,7 @@ qemuMigrationSrcDriveMirror(virQEMUDriverPtr driver,
             goto cleanup;
         }
 
-        if (virDomainObjWait(vm) < 0)
+        if (qemuDomainObjWait(vm, 0) < 0)
             goto cleanup;
     }
 
@@ -1181,7 +1181,7 @@ qemuMigrationSrcWaitForSpice(virDomainObjPtr vm)
 
     VIR_DEBUG("Waiting for SPICE to finish migration");
     while (!priv->job.spiceMigrated && !priv->job.abortJob) {
-        if (virDomainObjWait(vm) < 0)
+        if (qemuDomainObjWait(vm, 0) < 0)
             return -1;
     }
     return 0;
@@ -1460,7 +1460,7 @@ qemuMigrationSrcWaitForCompletion(virQEMUDriverPtr driver,
             return rv;
 
         if (events) {
-            if (virDomainObjWait(vm) < 0) {
+            if (qemuDomainObjWait(vm, 0) < 0) {
                 jobInfo->status = QEMU_DOMAIN_JOB_STATUS_FAILED;
                 return -2;
             }
@@ -1513,7 +1513,7 @@ qemuMigrationDstWaitForCompletion(virQEMUDriverPtr driver,
 
     while ((rv = qemuMigrationAnyCompleted(driver, vm, asyncJob,
                                            NULL, flags)) != 1) {
-        if (rv < 0 || virDomainObjWait(vm) < 0)
+        if (rv < 0 || qemuDomainObjWait(vm, 0) < 0)
             return -1;
     }
 
@@ -3464,7 +3464,7 @@ qemuMigrationSrcRun(virQEMUDriverPtr driver,
     if (priv->monJSON) {
         while (virDomainObjGetState(vm, NULL) == VIR_DOMAIN_RUNNING) {
             priv->signalStop = true;
-            rc = virDomainObjWait(vm);
+            rc = qemuDomainObjWait(vm, 0);
             priv->signalStop = false;
             if (rc < 0)
                 goto error;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 6a5262a..d76809e 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -268,6 +268,23 @@ qemuConnectAgent(virQEMUDriverPtr driver, virDomainObjPtr vm)
     return 0;
 }
 
+static void
+qemuProcessNotifyMonitorError(virDomainObjPtr vm,
+                              qemuMonitorPtr mon)
+{
+    qemuDomainObjPrivatePtr priv = vm->privateData;
+    virErrorPtr err = qemuMonitorLastError(mon);
+
+    virCopyError(err, &priv->monError);
+
+    /* set error code if due to OOM conditions we fail to set it before */
+    if (priv->monError.code == VIR_ERR_OK)
+        priv->monError.code = VIR_ERR_INTERNAL_ERROR;
+
+    /* Wake up anything waiting for events from monitor */
+    virDomainObjBroadcast(vm);
+    virFreeError(err);
+}
 
 /*
  * This is a callback registered with a qemuMonitorPtr instance,
@@ -286,6 +303,8 @@ qemuProcessHandleMonitorEOF(qemuMonitorPtr mon,
 
     virObjectLock(vm);
 
+    qemuProcessNotifyMonitorError(vm, mon);
+
     VIR_DEBUG("Received EOF on %p '%s'", vm, vm->def->name);
 
     priv = vm->privateData;
@@ -338,7 +357,8 @@ qemuProcessHandleMonitorError(qemuMonitorPtr mon ATTRIBUTE_UNUSED,
 
     virObjectLock(vm);
 
-    ((qemuDomainObjPrivatePtr) vm->privateData)->monError = true;
+    qemuProcessNotifyMonitorError(vm, mon);
+
     event = virDomainEventControlErrorNewFromObj(vm);
     qemuDomainEventQueue(driver, event);
 
@@ -5727,7 +5747,7 @@ qemuProcessPrepareDomain(virQEMUDriverPtr driver,
         goto cleanup;
 
     priv->monJSON = true;
-    priv->monError = false;
+    virResetError(&priv->monError);
     priv->monStart = 0;
     priv->gotShutdown = false;
 
@@ -6483,9 +6503,6 @@ qemuProcessBeginStopJob(virQEMUDriverPtr driver,
     if (qemuProcessKill(vm, killFlags) < 0)
         goto cleanup;
 
-    /* Wake up anything waiting on domain condition */
-    virDomainObjBroadcast(vm);
-
     if (qemuDomainObjBeginJob(driver, vm, job) < 0)
         goto cleanup;
 
-- 
1.8.3.1




More information about the libvir-list mailing list