[libvirt] [PATCH 3/3 v2] qemu: Deal with stucked qemu on daemon startup
Daniel P. Berrange
berrange at redhat.com
Thu Aug 25 14:32:43 UTC 2011
On Tue, Aug 23, 2011 at 08:22:18PM +0200, Michal Privoznik wrote:
> If libvirt daemon gets restarted and there is (at least) one
> unresponsive qemu, the startup procedure hangs up. This patch creates
> one thread per vm in which we try to reconnect to monitor. Therefore,
> blocking in one thread will not affect other APIs.
> ---
> src/qemu/qemu_driver.c | 23 +++++----
> src/qemu/qemu_process.c | 123 +++++++++++++++++++++++++++++++++++++++++++----
> 2 files changed, 127 insertions(+), 19 deletions(-)
>
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index c8dda73..8c2e356 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -143,16 +143,18 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
>
> virDomainObjLock(vm);
> virResetLastError();
> - if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
> - QEMU_JOB_MODIFY) < 0) {
> - err = virGetLastError();
> - VIR_ERROR(_("Failed to start job on VM '%s': %s"),
> - vm->def->name,
> - err ? err->message : _("unknown error"));
> - } else {
> - if (vm->autostart &&
> - !virDomainObjIsActive(vm) &&
> - qemuDomainObjStart(data->conn, data->driver, vm,
> + if (vm->autostart &&
> + !virDomainObjIsActive(vm)) {
> + if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
> + QEMU_JOB_MODIFY) < 0) {
> + err = virGetLastError();
> + VIR_ERROR(_("Failed to start job on VM '%s': %s"),
> + vm->def->name,
> + err ? err->message : _("unknown error"));
> + goto cleanup;
> + }
> +
> + if (qemuDomainObjStart(data->conn, data->driver, vm,
> false, false,
> data->driver->autoStartBypassCache) < 0) {
> err = virGetLastError();
> @@ -165,6 +167,7 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
> vm = NULL;
> }
>
> +cleanup:
> if (vm)
> virDomainObjUnlock(vm);
> }
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 21e73a5..4cc8ae6 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -820,6 +820,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
> {
> qemuDomainObjPrivatePtr priv = vm->privateData;
> int ret = -1;
> + qemuMonitorPtr mon = NULL;
>
> if (virSecurityManagerSetSocketLabel(driver->securityManager, vm) < 0) {
> VIR_ERROR(_("Failed to set security context for monitor for %s"),
> @@ -831,15 +832,29 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
> * deleted while the monitor is active */
> virDomainObjRef(vm);
>
> - priv->mon = qemuMonitorOpen(vm,
> - priv->monConfig,
> - priv->monJSON,
> - &monitorCallbacks);
> + ignore_value(virTimeMs(&priv->monStart));
> + virDomainObjUnlock(vm);
> + qemuDriverUnlock(driver);
> +
> + mon = qemuMonitorOpen(vm,
> + priv->monConfig,
> + priv->monJSON,
> + &monitorCallbacks);
> +
> + qemuDriverLock(driver);
> + virDomainObjLock(vm);
> + priv->monStart = 0;
>
> /* Safe to ignore value since ref count was incremented above */
> - if (priv->mon == NULL)
> + if (mon == NULL)
> ignore_value(virDomainObjUnref(vm));
>
> + if (!virDomainObjIsActive(vm)) {
> + qemuMonitorClose(mon);
> + goto error;
> + }
> + priv->mon = mon;
> +
> if (virSecurityManagerClearSocketLabel(driver->securityManager, vm) < 0) {
> VIR_ERROR(_("Failed to clear security context for monitor for %s"),
> vm->def->name);
> @@ -2484,24 +2499,30 @@ qemuProcessRecoverJob(struct qemud_driver *driver,
> struct qemuProcessReconnectData {
> virConnectPtr conn;
> struct qemud_driver *driver;
> + void *payload;
> + struct qemuDomainJobObj oldjob;
> };
> /*
> * Open an existing VM's monitor, re-detect VCPU threads
> * and re-reserve the security labels in use
> */
> static void
> -qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaque)
> +qemuProcessReconnect(void *opaque)
> {
> - virDomainObjPtr obj = payload;
> struct qemuProcessReconnectData *data = opaque;
> struct qemud_driver *driver = data->driver;
> + virDomainObjPtr obj = data->payload;
> qemuDomainObjPrivatePtr priv;
> virConnectPtr conn = data->conn;
> struct qemuDomainJobObj oldjob;
>
> + memcpy(&oldjob, &data->oldjob, sizeof(oldjob));
> +
> + VIR_FREE(data);
> +
> + qemuDriverLock(driver);
> virDomainObjLock(obj);
>
> - qemuDomainObjRestoreJob(obj, &oldjob);
>
> VIR_DEBUG("Reconnect monitor to %p '%s'", obj, obj->def->name);
>
> @@ -2572,12 +2593,21 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
> if (virDomainObjUnref(obj) > 0)
> virDomainObjUnlock(obj);
>
> + if (qemuDomainObjEndJob(driver, obj) == 0)
> + obj = NULL;
> +
> + qemuDriverUnlock(driver);
> +
> return;
>
> error:
> + if (qemuDomainObjEndJob(driver, obj) == 0)
> + obj = NULL;
> +
> if (!virDomainObjIsActive(obj)) {
> if (virDomainObjUnref(obj) > 0)
> virDomainObjUnlock(obj);
> + qemuDriverUnlock(driver);
> return;
> }
>
> @@ -2591,6 +2621,81 @@ error:
> else
> virDomainObjUnlock(obj);
> }
> + qemuDriverUnlock(driver);
> +}
> +
> +static void
> +qemuProcessReconnectHelper(void *payload,
> + const void *name ATTRIBUTE_UNUSED,
> + void *opaque)
> +{
> + virThread thread;
> + struct qemuProcessReconnectData *src = opaque;
> + struct qemuProcessReconnectData *data;
> + virDomainObjPtr obj = payload;
> +
> + if (VIR_ALLOC(data) < 0) {
> + virReportOOMError();
> + return;
> + }
> +
> + memcpy(data, src, sizeof(*data));
> + data->payload = payload;
> +
> + /* This iterator is called with driver being locked.
> + * We create a separate thread to run qemuProcessReconnect in it.
> + * However, qemuProcessReconnect needs to:
> + * 1. lock driver
> + * 2. just before monitor reconnect do lightweight MonitorEnter
> + * (increase VM refcount, unlock VM & driver)
> + * 3. reconnect to monitor
> + * 4. do lightweight MonitorExit (lock driver & VM)
> + * 5. continue reconnect process
> + * 6. EndJob
> + * 7. unlock driver
> + *
> + * It is necessary to NOT hold driver lock for the entire run
> + * of reconnect, otherwise we will get blocked if there is
> + * unresponsive qemu.
> + * However, iterating over hash table MUST be done on locked
> + * driver.
> + *
> + * NB, we can't do normal MonitorEnter & MonitorExit because
> + * these two lock the monitor lock, which does not exists in
> + * this early phase.
> + */
> +
> + virDomainObjLock(obj);
> +
> + qemuDomainObjRestoreJob(obj, &data->oldjob);
> +
> + if (qemuDomainObjBeginJobWithDriver(src->driver, obj, QEMU_JOB_MODIFY) < 0)
> + goto error;
> +
> + if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
> + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> + _("Could not create thread. QEMU initialization "
> + "might be incomplete"));
> + if (qemuDomainObjEndJob(src->driver, obj) == 0) {
> + obj = NULL;
> + } else if (virDomainObjUnref(obj) > 0) {
> + /* We can't spawn a thread and thus connect to monitor.
> + * Kill qemu */
> + qemuProcessStop(src->driver, obj, 0, VIR_DOMAIN_SHUTOFF_FAILED);
> + if (!obj->persistent)
> + virDomainRemoveInactive(&src->driver->domains, obj);
> + else
> + virDomainObjUnlock(obj);
> + }
> + goto error;
> + }
> +
> + virDomainObjUnlock(obj);
> +
> + return;
> +
> +error:
> + VIR_FREE(data);
> }
>
> /**
> @@ -2603,7 +2708,7 @@ void
> qemuProcessReconnectAll(virConnectPtr conn, struct qemud_driver *driver)
> {
> struct qemuProcessReconnectData data = {conn, driver};
> - virHashForEach(driver->domains.objs, qemuProcessReconnect, &data);
> + virHashForEach(driver->domains.objs, qemuProcessReconnectHelper, &data);
> }
>
> int qemuProcessStart(virConnectPtr conn,
ACK
Daniel
--
|: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org -o- http://virt-manager.org :|
|: http://autobuild.org -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org -o- http://live.gnome.org/gtk-vnc :|
More information about the libvir-list
mailing list