[libvirt] [PATCH 3/3] qemu: Deal with stucked qemu on daemon startup
Michal Privoznik
mprivozn at redhat.com
Tue Aug 23 18:15:26 UTC 2011
On 23.08.2011 14:42, Daniel P. Berrange wrote:
> On Tue, Aug 16, 2011 at 06:39:12PM +0200, Michal Privoznik wrote:
>> If libvirt daemon gets restarted and there is (at least) one
>> unresponsive qemu, the startup procedure hangs up. This patch creates
>> one thread per vm in which we try to reconnect to monitor. Therefore,
>> blocking in one thread will not affect other APIs.
>> ---
>> src/qemu/qemu_driver.c | 23 +++---------
>> src/qemu/qemu_process.c | 87 ++++++++++++++++++++++++++++++++++++++++++----
>> 2 files changed, 85 insertions(+), 25 deletions(-)
>>
>> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
>> index 421a98e..4574b6c 100644
>> --- a/src/qemu/qemu_driver.c
>> +++ b/src/qemu/qemu_driver.c
>> @@ -143,26 +143,15 @@ qemuAutostartDomain(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaq
>>
>> virDomainObjLock(vm);
>> virResetLastError();
>> - if (qemuDomainObjBeginJobWithDriver(data->driver, vm,
>> - QEMU_JOB_MODIFY) < 0) {
>> + if (vm->autostart &&
>> + !virDomainObjIsActive(vm) &&
>> + qemuDomainObjStart(data->conn, data->driver, vm,
>> + false, false,
>> + data->driver->autoStartBypassCache) < 0) {
>> err = virGetLastError();
>> - VIR_ERROR(_("Failed to start job on VM '%s': %s"),
>> + VIR_ERROR(_("Failed to autostart VM '%s': %s"),
>> vm->def->name,
>> err ? err->message : _("unknown error"));
>> - } else {
>> - if (vm->autostart &&
>> - !virDomainObjIsActive(vm) &&
>> - qemuDomainObjStart(data->conn, data->driver, vm,
>> - false, false,
>> - data->driver->autoStartBypassCache) < 0) {
>> - err = virGetLastError();
>> - VIR_ERROR(_("Failed to autostart VM '%s': %s"),
>> - vm->def->name,
>> - err ? err->message : _("unknown error"));
>> - }
>> -
>> - if (qemuDomainObjEndJob(data->driver, vm) == 0)
>> - vm = NULL;
>> }
>
> I'm not really seeing why this change is safe. You're removing the
> job protection from the autostart code, but AFAICT, no where else
> in the caller of this method is changed to acquire the job instead.
>
>> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
>> index 21e73a5..1daf6ae 100644
>> --- a/src/qemu/qemu_process.c
>> +++ b/src/qemu/qemu_process.c
>> @@ -820,6 +820,7 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>> {
>> qemuDomainObjPrivatePtr priv = vm->privateData;
>> int ret = -1;
>> + qemuMonitorPtr mon = NULL;
>>
>> if (virSecurityManagerSetSocketLabel(driver->securityManager, vm) < 0) {
>> VIR_ERROR(_("Failed to set security context for monitor for %s"),
>> @@ -831,15 +832,29 @@ qemuConnectMonitor(struct qemud_driver *driver, virDomainObjPtr vm)
>> * deleted while the monitor is active */
>> virDomainObjRef(vm);
>>
>> - priv->mon = qemuMonitorOpen(vm,
>> - priv->monConfig,
>> - priv->monJSON,
>> - &monitorCallbacks);
>> + ignore_value(virTimeMs(&priv->monStart));
>> + virDomainObjUnlock(vm);
>> + qemuDriverUnlock(driver);
>> +
>> + mon = qemuMonitorOpen(vm,
>> + priv->monConfig,
>> + priv->monJSON,
>> + &monitorCallbacks);
>> +
>> + qemuDriverLock(driver);
>> + virDomainObjLock(vm);
>> + priv->monStart = 0;
>
> I'm also not convinced it is safe to release the driver/domain
> locks in this way. At the very *least* you need to be acquiring
> an extra reference on the virDomainObjPtr, ideally using the
> EnterMonitor/ExitMonitor methods, otherwise some other thread
> may destroy the virDomainObjPtr while it is unlocked.
I cannot use EnterMonitor because it access priv->mon which does not
exist at this time.
>
> Also what is the 'monStart' field used for ?
For gathering statistics: virsh domcontrol <domain>
>
>>
>> /* Safe to ignore value since ref count was incremented above */
>> - if (priv->mon == NULL)
>> + if (mon == NULL)
>> ignore_value(virDomainObjUnref(vm));
>>
>> + if (!virDomainObjIsActive(vm)) {
>> + qemuMonitorClose(mon);
>> + goto error;
>> + }
>> + priv->mon = mon;
>> +
>> if (virSecurityManagerClearSocketLabel(driver->securityManager, vm) < 0) {
>> VIR_ERROR(_("Failed to clear security context for monitor for %s"),
>> vm->def->name);
>> @@ -2484,21 +2499,25 @@ qemuProcessRecoverJob(struct qemud_driver *driver,
>> struct qemuProcessReconnectData {
>> virConnectPtr conn;
>> struct qemud_driver *driver;
>> + void *payload;
>> };
>> /*
>> * Open an existing VM's monitor, re-detect VCPU threads
>> * and re-reserve the security labels in use
>> */
>> static void
>> -qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opaque)
>> +qemuProcessReconnect(void *opaque)
>> {
>> - virDomainObjPtr obj = payload;
>> struct qemuProcessReconnectData *data = opaque;
>> struct qemud_driver *driver = data->driver;
>> + virDomainObjPtr obj = data->payload;
>> qemuDomainObjPrivatePtr priv;
>> virConnectPtr conn = data->conn;
>> struct qemuDomainJobObj oldjob;
>>
>> + VIR_FREE(data);
>> +
>> + qemuDriverLock(driver);
>> virDomainObjLock(obj);
>>
>> qemuDomainObjRestoreJob(obj, &oldjob);
>> @@ -2572,12 +2591,15 @@ qemuProcessReconnect(void *payload, const void *name ATTRIBUTE_UNUSED, void *opa
>> if (virDomainObjUnref(obj) > 0)
>> virDomainObjUnlock(obj);
>>
>> + qemuDriverUnlock(driver);
>> +
>> return;
>>
>> error:
>> if (!virDomainObjIsActive(obj)) {
>> if (virDomainObjUnref(obj) > 0)
>> virDomainObjUnlock(obj);
>> + qemuDriverUnlock(driver);
>> return;
>> }
>>
>> @@ -2591,6 +2613,55 @@ error:
>> else
>> virDomainObjUnlock(obj);
>> }
>> + qemuDriverUnlock(driver);
>> +}
>> +
>> +static void
>> +qemuProcessReconnectHelper(void *payload,
>> + const void *name ATTRIBUTE_UNUSED,
>> + void *opaque)
>> +{
>> + virThread thread;
>> + struct qemuProcessReconnectData *src = opaque;
>> + struct qemuProcessReconnectData *data;
>> +
>> + if (VIR_ALLOC(data) < 0) {
>> + virReportOOMError();
>> + return;
>> + }
>> +
>> + memcpy(data, src, sizeof(*data));
>> + data->payload = payload;
>> +
>> + /* This iterator is called with driver being locked.
>> + * However, we need to unlock it so qemuProcessReconnect,
>> + * which will run in separate thread can lock it itself
>> + * (if needed). The whole idea behind: qemuProcessReconnect:
>> + * 1. lock driver
>> + * 2. just before monitor reconnect, do lightweight MonitorEnter
>> + * (unlock VM & driver)
>> + * 3. Reconnect to monitor
>> + * 4. do lightweight MonitorExit (lock driver & VM)
>> + * 5. continue reconnect process
>> + * 6. unlock driver
>> + *
>> + * It is necessary to NOT hold driver lock for the entire run
>> + * of reconnect, otherwise we will get blocked if there is
>> + * unresponsive qemu.
>> + * However, iterating over hash table MUST be done on locked
>> + * driver.
>> + *
>> + * NB, we can't do normal MonitorEnter & MonitorExit because
>> + * these two lock the monitor lock, which does not exists in
>> + * this early phase.
>> + */
>> + qemuDriverUnlock(src->driver);
>> + if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
>> + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>> + _("Could not create thread. QEMU initialization "
>> + "might be incomplete"));
>> + }
>> + qemuDriverLock(src->driver);
>> }
>
> This comment & code is not quite right. Since the 'qemuProcessReconnect'
> function runs in a separate thread, there is no need to unlock the
> src->driver here. Unlocking the driver is in fact dangerous, since this
> means something can (accidentally or not) change the driver->domains
> hash table, while we're in the middle of iterating over it.
>
> If you remove these qemuDriverUnlock/Lock calls here, it simply means
> that the thread you spawn will start, but immediately block waiting
> for the lock. When the thread that triggers reconnect finally
> unlocks the driver later, all the threads will be able to start
> executing.
>
> The other unsafe thing here though, is that the main thread owns the
> reference of virDomainObjPtr, and something could happily destroy
> this reference, before the thread you spawn gets a chance to run.
>
> So *before* spawning the thread, you need to start a job on the
> virDomainObjPtr. The thread will then actually do the job work
> and release the job when it is done.
>
> So this code should look something more like
>
> virDomainObjPtr obj = payload;
>
> ...begin job on obj...
> if (virThreadCreate(&thread, true, qemuProcessReconnect, data) < 0) {
> qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> _("Could not create thread to reconnect to domain");
> ...end job on obj...
> ...destroy the running guest PID if any..
> } else {
> ...nothing. the thread ends the job...
> }
>
>
> Regards,
> Daniel
More information about the libvir-list
mailing list