[libvirt] [PATCH v3] qemu: fix deadlock if create qemuProcessReconnect thread failed

John Ferlan jferlan at redhat.com
Thu Sep 20 17:04:44 UTC 2018



On 09/19/2018 10:27 PM, Wang Yechao wrote:
> qemuProcessReconnectHelper has hold lock on doms, if create
> qemuProcessReconnect thread failed, calls qemuDomainRemoveInactiveJob
> will lead to deadlock.
> 
> Add a qemuDomainRemoveInactiveJobLocked, modify qemuProcessReconnectHelper
> to call it.
> 
> Signed-off-by: Wang Yechao <wang.yechao255 at zte.com.cn>
> 
> ---
> v2 patch:
> https://www.redhat.com/archives/libvir-list/2018-September/msg00635.html
> 
> Changes in v3:
>  - drop v2 patch, cause it is not good.
>  - split qemuDomainRemoveInactive into qemuDomainRemoveInactiveCommon
> and virDomainObjListRemove.
>  - create a qemuDomainRemoveInactiveLocked, consists of
> qemuDomainRemoveInactiveCommon and virDomainObjListRemoveLocked.
>  - create a qemuDomainRemoveInactiveJobLocked,  which copies
> qemuDomainRemoveInactiveJob except calling qemuDomainRemoveInactiveLocked
>  - Modify qemuProcessReconnectHelper to call qemuDomainRemoveInactiveJobLocked
> ---
>  src/qemu/qemu_domain.c  | 75 ++++++++++++++++++++++++++++++++++++++++---------
>  src/qemu/qemu_domain.h  |  9 ++++++
>  src/qemu/qemu_process.c |  2 +-
>  3 files changed, 71 insertions(+), 15 deletions(-)
> 

Sigh, I thought I was clear - multiple patches would have been
preferred. It's easier to review and shows the progression to the fix.
Hopefully the next series will split them up.  Each will have their own
commit message to describe what's happening.

> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 2fd8a2a..ef0d91d 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -8343,24 +8343,13 @@ qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
>      return rem.err;
>  }
>  
> -
> -/**
> - * qemuDomainRemoveInactive:
> - *
> - * The caller must hold a lock to the vm.
> - */
>  void

This will be a "static void"

> -qemuDomainRemoveInactive(virQEMUDriverPtr driver,
> -                         virDomainObjPtr vm)
> +qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
> +                               virDomainObjPtr vm)
>  {
>      char *snapDir;
>      virQEMUDriverConfigPtr cfg;
>  
> -    if (vm->persistent) {
> -        /* Short-circuit, we don't want to remove a persistent domain */
> -        return;
> -    }
> -
>      cfg = virQEMUDriverGetConfig(driver);
>  
>      /* Remove any snapshot metadata prior to removing the domain */
> @@ -8379,9 +8368,47 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver,
>      }
>      qemuExtDevicesCleanupHost(driver, vm->def);
>  
> +    virObjectUnref(cfg);
> +}
> +

Two blank lines between functions

> +/**
> + * qemuDomainRemoveInactive:
> + *
> + * The caller must hold a lock to the vm.
> + */
> +void
> +qemuDomainRemoveInactive(virQEMUDriverPtr driver,
> +                         virDomainObjPtr vm)
> +{
> +    if (vm->persistent) {
> +        /* Short-circuit, we don't want to remove a persistent domain */
> +        return;
> +    }
> +
> +    qemuDomainRemoveInactiveCommon(driver, vm);
>      virDomainObjListRemove(driver->domains, vm);
>  

If you're going to have a blank line, then have it between the two
calls, but not after the last one.

> -    virObjectUnref(cfg);
> +}
> +

Two blank lines

> +/**
> + * qemuDomainRemoveInactiveLocked:
> + *
> + * The caller must hold a lock to the vm ,
> + * and hold a lock to the doms.

Change to:

 * The caller must hold a lock to the vm and must hold the
 * lock on driver->domains in order to call the remove obj
 * from locked list method.

> + */
> +
> +void

static void

> +qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
> +                               virDomainObjPtr vm)
> +{
> +    if (vm->persistent) {
> +        /* Short-circuit, we don't want to remove a persistent domain */
> +        return;
> +    }
> +
> +    qemuDomainRemoveInactiveCommon(driver, vm);
> +    virDomainObjListRemoveLocked(driver->domains, vm);
> +

Similar with respect to the extra blank line here.

>  }
>  
>  
> @@ -8407,6 +8434,26 @@ qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
>          qemuDomainObjEndJob(driver, vm);
>  }
>  
> +/**
> + * qemuDomainRemoveInactiveJobLocked:
> + *
> + * Just like qemuDomainRemoveInactiveJob but it hold lock
> + * on 'doms'.

I would say:

 * Similar to qemuDomainRemoveInactiveJob, except that the caller must
 * also hold the lock @driver->domains

> + */
> +

Remove the blank line above

> +void
> +qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
> +                                  virDomainObjPtr vm)
> +{
> +    bool haveJob;
> +
> +    haveJob = qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) >= 0;
> +
> +    qemuDomainRemoveInactiveLocked(driver, vm);
> +
> +    if (haveJob)
> +        qemuDomainObjEndJob(driver, vm);
> +}
>  

Two blank lines

>  void
>  qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
> index 914c9a6..1d80621 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -698,12 +698,21 @@ int qemuDomainSnapshotDiscardAll(void *payload,
>  int qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
>                                           virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
> +                                    virDomainObjPtr vm);
> +

Nope - this is not external

>  void qemuDomainRemoveInactive(virQEMUDriverPtr driver,
>                                virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
> +                                    virDomainObjPtr vm);
> +

and neither is this one

>  void qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
>                                   virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
> +                                       virDomainObjPtr vm);
> +
>  void qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
>                               virDomainObjPtr vm,
>                               bool value);
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 72a59de..ed24447 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -8025,7 +8025,7 @@ qemuProcessReconnectHelper(virDomainObjPtr obj,
>           */
>          qemuProcessStop(src->driver, obj, VIR_DOMAIN_SHUTOFF_FAILED,
>                          QEMU_ASYNC_JOB_NONE, 0);
> -        qemuDomainRemoveInactiveJob(src->driver, obj);
> +        qemuDomainRemoveInactiveJobLocked(src->driver, obj);
>  
>          virDomainObjEndAPI(&obj);
>          virNWFilterUnlockFilterUpdates();
> 

In the long run the above hung will be the "only" thing for patch3.

John




More information about the libvir-list mailing list