[libvirt] [PATCH] Fix a deadlock in bi-directional p2p concurrent migration.

Daniel P. Berrange berrange at redhat.com
Tue Jul 20 10:53:43 UTC 2010


On Fri, Jul 16, 2010 at 09:38:10AM -0400, Chris Lalancette wrote:
> If you try to execute two concurrent migrations p2p
> from A->B and B->A, the two libvirtd's will deadlock
> trying to perform the migrations.  The reason for this is
> that in p2p migration, the libvirtd's are responsible for
> making the RPC Prepare, Migrate, and Finish calls.  However,
> they are currently holding the driver lock while doing so,
> which basically guarantees deadlock in this scenario.
> 
> This patch fixes the situation by adding
> qemuDomainObjEnterRemoteWithDriver and
> qemuDomainObjExitRemoteWithDriver helper methods.  The Enter
> take an additional object reference, then drops both the
> domain object lock and the driver lock.  The Exit takes
> both the driver and domain object lock, then drops the
> reference.  Adding calls to these Enter and Exit helpers
> around remote calls in the various migration methods
> seems to fix the problem for me in testing.
> 
> This should make the situation safe. The additional domain
> object reference ensures that the domain object won't disappear
> while this operation is happening.  The BeginJob that is called
> inside of qemudDomainMigratePerform ensures that we can't execute a
> second migrate (or shutdown, or save, etc) job while the
> migration is active.  Finally, the additional check on the state
> of the vm after we reacquire the locks ensures that we can't
> be surprised by an external event (domain crash, etc).
> 
> Signed-off-by: Chris Lalancette <clalance at redhat.com>
> ---
>  src/qemu/qemu_driver.c |   57 ++++++++++++++++++++++++++++++++++++++++++------
>  1 files changed, 50 insertions(+), 7 deletions(-)
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 5f2607c..1eff4bc 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -531,6 +531,21 @@ static void qemuDomainObjExitMonitorWithDriver(struct qemud_driver *driver, virD
>      }
>  }
>  
> +static void qemuDomainObjEnterRemoteWithDriver(struct qemud_driver *driver,
> +                                               virDomainObjPtr obj)
> +{
> +    virDomainObjRef(obj);
> +    virDomainObjUnlock(obj);
> +    qemuDriverUnlock(driver);
> +}
> +
> +static void qemuDomainObjExitRemoteWithDriver(struct qemud_driver *driver,
> +                                              virDomainObjPtr obj)
> +{
> +    qemuDriverLock(driver);
> +    virDomainObjLock(obj);
> +    virDomainObjUnref(obj);
> +}
>  
>  static int qemuCgroupControllerActive(struct qemud_driver *driver,
>                                        int controller)
> @@ -10753,14 +10768,25 @@ static int doTunnelMigrate(virDomainPtr dom,
>          /* virStreamNew only fails on OOM, and it reports the error itself */
>          goto cleanup;
>  
> +    qemuDomainObjEnterRemoteWithDriver(driver, vm);
>      internalret = dconn->driver->domainMigratePrepareTunnel(dconn, st,
>                                                              flags, dname,
>                                                              resource, dom_xml);
> +    qemuDomainObjExitRemoteWithDriver(driver, vm);
>  
>      if (internalret < 0)
>          /* domainMigratePrepareTunnel sets the error for us */
>          goto cleanup;
>  
> +    /* the domain may have shutdown or crashed while we had the locks dropped
> +     * in qemuDomainObjEnterRemoteWithDriver, so check again
> +     */
> +    if (!virDomainObjIsActive(vm)) {
> +        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                        _("guest unexpectedly quit"));
> +        goto cleanup;
> +    }
> +
>      /*   3. start migration on source */
>      qemuDomainObjEnterMonitorWithDriver(driver, vm);
>      if (flags & VIR_MIGRATE_NON_SHARED_DISK)
> @@ -10833,8 +10859,10 @@ cancel:
>  
>  finish:
>      dname = dname ? dname : dom->name;
> +    qemuDomainObjEnterRemoteWithDriver(driver, vm);
>      ddomain = dconn->driver->domainMigrateFinish2
>          (dconn, dname, NULL, 0, uri, flags, retval);
> +    qemuDomainObjExitRemoteWithDriver(driver, vm);
>  
>  cleanup:
>      if (client_sock != -1)
> @@ -10873,19 +10901,32 @@ static int doNonTunnelMigrate(virDomainPtr dom,
>      virDomainPtr ddomain = NULL;
>      int retval = -1;
>      char *uri_out = NULL;
> +    int rc;
>  
> +    qemuDomainObjEnterRemoteWithDriver(driver, vm);
>      /* NB we don't pass 'uri' into this, since that's the libvirtd
>       * URI in this context - so we let dest pick it */
> -    if (dconn->driver->domainMigratePrepare2(dconn,
> -                                             NULL, /* cookie */
> -                                             0, /* cookielen */
> -                                             NULL, /* uri */
> -                                             &uri_out,
> -                                             flags, dname,
> -                                             resource, dom_xml) < 0)
> +    rc = dconn->driver->domainMigratePrepare2(dconn,
> +                                              NULL, /* cookie */
> +                                              0, /* cookielen */
> +                                              NULL, /* uri */
> +                                              &uri_out,
> +                                              flags, dname,
> +                                              resource, dom_xml);
> +    qemuDomainObjExitRemoteWithDriver(driver, vm);
> +    if (rc < 0)
>          /* domainMigratePrepare2 sets the error for us */
>          goto cleanup;
>  
> +    /* the domain may have shutdown or crashed while we had the locks dropped
> +     * in qemuDomainObjEnterRemoteWithDriver, so check again
> +     */
> +    if (!virDomainObjIsActive(vm)) {
> +        qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> +                        _("guest unexpectedly quit"));
> +        goto cleanup;
> +    }
> +
>      if (uri_out == NULL) {
>          qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>                          _("domainMigratePrepare2 did not set uri"));
> @@ -10899,8 +10940,10 @@ static int doNonTunnelMigrate(virDomainPtr dom,
>  
>  finish:
>      dname = dname ? dname : dom->name;
> +    qemuDomainObjEnterRemoteWithDriver(driver, vm);
>      ddomain = dconn->driver->domainMigrateFinish2
>          (dconn, dname, NULL, 0, uri_out, flags, retval);
> +    qemuDomainObjExitRemoteWithDriver(driver, vm);
>  
>      if (ddomain)
>          virUnrefDomain(ddomain);

ACK

Daniel
-- 
|: Red Hat, Engineering, London    -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org -o- http://virt-manager.org -o- http://deltacloud.org :|
|: http://autobuild.org        -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-   F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|




More information about the libvir-list mailing list