[libvirt] [PATCH v14] support offline migration

Jiri Denemark jdenemar at redhat.com
Fri Nov 16 12:59:15 UTC 2012


On Mon, Nov 12, 2012 at 09:57:29 +0800, liguang wrote:
> original migration did not aware of offline case,
> so, try to support offline migration quietly
> (did not disturb original migration) by pass
> VIR_MIGRATE_OFFLINE flag to migration APIs if only
> the domain is really inactive, and
> migration process will not puzzled by domain
> offline and exit unexpectedly.
> these changes did not take care of disk images the
> domain required, for them could be transferred by
> other APIs as suggested, then VIR_MIGRATE_OFFLINE
> must not combined with VIR_MIGRATE_NON_SHARED_*.
> and you must do a persistent migration at same time,
> do "virsh migrate --offline --persistent ...".
> 
> Signed-off-by: liguang <lig.fnst at cn.fujitsu.com>
> ---
>  include/libvirt/libvirt.h.in |    1 +
>  src/qemu/qemu_driver.c       |   16 +++---
>  src/qemu/qemu_migration.c    |  110 ++++++++++++++++++++++++++++--------------
>  src/qemu/qemu_migration.h    |    9 ++-
>  tools/virsh-domain.c         |    5 ++
>  tools/virsh.pod              |    5 +-
>  6 files changed, 97 insertions(+), 49 deletions(-)
> 
> diff --git a/include/libvirt/libvirt.h.in b/include/libvirt/libvirt.h.in
> index fe58c08..1e0500d 100644
> --- a/include/libvirt/libvirt.h.in
> +++ b/include/libvirt/libvirt.h.in
> @@ -1090,6 +1090,7 @@ typedef enum {
>                                                 * whole migration process; this will be used automatically
>                                                 * when supported */
>      VIR_MIGRATE_UNSAFE            = (1 << 9), /* force migration even if it is considered unsafe */
> +    VIR_MIGRATE_OFFLINE           = (1 << 10), /* offline migrate */
>  } virDomainMigrateFlags;
>  
>  /* Domain migration. */
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 978af57..5f91688 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -9594,7 +9594,7 @@ qemudDomainMigratePrepareTunnel(virConnectPtr dconn,
>  
>      ret = qemuMigrationPrepareTunnel(driver, dconn,
>                                       NULL, 0, NULL, NULL, /* No cookies in v2 */
> -                                     st, dname, dom_xml);
> +                                     st, dname, dom_xml, flags);
>  
>  cleanup:
>      qemuDriverUnlock(driver);
> @@ -9654,7 +9654,7 @@ qemudDomainMigratePrepare2(virConnectPtr dconn,
>      ret = qemuMigrationPrepareDirect(driver, dconn,
>                                       NULL, 0, NULL, NULL, /* No cookies */
>                                       uri_in, uri_out,
> -                                     dname, dom_xml);
> +                                     dname, dom_xml, flags);
>  
>  cleanup:
>      qemuDriverUnlock(driver);
> @@ -9796,7 +9796,7 @@ qemuDomainMigrateBegin3(virDomainPtr domain,
>          asyncJob = QEMU_ASYNC_JOB_NONE;
>      }
>  
> -    if (!virDomainObjIsActive(vm)) {
> +    if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) {
>          virReportError(VIR_ERR_OPERATION_INVALID,
>                         "%s", _("domain is not running"));
>          goto endjob;
> @@ -9805,9 +9805,9 @@ qemuDomainMigrateBegin3(virDomainPtr domain,
>      /* Check if there is any ejected media.
>       * We don't want to require them on the destination.
>       */
> -
> -    if (qemuDomainCheckEjectableMedia(driver, vm, asyncJob) < 0)
> -        goto endjob;
> +    if (virDomainObjIsActive(vm) && (flags & VIR_MIGRATE_OFFLINE))
> +        if (qemuDomainCheckEjectableMedia(driver, vm, asyncJob) < 0)
> +            goto endjob;

This condition is wrong, you actually want to call
qemuDomainCheckEjectableMedia iff the migration is NOT offline and in that
case we already know the domain is active. Thus

    if (!(flags & VIR_MIGRATE_OFFLINE) &&
        qemuDomainCheckEjectableMedia(driver, vm, asyncJob) < 0)
        goto enjob;

is how it should look like.

>  
>      if (!(xml = qemuMigrationBegin(driver, vm, xmlin, dname,
>                                     cookieout, cookieoutlen,
> @@ -9891,7 +9891,7 @@ qemuDomainMigratePrepare3(virConnectPtr dconn,
>                                       cookiein, cookieinlen,
>                                       cookieout, cookieoutlen,
>                                       uri_in, uri_out,
> -                                     dname, dom_xml);
> +                                     dname, dom_xml, flags);
>  
>  cleanup:
>      qemuDriverUnlock(driver);
> @@ -9936,7 +9936,7 @@ qemuDomainMigratePrepareTunnel3(virConnectPtr dconn,
>      ret = qemuMigrationPrepareTunnel(driver, dconn,
>                                       cookiein, cookieinlen,
>                                       cookieout, cookieoutlen,
> -                                     st, dname, dom_xml);
> +                                     st, dname, dom_xml, flags);
>      qemuDriverUnlock(driver);
>  
>  cleanup:
> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
> index 5f8a9c5..2479114 100644
> --- a/src/qemu/qemu_migration.c
> +++ b/src/qemu/qemu_migration.c
> @@ -1443,6 +1443,20 @@ char *qemuMigrationBegin(struct qemud_driver *driver,
>                                  QEMU_MIGRATION_COOKIE_LOCKSTATE) < 0)
>          goto cleanup;
>  
> +    if (flags & VIR_MIGRATE_OFFLINE) {
> +        if (flags & (VIR_MIGRATE_NON_SHARED_DISK|

Just a nit, add space between "VIR_MIGRATE_NON_SHARED_DISK" and '|'.

> +                     VIR_MIGRATE_NON_SHARED_INC)) {
> +            virReportError(VIR_ERR_OPERATION_INVALID,
> +                           "%s", _("offline migration cannot handle non-shared storage"));
> +            goto cleanup;
> +        }
> +        if (!(flags & VIR_MIGRATE_PERSIST_DEST)) {
> +            virReportError(VIR_ERR_OPERATION_INVALID,
> +                           "%s", _("offline migration must be specified with the persistent flag set"));
> +            goto cleanup;
> +        }
> +    }
> +
>      if (xmlin) {
>          if (!(def = virDomainDefParseString(driver->caps, xmlin,
>                                              QEMU_EXPECTED_VIRT_TYPES,

And as I said in v13, please, wrap the long lines so that they fit in 80
characters.

> @@ -1500,7 +1514,8 @@ qemuMigrationPrepareAny(struct qemud_driver *driver,
>                          const char *dname,
>                          const char *dom_xml,
>                          const char *migrateFrom,
> -                        virStreamPtr st)
> +                        virStreamPtr st,
> +                        unsigned long flags)
>  {
>      virDomainDefPtr def = NULL;
>      virDomainObjPtr vm = NULL;
> @@ -1610,15 +1625,17 @@ qemuMigrationPrepareAny(struct qemud_driver *driver,
>      /* Start the QEMU daemon, with the same command-line arguments plus
>       * -incoming $migrateFrom
>       */
> -    if (qemuProcessStart(dconn, driver, vm, migrateFrom, dataFD[0], NULL, NULL,
> -                         VIR_NETDEV_VPORT_PROFILE_OP_MIGRATE_IN_START,
> -                         VIR_QEMU_PROCESS_START_PAUSED |
> -                         VIR_QEMU_PROCESS_START_AUTODESROY) < 0) {
> -        virDomainAuditStart(vm, "migrated", false);
> -        /* Note that we don't set an error here because qemuProcessStart
> -         * should have already done that.
> -         */
> -        goto endjob;
> +    if (!(flags & VIR_MIGRATE_OFFLINE)) {
> +        if (qemuProcessStart(dconn, driver, vm, migrateFrom, dataFD[0], NULL, NULL,
> +                             VIR_NETDEV_VPORT_PROFILE_OP_MIGRATE_IN_START,
> +                             VIR_QEMU_PROCESS_START_PAUSED |
> +                             VIR_QEMU_PROCESS_START_AUTODESROY) < 0) {
> +            virDomainAuditStart(vm, "migrated", false);
> +            /* Note that we don't set an error here because qemuProcessStart
> +             * should have already done that.
> +             */
> +            goto endjob;
> +        }
>      }
>  
>      if (tunnel) {
> @@ -1626,7 +1643,8 @@ qemuMigrationPrepareAny(struct qemud_driver *driver,
>              virReportSystemError(errno, "%s",
>                                   _("cannot pass pipe for tunnelled migration"));
>              virDomainAuditStart(vm, "migrated", false);
> -            qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, 0);
> +            if (!(flags & VIR_MIGRATE_OFFLINE))
> +                qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED, 0);
>              goto endjob;
>          }
>          dataFD[1] = -1; /* 'st' owns the FD now & will close it */

Oh well, is there any reason why you changed the code in
qemuMigrationPrepareAny you had in v13 and was mostly right (except for using
migration cookie) to something like this? Just add

    if (flags & VIR_MIGRATE_OFFLINE) {
        ret = 0;
        goto done;
    }

below the "vm->def->id = -1;" line. And add the "done" label above
qemuDomainCleanupAdd() and make virDomainAuditStart and event creation
conditional for non-offline migrations only. It's also possible that we
actually need to bake migration cookie (although with 0 flags) so we may need
to move the "done" label few lines to the top and do some additional magic.
But I'm not 100% sure about that. And yes, I realize I messed this part a bit
last time since I suggested placing "done" few lines further, which would skip
the call to qemuDomainCleanupAdd. Sorry about that.

> @@ -1709,7 +1727,8 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver,
>                             int *cookieoutlen,
>                             virStreamPtr st,
>                             const char *dname,
> -                           const char *dom_xml)
> +                           const char *dom_xml,
> +                           unsigned long flags)
>  {
>      int ret;
>  
> @@ -1723,7 +1742,7 @@ qemuMigrationPrepareTunnel(struct qemud_driver *driver,
>       */
>      ret = qemuMigrationPrepareAny(driver, dconn, cookiein, cookieinlen,
>                                    cookieout, cookieoutlen, dname, dom_xml,
> -                                  "stdio", st);
> +                                  "stdio", st, flags);
>      return ret;
>  }
>  
> @@ -1738,7 +1757,8 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver,
>                             const char *uri_in,
>                             char **uri_out,
>                             const char *dname,
> -                           const char *dom_xml)
> +                           const char *dom_xml,
> +                           unsigned long flags)
>  {
>      static int port = 0;
>      int this_port;
> @@ -1834,7 +1854,7 @@ qemuMigrationPrepareDirect(struct qemud_driver *driver,
>  
>      ret = qemuMigrationPrepareAny(driver, dconn, cookiein, cookieinlen,
>                                    cookieout, cookieoutlen, dname, dom_xml,
> -                                  migrateFrom, NULL);
> +                                  migrateFrom, NULL, flags);
>  cleanup:
>      VIR_FREE(hostname);
>      if (ret != 0)
> @@ -2280,7 +2300,7 @@ cleanup:
>          orig_err = virSaveLastError();
>  
>      if (spec->fwdType != MIGRATION_FWD_DIRECT) {
> -        if (iothread && qemuMigrationStopTunnel(iothread, ret < 0) < 0)
> +       if (iothread && qemuMigrationStopTunnel(iothread, ret < 0) < 0)

Looks like an accidental change.

>              ret = -1;
>          VIR_FORCE_CLOSE(fd);
>      }
> @@ -2353,6 +2373,9 @@ static int doNativeMigrate(struct qemud_driver *driver,
>      if (!uribits)
>          return -1;
>  
> +    if (flags & VIR_MIGRATE_OFFLINE)
> +        return 0;
> +
>      if (qemuCapsGet(priv->caps, QEMU_CAPS_MIGRATE_QEMU_FD))
>          spec.destType = MIGRATION_DEST_CONNECT_HOST;
>      else

We should not be doing this hacking deep inside Perform phase, which should
rather be skipped completely during offline migration.

> @@ -2665,10 +2688,18 @@ static int doPeer2PeerMigrate3(struct qemud_driver *driver,
>               uri, &uri_out, flags, dname, resource, dom_xml);
>          qemuDomainObjExitRemoteWithDriver(driver, vm);
>      }
> +
>      VIR_FREE(dom_xml);
> +
>      if (ret == -1)
>          goto cleanup;
>  
> +    if (flags & VIR_MIGRATE_OFFLINE) {
> +        cookieout = NULL;
> +        cookieoutlen = 0;

And these two lines seem to confirm we really should not skip baking the
cookie in PrepareAny (or just setting it to NULL with 0 len) rather than doing
this hack here.

> +        goto finish;
> +    }
> +
>      if (!(flags & VIR_MIGRATE_TUNNELLED) &&
>          (uri_out == NULL)) {
>          virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
> @@ -2771,7 +2802,7 @@ finish:
>                   vm->def->name);
>  
>   cleanup:
> -    if (ddomain) {
> +    if (ddomain || (flags & VIR_MIGRATE_OFFLINE)) {
>          virObjectUnref(ddomain);
>          ret = 0;
>      } else {

As I said several times already, this code would just ignore any error that
might have happened so far. If we get here during offline migration with
ddomain == NULL, we would pretend everything went ok and return 0. It's
certainly possible I'm just missing something and my statement is not quite
right but so far you haven't said anything that would make me think so.
Please, either remove this change or explain in detail why it is needed and
why the thing I'm saying is wrong.

You said "or you'll get an error for ret = -1". Does that mean we can get here
with ddomain == NULL even though everything succeeded? If so it's
qemuMigrationFinish that needs to be fixed to properly return non-NULL when
offline migration succeeded. However, looking at the code qemuMigrationFinish
looks like it should already do so.

> @@ -2848,7 +2879,7 @@ static int doPeer2PeerMigrate(struct qemud_driver *driver,
>      }
>  
>      /* domain may have been stopped while we were talking to remote daemon */
> -    if (!virDomainObjIsActive(vm)) {
> +    if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) {
>          virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>                         _("guest unexpectedly quit"));
>          goto cleanup;
> @@ -2911,7 +2942,7 @@ qemuMigrationPerformJob(struct qemud_driver *driver,
>      if (qemuMigrationJobStart(driver, vm, QEMU_ASYNC_JOB_MIGRATION_OUT) < 0)
>          goto cleanup;
>  
> -    if (!virDomainObjIsActive(vm)) {
> +    if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) {
>          virReportError(VIR_ERR_OPERATION_INVALID,
>                         "%s", _("domain is not running"));
>          goto endjob;
> @@ -3235,26 +3266,27 @@ qemuMigrationFinish(struct qemud_driver *driver,
>       * object, but if no, clean up the empty qemu process.
>       */
>      if (retcode == 0) {
> -        if (!virDomainObjIsActive(vm)) {
> +        if (!virDomainObjIsActive(vm) && !(flags & VIR_MIGRATE_OFFLINE)) {
>              virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
>                             _("guest unexpectedly quit"));
>              goto endjob;
>          }
>  
> -        if (qemuMigrationVPAssociatePortProfiles(vm->def) < 0) {
> -            qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
> -                            VIR_QEMU_PROCESS_STOP_MIGRATED);
> -            virDomainAuditStop(vm, "failed");
> -            event = virDomainEventNewFromObj(vm,
> -                                             VIR_DOMAIN_EVENT_STOPPED,
> -                                             VIR_DOMAIN_EVENT_STOPPED_FAILED);
> -            goto endjob;
> +        if (!(flags & VIR_MIGRATE_OFFLINE)) {
> +            if (qemuMigrationVPAssociatePortProfiles(vm->def) < 0) {
> +                qemuProcessStop(driver, vm, VIR_DOMAIN_SHUTOFF_FAILED,
> +                                VIR_QEMU_PROCESS_STOP_MIGRATED);
> +                virDomainAuditStop(vm, "failed");
> +                event = virDomainEventNewFromObj(vm,
> +                                                 VIR_DOMAIN_EVENT_STOPPED,
> +                                                 VIR_DOMAIN_EVENT_STOPPED_FAILED);
> +                goto endjob;
> +            }
> +            if (mig->network)
> +                if (qemuDomainMigrateOPDRelocate(driver, vm, mig) < 0)
> +                    VIR_WARN("unable to provide network data for relocation");
>          }
>  
> -        if (mig->network)
> -            if (qemuDomainMigrateOPDRelocate(driver, vm, mig) < 0)
> -                VIR_WARN("unable to provide network data for relocation");
> -
>          if (flags & VIR_MIGRATE_PERSIST_DEST) {
>              virDomainDefPtr vmdef;
>              if (vm->persistent)
> @@ -3302,7 +3334,7 @@ qemuMigrationFinish(struct qemud_driver *driver,
>              event = NULL;
>          }
>  
> -        if (!(flags & VIR_MIGRATE_PAUSED)) {
> +        if (!(flags & VIR_MIGRATE_PAUSED) && !(flags & VIR_MIGRATE_OFFLINE)) {
>              /* run 'cont' on the destination, which allows migration on qemu
>               * >= 0.10.6 to work properly.  This isn't strictly necessary on
>               * older qemu's, but it also doesn't hurt anything there
> @@ -3351,9 +3383,11 @@ qemuMigrationFinish(struct qemud_driver *driver,
>                                               VIR_DOMAIN_EVENT_SUSPENDED,
>                                               VIR_DOMAIN_EVENT_SUSPENDED_PAUSED);
>          }
> -        if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) {
> -            VIR_WARN("Failed to save status on vm %s", vm->def->name);
> -            goto endjob;
> +        if (virDomainObjIsActive(vm)) {
> +            if (virDomainSaveStatus(driver->caps, driver->stateDir, vm) < 0) {
> +                VIR_WARN("Failed to save status on vm %s", vm->def->name);
> +                goto endjob;
> +            }
>          }

We should also skip generating the events. Or even better, we should emit
VIR_DOMAIN_EVENT_DEFINED for offline migration.

>  
>          /* Guest is successfully running, so cancel previous auto destroy */
> @@ -3420,6 +3454,9 @@ int qemuMigrationConfirm(struct qemud_driver *driver,
>      if (!(mig = qemuMigrationEatCookie(driver, vm, cookiein, cookieinlen, 0)))
>          return -1;
>  
> +    if (flags & VIR_MIGRATE_OFFLINE)
> +        goto done;
> +
>      /* Did the migration go as planned?  If yes, kill off the
>       * domain object, but if no, resume CPUs
>       */
> @@ -3455,6 +3492,7 @@ int qemuMigrationConfirm(struct qemud_driver *driver,
>          }
>      }
>  
> +done:
>      qemuMigrationCookieFree(mig);
>      rv = 0;
>  
> diff --git a/src/qemu/qemu_migration.h b/src/qemu/qemu_migration.h
> index 7a2269a..f2dc5aa 100644
> --- a/src/qemu/qemu_migration.h
> +++ b/src/qemu/qemu_migration.h
> @@ -36,7 +36,8 @@
>       VIR_MIGRATE_NON_SHARED_DISK |              \
>       VIR_MIGRATE_NON_SHARED_INC |               \
>       VIR_MIGRATE_CHANGE_PROTECTION |            \
> -     VIR_MIGRATE_UNSAFE)
> +     VIR_MIGRATE_UNSAFE |                       \
> +     VIR_MIGRATE_OFFLINE)
>  
>  enum qemuMigrationJobPhase {
>      QEMU_MIGRATION_PHASE_NONE = 0,
> @@ -97,7 +98,8 @@ int qemuMigrationPrepareTunnel(struct qemud_driver *driver,
>                                 int *cookieoutlen,
>                                 virStreamPtr st,
>                                 const char *dname,
> -                               const char *dom_xml);
> +                               const char *dom_xml,
> +                               unsigned long flags);
>  
>  int qemuMigrationPrepareDirect(struct qemud_driver *driver,
>                                 virConnectPtr dconn,
> @@ -108,7 +110,8 @@ int qemuMigrationPrepareDirect(struct qemud_driver *driver,
>                                 const char *uri_in,
>                                 char **uri_out,
>                                 const char *dname,
> -                               const char *dom_xml);
> +                               const char *dom_xml,
> +                               unsigned long flags);
>  
>  int qemuMigrationPerform(struct qemud_driver *driver,
>                           virConnectPtr conn,
> diff --git a/tools/virsh-domain.c b/tools/virsh-domain.c
> index 393b67b..39674ba 100644
> --- a/tools/virsh-domain.c
> +++ b/tools/virsh-domain.c
> @@ -6644,6 +6644,7 @@ static const vshCmdInfo info_migrate[] = {
>  
>  static const vshCmdOptDef opts_migrate[] = {
>      {"live", VSH_OT_BOOL, 0, N_("live migration")},
> +    {"offline", VSH_OT_BOOL, 0, N_("offline (domain's inactive) migration")},
>      {"p2p", VSH_OT_BOOL, 0, N_("peer-2-peer migration")},
>      {"direct", VSH_OT_BOOL, 0, N_("direct migration")},
>      {"tunneled", VSH_OT_ALIAS, 0, "tunnelled"},
> @@ -6729,6 +6730,10 @@ doMigrate(void *opaque)
>      if (vshCommandOptBool(cmd, "unsafe"))
>          flags |= VIR_MIGRATE_UNSAFE;
>  
> +    if (vshCommandOptBool(cmd, "offline")) {
> +        flags |= VIR_MIGRATE_OFFLINE;
> +    }
> +
>      if (xmlfile &&
>          virFileReadAll(xmlfile, 8192, &xml) < 0) {
>          vshError(ctl, _("file '%s' doesn't exist"), xmlfile);
> diff --git a/tools/virsh.pod b/tools/virsh.pod
> index e0c6b42..2545455 100644
> --- a/tools/virsh.pod
> +++ b/tools/virsh.pod
> @@ -1026,13 +1026,14 @@ I<--total> for only the total stats, I<start> for only the per-cpu
>  stats of the CPUs from I<start>, I<count> for only I<count> CPUs'
>  stats.
>  
> -=item B<migrate> [I<--live>] [I<--direct>] [I<--p2p> [I<--tunnelled>]]
> +=item B<migrate> [I<--live>] [I<--offline>] [I<--direct>] [I<--p2p> [I<--tunnelled>]]
>  [I<--persistent>] [I<--undefinesource>] [I<--suspend>] [I<--copy-storage-all>]
>  [I<--copy-storage-inc>] [I<--change-protection>] [I<--unsafe>] [I<--verbose>]
>  I<domain> I<desturi> [I<migrateuri>] [I<dname>]
>  [I<--timeout> B<seconds>] [I<--xml> B<file>]
>  
> -Migrate domain to another host.  Add I<--live> for live migration; I<--p2p>
> +Migrate domain to another host.  Add I<--live> for live migration;
> +I<--offline> for offline (domain's inactive) migration; <--p2p>
>  for peer-2-peer migration; I<--direct> for direct migration; or I<--tunnelled>
>  for tunnelled migration.  I<--persistent> leaves the domain persistent on
>  destination host, I<--undefinesource> undefines the domain on the source host,

I'm afraid we need yet another version of this patch :-(

Jirka




More information about the libvir-list mailing list