[libvirt] [PATCH 4/6] qemu: snapshot: Refactor snapshot rollback on failure

John Ferlan jferlan at redhat.com
Sat Jan 7 18:37:12 UTC 2017



On 12/16/2016 11:24 AM, Peter Krempa wrote:
> The code at first changed the definition and then rolled it back in case
> of failure. This was ridiculous. Refactor the code so that the image in
> the definition is changed only when the snapshot is successful.
> 
> The refactor will also simplify further fix of image locking when doing
> snapshots.
> ---
>  src/qemu/qemu_driver.c | 352 ++++++++++++++++++++++++++++---------------------
>  1 file changed, 203 insertions(+), 149 deletions(-)
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 8477ed0dd..742b6ceda 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -14156,75 +14156,186 @@ qemuDomainSnapshotPrepare(virConnectPtr conn,
>  }
> 
> 

[...]

> +/**
> + * qemuDomainSnapshotUpdateDiskSources:
> + * @dd: snapshot disk data object
> + * @persist: set to true if persistent config of the VM was changed
> + *
> + * Updates disk definition after a successful snapshot.
> + */
> +static void
> +qemuDomainSnapshotUpdateDiskSources(qemuDomainSnapshotDiskDataPtr dd,
> +                                    bool *persist)
> +{
> +    if (!dd->src)
> +        return;
> +
> +    /* storage driver access won'd be needed */
> +    if (dd->initialized)
> +        virStorageFileDeinit(dd->src);
> +
> +    dd->src->backingStore = dd->disk->src;
> +    dd->disk->src = dd->src;
> +    dd->src = NULL;

Your choice... You could VIR_STEAL_PTR * 2 too

> +
> +    if (dd->persistdisk) {
> +        dd->persistsrc->backingStore = dd->persistdisk->src;
> +        dd->persistdisk->src = dd->persistsrc;
> +        dd->persistsrc = NULL;

Ditto

> +        *persist = true;
> +    }
> +}
> +
> +

[...]

>  /* The domain is expected to be locked and active. */
>  static int
>  qemuDomainSnapshotCreateDiskActive(virQEMUDriverPtr driver,
> @@ -14324,6 +14376,8 @@ qemuDomainSnapshotCreateDiskActive(virQEMUDriverPtr driver,
>      bool persist = false;
>      bool reuse = (flags & VIR_DOMAIN_SNAPSHOT_CREATE_REUSE_EXT) != 0;
>      virQEMUDriverConfigPtr cfg = NULL;
> +    qemuDomainSnapshotDiskDataPtr diskdata = NULL;
> +    virErrorPtr orig_err = NULL;
> 
>      if (!virDomainObjIsActive(vm)) {
>          virReportError(VIR_ERR_OPERATION_INVALID,
> @@ -14341,81 +14395,81 @@ qemuDomainSnapshotCreateDiskActive(virQEMUDriverPtr driver,
>          return -1;
>      }
> 
> +    /* prepare a list of objects to use in the vm definition so that we don't
> +     * have to roll back later */
> +    if (!(diskdata = qemuDomainSnapshotDiskDataCollect(driver, vm, snap)))
> +        return -1;
> +

Coverity let me know actions could be leaked here.

>      cfg = virQEMUDriverGetConfig(driver);
> 
> -    /* No way to roll back if first disk succeeds but later disks
> -     * fail, unless we have transaction support.
> -     * Based on earlier qemuDomainSnapshotPrepare, all
> -     * disks in this list are now either SNAPSHOT_NO, or
> -     * SNAPSHOT_EXTERNAL with a valid file name and qcow2 format.  */
> +     /* Based on earlier qemuDomainSnapshotPrepare, all disks in this list are
> +      * now either SNAPSHOT_NO, or SNAPSHOT_EXTERNAL with a valid file name and
> +      * qcow2 format.  */

Since you're adjusting the comment anyway... can we fix the wording for
"SNAPSHOT_NO," too?

>      for (i = 0; i < snap->def->ndisks; i++) {
> -        virDomainDiskDefPtr persistDisk = NULL;
> -
>          if (snap->def->disks[i].snapshot == VIR_DOMAIN_SNAPSHOT_LOCATION_NONE)
>              continue;
> -        if (vm->newDef &&
> -            (persistDisk = virDomainDiskByName(vm->newDef,
> -                                               vm->def->disks[i]->dst,
> -                                               false)))
> -            persist = true;
> 
>          ret = qemuDomainSnapshotCreateSingleDiskActive(driver, vm,
> -                                                       &snap->def->disks[i],
> -                                                       vm->def->disks[i],
> -                                                       persistDisk, actions,
> -                                                       reuse, asyncJob);
> +                                                       &diskdata[i],
> +                                                       actions, reuse, asyncJob);
> +
> +        /* without transaction support the change can't be rolled back */
> +        if (!actions)
> +            qemuDomainSnapshotUpdateDiskSources(&diskdata[i], &persist);
> +
>          if (ret < 0)
> -            break;
> +            goto cleanup;
> 
>          do_transaction = true;
>      }
> -    if (actions) {
> -        if (ret == 0 && do_transaction) {
> -            if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) == 0) {
> -                ret = qemuMonitorTransaction(priv->mon, actions);
> -                if (qemuDomainObjExitMonitor(driver, vm) < 0)
> -                    ret = -1;
> -            } else {
> -                /* failed to enter monitor, clean stuff up and quit */
> -                ret = -1;
> -            }
> -        } else {
> -            VIR_DEBUG("no disks to snapshot, skipping 'transaction' command");
> +
> +    if (actions && do_transaction) {
> +        if (qemuDomainObjEnterMonitorAsync(driver, vm, asyncJob) < 0)
> +            goto cleanup;
> +
> +        ret = qemuMonitorTransaction(priv->mon, actions);
> +
> +        if (qemuDomainObjExitMonitor(driver, vm) < 0 || ret < 0) {
> +            ret = -1;
> +            goto cleanup;
>          }
> 
> -        virJSONValueFree(actions);
> +        for (i = 0; i < snap->def->ndisks; i++)
> +            qemuDomainSnapshotUpdateDiskSources(&diskdata[i], &persist);
> +    }
> 
> -        if (ret < 0) {
> -            /* Transaction failed; undo the changes to vm.  */
> -            bool need_unlink = !(flags & VIR_DOMAIN_SNAPSHOT_CREATE_REUSE_EXT);
> -            while (i-- > 0) {
> -                virDomainDiskDefPtr persistDisk = NULL;
> + cleanup:
> +    if (ret < 0) {
> +        orig_err = virSaveLastError();
> +        for (i = 0; i < snap->def->ndisks; i++) {
> +            if (!diskdata[i].src)
> +                continue;
> 
> -                if (snap->def->disks[i].snapshot ==
> -                    VIR_DOMAIN_SNAPSHOT_LOCATION_NONE)
> -                    continue;
> -                if (vm->newDef &&
> -                    (persistDisk = virDomainDiskByName(vm->newDef,
> -                                                       vm->def->disks[i]->dst,
> -                                                       false)))
> -                    persist = true;
> -
> -                qemuDomainSnapshotUndoSingleDiskActive(driver, vm,
> -                                                       vm->def->disks[i],
> -                                                       persistDisk,
> -                                                       need_unlink);
> -            }
> +            if (diskdata[i].prepared)
> +                qemuDomainDiskChainElementRevoke(driver, vm, diskdata[i].src);
> +
> +            if (diskdata[i].created &&
> +                virStorageFileUnlink(diskdata[i].src) < 0)
> +                VIR_WARN("Unable to remove just-created %s", diskdata[i].src->path);
>          }
>      }
> 
> -    if (ret == 0 || !virQEMUCapsGet(priv->qemuCaps, QEMU_CAPS_TRANSACTION)) {
> +    if (ret == 0 || !actions) {

Wouldn't we want to save the status based on transactions? Should this
be "if ((!actions && do_transaction) || (actions && ret == 0))".

    /* Either a partial update has happened without transaction
     * support or a successful usage of actions to perform all
     * updates occurred, thus we need to save our state/config */
    if (do_transaction && (!actions || (actions && ret == 0))) {
    }

Trying to avoid a possible write of the file if no transactions were
completed that could cause an erroneous failure since we did nothing.

>          if (virDomainSaveStatus(driver->xmlopt, cfg->stateDir, vm, driver->caps) < 0 ||
>              (persist && virDomainSaveConfig(cfg->configDir, driver->caps,
>                                              vm->newDef) < 0))
>              ret = -1;

I know this just follows the previous code, but at this point there's no
more undoing and returning -1 would indicate something went wrong.
Nothing quick comes to mind on the best way to handle that, so we can
continue with this. Still we have an inconsistency between actual,
Status, and possibly Config.  Which is worse?

ACK with at least the leak fixed... It'd be good to handle saving things
better.  Whether there's anything than be done if one of the save's fail
- is perhaps a challenge for another day.

John

>      }
> +
> +    qemuDomainSnapshotDiskDataFree(diskdata, snap->def->ndisks, driver, vm);
> +    virJSONValueFree(actions);
>      virObjectUnref(cfg);
> 
> +    if (orig_err) {
> +        virSetError(orig_err);
> +        virFreeError(orig_err);
> +    }
> +
FWIW: If (ret < 0 && do_transactions && !actions), then we actually try
to save what we did and that could fail as well compounding our misery.
Still makes me wonder now - do we not save on error... no worse than
saving on error and having a failure for that save and really not
knowing WTF the "real deal" is!

>      return ret;
>  }
> 




More information about the libvir-list mailing list