[libvirt] [PATCHv2 20/20] snapshot: qemu: Implement reverting of external snapshots

Eric Blake eblake at redhat.com
Tue Nov 6 23:37:37 UTC 2012


On 11/01/2012 10:22 AM, Peter Krempa wrote:
> This patch adds support for reverting of external snapshots. The support
> is somewhat limited yet (you can only revert to a snapshot that has no
> children or delete the children that would have their image chains
> invalidated).

Sounds like we need to work on the snapshot-create-and-revert operation;
I'll see if I can propose something for that.

> 
> While reverting an external snapshot, the domain has to be taken offline
> as there's no possiblity to start a migration from file on a running

s/possiblity/possibility/

> machine. This poses a few dificulties when the user has virt-viewer

s/dificulties/difficulties/

> attached as apropriate events need to be re-emited even if the machine

s/apropriate/appropriate/
s/re-emited/re-emitted/

> doesn't change states.

The need to start a new qemu process in order to revert to an external
checkpoint sounds like a case for the already-existing
VIR_DOMAIN_SNAPSHOT_REVERT_FORCE - if that flag is missing, then we
refuse to revert to a running state just like we refuse to revert across
guest ABI changes; when the flag is present, then the user is admitting
that spawning a new qemu process (and thus temporarily breaking
virt-viewer connection) is acceptable.

> ---
>  src/qemu/qemu_driver.c | 333 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 328 insertions(+), 5 deletions(-)
> 
> diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c
> index 46b7e3a..dd4c216 100644
> --- a/src/qemu/qemu_driver.c
> +++ b/src/qemu/qemu_driver.c
> @@ -12012,6 +12012,327 @@ qemuDomainSnapshotRevertInactive(struct qemud_driver *driver,
>      return ret > 0 ? -1 : ret;
>  }
> 
> +/* helper to create lifecycle for possible outcomes of reverting snapshots */
> +static int
> +qemuDomainSnapshotCreateEvent(struct qemud_driver *driver,
> +                              virDomainObjPtr vm,
> +                              virDomainState old_state,
> +                              virDomainState new_state)
> +{
> +    int reason = 0; /* generic unknown reason */
> +    int type = -1;
> +    virDomainEventPtr event;
> +
> +    switch (new_state) {
> +    case VIR_DOMAIN_RUNNING:
> +        switch (old_state) {
> +        case VIR_DOMAIN_NOSTATE:
> +        case VIR_DOMAIN_SHUTOFF:
> +        case VIR_DOMAIN_CRASHED:
> +        case VIR_DOMAIN_RUNNING:

Do we need an EVENT_STARTED even when we were previously running?  Or is
that only if we had to spawn a new qemu process (in which case, this
function might need a bool parameter that says whether we are reusing an
existing qemu or creating a new one).

> +        case VIR_DOMAIN_BLOCKED:
> +        case VIR_DOMAIN_SHUTDOWN:
> +            type = VIR_DOMAIN_EVENT_STARTED;
> +            reason = VIR_DOMAIN_EVENT_STARTED_FROM_SNAPSHOT;
> +            break;
> +
> +        case VIR_DOMAIN_PMSUSPENDED:
> +        case VIR_DOMAIN_PAUSED:
> +            type = VIR_DOMAIN_EVENT_RESUMED;
> +            reason = VIR_DOMAIN_EVENT_RESUMED_FROM_SNAPSHOT;
> +            break;
> +
> +       case VIR_DOMAIN_LAST:
> +            /* no event */
> +            break;
> +        }
> +        break;
> +
> +    case VIR_DOMAIN_PAUSED:
> +        switch (old_state) {
> +        case VIR_DOMAIN_NOSTATE:
> +        case VIR_DOMAIN_SHUTOFF:
> +        case VIR_DOMAIN_CRASHED:
> +            /* the machine was started here, so we need an aditional event */

s/aditional/additional/

> +            event = virDomainEventNewFromObj(vm,
> +                                             VIR_DOMAIN_EVENT_STARTED,
> +                                             VIR_DOMAIN_EVENT_STARTED_FROM_SNAPSHOT);
> +            if (!event)
> +                goto no_memory;
> +            qemuDomainEventQueue(driver, event);
> +            /* fallthrough! */
> +        case VIR_DOMAIN_PMSUSPENDED:
> +        case VIR_DOMAIN_RUNNING:
> +        case VIR_DOMAIN_SHUTDOWN:
> +        case VIR_DOMAIN_BLOCKED:
> +            type = VIR_DOMAIN_EVENT_SUSPENDED;
> +            reason = VIR_DOMAIN_EVENT_SUSPENDED_FROM_SNAPSHOT;
> +            break;
> +
> +        case VIR_DOMAIN_PAUSED:
> +        case VIR_DOMAIN_LAST:
> +            /* no event */
> +            break;
> +        }
> +        break;
> +
> +    case VIR_DOMAIN_SHUTOFF:
> +        switch (old_state) {
> +        case VIR_DOMAIN_NOSTATE:
> +        case VIR_DOMAIN_BLOCKED:
> +        case VIR_DOMAIN_SHUTDOWN:
> +        case VIR_DOMAIN_PAUSED:
> +        case VIR_DOMAIN_RUNNING:
> +        case VIR_DOMAIN_PMSUSPENDED:
> +            type = VIR_DOMAIN_EVENT_STOPPED;
> +            reason = VIR_DOMAIN_EVENT_STOPPED_FROM_SNAPSHOT;
> +            break;
> +
> +        case VIR_DOMAIN_SHUTOFF:
> +        case VIR_DOMAIN_CRASHED:
> +        case VIR_DOMAIN_LAST:
> +            /* no event */
> +            break;
> +        }
> +        break;
> +
> +    case VIR_DOMAIN_PMSUSPENDED:
> +        switch (old_state) {
> +        case VIR_DOMAIN_NOSTATE:
> +        case VIR_DOMAIN_SHUTOFF:
> +        case VIR_DOMAIN_CRASHED:
> +            /* the machine was started here, so we need an aditional event */

s/aditional/additional/

> +            event = virDomainEventNewFromObj(vm,
> +                                             VIR_DOMAIN_EVENT_STARTED,
> +                                             VIR_DOMAIN_EVENT_STARTED_FROM_SNAPSHOT);
> +            if (!event)
> +                goto no_memory;
> +
> +            qemuDomainEventQueue(driver, event);
> +            /* fallthrough! */
> +        case VIR_DOMAIN_BLOCKED:
> +        case VIR_DOMAIN_SHUTDOWN:
> +        case VIR_DOMAIN_PAUSED:
> +        case VIR_DOMAIN_RUNNING:
> +            type = VIR_DOMAIN_EVENT_PMSUSPENDED;
> +            reason = VIR_DOMAIN_EVENT_PMSUSPENDED_FROM_SNAPSHOT;
> +            break;

Back to my questions in 17 about whether we even have the capability of
reverting to S3 state, and how we have to be careful about the
difference between S3 (qemu continues to exist) and S4 (looks like
SHUTOFF except for how the guest will boot next).

> +
> +        case VIR_DOMAIN_PMSUSPENDED:
> +        case VIR_DOMAIN_LAST:
> +            /* no event */
> +            break;
> +        }
> +        break;
> +
> +    /* fallthrough */
> +    case VIR_DOMAIN_NOSTATE:
> +    case VIR_DOMAIN_BLOCKED:
> +    case VIR_DOMAIN_SHUTDOWN:
> +    case VIR_DOMAIN_CRASHED:
> +    case VIR_DOMAIN_LAST:
> +        virReportError(VIR_ERR_INTERNAL_ERROR,
> +                       _("Suspicious domain state '%d' after snapshot"),
> +                       new_state);
> +        return -1;
> +    }
> +
> +    if (!(event = virDomainEventNewFromObj(vm, type, reason)))
> +        goto no_memory;
> +
> +    qemuDomainEventQueue(driver, event);
> +
> +    return 0;
> +
> +no_memory:
> +    virReportOOMError();
> +    return -1;
> +}

Is this helper function worth splitting into a separate commit, rather
than trying to add it and external reverts all at once?

> +
> +
> +/* The domain and driver is expected to be locked */
> +static int
> +qemuDomainSnapshotRevertExternal(virConnectPtr conn,
> +                                 struct qemud_driver *driver,
> +                                 virDomainObjPtr vm,
> +                                 virDomainSnapshotObjPtr snap,
> +                                 unsigned int flags)
> +{
> +    int ret = -1;
> +    int loadfd = -1;
> +
> +    virDomainDefPtr save_def = NULL;
> +    struct qemud_save_header header;
> +    virFileWrapperFdPtr wrapperFd = NULL;
> +
> +    virDomainState old_state = virDomainObjGetState(vm, NULL);
> +    virDomainDefPtr config = NULL;
> +
> +    struct qemu_snap_remove rem;
> +
> +    /* check if snapshot has children */
> +    if (snap->nchildren > 0 &&
> +        !(flags & VIR_DOMAIN_SNAPSHOT_REVERT_FORCE)) {
> +        virReportError(VIR_ERR_SNAPSHOT_REVERT_RISKY,
> +                       _("Can't revert to snapshot '%s'. "
> +                         "Snapshot has children."), snap->def->name);
> +        goto cleanup;
> +    }

This says that if the user _does_ pass FORCE, then we proceed with the
revert anyway, even though it invalidates the children.  I wonder if
instead we need a flag that says 'revert to this snapshot and
simultaneously delete its children'.

Also, in the case of external snapshots, we have two plausible points of
where to revert to.  For symmetry with internal checkpoints, the default
with no flag has to be reverting to the point in time where the snapshot
was taken.  But for external snapshots, if we allow creation of snapshot
branches (again, back to my create-and-revert idea), then it makes sense
to revert to the named snapshot _and all subsequent changes_ tracked by
the external file; sounds like yet another flag worth adding.

> +
> +    if (vm->current_snapshot) {
> +        vm->current_snapshot->def->current = false;
> +        if (qemuDomainSnapshotWriteMetadata(vm, vm->current_snapshot,
> +                                            driver->snapshotDir) < 0)
> +            goto cleanup;
> +        vm->current_snapshot = NULL;
> +        /* XXX Should we restore vm->current_snapshot after this point
> +         * in the failure cases where we know there was no change?  */
> +    }
> +
> +    /* Prepare to copy the snapshot inactive xml as the config of this
> +     * domain.  Easiest way is by a round trip through xml.
> +     *
> +     * XXX Should domain snapshots track live xml rather
> +     * than inactive xml?  */
> +    snap->def->current = true;
> +    if (snap->def->dom) {
> +        char *xml;
> +        if (!(xml = qemuDomainDefFormatXML(driver,
> +                                           snap->def->dom,
> +                                           VIR_DOMAIN_XML_INACTIVE |
> +                                           VIR_DOMAIN_XML_SECURE |
> +                                           VIR_DOMAIN_XML_MIGRATABLE)))
> +            goto cleanup;
> +
> +        config = virDomainDefParseString(driver->caps, xml,
> +                                         QEMU_EXPECTED_VIRT_TYPES,
> +                                         VIR_DOMAIN_XML_INACTIVE);
> +        VIR_FREE(xml);
> +        if (!config)
> +            goto cleanup;
> +    } else {
> +        virReportError(VIR_ERR_INTERNAL_ERROR,
> +                       _("Domain definition not found in snapshot '%s'"),
> +                       snap->def->name);
> +        goto cleanup;
> +    }
> +
> +    /* try to open the memory save image if one exists */
> +    if (snap->def->memory == VIR_DOMAIN_SNAPSHOT_LOCATION_EXTERNAL) {
> +        if (!snap->def->file || !virFileExists(snap->def->file)) {
> +            virReportError(VIR_ERR_INTERNAL_ERROR,
> +                           _("Memory save file '%s' does not exist."),
> +                           snap->def->file);

NULL dereference if !snap->def->file.  Your error message needs to be
different for disk-only snapshots than for missing memory file for
external checkpoint.

> +            goto cleanup;
> +        }
> +
> +        /* open the save image file */
> +        if ((loadfd = qemuDomainSaveImageOpen(driver, snap->def->file,
> +                                              &save_def, &header,
> +                                              false, &wrapperFd, NULL,
> +                                              -1, false, false)) < 0)
> +            goto cleanup;
> +
> +        /* opening succeeded, there's a big probability the revert will work.
> +         * We can now get rid of the active qemu, if it runs */
> +    }
> +
> +    if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0)
> +        goto cleanup;
> +
> +    /* Now we destroy the (possibly) running qemu process.
> +     * Unfortunately, the guest can be restored only using incomming migration.

s/incomming/incoming/

> +     */
> +    if (virDomainObjIsActive(vm)) {
> +        qemuProcessStop(driver, vm,
> +                        VIR_DOMAIN_SHUTOFF_FROM_SNAPSHOT, 0);
> +        virDomainAuditStop(vm, "from-snapshot");
> +    }
> +
> +    /* remove snapshot descendatns, as we're invalidating the image chain */

s/descendatns/descendants/

> +
> +    rem.driver = driver;
> +    rem.vm = vm;
> +    rem.metadata_only = false;
> +    rem.err = 0;
> +    rem.current = false;
> +    virDomainSnapshotForEachDescendant(snap,
> +                                       qemuDomainSnapshotDiscardAll,
> +                                       &rem);
> +    if (rem.err < 0)
> +        goto endjob;
> +
> +    /* assign domain definition */
> +    virDomainObjAssignDef(vm, config, false);
> +    config = NULL;
> +
> +    /* wipe and re-create disk images */
> +    if (qemuDomainSnapshotCreateInactiveExternal(driver, vm, snap, true) < 0)
> +        goto endjob;

Oh, here I see where you are passing 'true' for the reuse parameter
where I complained that patch 15 was always passing false.  But there's
a difference between the user passing
VIR_DOMAIN_SNAPSHOT_CREATE_REUSE_EXT (we should reuse the file exactly
as-is, under the assumption that the user created it as a wrapper around
the existing file with no contents) and the revert case (the wrapper is
NOT thin, so it needs to be re-created or at least wiped to be made thin
again).  I'm wondering if this will need to be made a tri-state
parameter, or more work done here to actually wipe things.

> +
> +    /* save the config files */
> +    if (virDomainSaveConfig(driver->configDir, vm->def) < 0) {
> +        virReportError(VIR_ERR_OPERATION_INVALID, "%s",
> +                       _("Write to config file failed"));
> +        goto endjob;
> +    }
> +
> +    /* re-start the guest if necessary */
> +    if (!(flags & VIR_DOMAIN_SNAPSHOT_REVERT_STOPPED) &&
> +        (snap->def->state == VIR_DOMAIN_RUNNING ||
> +         snap->def->state == VIR_DOMAIN_PAUSED ||
> +         flags & VIR_DOMAIN_SNAPSHOT_REVERT_RUNNING ||
> +         flags & VIR_DOMAIN_SNAPSHOT_REVERT_PAUSED)) {
> +
> +        if (loadfd < 0) {
> +            virReportError(VIR_ERR_INVALID_ARG, "%s",
> +                           _("Can't revert snapshot to running state. "
> +                             "Memory image not found."));
> +            goto endjob;
> +        }

I'm wondering if some of these errors should be detected up front,
rather than after the point that we already shut down an already-running
domain in preparation for loading the new state.

> +
...
> @@ -12096,7 +12419,6 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot,
>          }
>      }
> 
> -
>      if (vm->current_snapshot) {
>          vm->current_snapshot->def->current = false;
>          if (qemuDomainSnapshotWriteMetadata(vm, vm->current_snapshot,

Spurious whitespace change?

> @@ -12129,6 +12451,7 @@ static int qemuDomainRevertToSnapshot(virDomainSnapshotPtr snapshot,
>              goto cleanup;
>      }
> 
> +    /* Internal snapshot revert code starts below */
>      if (qemuDomainObjBeginJobWithDriver(driver, vm, QEMU_JOB_MODIFY) < 0)
>          goto cleanup;
> 

-- 
Eric Blake   eblake at redhat.com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 617 bytes
Desc: OpenPGP digital signature
URL: <http://listman.redhat.com/archives/libvir-list/attachments/20121106/c4b13c0b/attachment-0001.sig>


More information about the libvir-list mailing list