[libvirt] [PATCH v1 09/21] qemu: Spawn qemu under mount namespace

Daniel P. Berrange berrange at redhat.com
Mon Dec 5 13:26:05 UTC 2016


On Thu, Nov 24, 2016 at 03:47:58PM +0100, Michal Privoznik wrote:
> Prime time. When it comes to spawning qemu process and
> relabelling all the devices it's going to touch, there's inherent
> race with other applications in the system (e.g. udev). Instead
> of trying convincing udev to not touch libvirt managed devices,
> we can create a separate mount namespace for the qemu, and mount
> our own /dev there. Of course this puts more work onto us as we
> have to maintain /dev files on each domain start and device
> hot(un-)plug. On the other hand, this enhances security also.
> 
> >From technical POV, on domain startup process the parent
> (libvirtd) creates:
> 
>   /var/lib/libvirt/qemu/$domain.dev
>   /var/lib/libvirt/qemu/$domain.devpts
> 
> The child (which is going to be qemu eventually) calls unshare()
> to create new mount namespace. From now on anything that child
> does is invisible to the parent. Child then mounts tmpfs on
> $domain.dev (so that it still sees original /dev from the host)
> and creates some devices (as explained in one of the previous
> patches). The devices have to be created exactly as they are in
> the host (including perms, seclabels, ACLs, ...). After that it
> moves $domain.dev mount to /dev.
> 
> What's the $domain.devpts mount there for then you ask? QEMU can
> create PTYs for some chardevs. And historically we exposed the
> host ends in our domain XML allowing users to connect to them.
> Therefore we must preserve devpts mount to be shared with the
> host's one.
> 
> To make this patch as small as possible, creating of devices
> configured for domain in question is implemented in next patches.

IIUC, this means that QEMU startup will be broken for any guests
that need host devices for disk, usb/pci passthrough, etc ?

It is nice to keep the work incremental though.  I wonder if there
is any value in doing everything *except* the MS_MOVE of
/var/lib/libvirt/qemu/$domain.dev -> /dev/. ie only turn it on once
all the code is in place ?


> Signed-off-by: Michal Privoznik <mprivozn at redhat.com>
> ---
>  src/qemu/qemu_domain.c  | 289 ++++++++++++++++++++++++++++++++++++++++++++++++
>  src/qemu/qemu_domain.h  |  10 ++
>  src/qemu/qemu_process.c |  13 +++
>  3 files changed, 312 insertions(+)
> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 137d4d5..d6a1c29 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -55,6 +55,9 @@
>  
>  #include <sys/time.h>
>  #include <fcntl.h>
> +#if defined(HAVE_SYS_MOUNT_H)
> +# include <sys/mount.h>
> +#endif
>  
>  #include <libxml/xpathInternals.h>
>  
> @@ -1627,6 +1630,9 @@ qemuDomainObjPrivateXMLFormat(virBufferPtr buf,
>                            virDomainChrTypeToString(priv->monConfig->type));
>      }
>  
> +    if (priv->containerized)
> +        virBufferAddLit(buf, "<containerized/>\n");

I wonder if it is worth explicitly listing the namespaces we enabled ?
eg

  <namespaces>
     <mount/>
  </namespaces>

so we're prepared to deal with us adding use of more private namespaces
in future ?

> +
>      qemuDomainObjPrivateXMLFormatVcpus(buf, vm->def);
>  
>      if (priv->qemuCaps) {
> @@ -1809,6 +1815,8 @@ qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt,
>          goto error;
>      }
>  
> +    priv->containerized = virXPathBoolean("count(./containerized) > 0", ctxt) > 0;
> +
>      if ((n = virXPathNodeSet("./vcpus/vcpu", ctxt, &nodes)) < 0)
>          goto error;
>  
> @@ -6653,3 +6661,284 @@ qemuDomainSupportsVideoVga(virDomainVideoDefPtr video,
>  
>      return true;
>  }
> +
> +
> +static int
> +qemuDomainCreateDevice(const char *device,
> +                       const char *path,
> +                       bool allow_noent)
> +{
> +    char *devicePath = NULL;
> +    struct stat sb;
> +    int ret = -1;
> +
> +    if (!STRPREFIX(device, "/dev")) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR,
> +                       _("invalid device: %s"),
> +                       device);
> +        goto cleanup;
> +    }
> +
> +    if (virAsprintf(&devicePath, "%s/%s",
> +                    path, device + 4) < 0)
> +        goto cleanup;
> +
> +    if (stat(device, &sb) < 0) {
> +        if (errno == ENOENT && allow_noent) {
> +            /* Ignore non-existent device. */
> +            ret = 0;
> +            goto cleanup;
> +        }
> +
> +        virReportSystemError(errno, _("Unable to stat %s"), device);
> +        goto cleanup;
> +    }
> +
> +    if (virFileMakeParentPath(devicePath) < 0) {
> +        virReportSystemError(errno,
> +                             _("Unable to create %s"),
> +                             devicePath);
> +        goto cleanup;
> +    }
> +
> +    if (mknod(devicePath, sb.st_mode, sb.st_rdev) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to make device %s"),
> +                             devicePath);
> +        goto cleanup;
> +    }
> +
> +    if (chown(devicePath, sb.st_uid, sb.st_gid) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to chown device %s"),
> +                             devicePath);
> +        goto cleanup;
> +    }
> +
> +    if (virFileCopyACLs(device, devicePath) < 0 &&
> +        errno != ENOTSUP) {
> +        virReportSystemError(errno,
> +                             _("Failed to copy ACLs on device %s"),
> +                             devicePath);
> +        goto cleanup;
> +    }

Per my comments on earlier patches, I don't think we need to have
the chown or ACL copy. Just have the dev owned by root:root and
let our security drivers deal with the rest.

> +
> +    ret = 0;
> + cleanup:
> +    VIR_FREE(devicePath);
> +    return ret;
> +}
> +
> +
> +
> +static int
> +qemuDomainPopulateDevices(virQEMUDriverPtr driver,
> +                          virDomainObjPtr vm ATTRIBUTE_UNUSED,
> +                          const char *path)
> +{
> +    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
> +    const char *const *devices = (const char *const *) cfg->cgroupDeviceACL;
> +    size_t i;
> +    int ret = -1;
> +
> +    if (!devices)
> +        devices = defaultDeviceACL;
> +
> +    for (i = 0; devices[i]; i++) {
> +        if (qemuDomainCreateDevice(devices[i], path, true) < 0)
> +            goto cleanup;
> +    }
> +
> +    ret = 0;
> + cleanup:
> +    virObjectUnref(cfg);
> +    return ret;
> +}
> +
> +
> +static int
> +qemuDomainSetupDev(virQEMUDriverPtr driver,
> +                   virDomainObjPtr vm,
> +                   const char *path)
> +{
> +    char *mount_options = NULL;
> +    char *opts = NULL;
> +    int ret = -1;
> +
> +    VIR_DEBUG("Setting up /dev/ for domain %s", vm->def->name);
> +
> +    mount_options = virSecurityManagerGetMountOptions(driver->securityManager,
> +                                                      vm->def);
> +
> +    if (!mount_options &&
> +        VIR_STRDUP(mount_options, "") < 0)
> +        goto cleanup;
> +
> +    /*
> +     * tmpfs is limited to 64kb, since we only have device nodes in there
> +     * and don't want to DOS the entire OS RAM usage
> +     */
> +    if (virAsprintf(&opts,
> +                    "mode=755,size=65536%s", mount_options) < 0)
> +        goto cleanup;
> +
> +    if (virFileSetupDev(path, opts) < 0)
> +        goto cleanup;
> +
> +    if (qemuDomainPopulateDevices(driver, vm, path) < 0)
> +        goto cleanup;
> +
> +    ret = 0;
> + cleanup:
> +    VIR_FREE(opts);
> +    VIR_FREE(mount_options);
> +    return ret;
> +}
> +
> +
> +int
> +qemuDomainBuildNamespace(virQEMUDriverPtr driver,
> +                         virDomainObjPtr vm)
> +{
> +    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
> +    qemuDomainObjPrivatePtr priv = vm->privateData;
> +    const unsigned long mount_flags = MS_MOVE;
> +    char *devPath = NULL;
> +    char *devptsPath = NULL;
> +    int ret = -1;
> +
> +    if (!priv->containerized) {
> +        ret = 0;
> +        goto cleanup;
> +    }
> +
> +    if (virAsprintf(&devPath, "%s/%s.dev",
> +                    cfg->stateDir, vm->def->name) < 0 ||
> +        virAsprintf(&devptsPath, "%s/%s.devpts",
> +                    cfg->stateDir, vm->def->name) < 0)
> +        goto cleanup;
> +
> +    if (qemuDomainSetupDev(driver, vm, devPath) < 0)
> +        goto cleanup;
> +
> +    /* Save /dev/pts mount point because /dev/pts/NNN is exposed in our live
> +     * XML and other applications are supposed to be able to use it. */
> +    if (mount("/dev/pts", devptsPath, NULL, mount_flags, NULL) < 0) {
> +        virReportSystemError(errno, "%s",
> +                             _("Unable to move /dev/pts mount"));
> +        goto cleanup;
> +    }
> +
> +    if (mount(devPath, "/dev", NULL, mount_flags, NULL) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to mount %s on /dev"),
> +                             devPath);
> +        goto cleanup;
> +    }
> +
> +    if (virFileMakePath("/dev/pts") < 0) {
> +        virReportSystemError(errno, "%s",
> +                             _("Cannot create /dev/pts"));
> +        goto cleanup;
> +    }
> +
> +    if (mount(devptsPath, "/dev/pts", NULL, mount_flags, NULL) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to mount %s on /dev/pts"),
> +                             devptsPath);
> +        goto cleanup;
> +    }
> +
> +    if (virFileBindMountDevice("/dev/pts/ptmx", "/dev/ptmx") < 0)
> +        goto cleanup;
> +
> +    VIR_DEBUG("blaaah: %d", system("find /dev/ -ls > /tmp/blaaah"));
> +    VIR_DEBUG("blaaah: %d", system("echo >> /tmp/blaaah"));
> +    VIR_DEBUG("blaaah: %d", system("mount >> /tmp/blaaah"));

Heh, :-)


> +    ret = 0;
> + cleanup:
> +    virObjectUnref(cfg);
> +    VIR_FREE(devPath);
> +    return ret;
> +}
> +
> +
> +int
> +qemuDomainCreateNamespace(virQEMUDriverPtr driver,
> +                          virDomainObjPtr vm)
> +{
> +    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
> +    qemuDomainObjPrivatePtr priv = vm->privateData;
> +    int ret = -1;
> +    char *path = NULL;
> +
> +#if !defined(__linux__)
> +    /* Namespaces are Linux specific. On other platforms just
> +     * carry on with the old behaviour. */
> +    return 0;
> +#endif

This feels quite likely to create compiler warnings on non-linux
about unreachable code. It feels like we should just stub out
the entire method instead.

> +
> +    if (!virQEMUDriverIsPrivileged(driver)) {
> +        ret = 0;
> +        goto cleanup;
> +    }
> +
> +    if (virAsprintf(&path, "%s/%s.dev",
> +                    cfg->stateDir, vm->def->name) < 0)
> +        goto cleanup;
> +
> +    if (virFileMakePath(path) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to create %s"),
> +                             path);
> +        goto cleanup;
> +    }
> +
> +    VIR_FREE(path);
> +    if (virAsprintf(&path, "%s/%s.devpts",
> +                    cfg->stateDir, vm->def->name) < 0)
> +        goto cleanup;
> +
> +    if (virFileMakePath(path) < 0) {
> +        virReportSystemError(errno,
> +                             _("Failed to create %s"),
> +                             path);
> +        goto cleanup;
> +    }
> +
> +    priv->containerized = true;
> +    ret = 0;
> + cleanup:
> +    VIR_FREE(path);
> +    virObjectUnref(cfg);
> +    return ret;
> +}
> +
> +
> +void
> +qemuDomainDeleteNamespace(virQEMUDriverPtr driver,
> +                          virDomainObjPtr vm)
> +{
> +    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
> +    qemuDomainObjPrivatePtr priv = vm->privateData;
> +    char *path;
> +
> +    if (!priv->containerized)
> +        return;
> +
> +    if (virAsprintf(&path, "%s/%s.dev",
> +                    cfg->stateDir, vm->def->name) < 0)
> +        goto cleanup;
> +
> +    virFileDeleteTree(path);
> +
> +    VIR_FREE(path);
> +    if (virAsprintf(&path, "%s/%s.devpts",
> +                    cfg->stateDir, vm->def->name) < 0)
> +        goto cleanup;
> +
> +    virFileDeleteTree(path);
> + cleanup:
> +    virObjectUnref(cfg);
> +    VIR_FREE(path);
> +}

This is running in the host namespace and custom /dev tmpfs
was only ever mounted in the QEMU private namespace, so
should be invisible to the host. As such, IIUC, these
directories should both be 100% empty. IOW, it seems that
we don't need virFileDeleteTree - plain rmdir should be
sufficient and safer against accidents.

> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
> index 7650ff3..9c34476 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -170,6 +170,8 @@ typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr;
>  struct _qemuDomainObjPrivate {
>      struct qemuDomainJobObj job;
>  
> +    bool containerized;
> +
>      qemuMonitorPtr mon;
>      virDomainChrSourceDefPtr monConfig;
>      bool monJSON;
> @@ -785,4 +787,12 @@ int qemuDomainCheckMonitor(virQEMUDriverPtr driver,
>  bool qemuDomainSupportsVideoVga(virDomainVideoDefPtr video,
>                                  virQEMUCapsPtr qemuCaps);
>  
> +int qemuDomainBuildNamespace(virQEMUDriverPtr driver,
> +                             virDomainObjPtr vm);
> +
> +int qemuDomainCreateNamespace(virQEMUDriverPtr driver,
> +                              virDomainObjPtr vm);
> +
> +void qemuDomainDeleteNamespace(virQEMUDriverPtr driver,
> +                               virDomainObjPtr vm);
>  #endif /* __QEMU_DOMAIN_H__ */
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 93a38e1..4f64dd1 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -2661,6 +2661,12 @@ static int qemuProcessHook(void *data)
>      if (virSecurityManagerClearSocketLabel(h->driver->securityManager, h->vm->def) < 0)
>          goto cleanup;
>  
> +    if (virProcessSetupPrivateMountNS() < 0)
> +        goto cleanup;
> +
> +    if (qemuDomainBuildNamespace(h->driver, h->vm) < 0)
> +        goto cleanup;
> +
>      if (virDomainNumatuneGetMode(h->vm->def->numa, -1, &mode) == 0) {
>          if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT &&
>              h->cfg->cgroupControllers & (1 << VIR_CGROUP_CONTROLLER_CPUSET) &&
> @@ -5434,6 +5440,11 @@ qemuProcessLaunch(virConnectPtr conn,
>  
>      qemuDomainLogContextMarkPosition(logCtxt);
>  
> +    VIR_DEBUG("Building mount namespace");
> +
> +    if (qemuDomainCreateNamespace(driver, vm) < 0)
> +        goto cleanup;
> +
>      VIR_DEBUG("Clear emulator capabilities: %d",
>                cfg->clearEmulatorCapabilities);
>      if (cfg->clearEmulatorCapabilities)
> @@ -6210,6 +6221,8 @@ void qemuProcessStop(virQEMUDriverPtr driver,
>          }
>      }
>  
> +    qemuDomainDeleteNamespace(driver, vm);
> +
>      vm->taint = 0;
>      vm->pid = -1;
>      virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, reason);

Regards,
Daniel
-- 
|: http://berrange.com      -o-    http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org              -o-             http://virt-manager.org :|
|: http://entangle-photo.org       -o-    http://search.cpan.org/~danberr/ :|




More information about the libvir-list mailing list