[libvirt] PATCH: 7/7: Allow replacing root filesystem

Daniel P. Berrange berrange at redhat.com
Tue Aug 5 16:24:29 UTC 2008


The LXC driver currently allows custom mount points to be setup inside the
container. This only works for non-root mount points. You cannot replace 
the entire root filesystem. This patch adds support for replacing the entire
root filesystem, thus allowing the use of LXC containers as a 'better chroot
than chroot'. Well, with one minor flaw - the Linux kernel currently has no
device namespace virtualization, so the admin inside the container can just
do a 'mknod' and access the real devices of the host. So for now this patch
doesn't make LXC containers secure, but a future kernel release will enable
it to be secure.

 lxc_container.c |  253 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 util.c          |   12 +-
 2 files changed, 226 insertions(+), 39 deletions(-)

Daniel

diff -r eaa42985aed4 src/lxc_container.c
--- a/src/lxc_container.c	Tue Aug 05 16:50:59 2008 +0100
+++ b/src/lxc_container.c	Tue Aug 05 16:51:14 2008 +0100
@@ -1,10 +1,12 @@
 /*
  * Copyright IBM Corp. 2008
+ * Copyright Red Hat 2008
  *
  * lxc_container.c: file description
  *
  * Authors:
  *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
+ *  Daniel P. Berrange <berrange at redhat.com>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -28,10 +30,18 @@
 #include <fcntl.h>
 #include <limits.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#include <mntent.h>
+
+/* Yes, we want linux private one, for _syscall2() macro */
+#include <linux/unistd.h>
+
+/* For MS_MOVE */
+#include <linux/fs.h>
 
 #include "lxc_container.h"
 #include "util.h"
@@ -105,23 +115,15 @@
  *
  * Returns 0 on success or -1 in case of error
  */
-static int lxcContainerSetStdio(int control, const char *ttyPath)
+static int lxcContainerSetStdio(int control, int ttyfd)
 {
     int rc = -1;
-    int ttyfd;
     int open_max, i;
 
     if (setsid() < 0) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                  _("setsid failed: %s"), strerror(errno));
-        goto error_out;
-    }
-
-    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
-    if (ttyfd < 0) {
-        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                 _("open(%s) failed: %s"), ttyPath, strerror(errno));
-        goto error_out;
+        goto cleanup;
     }
 
     if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
@@ -159,8 +161,6 @@
 
 cleanup:
     close(ttyfd);
-
-error_out:
     return rc;
 }
 
@@ -223,6 +223,7 @@
     return 0;
 }
 
+
 /**
  * lxcEnableInterfaces:
  * @vm: Pointer to vm structure
@@ -252,6 +253,20 @@
     return rc;
 }
 
+
+//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
+extern int pivot_root(const char * new_root,const char * put_old);
+
+static int lxcContainerChildMountSort(const void *a, const void *b)
+{
+  const char **sa = (const char**)a;
+  const char **sb = (const char**)b;
+
+  /* Delibrately reversed args - we need to unmount deepest
+     children first */
+  return strcmp(*sb, *sa);
+}
+
 /**
  * lxcChild:
  * @argv: Pointer to container arguments
@@ -269,8 +284,8 @@
     int rc = -1;
     lxc_child_argv_t *argv = data;
     virDomainDefPtr vmDef = argv->config;
-    virDomainFSDefPtr curMount;
-    int i;
+    virDomainFSDefPtr tmp, root = NULL;
+    int ttyfd, i;
 
     if (NULL == vmDef) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
@@ -278,36 +293,210 @@
         return -1;
     }
 
+#if 0
+    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+    if (ttyfd < 0) {
+        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
+        return -1;
+    }
+#endif
+
     /* handle the bind mounts first before doing anything else that may */
     /* then access those mounted dirs */
-    curMount = vmDef->fss;
-    for (i = 0; curMount; curMount = curMount->next) {
-        // XXX fix
-        if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+    for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) {
+        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
             continue;
-        rc = mount(curMount->src,
-                   curMount->dst,
-                   NULL,
-                   MS_BIND,
-                   NULL);
-        if (0 != rc) {
+        if (STREQ(tmp->dst, "/"))
+            root = tmp;
+    }
+
+    if (root) {
+        char *oldroot;
+        struct mntent *mntent;
+        char **mounts = NULL;
+        int nmounts = 0;
+        FILE *procmnt;
+        struct {
+            int maj;
+            int min;
+            const char *path;
+        } devs[] = {
+            { 1, 3, "/dev/null" },
+            { 1, 5, "/dev/zero" },
+            { 1, 7, "/dev/full" },
+            { 5, 1, "/dev/console" },
+        };
+
+        /* Got a FS mapped to /, we're going the pivot_root
+           approach to do a better-chroot-than-chroot */
+
+        /* this is based on this thread http://lkml.org/lkml/2008/3/5/29 */
+
+        /* First step is to ensure the new root itself is
+           a mount point */
+        if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) {
             lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                     _("failed to mount %s at %s for container: %s"),
-                     curMount->src, curMount->dst, strerror(errno));
+                     _("failed to bind new root %s: %s"),
+                     root->src, strerror(errno));
+            return -1;
+        }
+
+        if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
+            return -1;
+        }
+
+        if (virFileMakePath(oldroot) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to create %s: %s"),
+                     oldroot, strerror(errno));
+            return -1;
+        }
+
+        /* The old root directory will live at /.oldroot after
+         * this and will soon be unmounted completely */
+        if (pivot_root(root->src, oldroot) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to pivot root %s to %s: %s"),
+                     oldroot, root->src, strerror(errno));
+            return -1;
+        }
+
+        /* CWD is undefined after pivot_root, so go to / */
+        if (chdir("/") < 0) {
+            return -1;
+        }
+
+        if (virFileMakePath("/proc") < 0 ||
+            mount("none", "/proc", "proc", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /proc for container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        if (virFileMakePath("/dev") < 0 ||
+            mount("none", "/dev", "tmpfs", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /dev tmpfs for container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        /* Move old devpts into container, since we have to
+           connect to the master ptmx which was opened in
+           the parent.
+           XXX This sucks, we need to figure out how to get our
+           own private devpts for isolation
+        */
+        if (virFileMakePath("/dev/pts") < 0 ||
+            mount("/.oldroot/dev/pts", "/dev/pts", NULL,
+                  MS_MOVE, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to move /dev/pts into container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+
+        /* Populate /dev/ with a few important bits */
+        umask(0);
+        for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
+            dev_t dev = makedev(devs[i].maj, devs[i].min);
+            if (mknod(devs[i].path,
+                      0777 | S_IFCHR,
+                      dev) < 0) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to make device %s: %s"),
+                         devs[i].path, strerror(errno));
+                return -1;
+            }
+        }
+        umask(0700);
+
+        /* Pull in rest of container's mounts */
+        for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+            char *src;
+            if (STREQ(tmp->dst, "/"))
+                continue;
+            // XXX fix
+            if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+                continue;
+
+            if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0)
+                return -1;
+
+            if (virFileMakePath(tmp->dst) < 0 ||
+                mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to mount %s at %s for container: %s"),
+                         tmp->src, tmp->dst, strerror(errno));
+                return -1;
+            }
+            VIR_FREE(src);
+        }
+
+        if (!(procmnt = setmntent("/proc/mounts", "r"))) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to read /proc/mounts: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        while ((mntent = getmntent(procmnt)) != NULL) {
+            if (!STRPREFIX(mntent->mnt_dir, "/.oldroot"))
+                continue;
+            if (VIR_REALLOC_N(mounts, nmounts+1) < 0)
+                return -1;
+            mounts[nmounts++] = strdup(mntent->mnt_dir);
+        }
+        endmntent(procmnt);
+
+        qsort(mounts, nmounts, sizeof(mounts[0]),
+              lxcContainerChildMountSort);
+
+        for (i = 0 ; i < nmounts ; i++) {
+            if (umount(mounts[i]) < 0) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to unmount %s: %s"),
+                         mounts[i], strerror(errno));
+                return -1;
+            }
+        }
+    } else {
+        /* Nothing mapped to /, we're using the main root,
+           but with extra stuff mapped in */
+        for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+            // XXX fix
+            if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+                continue;
+            rc = mount(tmp->src,
+                       tmp->dst,
+                       NULL,
+                       MS_BIND,
+                       NULL);
+            if (0 != rc) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to mount %s at %s for container: %s"),
+                         tmp->src, tmp->dst, strerror(errno));
+                return -1;
+            }
+        }
+
+        /* mount /proc */
+        if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /proc for container: %s"),
+                     strerror(errno));
             return -1;
         }
     }
 
-    /* mount /proc */
-    rc = mount("lxcproc", "/proc", "proc", 0, NULL);
-    if (0 != rc) {
+    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+    if (ttyfd < 0) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                 _("failed to mount /proc for container: %s"),
-                 strerror(errno));
+                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
         return -1;
     }
 
-    if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0)
+    if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0)
         return -1;
 
     /* Wait for interface devices to show up */
diff -r eaa42985aed4 src/util.c
--- a/src/util.c	Tue Aug 05 16:50:59 2008 +0100
+++ b/src/util.c	Tue Aug 05 16:51:14 2008 +0100
@@ -524,13 +524,11 @@
     if (!(p = strrchr(parent, '/')))
         return EINVAL;
 
-    if (p == parent)
-        return EPERM;
-
-    *p = '\0';
-
-    if ((err = virFileMakePath(parent)))
-        return err;
+    if (p != parent) {
+        *p = '\0';
+        if ((err = virFileMakePath(parent)))
+            return err;
+    }
 
     if (mkdir(path, 0777) < 0 && errno != EEXIST)
         return errno;

-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|




More information about the libvir-list mailing list