[libvirt] PATCH: 4/4: Add pivot_root support to container

Daniel P. Berrange berrange at redhat.com
Wed Aug 13 14:25:07 UTC 2008


This isn't really related to the others, except for the fact that is part
of the LXC driver. This is a re-post of the patch I did for adding support
for pivot_root() in the container. This allows the entire container FS
to be separated from the parent OS. As noted previously this is not currently
secure, since Linux has not yet gained device namespace support, but is a
stepping stone toward the final solution we'll need. Since the last posting
I've added the explicit chmod() suggested by Jim, and the various other
bug fixes

 lxc_container.c |  265 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 util.c          |   12 +-
 2 files changed, 237 insertions(+), 40 deletions(-)


Daniel

diff -r 17f02fec7fe8 src/lxc_container.c
--- a/src/lxc_container.c	Wed Aug 13 14:39:28 2008 +0100
+++ b/src/lxc_container.c	Wed Aug 13 14:39:34 2008 +0100
@@ -1,10 +1,12 @@
 /*
  * Copyright IBM Corp. 2008
+ * Copyright Red Hat 2008
  *
  * lxc_container.c: file description
  *
  * Authors:
  *  David L. Leskovec <dlesko at linux.vnet.ibm.com>
+ *  Daniel P. Berrange <berrange at redhat.com>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -26,10 +28,18 @@
 #include <fcntl.h>
 #include <limits.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <sys/ioctl.h>
 #include <sys/mount.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#include <mntent.h>
+
+/* Yes, we want linux private one, for _syscall2() macro */
+#include <linux/unistd.h>
+
+/* For MS_MOVE */
+#include <linux/fs.h>
 
 #include "lxc_container.h"
 #include "util.h"
@@ -103,23 +113,15 @@
  *
  * Returns 0 on success or -1 in case of error
  */
-static int lxcContainerSetStdio(int control, const char *ttyPath)
+static int lxcContainerSetStdio(int control, int ttyfd)
 {
     int rc = -1;
-    int ttyfd;
     int open_max, i;
 
     if (setsid() < 0) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
                  _("setsid failed: %s"), strerror(errno));
-        goto error_out;
-    }
-
-    ttyfd = open(ttyPath, O_RDWR|O_NOCTTY);
-    if (ttyfd < 0) {
-        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                 _("open(%s) failed: %s"), ttyPath, strerror(errno));
-        goto error_out;
+        goto cleanup;
     }
 
     if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) {
@@ -156,9 +158,6 @@
     rc = 0;
 
 cleanup:
-    close(ttyfd);
-
-error_out:
     return rc;
 }
 
@@ -221,6 +220,7 @@
     return 0;
 }
 
+
 /**
  * lxcEnableInterfaces:
  * @vm: Pointer to vm structure
@@ -251,6 +251,20 @@
     return rc;
 }
 
+
+//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot)
+extern int pivot_root(const char * new_root,const char * put_old);
+
+static int lxcContainerChildMountSort(const void *a, const void *b)
+{
+  const char **sa = (const char**)a;
+  const char **sb = (const char**)b;
+
+  /* Delibrately reversed args - we need to unmount deepest
+     children first */
+  return strcmp(*sb, *sa);
+}
+
 /**
  * lxcChild:
  * @argv: Pointer to container arguments
@@ -268,8 +282,8 @@
     int rc = -1;
     lxc_child_argv_t *argv = data;
     virDomainDefPtr vmDef = argv->config;
-    virDomainFSDefPtr curMount;
-    int i;
+    virDomainFSDefPtr tmp, root = NULL;
+    int ttyfd, i;
 
     if (NULL == vmDef) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
@@ -277,37 +291,222 @@
         return -1;
     }
 
+#if 0
+    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+    if (ttyfd < 0) {
+        lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
+        return -1;
+    }
+#endif
+
     /* handle the bind mounts first before doing anything else that may */
     /* then access those mounted dirs */
-    curMount = vmDef->fss;
-    for (i = 0; curMount; curMount = curMount->next) {
-        // XXX fix
-        if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+    for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) {
+        if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
             continue;
-        rc = mount(curMount->src,
-                   curMount->dst,
-                   NULL,
-                   MS_BIND,
-                   NULL);
-        if (0 != rc) {
+        if (STREQ(tmp->dst, "/"))
+            root = tmp;
+    }
+
+    if (root) {
+        char *oldroot;
+        struct mntent *mntent;
+        char **mounts = NULL;
+        int nmounts = 0;
+        FILE *procmnt;
+        const struct {
+            int maj;
+            int min;
+            mode_t mode;
+            const char *path;
+        } devs[] = {
+            { 1, 3, 0666, "/dev/null" },
+            { 1, 5, 0666, "/dev/zero" },
+            { 1, 7, 0666, "/dev/full" },
+            { 5, 1, 0600, "/dev/console" },
+            { 1, 8, 0666, "/dev/random" },
+            { 1, 9, 0666, "/dev/urandom" },
+        };
+
+        /* Got a FS mapped to /, we're going the pivot_root
+           approach to do a better-chroot-than-chroot */
+
+        /* this is based on this thread http://lkml.org/lkml/2008/3/5/29 */
+
+        /* First step is to ensure the new root itself is
+           a mount point */
+        if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) {
             lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                     _("failed to mount %s at %s for container: %s"),
-                     curMount->src, curMount->dst, strerror(errno));
+                     _("failed to bind new root %s: %s"),
+                     root->src, strerror(errno));
+            return -1;
+        }
+
+        if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL);
+            return -1;
+        }
+
+        if (virFileMakePath(oldroot) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to create %s: %s"),
+                     oldroot, strerror(errno));
+            return -1;
+        }
+
+        /* The old root directory will live at /.oldroot after
+         * this and will soon be unmounted completely */
+        if (pivot_root(root->src, oldroot) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to pivot root %s to %s: %s"),
+                     oldroot, root->src, strerror(errno));
+            return -1;
+        }
+
+        /* CWD is undefined after pivot_root, so go to / */
+        if (chdir("/") < 0) {
+            return -1;
+        }
+
+        if (virFileMakePath("/proc") < 0 ||
+            mount("none", "/proc", "proc", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /proc for container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        if (virFileMakePath("/dev") < 0 ||
+            mount("none", "/dev", "tmpfs", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /dev tmpfs for container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        /* Move old devpts into container, since we have to
+           connect to the master ptmx which was opened in
+           the parent.
+           XXX This sucks, we need to figure out how to get our
+           own private devpts for isolation
+        */
+        if (virFileMakePath("/dev/pts") < 0 ||
+            mount("/.oldroot/dev/pts", "/dev/pts", NULL,
+                  MS_MOVE, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to move /dev/pts into container: %s"),
+                     strerror(errno));
+            return -1;
+        }
+
+        /* Populate /dev/ with a few important bits */
+        for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) {
+            dev_t dev = makedev(devs[i].maj, devs[i].min);
+            if (mknod(devs[i].path, 0, dev) < 0 ||
+                chmod(devs[i].path, devs[i].mode)) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to make device %s: %s"),
+                         devs[i].path, strerror(errno));
+                return -1;
+            }
+        }
+
+        /* Pull in rest of container's mounts */
+        for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+            char *src;
+            if (STREQ(tmp->dst, "/"))
+                continue;
+            // XXX fix
+            if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+                continue;
+
+            if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0)
+                return -1;
+
+            if (virFileMakePath(tmp->dst) < 0 ||
+                mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) {
+                VIR_FREE(src);
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to mount %s at %s for container: %s"),
+                         tmp->src, tmp->dst, strerror(errno));
+                return -1;
+            }
+            VIR_FREE(src);
+        }
+
+        if (!(procmnt = setmntent("/proc/mounts", "r"))) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to read /proc/mounts: %s"),
+                     strerror(errno));
+            return -1;
+        }
+        while ((mntent = getmntent(procmnt)) != NULL) {
+            if (!STRPREFIX(mntent->mnt_dir, "/.oldroot"))
+                continue;
+            if (VIR_REALLOC_N(mounts, nmounts+1) < 0) {
+                endmntent(procmnt);
+                return -1;
+            }
+            if (!(mounts[nmounts++] = strdup(mntent->mnt_dir))) {
+                endmntent(procmnt);
+                return -1;
+            }
+        }
+        endmntent(procmnt);
+
+        qsort(mounts, nmounts, sizeof(mounts[0]),
+              lxcContainerChildMountSort);
+
+        for (i = 0 ; i < nmounts ; i++) {
+            if (umount(mounts[i]) < 0) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to unmount %s: %s"),
+                         mounts[i], strerror(errno));
+                return -1;
+            }
+            VIR_FREE(mounts[i]);
+        }
+        VIR_FREE(mounts);
+    } else {
+        /* Nothing mapped to /, we're using the main root,
+           but with extra stuff mapped in */
+        for (tmp = vmDef->fss; tmp; tmp = tmp->next) {
+            // XXX fix
+            if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT)
+                continue;
+            rc = mount(tmp->src,
+                       tmp->dst,
+                       NULL,
+                       MS_BIND,
+                       NULL);
+            if (0 != rc) {
+                lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                         _("failed to mount %s at %s for container: %s"),
+                         tmp->src, tmp->dst, strerror(errno));
+                return -1;
+            }
+        }
+
+        /* mount /proc */
+        if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) {
+            lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
+                     _("failed to mount /proc for container: %s"),
+                     strerror(errno));
             return -1;
         }
     }
 
-    /* mount /proc */
-    rc = mount("lxcproc", "/proc", "proc", 0, NULL);
-    if (0 != rc) {
+    ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY);
+    if (ttyfd < 0) {
         lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR,
-                 _("failed to mount /proc for container: %s"),
-                 strerror(errno));
+                 _("open(%s) failed: %s"), argv->ttyPath, strerror(errno));
         return -1;
     }
 
-    if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0)
+    if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) {
+        close(ttyfd);
         return -1;
+    }
+    close(ttyfd);
 
     /* Wait for interface devices to show up */
     if (lxcContainerWaitForContinue(argv->monitor) < 0)
diff -r 17f02fec7fe8 src/util.c
--- a/src/util.c	Wed Aug 13 14:39:28 2008 +0100
+++ b/src/util.c	Wed Aug 13 14:39:34 2008 +0100
@@ -604,13 +604,11 @@
     if (!(p = strrchr(parent, '/')))
         return EINVAL;
 
-    if (p == parent)
-        return EPERM;
-
-    *p = '\0';
-
-    if ((err = virFileMakePath(parent)))
-        return err;
+    if (p != parent) {
+        *p = '\0';
+        if ((err = virFileMakePath(parent)))
+            return err;
+    }
 
     if (mkdir(path, 0777) < 0 && errno != EEXIST)
         return errno;

-- 
|: Red Hat, Engineering, London   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|




More information about the libvir-list mailing list