[Libguestfs] [PATCH nbdkit] filters: Add copy-on-write filter.

Richard W.M. Jones rjones at redhat.com
Sun Jan 21 22:08:15 UTC 2018


Here's the patch (on top of the preceeding one) which uses a bitmap
instead of SEEK_DATA.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
libguestfs lets you edit virtual machines.  Supports shell scripting,
bindings from many languages.  http://libguestfs.org
-------------- next part --------------
>From e9f7ff5ea68f2a0391a3319cef1bf9e3f5581942 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones at redhat.com>
Date: Sun, 21 Jan 2018 21:52:26 +0000
Subject: [PATCH] filters: cow: Modify cow filter to use a bitmap.

---
 configure.ac            |   4 --
 filters/Makefile.am     |   5 +-
 filters/cow/Makefile.am |   3 -
 filters/cow/cow.c       | 179 +++++++++++++++++++++++++++++-------------------
 tests/Makefile.am       |   2 -
 5 files changed, 111 insertions(+), 82 deletions(-)

diff --git a/configure.ac b/configure.ac
index aa7f406..1091d27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -483,10 +483,6 @@ AC_SUBST([VDDK_LIBS])
 AC_DEFINE_UNQUOTED([VDDK_LIBDIR],["$VDDK_LIBDIR"],[VDDK 'libDir'.])
 AM_CONDITIONAL([HAVE_VDDK],[test "x$VDDK_LIBS" != "x"])
 
-dnl Check for <linux/fs.h>, optional but needed for COW filter.
-AC_CHECK_HEADER([linux/fs.h], [have_linux_fs_h=yes])
-AM_CONDITIONAL([HAVE_COW_FILTER], [test "x$have_linux_fs_h" = "xyes"])
-
 dnl Produce output files.
 AC_CONFIG_HEADERS([config.h])
 AC_CONFIG_FILES([nbdkit],
diff --git a/filters/Makefile.am b/filters/Makefile.am
index 15a1995..7e6fe5a 100644
--- a/filters/Makefile.am
+++ b/filters/Makefile.am
@@ -31,10 +31,7 @@
 # SUCH DAMAGE.
 
 SUBDIRS = \
+	cow \
 	delay \
 	offset \
 	partition
-
-if HAVE_COW_FILTER
-SUBDIRS += cow
-endif
diff --git a/filters/cow/Makefile.am b/filters/cow/Makefile.am
index 5b8ae5a..31526d8 100644
--- a/filters/cow/Makefile.am
+++ b/filters/cow/Makefile.am
@@ -32,8 +32,6 @@
 
 EXTRA_DIST = nbdkit-cow-filter.pod
 
-if HAVE_COW_FILTER
-
 CLEANFILES = *~
 
 filterdir = $(libdir)/nbdkit/filters
@@ -62,4 +60,3 @@ nbdkit-cow-filter.1: nbdkit-cow-filter.pod
 	mv $@.t $@
 
 endif
-endif
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 287c94e..2b023af 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -38,20 +38,22 @@
  * takes up no space.
  *
  * We confine all pread/pwrite operations to the filesystem block
- * size.  The blk_read and blk_write functions below always happen on
- * whole filesystem block boundaries.  A smaller-than-block-size
- * pwrite will turn into a read-modify-write of a whole block.  We
- * also assume that the plugin returns the same immutable data for
- * each pread call we make, and optimize on this basis.
+ * size.  The blk_* functions below only work on whole filesystem
+ * block boundaries.  A smaller-than-block-size pwrite will turn into
+ * a read-modify-write of a whole block.  We also assume that the
+ * plugin returns the same immutable data for each pread call we make,
+ * and optimize on this basis.
  *
- * When reading a block we first check the temporary file to see if
- * that file block is allocated or a hole.  If allocated, we return it
- * from the temporary file.  If a hole, we issue a pread to the
- * underlying plugin.
+ * A block bitmap is maintained in memory recording if each block in
+ * the temporary file is "allocated" (1) or "hole" (0).
+ *
+ * When reading a block we first check the bitmap to see if that file
+ * block is allocated or a hole.  If allocated, we return it from the
+ * temporary file.  If a hole, we issue a pread to the underlying
+ * plugin.
  *
  * When writing a block we unconditionally write the data to the
- * temporary file (allocating a block in that file if it wasn't
- * before).
+ * temporary file, setting the bit in the bitmap.
  *
  * No locking is needed for blk_* calls, but there is a potential
  * problem of multiple pwrite calls doing a read-modify-write cycle
@@ -75,18 +77,27 @@
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/ioctl.h>
-#include <linux/fs.h>
 
 #include <nbdkit-filter.h>
 
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+
 /* XXX See design comment above. */
 #define THREAD_MODEL NBDKIT_THREAD_MODEL_SERIALIZE_ALL_REQUESTS
 
+/* Size of a block in the overlay.  A 4K block size means that we need
+ * 32 MB of memory to store the bitmap for a 1 TB underlying image.
+ */
+#define BLKSIZE 4096
+
 /* The temporary overlay. */
 static int fd = -1;
 
-/* The filesystem block size. */
-static int blksize;
+/* Bitmap.  Bit 1 = allocated, 0 = hole. */
+static uint8_t *bitmap;
+
+/* Size of the bitmap in bytes. */
+static uint64_t bm_size;
 
 static void
 cow_load (void)
@@ -112,17 +123,6 @@ cow_load (void)
   }
 
   unlink (template);
-
-  if (ioctl (fd, FIGETBSZ, &blksize) == -1) {
-    nbdkit_error ("ioctl: FIGETBSZ: %m");
-    exit (EXIT_FAILURE);
-  }
-  if (blksize <= 0) {
-    nbdkit_error ("filesystem block size is < 0 or cannot be read");
-    exit (EXIT_FAILURE);
-  }
-
-  nbdkit_debug ("cow: filesystem block size: %d", blksize);
 }
 
 static void
@@ -147,6 +147,34 @@ cow_open (nbdkit_next_open *next, void *nxdata, int readonly)
   return &handle;
 }
 
+/* Allocate or resize the overlay file and bitmap. */
+static int
+blk_set_size (uint64_t new_size)
+{
+  uint8_t *new_bm;
+  const size_t old_bm_size = bm_size;
+  size_t new_bm_size = DIV_ROUND_UP (new_size, BLKSIZE*8);
+
+  new_bm = realloc (bitmap, new_bm_size);
+  if (new_bm == NULL) {
+    nbdkit_error ("realloc: %m");
+    return -1;
+  }
+  bitmap = new_bm;
+  bm_size = new_bm_size;
+  if (old_bm_size < new_bm_size)
+    memset (&bitmap[old_bm_size], 0, new_bm_size-old_bm_size);
+
+  nbdkit_debug ("cow: bitmap resized to %" PRIu64 " bytes", new_bm_size);
+
+  if (ftruncate (fd, new_size) == -1) {
+    nbdkit_error ("ftruncate: %m");
+    return -1;
+  }
+
+  return 0;
+}
+
 /* Get the file size and ensure the overlay is the correct size. */
 static int64_t
 cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
@@ -158,11 +186,11 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
   if (size == -1)
     return -1;
 
-  if (ftruncate (fd, size) == -1)
-    return -1;
-
   nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
 
+  if (blk_set_size (size))
+    return -1;
+
   return size;
 }
 
@@ -200,6 +228,36 @@ cow_can_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle)
   return 1;
 }
 
+/* Return true if the block is allocated.  Consults the bitmap. */
+static bool
+blk_is_allocated (uint64_t blknum)
+{
+  uint64_t bm_offset = blknum / 8;
+  uint64_t bm_bit = blknum % 8;
+
+  if (bm_offset >= bm_size) {
+    nbdkit_debug ("blk_is_allocated: block number is out of range");
+    return false;
+  }
+
+  return bitmap[bm_offset] & (1 << bm_bit);
+}
+
+/* Mark a block as allocated. */
+static void
+blk_set_allocated (uint64_t blknum)
+{
+  uint64_t bm_offset = blknum / 8;
+  uint64_t bm_bit = blknum % 8;
+
+  if (bm_offset >= bm_size) {
+    nbdkit_debug ("blk_set_allocated: block number is out of range");
+    return;
+  }
+
+  bitmap[bm_offset] |= 1 << bm_bit;
+}
+
 /* These are the block operations.  They always read or write a single
  * whole block of size ?blksize?.
  */
@@ -207,36 +265,17 @@ static int
 blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
           uint64_t blknum, uint8_t *block)
 {
-  off_t offset = blknum * blksize, roffset;
-  bool hole;
-
-  nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ")",
-                blknum, (uint64_t) offset);
+  off_t offset = blknum * BLKSIZE;
+  bool allocated = blk_is_allocated (blknum);
 
-  /* Find out if the current block contains data or is a hole. */
-  roffset = lseek (fd, offset, SEEK_DATA);
-  if (roffset == -1) {
-    /* Undocumented?  Anyway if SEEK_DATA returns ENXIO it means
-     * "there are no more data regions past the supplied offset", ie.
-     * we're in a hole.
-     */
-    if (errno == ENXIO)
-      hole = true;
-    else {
-      nbdkit_error ("lseek: SEEK_DATA: %m");
-      return -1;
-    }
-  }
-  else
-    hole = offset != roffset;
-
-  nbdkit_debug ("cow: block %" PRIu64 " is %s",
-                blknum, hole ? "a hole" : "allocated");
+  nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
+                blknum, (uint64_t) offset,
+                !allocated ? "a hole" : "allocated");
 
-  if (hole)                     /* Read underlying plugin. */
-    return next_ops->pread (nxdata, block, blksize, offset);
+  if (!allocated)               /* Read underlying plugin. */
+    return next_ops->pread (nxdata, block, BLKSIZE, offset);
   else {                        /* Read overlay. */
-    if (pread (fd, block, blksize, offset) == -1) {
+    if (pread (fd, block, BLKSIZE, offset) == -1) {
       nbdkit_error ("pread: %m");
       return -1;
     }
@@ -247,15 +286,17 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
 static int
 blk_write (uint64_t blknum, const uint8_t *block)
 {
-  off_t offset = blknum * blksize;
+  off_t offset = blknum * BLKSIZE;
 
   nbdkit_debug ("cow: blk_write block %" PRIu64 " (offset %" PRIu64 ")",
                 blknum, (uint64_t) offset);
 
-  if (pwrite (fd, block, blksize, offset) == -1) {
+  if (pwrite (fd, block, BLKSIZE, offset) == -1) {
     nbdkit_error ("pwrite: %m");
     return -1;
   }
+  blk_set_allocated (blknum);
+
   return 0;
 }
 
@@ -266,7 +307,7 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
 {
   uint8_t *block;
 
-  block = malloc (blksize);
+  block = malloc (BLKSIZE);
   if (block == NULL) {
     nbdkit_error ("malloc: %m");
     return -1;
@@ -275,9 +316,9 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
   while (count > 0) {
     uint64_t blknum, blkoffs, n;
 
-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
+    blknum = offset / BLKSIZE;  /* block number */
+    blkoffs = offset % BLKSIZE; /* offset within the block */
+    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
     if (n > count)
       n = count;
 
@@ -304,7 +345,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
 {
   uint8_t *block;
 
-  block = malloc (blksize);
+  block = malloc (BLKSIZE);
   if (block == NULL) {
     nbdkit_error ("malloc: %m");
     return -1;
@@ -313,9 +354,9 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
   while (count > 0) {
     uint64_t blknum, blkoffs, n;
 
-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
+    blknum = offset / BLKSIZE;  /* block number */
+    blkoffs = offset % BLKSIZE; /* offset within the block */
+    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
     if (n > count)
       n = count;
 
@@ -346,7 +387,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
 {
   uint8_t *block;
 
-  block = malloc (blksize);
+  block = malloc (BLKSIZE);
   if (block == NULL) {
     nbdkit_error ("malloc: %m");
     return -1;
@@ -355,9 +396,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
   while (count > 0) {
     uint64_t blknum, blkoffs, n;
 
-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
+    blknum = offset / BLKSIZE;  /* block number */
+    blkoffs = offset % BLKSIZE; /* offset within the block */
+    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
     if (n > count)
       n = count;
 
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ae22801..b073f22 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -413,9 +413,7 @@ endif HAVE_RUBY
 # Tests of filters.
 
 # cow filter test.
-if HAVE_COW_FILTER
 TESTS += test-cow.sh
-endif HAVE_COW_FILTER
 
 # delay filter test.
 check_PROGRAMS += test-delay
-- 
2.15.1



More information about the Libguestfs mailing list