[Libguestfs] [PATCH nbdkit] filters: Add copy-on-write filter.
Richard W.M. Jones
rjones at redhat.com
Sun Jan 21 22:08:15 UTC 2018
Here's the patch (on top of the preceeding one) which uses a bitmap
instead of SEEK_DATA.
Rich.
--
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
libguestfs lets you edit virtual machines. Supports shell scripting,
bindings from many languages. http://libguestfs.org
-------------- next part --------------
>From e9f7ff5ea68f2a0391a3319cef1bf9e3f5581942 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones at redhat.com>
Date: Sun, 21 Jan 2018 21:52:26 +0000
Subject: [PATCH] filters: cow: Modify cow filter to use a bitmap.
---
configure.ac | 4 --
filters/Makefile.am | 5 +-
filters/cow/Makefile.am | 3 -
filters/cow/cow.c | 179 +++++++++++++++++++++++++++++-------------------
tests/Makefile.am | 2 -
5 files changed, 111 insertions(+), 82 deletions(-)
diff --git a/configure.ac b/configure.ac
index aa7f406..1091d27 100644
--- a/configure.ac
+++ b/configure.ac
@@ -483,10 +483,6 @@ AC_SUBST([VDDK_LIBS])
AC_DEFINE_UNQUOTED([VDDK_LIBDIR],["$VDDK_LIBDIR"],[VDDK 'libDir'.])
AM_CONDITIONAL([HAVE_VDDK],[test "x$VDDK_LIBS" != "x"])
-dnl Check for <linux/fs.h>, optional but needed for COW filter.
-AC_CHECK_HEADER([linux/fs.h], [have_linux_fs_h=yes])
-AM_CONDITIONAL([HAVE_COW_FILTER], [test "x$have_linux_fs_h" = "xyes"])
-
dnl Produce output files.
AC_CONFIG_HEADERS([config.h])
AC_CONFIG_FILES([nbdkit],
diff --git a/filters/Makefile.am b/filters/Makefile.am
index 15a1995..7e6fe5a 100644
--- a/filters/Makefile.am
+++ b/filters/Makefile.am
@@ -31,10 +31,7 @@
# SUCH DAMAGE.
SUBDIRS = \
+ cow \
delay \
offset \
partition
-
-if HAVE_COW_FILTER
-SUBDIRS += cow
-endif
diff --git a/filters/cow/Makefile.am b/filters/cow/Makefile.am
index 5b8ae5a..31526d8 100644
--- a/filters/cow/Makefile.am
+++ b/filters/cow/Makefile.am
@@ -32,8 +32,6 @@
EXTRA_DIST = nbdkit-cow-filter.pod
-if HAVE_COW_FILTER
-
CLEANFILES = *~
filterdir = $(libdir)/nbdkit/filters
@@ -62,4 +60,3 @@ nbdkit-cow-filter.1: nbdkit-cow-filter.pod
mv $@.t $@
endif
-endif
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 287c94e..2b023af 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -38,20 +38,22 @@
* takes up no space.
*
* We confine all pread/pwrite operations to the filesystem block
- * size. The blk_read and blk_write functions below always happen on
- * whole filesystem block boundaries. A smaller-than-block-size
- * pwrite will turn into a read-modify-write of a whole block. We
- * also assume that the plugin returns the same immutable data for
- * each pread call we make, and optimize on this basis.
+ * size. The blk_* functions below only work on whole filesystem
+ * block boundaries. A smaller-than-block-size pwrite will turn into
+ * a read-modify-write of a whole block. We also assume that the
+ * plugin returns the same immutable data for each pread call we make,
+ * and optimize on this basis.
*
- * When reading a block we first check the temporary file to see if
- * that file block is allocated or a hole. If allocated, we return it
- * from the temporary file. If a hole, we issue a pread to the
- * underlying plugin.
+ * A block bitmap is maintained in memory recording if each block in
+ * the temporary file is "allocated" (1) or "hole" (0).
+ *
+ * When reading a block we first check the bitmap to see if that file
+ * block is allocated or a hole. If allocated, we return it from the
+ * temporary file. If a hole, we issue a pread to the underlying
+ * plugin.
*
* When writing a block we unconditionally write the data to the
- * temporary file (allocating a block in that file if it wasn't
- * before).
+ * temporary file, setting the bit in the bitmap.
*
* No locking is needed for blk_* calls, but there is a potential
* problem of multiple pwrite calls doing a read-modify-write cycle
@@ -75,18 +77,27 @@
#include <errno.h>
#include <sys/types.h>
#include <sys/ioctl.h>
-#include <linux/fs.h>
#include <nbdkit-filter.h>
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+
/* XXX See design comment above. */
#define THREAD_MODEL NBDKIT_THREAD_MODEL_SERIALIZE_ALL_REQUESTS
+/* Size of a block in the overlay. A 4K block size means that we need
+ * 32 MB of memory to store the bitmap for a 1 TB underlying image.
+ */
+#define BLKSIZE 4096
+
/* The temporary overlay. */
static int fd = -1;
-/* The filesystem block size. */
-static int blksize;
+/* Bitmap. Bit 1 = allocated, 0 = hole. */
+static uint8_t *bitmap;
+
+/* Size of the bitmap in bytes. */
+static uint64_t bm_size;
static void
cow_load (void)
@@ -112,17 +123,6 @@ cow_load (void)
}
unlink (template);
-
- if (ioctl (fd, FIGETBSZ, &blksize) == -1) {
- nbdkit_error ("ioctl: FIGETBSZ: %m");
- exit (EXIT_FAILURE);
- }
- if (blksize <= 0) {
- nbdkit_error ("filesystem block size is < 0 or cannot be read");
- exit (EXIT_FAILURE);
- }
-
- nbdkit_debug ("cow: filesystem block size: %d", blksize);
}
static void
@@ -147,6 +147,34 @@ cow_open (nbdkit_next_open *next, void *nxdata, int readonly)
return &handle;
}
+/* Allocate or resize the overlay file and bitmap. */
+static int
+blk_set_size (uint64_t new_size)
+{
+ uint8_t *new_bm;
+ const size_t old_bm_size = bm_size;
+ size_t new_bm_size = DIV_ROUND_UP (new_size, BLKSIZE*8);
+
+ new_bm = realloc (bitmap, new_bm_size);
+ if (new_bm == NULL) {
+ nbdkit_error ("realloc: %m");
+ return -1;
+ }
+ bitmap = new_bm;
+ bm_size = new_bm_size;
+ if (old_bm_size < new_bm_size)
+ memset (&bitmap[old_bm_size], 0, new_bm_size-old_bm_size);
+
+ nbdkit_debug ("cow: bitmap resized to %" PRIu64 " bytes", new_bm_size);
+
+ if (ftruncate (fd, new_size) == -1) {
+ nbdkit_error ("ftruncate: %m");
+ return -1;
+ }
+
+ return 0;
+}
+
/* Get the file size and ensure the overlay is the correct size. */
static int64_t
cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
@@ -158,11 +186,11 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
if (size == -1)
return -1;
- if (ftruncate (fd, size) == -1)
- return -1;
-
nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
+ if (blk_set_size (size))
+ return -1;
+
return size;
}
@@ -200,6 +228,36 @@ cow_can_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle)
return 1;
}
+/* Return true if the block is allocated. Consults the bitmap. */
+static bool
+blk_is_allocated (uint64_t blknum)
+{
+ uint64_t bm_offset = blknum / 8;
+ uint64_t bm_bit = blknum % 8;
+
+ if (bm_offset >= bm_size) {
+ nbdkit_debug ("blk_is_allocated: block number is out of range");
+ return false;
+ }
+
+ return bitmap[bm_offset] & (1 << bm_bit);
+}
+
+/* Mark a block as allocated. */
+static void
+blk_set_allocated (uint64_t blknum)
+{
+ uint64_t bm_offset = blknum / 8;
+ uint64_t bm_bit = blknum % 8;
+
+ if (bm_offset >= bm_size) {
+ nbdkit_debug ("blk_set_allocated: block number is out of range");
+ return;
+ }
+
+ bitmap[bm_offset] |= 1 << bm_bit;
+}
+
/* These are the block operations. They always read or write a single
* whole block of size ?blksize?.
*/
@@ -207,36 +265,17 @@ static int
blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
uint64_t blknum, uint8_t *block)
{
- off_t offset = blknum * blksize, roffset;
- bool hole;
-
- nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ")",
- blknum, (uint64_t) offset);
+ off_t offset = blknum * BLKSIZE;
+ bool allocated = blk_is_allocated (blknum);
- /* Find out if the current block contains data or is a hole. */
- roffset = lseek (fd, offset, SEEK_DATA);
- if (roffset == -1) {
- /* Undocumented? Anyway if SEEK_DATA returns ENXIO it means
- * "there are no more data regions past the supplied offset", ie.
- * we're in a hole.
- */
- if (errno == ENXIO)
- hole = true;
- else {
- nbdkit_error ("lseek: SEEK_DATA: %m");
- return -1;
- }
- }
- else
- hole = offset != roffset;
-
- nbdkit_debug ("cow: block %" PRIu64 " is %s",
- blknum, hole ? "a hole" : "allocated");
+ nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
+ blknum, (uint64_t) offset,
+ !allocated ? "a hole" : "allocated");
- if (hole) /* Read underlying plugin. */
- return next_ops->pread (nxdata, block, blksize, offset);
+ if (!allocated) /* Read underlying plugin. */
+ return next_ops->pread (nxdata, block, BLKSIZE, offset);
else { /* Read overlay. */
- if (pread (fd, block, blksize, offset) == -1) {
+ if (pread (fd, block, BLKSIZE, offset) == -1) {
nbdkit_error ("pread: %m");
return -1;
}
@@ -247,15 +286,17 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
static int
blk_write (uint64_t blknum, const uint8_t *block)
{
- off_t offset = blknum * blksize;
+ off_t offset = blknum * BLKSIZE;
nbdkit_debug ("cow: blk_write block %" PRIu64 " (offset %" PRIu64 ")",
blknum, (uint64_t) offset);
- if (pwrite (fd, block, blksize, offset) == -1) {
+ if (pwrite (fd, block, BLKSIZE, offset) == -1) {
nbdkit_error ("pwrite: %m");
return -1;
}
+ blk_set_allocated (blknum);
+
return 0;
}
@@ -266,7 +307,7 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
{
uint8_t *block;
- block = malloc (blksize);
+ block = malloc (BLKSIZE);
if (block == NULL) {
nbdkit_error ("malloc: %m");
return -1;
@@ -275,9 +316,9 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
while (count > 0) {
uint64_t blknum, blkoffs, n;
- blknum = offset / blksize; /* block number */
- blkoffs = offset % blksize; /* offset within the block */
- n = blksize - blkoffs; /* max bytes we can read from this block */
+ blknum = offset / BLKSIZE; /* block number */
+ blkoffs = offset % BLKSIZE; /* offset within the block */
+ n = BLKSIZE - blkoffs; /* max bytes we can read from this block */
if (n > count)
n = count;
@@ -304,7 +345,7 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
{
uint8_t *block;
- block = malloc (blksize);
+ block = malloc (BLKSIZE);
if (block == NULL) {
nbdkit_error ("malloc: %m");
return -1;
@@ -313,9 +354,9 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
while (count > 0) {
uint64_t blknum, blkoffs, n;
- blknum = offset / blksize; /* block number */
- blkoffs = offset % blksize; /* offset within the block */
- n = blksize - blkoffs; /* max bytes we can read from this block */
+ blknum = offset / BLKSIZE; /* block number */
+ blkoffs = offset % BLKSIZE; /* offset within the block */
+ n = BLKSIZE - blkoffs; /* max bytes we can read from this block */
if (n > count)
n = count;
@@ -346,7 +387,7 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
{
uint8_t *block;
- block = malloc (blksize);
+ block = malloc (BLKSIZE);
if (block == NULL) {
nbdkit_error ("malloc: %m");
return -1;
@@ -355,9 +396,9 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
while (count > 0) {
uint64_t blknum, blkoffs, n;
- blknum = offset / blksize; /* block number */
- blkoffs = offset % blksize; /* offset within the block */
- n = blksize - blkoffs; /* max bytes we can read from this block */
+ blknum = offset / BLKSIZE; /* block number */
+ blkoffs = offset % BLKSIZE; /* offset within the block */
+ n = BLKSIZE - blkoffs; /* max bytes we can read from this block */
if (n > count)
n = count;
diff --git a/tests/Makefile.am b/tests/Makefile.am
index ae22801..b073f22 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -413,9 +413,7 @@ endif HAVE_RUBY
# Tests of filters.
# cow filter test.
-if HAVE_COW_FILTER
TESTS += test-cow.sh
-endif HAVE_COW_FILTER
# delay filter test.
check_PROGRAMS += test-delay
--
2.15.1
More information about the Libguestfs
mailing list