[Libguestfs] [PATCH nbdkit] cache, cow: Do not round size down.

Richard W.M. Jones rjones at redhat.com
Wed Feb 17 12:57:08 UTC 2021


Both the cache and cow filters have the annoying behaviour that the
size of the underlying plugin is rounded down to the block size used
by the filters (ie. 4K).  This is especially visible for single sector
disk images:

$ ./nbdkit memory size=512 --filter=cow --run 'nbdinfo --size $uri'
0
$ ./nbdkit memory size=512 --filter=cache --run 'nbdinfo --size $uri'
0

but affects all users of these filters somewhat randomly and
unexpectedly.  (For example I had a GPT partitioned disk image from a
customer which did not have a 4K aligned size, which was unexpectedly
"corrupted" when I tried to open it using the cow filter - the reason
for the corruption was the backup partition table at the end of the
disk being truncated.)

Getting rid of this assumption is awkward: the general idea is that we
round up the size of the backing file / bitmap by a full block.  But
whenever we are asked to read or write to the underlying plugin we
handle the tail case carefully.

This also tests various corner cases.
---
 filters/cache/nbdkit-cache-filter.pod |  9 +---
 filters/cow/nbdkit-cow-filter.pod     | 11 +----
 tests/Makefile.am                     |  4 ++
 filters/cache/blk.c                   | 54 +++++++++++++++++++---
 filters/cache/cache.c                 | 12 ++---
 filters/cow/blk.c                     | 49 +++++++++++++++++---
 filters/cow/cow.c                     | 23 +++++++---
 tests/test-cache-unaligned.sh         | 66 +++++++++++++++++++++++++++
 tests/test-cow-unaligned.sh           | 56 +++++++++++++++++++++++
 9 files changed, 239 insertions(+), 45 deletions(-)

diff --git a/filters/cache/nbdkit-cache-filter.pod b/filters/cache/nbdkit-cache-filter.pod
index 3601dcd5..34fd0b29 100644
--- a/filters/cache/nbdkit-cache-filter.pod
+++ b/filters/cache/nbdkit-cache-filter.pod
@@ -25,12 +25,6 @@ does not have effective caching, or (with C<cache=unsafe>) to defeat
 flush requests from the client (which is unsafe and can cause data
 loss, as the name suggests).
 
-Note that the use of this filter rounds the image size down to a
-multiple of the caching granularity (the larger of 4096 or the
-C<f_bsize> field of L<fstatvfs(3)>), to ease the implementation. If
-you need to round the image size up instead to access the last few
-bytes, combine this filter with L<nbdkit-truncate-filter(1)>.
-
 This filter only caches image contents.  To cache image metadata, use
 L<nbdkit-cacheextents-filter(1)> between this filter and the plugin.
 To accelerate sequential reads, use L<nbdkit-readahead-filter(1)>
@@ -168,7 +162,6 @@ L<nbdkit(1)>,
 L<nbdkit-file-plugin(1)>,
 L<nbdkit-cacheextents-filter(1)>,
 L<nbdkit-readahead-filter(1)>,
-L<nbdkit-truncate-filter(1)>,
 L<nbdkit-filter(3)>,
 L<qemu-img(1)>.
 
@@ -180,4 +173,4 @@ Richard W.M. Jones
 
 =head1 COPYRIGHT
 
-Copyright (C) 2018-2020 Red Hat Inc.
+Copyright (C) 2018-2021 Red Hat Inc.
diff --git a/filters/cow/nbdkit-cow-filter.pod b/filters/cow/nbdkit-cow-filter.pod
index 4d5ae856..64df3fbd 100644
--- a/filters/cow/nbdkit-cow-filter.pod
+++ b/filters/cow/nbdkit-cow-filter.pod
@@ -31,14 +31,6 @@ that away.  It also allows us to create diffs, see below.
 The plugin is opened read-only (as if the I<-r> flag was passed), but
 you should B<not> pass the I<-r> flag to nbdkit.
 
-=item *
-
-Note that the use of this filter rounds the image size down to a
-multiple of the caching granularity (4096), to ease the
-implementation. If you need to round the image size up instead to
-access the last few bytes, combine this filter with
-L<nbdkit-truncate-filter(1)>.
-
 =back
 
 Limitations of the filter include:
@@ -140,7 +132,6 @@ C<nbdkit-cow-filter> first appeared in nbdkit 1.2.
 
 L<nbdkit(1)>,
 L<nbdkit-file-plugin(1)>,
-L<nbdkit-truncate-filter(1)>,
 L<nbdkit-xz-filter(1)>,
 L<nbdkit-filter(3)>,
 L<qemu-img(1)>.
@@ -153,4 +144,4 @@ Richard W.M. Jones
 
 =head1 COPYRIGHT
 
-Copyright (C) 2018 Red Hat Inc.
+Copyright (C) 2018-2021 Red Hat Inc.
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 42e842b2..70898f20 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -1353,11 +1353,13 @@ TESTS += \
 	test-cache.sh \
 	test-cache-on-read.sh \
 	test-cache-max-size.sh \
+	test-cache-unaligned.sh \
 	$(NULL)
 EXTRA_DIST += \
 	test-cache.sh \
 	test-cache-on-read.sh \
 	test-cache-max-size.sh \
+	test-cache-unaligned.sh \
 	$(NULL)
 
 # cacheextents filter test.
@@ -1380,6 +1382,7 @@ TESTS += \
 	test-cow.sh \
 	test-cow-extents1.sh \
 	test-cow-extents2.sh \
+	test-cow-unaligned.sh \
 	$(NULL)
 endif
 TESTS += test-cow-null.sh
@@ -1388,6 +1391,7 @@ EXTRA_DIST += \
 	test-cow-extents1.sh \
 	test-cow-extents2.sh \
 	test-cow-null.sh \
+	test-cow-unaligned.sh \
 	$(NULL)
 
 # delay filter tests.
diff --git a/filters/cache/blk.c b/filters/cache/blk.c
index dc1bcc57..b9af6909 100644
--- a/filters/cache/blk.c
+++ b/filters/cache/blk.c
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2018-2020 Red Hat Inc.
+ * Copyright (C) 2018-2021 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -54,6 +54,7 @@
 
 #include "bitmap.h"
 #include "minmax.h"
+#include "rounding.h"
 #include "utils.h"
 
 #include "cache.h"
@@ -165,18 +166,25 @@ blk_free (void)
   lru_free ();
 }
 
+/* Because blk_set_size is called before the other blk_* functions
+ * this should be set to the true size before we need it.
+ */
+static uint64_t size = 0;
+
 int
 blk_set_size (uint64_t new_size)
 {
-  if (bitmap_resize (&bm, new_size) == -1)
+  size = new_size;
+
+  if (bitmap_resize (&bm, size) == -1)
     return -1;
 
-  if (ftruncate (fd, new_size) == -1) {
+  if (ftruncate (fd, ROUND_UP (size, blksize)) == -1) {
     nbdkit_error ("ftruncate: %m");
     return -1;
   }
 
-  if (lru_set_size (new_size) == -1)
+  if (lru_set_size (size) == -1)
     return -1;
 
   return 0;
@@ -199,9 +207,22 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
                 "unknown");
 
   if (state == BLOCK_NOT_CACHED) { /* Read underlying plugin. */
-    if (next_ops->pread (nxdata, block, blksize, offset, 0, err) == -1)
+    unsigned n = blksize, tail = 0;
+
+    if (offset + n > size) {
+      tail = offset + n - size;
+      n -= tail;
+    }
+
+    if (next_ops->pread (nxdata, block, n, offset, 0, err) == -1)
       return -1;
 
+    /* Normally we're reading whole blocks, but at the very end of the
+     * file we might read a partial block.  Deal with that case by
+     * zeroing the tail.
+     */
+    memset (block + n, 0, tail);
+
     /* If cache-on-read, copy the block to the cache. */
     if (cache_on_read) {
       nbdkit_debug ("cache: cache-on-read block %" PRIu64
@@ -247,9 +268,22 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
 
   if (state == BLOCK_NOT_CACHED) {
     /* Read underlying plugin, copy to cache regardless of cache-on-read. */
-    if (next_ops->pread (nxdata, block, blksize, offset, 0, err) == -1)
+    unsigned n = blksize, tail = 0;
+
+    if (offset + n > size) {
+      tail = offset + n - size;
+      n -= tail;
+    }
+
+    if (next_ops->pread (nxdata, block, n, offset, 0, err) == -1)
       return -1;
 
+    /* Normally we're reading whole blocks, but at the very end of the
+     * file we might read a partial block.  Deal with that case by
+     * zeroing the tail.
+     */
+    memset (block + n, 0, tail);
+
     nbdkit_debug ("cache: cache block %" PRIu64 " (offset %" PRIu64 ")",
                   blknum, (uint64_t) offset);
 
@@ -281,6 +315,12 @@ blk_writethrough (struct nbdkit_next_ops *next_ops, void *nxdata,
                   int *err)
 {
   off_t offset = blknum * blksize;
+  unsigned n = blksize, tail = 0;
+
+  if (offset + n > size) {
+    tail = offset + n - size;
+    n -= tail;
+  }
 
   reclaim (fd, &bm);
 
@@ -293,7 +333,7 @@ blk_writethrough (struct nbdkit_next_ops *next_ops, void *nxdata,
     return -1;
   }
 
-  if (next_ops->pwrite (nxdata, block, blksize, offset, flags, err) == -1)
+  if (next_ops->pwrite (nxdata, block, n, offset, flags, err) == -1)
     return -1;
 
   bitmap_set_blk (&bm, blknum, BLOCK_CLEAN);
diff --git a/filters/cache/cache.c b/filters/cache/cache.c
index 91dcc43d..b979f256 100644
--- a/filters/cache/cache.c
+++ b/filters/cache/cache.c
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2018-2020 Red Hat Inc.
+ * Copyright (C) 2018-2021 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -201,9 +201,7 @@ cache_config_complete (nbdkit_next_config_complete *next, void *nxdata)
   return next (nxdata);
 }
 
-/* Get the file size; round it down to cache granularity before
- * setting cache size.
- */
+/* Get the file size, set the cache size. */
 static int64_t
 cache_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
                 void *handle)
@@ -216,7 +214,6 @@ cache_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
     return -1;
 
   nbdkit_debug ("cache: underlying file size: %" PRIi64, size);
-  size = ROUND_DOWN (size, blksize);
 
   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   r = blk_set_size (size);
@@ -226,8 +223,9 @@ cache_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
   return size;
 }
 
-/* Force an early call to cache_get_size, consequently truncating the
- * cache to the correct size.
+/* Force an early call to cache_get_size because we have to set the
+ * backing file size and bitmap size before any other read or write
+ * calls.
  */
 static int
 cache_prepare (struct nbdkit_next_ops *next_ops, void *nxdata,
diff --git a/filters/cow/blk.c b/filters/cow/blk.c
index ae0db1fe..51ba91c4 100644
--- a/filters/cow/blk.c
+++ b/filters/cow/blk.c
@@ -94,6 +94,7 @@
 
 #include "bitmap.h"
 #include "fdatasync.h"
+#include "rounding.h"
 #include "pread.h"
 #include "pwrite.h"
 #include "utils.h"
@@ -176,14 +177,21 @@ blk_free (void)
   bitmap_free (&bm);
 }
 
+/* Because blk_set_size is called before the other blk_* functions
+ * this should be set to the true size before we need it.
+ */
+static uint64_t size = 0;
+
 /* Allocate or resize the overlay file and bitmap. */
 int
 blk_set_size (uint64_t new_size)
 {
-  if (bitmap_resize (&bm, new_size) == -1)
+  size = new_size;
+
+  if (bitmap_resize (&bm, size) == -1)
     return -1;
 
-  if (ftruncate (fd, new_size) == -1) {
+  if (ftruncate (fd, ROUND_UP (size, BLKSIZE)) == -1) {
     nbdkit_error ("ftruncate: %m");
     return -1;
   }
@@ -216,8 +224,24 @@ blk_read (struct nbdkit_next_ops *next_ops, void *nxdata,
   nbdkit_debug ("cow: blk_read block %" PRIu64 " (offset %" PRIu64 ") is %s",
                 blknum, (uint64_t) offset, state_to_string (state));
 
-  if (state == BLOCK_NOT_ALLOCATED) /* Read underlying plugin. */
-    return next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err);
+  if (state == BLOCK_NOT_ALLOCATED) { /* Read underlying plugin. */
+    unsigned n = BLKSIZE, tail = 0;
+
+    if (offset + n > size) {
+      tail = offset + n - size;
+      n -= tail;
+    }
+
+    if (next_ops->pread (nxdata, block, n, offset, 0, err) == -1)
+      return -1;
+
+    /* Normally we're reading whole blocks, but at the very end of the
+     * file we might read a partial block.  Deal with that case by
+     * zeroing the tail.
+     */
+    memset (block + n, 0, tail);
+    return 0;
+  }
   else if (state == BLOCK_ALLOCATED) { /* Read overlay. */
     if (pread (fd, block, BLKSIZE, offset) == -1) {
       *err = errno;
@@ -238,6 +262,12 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
 {
   off_t offset = blknum * BLKSIZE;
   enum bm_entry state = bitmap_get_blk (&bm, blknum, BLOCK_NOT_ALLOCATED);
+  unsigned n = BLKSIZE, tail = 0;
+
+  if (offset + n > size) {
+    tail = offset + n - size;
+    n -= tail;
+  }
 
   nbdkit_debug ("cow: blk_cache block %" PRIu64 " (offset %" PRIu64 ") is %s",
                 blknum, (uint64_t) offset, state_to_string (state));
@@ -258,9 +288,16 @@ blk_cache (struct nbdkit_next_ops *next_ops, void *nxdata,
   if (mode == BLK_CACHE_IGNORE)
     return 0;
   if (mode == BLK_CACHE_PASSTHROUGH)
-    return next_ops->cache (nxdata, BLKSIZE, offset, 0, err);
-  if (next_ops->pread (nxdata, block, BLKSIZE, offset, 0, err) == -1)
+    return next_ops->cache (nxdata, n, offset, 0, err);
+
+  if (next_ops->pread (nxdata, block, n, offset, 0, err) == -1)
     return -1;
+  /* Normally we're reading whole blocks, but at the very end of the
+   * file we might read a partial block.  Deal with that case by
+   * zeroing the tail.
+   */
+  memset (block + n, 0, tail);
+
   if (mode == BLK_CACHE_COW) {
     if (pwrite (fd, block, BLKSIZE, offset) == -1) {
       *err = errno;
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 75d3b907..93e10f24 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2018-2020 Red Hat Inc.
+ * Copyright (C) 2018-2021 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -103,9 +103,7 @@ cow_open (nbdkit_next_open *next, void *nxdata,
   return NBDKIT_HANDLE_NOT_NEEDED;
 }
 
-/* Get the file size; round it down to overlay granularity before
- * setting overlay size.
- */
+/* Get the file size, set the cache size. */
 static int64_t
 cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
               void *handle)
@@ -118,7 +116,6 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
     return -1;
 
   nbdkit_debug ("cow: underlying file size: %" PRIi64, size);
-  size = ROUND_DOWN (size, BLKSIZE);
 
   ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
   r = blk_set_size (size);
@@ -128,8 +125,9 @@ cow_get_size (struct nbdkit_next_ops *next_ops, void *nxdata,
   return size;
 }
 
-/* Force an early call to cow_get_size, consequently truncating the
- * overlay to the correct size.
+/* Force an early call to cow_get_size because we have to set the
+ * backing file size and bitmap size before any other read or write
+ * calls.
  */
 static int
 cow_prepare (struct nbdkit_next_ops *next_ops, void *nxdata,
@@ -627,6 +625,7 @@ cow_extents (struct nbdkit_next_ops *next_ops, void *nxdata,
       uint64_t range_offset = offset;
       uint32_t range_count = 0;
       size_t i;
+      int64_t size;
 
       /* Asking the plugin for a single block of extents is not
        * efficient for some plugins (eg. VDDK) so ask for as much data
@@ -643,6 +642,16 @@ cow_extents (struct nbdkit_next_ops *next_ops, void *nxdata,
         if (present) break;
       }
 
+      /* Don't ask for extent data beyond the end of the plugin. */
+      size = next_ops->get_size (nxdata);
+      if (size == -1)
+        return -1;
+
+      if (range_offset + range_count > size) {
+        unsigned tail = range_offset + range_count - size;
+        range_count -= tail;
+      }
+
       CLEANUP_EXTENTS_FREE struct nbdkit_extents *extents2 =
         nbdkit_extents_full (next_ops, nxdata,
                              range_count, range_offset, flags, err);
diff --git a/tests/test-cache-unaligned.sh b/tests/test-cache-unaligned.sh
new file mode 100755
index 00000000..80b3d128
--- /dev/null
+++ b/tests/test-cache-unaligned.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+# nbdkit
+# Copyright (C) 2018-2021 Red Hat Inc.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Red Hat nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+
+source ./functions.sh
+set -e
+set -x
+
+requires_filter cache
+requires_nbdsh_uri
+
+sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX)
+files="cache-unaligned.img $sock cache-unaligned.pid"
+rm -f $files
+cleanup_fn rm -f $files
+
+# Create an empty base image which is not a multiple of 4K.
+truncate -s 130000 cache-unaligned.img
+
+# Run nbdkit with the caching filter.
+start_nbdkit -P cache-unaligned.pid -U $sock --filter=cache \
+             file cache-unaligned.img
+
+nbdsh --connect "nbd+unix://?socket=$sock" \
+      -c '
+# Write some pattern data to the overlay and check it reads back OK.
+buf = b"abcdefghijklm" * 10000
+h.pwrite(buf, 0)
+buf2 = h.pread(130000, 0)
+assert buf == buf2
+
+# Flushing should write through to the underlying file.
+h.flush()
+
+with open("cache-unaligned.img", "rb") as file:
+    buf2 = file.read(130000)
+    assert buf == buf2
+'
diff --git a/tests/test-cow-unaligned.sh b/tests/test-cow-unaligned.sh
new file mode 100755
index 00000000..d89b84b9
--- /dev/null
+++ b/tests/test-cow-unaligned.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# nbdkit
+# Copyright (C) 2018-2021 Red Hat Inc.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the name of Red Hat nor the names of its contributors may be
+# used to endorse or promote products derived from this software without
+# specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+
+source ./functions.sh
+set -e
+set -x
+
+requires_filter cow
+requires_nbdsh_uri
+
+sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX)
+files="$sock cow-unaligned.pid"
+rm -f $files
+cleanup_fn rm -f $files
+
+# Run nbdkit with the cow filter and a size which is not a multiple of
+# the cow filter block size (4K).
+start_nbdkit -P cow-unaligned.pid -U $sock --filter=cow memory 130000
+
+nbdsh --connect "nbd+unix://?socket=$sock" \
+      -c '
+# Write some pattern data to the overlay and check it reads back OK.
+buf = b"abcdefghijklm" * 10000
+h.pwrite(buf, 0)
+buf2 = h.pread(130000, 0)
+assert buf == buf2
+'
-- 
2.29.0.rc2




More information about the Libguestfs mailing list