[Libguestfs] [nbdkit PATCH v2 2/2] cache, cow: Reduce use of bounce-buffer

Mon May 13 21:40:22 UTC 2019

Although the time spent in memcpy/memset probably pales in comparison
to time spent in socket I/O, it's still worth worth reducing the
number of times we have to utilize a bounce buffer when we already
have aligned data.

Note that blocksize, cache, and cow all do block fragmentation and
bounce-buffer alignment; this brings the logic in cache and cow (which
were copied from one another) more closely in line with what is in
blocksize (written independently), in case a future change to nbdkit
allows plugins/filters to report their preferred alignment and block
sizes and we refactor the actual fragmentation/alignment logic to
nbdkit proper. However, there are still some intentional differences:
the blocksize filter utilizes the largest aligned chunk possible,
while cache and cow still fragment the aligned portion all the way
down to blksize (even if it has more overhead); conversely, blocksize
serializes all requests to share a single bounce buffer, while we have
a per-request bounce buffer.

Not addressed here: we have no way (yet) for a plugin to report
preferred alignment (for example, a plugin using O_DIRECT may insist
on 4k alignment not only for the offset being read, but also for the
offset of the buffer that the read results are placed into). When we
always used the bounce buffer, we were more likely to have an aligned
buffer out of luck (although not guaranteed, since we only used malloc
rather than posix_memalign); now that we write the middle of the
request directly into the caller's buffer, it is more likely that we
are not matching alignment if the overall request was unaligned. But
ideally, a fix for that would be at the nbdkit level, where every
plugin records its preferred alignments, and where nbdkit then
overallocates and arranges that .pread and .pwrite use the middle of
that buffer such that 'offset % alignment == buf % alignment'.

Signed-off-by: Eric Blake <eblake at redhat.com>
---
 filters/cache/cache.c | 173 +++++++++++++++++++++++++++++-----------
 filters/cow/cow.c     | 180 +++++++++++++++++++++++++++++++-----------
 2 files changed, 263 insertions(+), 90 deletions(-)

diff --git a/filters/cache/cache.c b/filters/cache/cache.c
index 19ce555..360458f 100644
--- a/filters/cache/cache.c
+++ b/filters/cache/cache.c
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2018 Red Hat Inc.
+ * Copyright (C) 2018-2019 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -58,6 +58,8 @@
 #include "cache.h"
 #include "blk.h"
 #include "reclaim.h"
+#include "isaligned.h"
+#include "minmax.h"

 #define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL

@@ -231,43 +233,68 @@ cache_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
              uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
+  uint64_t blknum, blkoffs;
+  int r;

   assert (!flags);
-  block = malloc (blksize);
-  if (block == NULL) {
-    *err = errno;
-    nbdkit_error ("malloc: %m");
-    return -1;
+  if (!IS_ALIGNED (count | offset, blksize)) {
+    block = malloc (blksize);
+    if (block == NULL) {
+      *err = errno;
+      nbdkit_error ("malloc: %m");
+      return -1;
+    }
   }

+  blknum = offset / blksize;  /* block number */
+  blkoffs = offset % blksize; /* offset within the block */
+
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (blksize - blkoffs, count);
+
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r == -1)
+      return -1;
+
+    memcpy (buf, &block[blkoffs], n);
+
+    buf += n;
+    count -= n;
+    offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
   /* XXX This breaks up large read requests into smaller ones, which
    * is a problem for plugins which have a large, fixed per-request
    * overhead (hello, curl).  We should try to keep large requests
    * together as much as possible, but that requires us to be much
    * smarter here.
    */
-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;
+  while (count >= blksize) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, buf, err);
+    if (r == -1)
+      return -1;

-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
+    buf += blksize;
+    count -= blksize;
+    offset += blksize;
+    blknum++;
+  }

-    {
-      ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-      r = blk_read (next_ops, nxdata, blknum, block, err);
-    }
+  /* Unaligned tail */
+  if (count) {
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;

-    memcpy (buf, &block[blkoffs], n);
-
-    buf += n;
-    count -= n;
-    offset += n;
+    memcpy (buf, block, count);
   }

   return 0;
@@ -280,13 +307,17 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
               uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
+  uint64_t blknum, blkoffs;
+  int r;
   bool need_flush = false;

-  block = malloc (blksize);
-  if (block == NULL) {
-    *err = errno;
-    nbdkit_error ("malloc: %m");
-    return -1;
+  if (!IS_ALIGNED (count | offset, blksize)) {
+    block = malloc (blksize);
+    if (block == NULL) {
+      *err = errno;
+      nbdkit_error ("malloc: %m");
+      return -1;
+    }
   }

   if ((flags & NBDKIT_FLAG_FUA) &&
@@ -294,19 +325,18 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
     flags &= ~NBDKIT_FLAG_FUA;
     need_flush = true;
   }
-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;

-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
+  blknum = offset / blksize;  /* block number */
+  blkoffs = offset % blksize; /* offset within the block */
+
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (blksize - blkoffs, count);

     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
      */
+    assert (block);
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
@@ -319,6 +349,33 @@ cache_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
     buf += n;
     count -= n;
     offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
+  while (count >= blksize) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_write (next_ops, nxdata, blknum, buf, flags, err);
+    if (r == -1)
+      return -1;
+
+    buf += blksize;
+    count -= blksize;
+    offset += blksize;
+    blknum++;
+  }
+
+  /* Unaligned tail */
+  if (count) {
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r != -1) {
+      memcpy (block, buf, count);
+      r = blk_write (next_ops, nxdata, blknum, block, flags, err);
+    }
+    if (r == -1)
+      return -1;
   }

   if (need_flush)
@@ -333,6 +390,8 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
             int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
+  uint64_t blknum, blkoffs;
+  int r;
   bool need_flush = false;

   block = malloc (blksize);
@@ -348,15 +407,13 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     flags &= ~NBDKIT_FLAG_FUA;
     need_flush = true;
   }
-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;

-    blknum = offset / blksize;  /* block number */
-    blkoffs = offset % blksize; /* offset within the block */
-    n = blksize - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
+  blknum = offset / blksize;  /* block number */
+  blkoffs = offset % blksize; /* offset within the block */
+
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (blksize - blkoffs, count);

     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
@@ -372,6 +429,34 @@ cache_zero (struct nbdkit_next_ops *next_ops, void *nxdata,

     count -= n;
     offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
+  if (count >= blksize)
+    memset (block, 0, blksize);
+  while (count >=blksize) {
+    /* Intentional that we do not use next_ops->zero */
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_write (next_ops, nxdata, blknum, block, flags, err);
+    if (r == -1)
+      return -1;
+
+    count -= blksize;
+    offset += blksize;
+    blknum++;
+  }
+
+  /* Unaligned tail */
+  if (count) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r != -1) {
+      memset (&block[count], 0, blksize - count);
+      r = blk_write (next_ops, nxdata, blknum, block, flags, err);
+    }
+    if (r == -1)
+      return -1;
   }

   if (need_flush)
diff --git a/filters/cow/cow.c b/filters/cow/cow.c
index 35ad718..1ce5893 100644
--- a/filters/cow/cow.c
+++ b/filters/cow/cow.c
@@ -1,5 +1,5 @@
 /* nbdkit
- * Copyright (C) 2018 Red Hat Inc.
+ * Copyright (C) 2018-2019 Red Hat Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -47,6 +47,8 @@
 #include "cleanup.h"

 #include "blk.h"
+#include "isaligned.h"
+#include "minmax.h"

 #define THREAD_MODEL NBDKIT_THREAD_MODEL_PARALLEL

@@ -155,28 +157,28 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
            uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
-
-  block = malloc (BLKSIZE);
-  if (block == NULL) {
-    *err = errno;
-    nbdkit_error ("malloc: %m");
-    return -1;
-  }
-
-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;
-
-    blknum = offset / BLKSIZE;  /* block number */
-    blkoffs = offset % BLKSIZE; /* offset within the block */
-    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
-
-    {
-      ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
-      r = blk_read (next_ops, nxdata, blknum, block, err);
+  uint64_t blknum, blkoffs;
+  int r;
+
+  if (!IS_ALIGNED (count | offset, BLKSIZE)) {
+    block = malloc (BLKSIZE);
+    if (block == NULL) {
+      *err = errno;
+      nbdkit_error ("malloc: %m");
+      return -1;
     }
+  }
+
+  blknum = offset / BLKSIZE;  /* block number */
+  blkoffs = offset % BLKSIZE; /* offset within the block */
+
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (BLKSIZE - blkoffs, count);
+
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r == -1)
       return -1;

@@ -185,6 +187,37 @@ cow_pread (struct nbdkit_next_ops *next_ops, void *nxdata,
     buf += n;
     count -= n;
     offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
+  /* XXX This breaks up large read requests into smaller ones, which
+   * is a problem for plugins which have a large, fixed per-request
+   * overhead (hello, curl).  We should try to keep large requests
+   * together as much as possible, but that requires us to be much
+   * smarter here.
+   */
+  while (count >= BLKSIZE) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, buf, err);
+    if (r == -1)
+      return -1;
+
+    buf += BLKSIZE;
+    count -= BLKSIZE;
+    offset += BLKSIZE;
+    blknum++;
+  }
+
+  /* Unaligned tail */
+  if (count) {
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r == -1)
+      return -1;
+
+    memcpy (buf, block, count);
   }

   return 0;
@@ -197,27 +230,29 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
             uint32_t flags, int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
+  uint64_t blknum, blkoffs;
+  int r;

-  block = malloc (BLKSIZE);
-  if (block == NULL) {
-    *err = errno;
-    nbdkit_error ("malloc: %m");
-    return -1;
+  if (!IS_ALIGNED (count | offset, BLKSIZE)) {
+    block = malloc (BLKSIZE);
+    if (block == NULL) {
+      *err = errno;
+      nbdkit_error ("malloc: %m");
+      return -1;
+    }
   }

-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;
+  blknum = offset / BLKSIZE;  /* block number */
+  blkoffs = offset % BLKSIZE; /* offset within the block */

-    blknum = offset / BLKSIZE;  /* block number */
-    blkoffs = offset % BLKSIZE; /* offset within the block */
-    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (BLKSIZE - blkoffs, count);

     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
      */
+    assert (block);
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
     if (r != -1) {
@@ -230,6 +265,33 @@ cow_pwrite (struct nbdkit_next_ops *next_ops, void *nxdata,
     buf += n;
     count -= n;
     offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
+  while (count >= BLKSIZE) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_write (blknum, buf, err);
+    if (r == -1)
+      return -1;
+
+    buf += BLKSIZE;
+    count -= BLKSIZE;
+    offset += BLKSIZE;
+    blknum++;
+  }
+
+  /* Unaligned tail */
+  if (count) {
+    assert (block);
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r != -1) {
+      memcpy (block, buf, count);
+      r = blk_write (blknum, block, err);
+    }
+    if (r == -1)
+      return -1;
   }

   if (flags & NBDKIT_FLAG_FUA)
@@ -244,6 +306,8 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
           int *err)
 {
   CLEANUP_FREE uint8_t *block = NULL;
+  uint64_t blknum, blkoffs;
+  int r;

   block = malloc (BLKSIZE);
   if (block == NULL) {
@@ -252,21 +316,15 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,
     return -1;
   }

-  while (count > 0) {
-    uint64_t blknum, blkoffs, n;
-    int r;
+  blknum = offset / BLKSIZE;  /* block number */
+  blkoffs = offset % BLKSIZE; /* offset within the block */

-    blknum = offset / BLKSIZE;  /* block number */
-    blkoffs = offset % BLKSIZE; /* offset within the block */
-    n = BLKSIZE - blkoffs;      /* max bytes we can read from this block */
-    if (n > count)
-      n = count;
+  /* Unaligned head */
+  if (blkoffs) {
+    uint64_t n = MIN (BLKSIZE - blkoffs, count);

     /* Do a read-modify-write operation on the current block.
      * Hold the lock over the whole operation.
-     *
-     * XXX There is the possibility of optimizing this: ONLY if we are
-     * writing a whole, aligned block, then use FALLOC_FL_ZERO_RANGE.
      */
     ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
     r = blk_read (next_ops, nxdata, blknum, block, err);
@@ -279,6 +337,36 @@ cow_zero (struct nbdkit_next_ops *next_ops, void *nxdata,

     count -= n;
     offset += n;
+    blknum++;
+  }
+
+  /* Aligned body */
+  if (count >= BLKSIZE)
+    memset (block, 0, BLKSIZE);
+  while (count >= BLKSIZE) {
+    /* XXX There is the possibility of optimizing this: since this loop is
+     * writing a whole, aligned block, we should use FALLOC_FL_ZERO_RANGE.
+     */
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_write (blknum, block, err);
+    if (r == -1)
+      return -1;
+
+    count -= BLKSIZE;
+    offset += BLKSIZE;
+    blknum++;
+  }
+
+  /* Unaligned tail */
+  if (count) {
+    ACQUIRE_LOCK_FOR_CURRENT_SCOPE (&lock);
+    r = blk_read (next_ops, nxdata, blknum, block, err);
+    if (r != -1) {
+      memset (&block[count], 0, BLKSIZE - count);
+      r = blk_write (blknum, block, err);
+    }
+    if (r == -1)
+      return -1;
   }

   if (flags & NBDKIT_FLAG_FUA)
-- 
2.20.1