[Libguestfs] [PATCH nbdkit] file: Implement cache=none and fadvise=normal|random|sequential.

Richard W.M. Jones rjones at redhat.com
Fri Aug 7 11:31:14 UTC 2020


You can use these flags as described in the manual page to optimize
access patterns, and to get better behaviour with the page cache in
some scenarios.

For my testing I used the cachedel and cachestats utilities written by
Julius Plenz (https://github.com/Feh/nocache).  I started with a 32 GB
file of random data on a machine with about 32 GB of RAM.  At the
beginning of the test I evicted the file from the page cache:

$ cachedel /var/tmp/random
$ cachestats /var/tmp/random
pages in cache: 0/8388608 (0.0%)  [filesize=33554432.0K, pagesize=4K]

Performing a normal sequential copy of the file to /dev/null shows
that the file is almost entirely pulled into page cache (thus evicting
useful programs and data):

$ free -m; time ./nbdkit file /var/tmp/random --run 'qemu-img convert -n -p -m 16 -W $nbd "json:{\"file.driver\":\"null-co\",\"file.size\":\"1E\"}"' ; free -m ; cachestats /var/tmp/random
              total        used        free      shared  buff/cache   available
Mem:          32083        1193       27816           1        3073       30435
Swap:         16135          16       16119
    (100.00/100%)

real	0m12.437s
user	0m2.005s
sys	0m31.339s
              total        used        free      shared  buff/cache   available
Mem:          32083        1190         313           1       30578       30433
Swap:         16135          16       16119
pages in cache: 7053276/8388608 (84.1%)  [filesize=33554432.0K, pagesize=4K]

Now we repeat the test using fadvise=sequential cache=none:

$ cachedel /var/tmp/random
$ cachestats /var/tmp/random
pages in cache: 106/8388608 (0.0%)  [filesize=33554432.0K, pagesize=4K]

$ free -m; time ./nbdkit file /var/tmp/random fadvise=sequential cache=none --run 'qemu-img convert -n -p -m 16 -W $nbd "json:{\"file.driver\":\"null-co\",\"file.size\":\"1E\"}"' ; free -m ; cachestats /var/tmp/random
              total        used        free      shared  buff/cache   available
Mem:          32083        1188       27928           1        2966       30440
Swap:         16135          16       16119
    (100.00/100%)

real	0m13.107s
user	0m2.051s
sys	0m37.556s
              total        used        free      shared  buff/cache   available
Mem:          32083        1196       27861           1        3024       30429
Swap:         16135          16       16119
pages in cache: 14533/8388608 (0.2%)  [filesize=33554432.0K, pagesize=4K]

In this case the file largely avoids being pulled into the page cache,
and we do not evict useful stuff.

Notice that the test takes slightly longer to run.  This is expected
because page cache eviction happens synchronously.  I expect the cost
when doing sequential writes to be higher.  Linus outlined a technique
to do this without the overhead, but unfortunately it is considerably
more complex and dangerous than I am comfortable adding to the file
plugin:

http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01845.html
http://lkml.iu.edu/hypermail/linux/kernel/1005.2/01953.html

(See also scary warnings in the sync_file_range man page)
---
 plugins/file/nbdkit-file-plugin.pod | 44 ++++++++++++++
 plugins/file/file.c                 | 90 +++++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)

diff --git a/plugins/file/nbdkit-file-plugin.pod b/plugins/file/nbdkit-file-plugin.pod
index dac673ae..0d61b312 100644
--- a/plugins/file/nbdkit-file-plugin.pod
+++ b/plugins/file/nbdkit-file-plugin.pod
@@ -5,6 +5,7 @@ nbdkit-file-plugin - nbdkit file plugin
 =head1 SYNOPSIS
 
  nbdkit file [file=]FILENAME
+             [cache=default|none] [fadvise=normal|random|sequential]
 
 =head1 DESCRIPTION
 
@@ -17,6 +18,28 @@ It serves the named C<FILENAME> over NBD.  Local block devices
 
 =over 4
 
+=item B<cache=default>
+
+=item B<cache=none>
+
+Using C<cache=none> tries to prevent the kernel from keeping parts of
+the file that have already been read or written in the page cache.
+
+=item B<fadvise=normal>
+
+=item B<fadvise=random>
+
+=item B<fadvise=sequential>
+
+This optional flag hints to the kernel that you will access the file
+normally, or in a random order, or sequentially.  The exact behaviour
+depends on your operating system, but for Linux using C<normal> causes
+the kernel to read-ahead, C<sequential> causes the kernel to
+read-ahead twice as much as C<normal>, and C<random> turns off
+read-ahead.
+
+The default is C<normal>.
+
 =item [B<file=>]FILENAME
 
 Serve the file named C<FILENAME>.  A local block device name can also
@@ -31,6 +54,27 @@ See L<nbdkit(1)/Magic parameters>.
 
 =head1 NOTES
 
+=head2 Optimizing for random or sequential access
+
+If you know in advance that the NBD client will access the file
+randomly or only sequentially then you can hint that to the kernel
+using:
+
+ nbdkit file disk.img fadvise=random
+ nbdkit file disk.img fadvise=sequential
+
+As described in the L</PARAMETERS> section above, on Linux this
+disables or increases the amount of read-ahead that the kernel does.
+
+=head2 Reducing evictions from the page cache
+
+If the file is very large and you known the client will only
+read/write the file sequentially one time (eg for making a single copy
+or backup) then this will stop other processes from being evicted from
+the page cache:
+
+ nbdkit file disk.img fadvise=sequential cache=none
+
 =head2 Files on tmpfs
 
 If you want to expose a file that resides on a file system known to
diff --git a/plugins/file/file.c b/plugins/file/file.c
index 076e7531..a8e37cd9 100644
--- a/plugins/file/file.c
+++ b/plugins/file/file.c
@@ -66,6 +66,18 @@
 
 static char *filename = NULL;
 
+/* posix_fadvise mode: -1 = don't set it, or POSIX_FADV_*. */
+static int fadvise_mode =
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_NORMAL)
+  POSIX_FADV_NORMAL
+#else
+  -1
+#endif
+  ;
+
+/* cache mode */
+static enum { cache_default, cache_none } cache_mode = cache_default;
+
 /* Any callbacks using lseek must be protected by this lock. */
 static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER;
 
@@ -97,6 +109,46 @@ file_config (const char *key, const char *value)
     if (!filename)
       return -1;
   }
+  else if (strcmp (key, "fadvise") == 0) {
+    /* As this is a hint, if the kernel doesn't support the feature
+     * ignore the parameter.
+     */
+    if (strcmp (value, "normal") == 0) {
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_NORMAL)
+      fadvise_mode = POSIX_FADV_NORMAL;
+#else
+      fadvise_mode = -1;
+#endif
+    }
+    else if (strcmp (value, "random") == 0) {
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_RANDOM)
+      fadvise_mode = POSIX_FADV_RANDOM;
+#else
+      fadvise_mode = -1;
+#endif
+    }
+    else if (strcmp (value, "sequential") == 0) {
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_SEQUENTIAL)
+      fadvise_mode = POSIX_FADV_SEQUENTIAL;
+#else
+      fadvise_mode = -1;
+#endif
+    }
+    else {
+      nbdkit_error ("unknown fadvise mode: %s", value);
+      return -1;
+    }
+  }
+  else if (strcmp (key, "cache") == 0) {
+    if (strcmp (value, "default") == 0)
+      cache_mode = cache_default;
+    else if (strcmp (value, "none") == 0)
+      cache_mode = cache_none;
+    else {
+      nbdkit_error ("unknown cache mode: %s", value);
+      return -1;
+    }
+  }
   else if (strcmp (key, "rdelay") == 0 ||
            strcmp (key, "wdelay") == 0) {
     nbdkit_error ("add --filter=delay on the command line");
@@ -188,6 +240,17 @@ file_open (int readonly)
     return NULL;
   }
 
+  if (fadvise_mode != -1) {
+    /* This is a hint so we ignore failures. */
+#ifdef HAVE_POSIX_FADVISE
+    int r = posix_fadvise (h->fd, 0, 0, fadvise_mode);
+    if (r == -1)
+      nbdkit_debug ("posix_fadvise: %s: %m (ignored)", filename);
+#else
+    nbdkit_debug ("fadvise is not supported");
+#endif
+  }
+
   h->is_block_device = S_ISBLK (statbuf.st_mode);
   h->sector_size = 4096;  /* Start with safe guess */
 
@@ -329,6 +392,10 @@ file_pread (void *handle, void *buf, uint32_t count, uint64_t offset,
             uint32_t flags)
 {
   struct handle *h = handle;
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
+  uint32_t orig_count = count;
+  uint64_t orig_offset = offset;
+#endif
 
   while (count > 0) {
     ssize_t r = pread (h->fd, buf, count, offset);
@@ -345,6 +412,12 @@ file_pread (void *handle, void *buf, uint32_t count, uint64_t offset,
     offset += r;
   }
 
+#ifdef HAVE_POSIX_FADVISE
+  /* On Linux this will evict the pages we just read from the page cache. */
+  if (cache_mode == cache_none)
+    posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
+#endif
+
   return 0;
 }
 
@@ -355,6 +428,17 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
 {
   struct handle *h = handle;
 
+#if defined (HAVE_POSIX_FADVISE) && defined (POSIX_FADV_DONTNEED)
+  uint32_t orig_count = count;
+  uint64_t orig_offset = offset;
+
+  /* If cache=none we want to force pages we have just written to the
+   * file to be flushed to disk so we can immediately evict them from
+   * the page cache.
+   */
+  if (cache_mode == cache_none) flags |= NBDKIT_FLAG_FUA;
+#endif
+
   while (count > 0) {
     ssize_t r = pwrite (h->fd, buf, count, offset);
     if (r == -1) {
@@ -369,6 +453,12 @@ file_pwrite (void *handle, const void *buf, uint32_t count, uint64_t offset,
   if ((flags & NBDKIT_FLAG_FUA) && file_flush (handle, 0) == -1)
     return -1;
 
+#ifdef HAVE_POSIX_FADVISE
+  /* On Linux this will evict the pages we just wrote from the page cache. */
+  if (cache_mode == cache_none)
+    posix_fadvise (h->fd, orig_offset, orig_count, POSIX_FADV_DONTNEED);
+#endif
+
   return 0;
 }
 
-- 
2.27.0




More information about the Libguestfs mailing list