[Libguestfs] [PATCH nbdkit v2 4/4] cache: Implement cache-max-size and method of reclaiming space from the cache.

Thu Jan 3 03:39:10 UTC 2019

On 1/1/19 8:33 AM, Richard W.M. Jones wrote:
> The original plan was to have a background thread doing the reclaim.
> However that cannot work given the design of filters, because a
> background thread cannot access the next_ops struct which is only
> available during requests.
> 
> Therefore we spread the work over the request threads.  Each blk_*
> function checks whether there is work to do, and if there is will
> reclaim up to two blocks from the cache (thus ensuring that we always
> make forward progress, since each blk_* function can only add at most
> one block to the cache).
> ---
>  filters/cache/nbdkit-cache-filter.pod |  71 +++++++++++
>  filters/cache/cache.h                 |  13 ++
>  filters/cache/blk.c                   | 163 ++++++++++++++++++++++++++
>  filters/cache/cache.c                 |  56 +++++++++
>  filters/cache/lru.c                   |   7 +-
>  TODO                                  |  11 +-
>  tests/Makefile.am                     |   4 +-
>  tests/test-cache-max-size.sh          |  96 +++++++++++++++
>  8 files changed, 409 insertions(+), 12 deletions(-)
> 

> +=back
> +
> +The way this works is once the size of the cache exceeds
> +S<C<SIZE> ✕ the high threshold>, the filter works to reduce the size
> +of the cache until it is less than S<C<SIZE> ✕ the low threshold>.
> +Once the size is below the low threshold, no more reclaim work is done
> +until the size exceeds the high threshold again.
> +
> +The default thresholds are high 95% and low 80%.  The thresholds are
> +expressed as percentages of C<cache-max-size>, and may be greater than
> +100.

Do the thresholds support fractional percentages, like 90.5, or must
they be integral?  Should we enforce that low threshold < high
threshold?  What does it really mean to have a threshold > 100 - that we
can never reach the high threshold?

> +/* Do we support reclaiming cache blocks? */
> +#ifdef FALLOC_FL_PUNCH_HOLE
> +#define HAVE_CACHE_RECLAIM 1
> +#else
> +#undef HAVE_CACHE_RECLAIM
> +#endif
> +

Should the man page mention that max cache size can only be enforced
with kernel support?  Do we want to go further and probe whether
FALLOC_FL_PUNCH_HOLE works on $TMPDIR?  (Even if the kernel supports
punching holes, there are some filesystems that don't).

> @@ -74,6 +76,31 @@ enum bm_entry {
>    BLOCK_DIRTY = 3,
>  };
>  
> +#ifdef HAVE_CACHE_RECLAIM
> +/* If we are currently reclaiming blocks from the cache.
> + *
> + * The state machine starts in the NOT_RECLAIMING state.  When the
> + * size of the cache exceeds the high threshold, we move to
> + * RECLAIMING_LRU.  Once we have exhausted all LRU blocks, we move to
> + * RECLAIMING_ANY (reclaiming any blocks).
> + *
> + * If at any time the size of the cache goes below the low threshold
> + * we move back to the NOT_RECLAIMING state.
> + *
> + * reclaim_blk is the last block that we will looked at.

s/will //

> +
> +#ifdef HAVE_CACHE_RECLAIM
> +
> +static void reclaim_one (void);
> +static void reclaim_lru (void);
> +static void reclaim_any (void);
> +static void reclaim_block (void);
> +
> +/* See if we need to reclaim blocks. */
> +static void
> +reclaim (void)
> +{
> +  struct stat statbuf;
> +  uint64_t cache_allocated;
> +
> +  /* If the user didn't set cache-max-size, do nothing. */
> +  if (max_size == -1) return;
> +
> +  /* Check the allocated size of the cache. */
> +  if (fstat (fd, &statbuf) == -1) {
> +    nbdkit_debug ("cache: fstat: %m");
> +    return;
> +  }
> +  cache_allocated = statbuf.st_blocks * UINT64_C(512);

Is that calculation always going to be accurate, even when more
disks/file systems move towards 4k minimum access?

> +
> +  if (reclaiming) {
> +    /* Keep reclaiming until the cache size drops below the low threshold. */
> +    if (cache_allocated < max_size * lo_thresh / 100) {
> +      nbdkit_debug ("cache: stop reclaiming");
> +      reclaiming = NOT_RECLAIMING;
> +      return;
> +    }
> +  }
> +  else {
> +    if (cache_allocated < max_size * hi_thresh / 100)
> +      return;
> +
> +    /* Start reclaiming if the cache size goes over the high threshold. */
> +    nbdkit_debug ("cache: start reclaiming");
> +    reclaiming = RECLAIMING_LRU;
> +  }
> +
> +  /* Reclaim up to 2 cache blocks. */
> +  reclaim_one ();
> +  reclaim_one ();
> +}
> +
> +/* Reclaim a single cache block. */
> +static void
> +reclaim_one (void)
> +{
> +  assert (reclaiming);
> +
> +  if (reclaiming == RECLAIMING_LRU)
> +    reclaim_lru ();
> +  else
> +    reclaim_any ();
> +}
> +
> +static void
> +reclaim_lru (void)
> +{
> +  uint64_t old_reclaim_blk;
> +
> +  /* Find the next block in the cache. */
> +  reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> +  old_reclaim_blk = reclaim_blk;
> +
> +  /* Search for an LRU block after this one. */
> +  do {
> +    if (! lru_has_been_recently_accessed (reclaim_blk)) {
> +      reclaim_block ();
> +      return;
> +    }
> +
> +    reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> +    if (reclaim_blk == -1)    /* wrap around */
> +      reclaim_blk = bitmap_next (&bm, 0);
> +  } while (reclaim_blk >= 0 && old_reclaim_blk != reclaim_blk);
> +
> +  if (old_reclaim_blk == reclaim_blk) {
> +    /* Run out of LRU blocks, so start reclaiming any block in the cache. */
> +    nbdkit_debug ("cache: reclaiming any blocks");
> +    reclaiming = RECLAIMING_ANY;
> +    reclaim_one ();
> +    return;
> +  }
> +}
> +
> +static void
> +reclaim_any (void)
> +{
> +  /* Find the next block in the cache. */
> +  reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> +  if (reclaim_blk == -1)        /* wrap around */
> +    reclaim_blk = bitmap_next (&bm, 0);
> +
> +  reclaim_block ();
> +}
> +
> +static void
> +reclaim_block (void)
> +{
> +  if (reclaim_blk == -1) {
> +    nbdkit_debug ("cache: run out of blocks to reclaim!");
> +    return;
> +  }
> +
> +  nbdkit_debug ("cache: reclaiming block %" PRIu64, reclaim_blk);

Where does this prioritize clean blocks over dirty blocks?  Or is the
.pod change inaccurate given later changes to implementation?

> +#ifdef FALLOC_FL_PUNCH_HOLE
> +  if (fallocate (fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> +                 reclaim_blk * BLKSIZE, BLKSIZE) == -1) {
> +    nbdkit_error ("cache: reclaiming cache blocks: "
> +                  "fallocate: FALLOC_FL_PUNCH_HOLE: %m");
> +    return;
> +  }
> +#else
> +#error "no implementation for punching holes"

This #error should be unreachable, given the definition of
HAVE_CACHE_RECLAIM based on FALLOC_FL_PUNCH_HOLE.

> +#endif
> +
> +  bitmap_set_blk (&bm, reclaim_blk, BLOCK_NOT_CACHED);
> +}
> +
> +#else /* !HAVE_CACHE_RECLAIM */
> +static void
> +reclaim (void)
> +{
> +  /* nothing */
> +}
> +#endif /* !HAVE_CACHE_RECLAIM */
> diff --git a/filters/cache/cache.c b/filters/cache/cache.c
> index a14f789..67dde23 100644
> --- a/filters/cache/cache.c
> +++ b/filters/cache/cache.c
> @@ -65,6 +65,8 @@
>  static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
>  
>  enum cache_mode cache_mode = CACHE_MODE_WRITEBACK;
> +int64_t max_size = -1;
> +int hi_thresh = 95, lo_thresh = 80;
>  bool cache_on_read = false;
>  
>  static int cache_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle, uint32_t flags, int *err);
> @@ -105,6 +107,44 @@ cache_config (nbdkit_next_config *next, void *nxdata,
>        return -1;
>      }
>    }
> +#ifdef HAVE_CACHE_RECLAIM
> +  else if (strcmp (key, "cache-max-size") == 0) {
> +    int64_t r;
> +
> +    r = nbdkit_parse_size (value);
> +    if (r == -1)
> +      return -1;
> +    /* We set a lower limit for the cache size just to keep out of
> +     * trouble.
> +     */
> +    if (r < 1024*1024) {
> +      nbdkit_error ("cache-max-size is too small");
> +      return -1;
> +    }
> +    max_size = r;
> +    return 0;
> +  }
> +  else if (strcmp (key, "cache-high-threshold") == 0) {
> +    if (sscanf (value, "%d", &hi_thresh) != 1) {
> +      nbdkit_error ("invalid cache-high-threshold parameter: %s", value);
> +      return -1;
> +    }
> +    if (hi_thresh <= 0) {
> +      nbdkit_error ("cache-high-threshold must be greater than zero");
> +    }
> +    return 0;
> +  }
> +  else if (strcmp (key, "cache-low-threshold") == 0) {
> +    if (sscanf (value, "%d", &lo_thresh) != 1) {
> +      nbdkit_error ("invalid cache-low-threshold parameter: %s", value);
> +      return -1;
> +    }
> +    if (lo_thresh <= 0) {
> +      nbdkit_error ("cache-low-threshold must be greater than zero");
> +    }
> +    return 0;
> +  }
> +#endif

There is no #else, so on platforms that can't punch holes, this silently
passes cache-max-size and friends on to the next filter layer, where it
will probably result in some odd error about being unrecognized. Would
it be better to have an explicit #else branch here that specifically
handles (and rejects) these parameters where we can't implement them?

> +++ b/tests/test-cache-max-size.sh

> +
> +# Check that this is a Linux-like system supporting /proc/pid/fd.
> +if ! test -d /proc/self/fd; then
> +    echo "$0: not a Linux-like system supporting /proc/pid/fd"

Would this read any better if you used /proc/\$pid/fd or /proc/PID/fd in
the error message?  Especially since...

> +    exit 77
> +fi
> +
> +# Test that qemu-io works
> +if ! qemu-io --help >/dev/null; then
> +    echo "$0: missing or broken qemu-io"
> +    exit 77
> +fi
> +
> +# Need the stat command from coreutils.
> +if ! stat --version >/dev/null; then
> +    echo "$0: missing or broken stat command"
> +    exit 77
> +fi
> +
> +d=cache-max-size.d
> +rm -rf $d
> +mkdir -p $d
> +cleanup_fn rm -rf $d
> +
> +# Create a cache directory.
> +mkdir $d/cache
> +TMPDIR=$d/cache
> +export TMPDIR
> +
> +# Create an empty base image.
> +truncate -s 1G $d/cache-max-size.img
> +
> +# Run nbdkit with the caching filter and a low size limit to ensure
> +# that the reclaim code is exercised.
> +start_nbdkit -P $d/cache-max-size.pid -U $d/cache-max-size.sock \
> +             --filter=cache \
> +             file $d/cache-max-size.img \
> +             cache-max-size=10M cache-on-read=true
> +
> +# Write > 10M to the plugin.
> +qemu-io -f raw "nbd+unix://?socket=$d/cache-max-size.sock" \
> +        -c "w -P 0 0 10M" \
> +        -c "w -P 0 20M 10M" \
> +        -c "r 20M 10M" \
> +        -c "r 30M 10M" \
> +        -c "w -P 0 40M 10M"
> +
> +# We can't use ‘du’ on the cache directory because the cache file is
> +# deleted by the filter, and so is only accessible via /proc/PID/fd.

...you use PID here to make it obvious it is a placeholder?

> +# Get the /proc link to the cache file, and the size of it in bytes.
> +fddir="/proc/$( cat $d/cache-max-size.pid )/fd"
> +ls -l $fddir
> +fd="$fddir/$( ls -l $fddir | grep $TMPDIR | head -1 | awk '{print $9}' )"
> +stat -L $fd
> +size=$(( $(stat -L -c '%b * %B' $fd) ))

Wow - that's quite a bit of work. But it looks reasonable and sticks to
tools likely to already be present for other tests, and avoids dragging
in lsof or other less-common dependencies.

> +
> +if [ "$size" -gt $(( 11 * 1024 * 1024 )) ]; then
> +    echo "$0: cache size is larger than 10M (actual size: $size bytes)"
> +    exit 1
> +fi
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: OpenPGP digital signature
URL: <http://listman.redhat.com/archives/libguestfs/attachments/20190102/e91a2b7b/attachment.sig>