[Libguestfs] [PATCH nbdkit v2 4/4] cache: Implement cache-max-size and method of reclaiming space from the cache.
Eric Blake
eblake at redhat.com
Thu Jan 3 03:39:10 UTC 2019
On 1/1/19 8:33 AM, Richard W.M. Jones wrote:
> The original plan was to have a background thread doing the reclaim.
> However that cannot work given the design of filters, because a
> background thread cannot access the next_ops struct which is only
> available during requests.
>
> Therefore we spread the work over the request threads. Each blk_*
> function checks whether there is work to do, and if there is will
> reclaim up to two blocks from the cache (thus ensuring that we always
> make forward progress, since each blk_* function can only add at most
> one block to the cache).
> ---
> filters/cache/nbdkit-cache-filter.pod | 71 +++++++++++
> filters/cache/cache.h | 13 ++
> filters/cache/blk.c | 163 ++++++++++++++++++++++++++
> filters/cache/cache.c | 56 +++++++++
> filters/cache/lru.c | 7 +-
> TODO | 11 +-
> tests/Makefile.am | 4 +-
> tests/test-cache-max-size.sh | 96 +++++++++++++++
> 8 files changed, 409 insertions(+), 12 deletions(-)
>
> +=back
> +
> +The way this works is once the size of the cache exceeds
> +S<C<SIZE> ✕ the high threshold>, the filter works to reduce the size
> +of the cache until it is less than S<C<SIZE> ✕ the low threshold>.
> +Once the size is below the low threshold, no more reclaim work is done
> +until the size exceeds the high threshold again.
> +
> +The default thresholds are high 95% and low 80%. The thresholds are
> +expressed as percentages of C<cache-max-size>, and may be greater than
> +100.
Do the thresholds support fractional percentages, like 90.5, or must
they be integral? Should we enforce that low threshold < high
threshold? What does it really mean to have a threshold > 100 - that we
can never reach the high threshold?
> +/* Do we support reclaiming cache blocks? */
> +#ifdef FALLOC_FL_PUNCH_HOLE
> +#define HAVE_CACHE_RECLAIM 1
> +#else
> +#undef HAVE_CACHE_RECLAIM
> +#endif
> +
Should the man page mention that max cache size can only be enforced
with kernel support? Do we want to go further and probe whether
FALLOC_FL_PUNCH_HOLE works on $TMPDIR? (Even if the kernel supports
punching holes, there are some filesystems that don't).
> @@ -74,6 +76,31 @@ enum bm_entry {
> BLOCK_DIRTY = 3,
> };
>
> +#ifdef HAVE_CACHE_RECLAIM
> +/* If we are currently reclaiming blocks from the cache.
> + *
> + * The state machine starts in the NOT_RECLAIMING state. When the
> + * size of the cache exceeds the high threshold, we move to
> + * RECLAIMING_LRU. Once we have exhausted all LRU blocks, we move to
> + * RECLAIMING_ANY (reclaiming any blocks).
> + *
> + * If at any time the size of the cache goes below the low threshold
> + * we move back to the NOT_RECLAIMING state.
> + *
> + * reclaim_blk is the last block that we will looked at.
s/will //
> +
> +#ifdef HAVE_CACHE_RECLAIM
> +
> +static void reclaim_one (void);
> +static void reclaim_lru (void);
> +static void reclaim_any (void);
> +static void reclaim_block (void);
> +
> +/* See if we need to reclaim blocks. */
> +static void
> +reclaim (void)
> +{
> + struct stat statbuf;
> + uint64_t cache_allocated;
> +
> + /* If the user didn't set cache-max-size, do nothing. */
> + if (max_size == -1) return;
> +
> + /* Check the allocated size of the cache. */
> + if (fstat (fd, &statbuf) == -1) {
> + nbdkit_debug ("cache: fstat: %m");
> + return;
> + }
> + cache_allocated = statbuf.st_blocks * UINT64_C(512);
Is that calculation always going to be accurate, even when more
disks/file systems move towards 4k minimum access?
> +
> + if (reclaiming) {
> + /* Keep reclaiming until the cache size drops below the low threshold. */
> + if (cache_allocated < max_size * lo_thresh / 100) {
> + nbdkit_debug ("cache: stop reclaiming");
> + reclaiming = NOT_RECLAIMING;
> + return;
> + }
> + }
> + else {
> + if (cache_allocated < max_size * hi_thresh / 100)
> + return;
> +
> + /* Start reclaiming if the cache size goes over the high threshold. */
> + nbdkit_debug ("cache: start reclaiming");
> + reclaiming = RECLAIMING_LRU;
> + }
> +
> + /* Reclaim up to 2 cache blocks. */
> + reclaim_one ();
> + reclaim_one ();
> +}
> +
> +/* Reclaim a single cache block. */
> +static void
> +reclaim_one (void)
> +{
> + assert (reclaiming);
> +
> + if (reclaiming == RECLAIMING_LRU)
> + reclaim_lru ();
> + else
> + reclaim_any ();
> +}
> +
> +static void
> +reclaim_lru (void)
> +{
> + uint64_t old_reclaim_blk;
> +
> + /* Find the next block in the cache. */
> + reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> + old_reclaim_blk = reclaim_blk;
> +
> + /* Search for an LRU block after this one. */
> + do {
> + if (! lru_has_been_recently_accessed (reclaim_blk)) {
> + reclaim_block ();
> + return;
> + }
> +
> + reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> + if (reclaim_blk == -1) /* wrap around */
> + reclaim_blk = bitmap_next (&bm, 0);
> + } while (reclaim_blk >= 0 && old_reclaim_blk != reclaim_blk);
> +
> + if (old_reclaim_blk == reclaim_blk) {
> + /* Run out of LRU blocks, so start reclaiming any block in the cache. */
> + nbdkit_debug ("cache: reclaiming any blocks");
> + reclaiming = RECLAIMING_ANY;
> + reclaim_one ();
> + return;
> + }
> +}
> +
> +static void
> +reclaim_any (void)
> +{
> + /* Find the next block in the cache. */
> + reclaim_blk = bitmap_next (&bm, reclaim_blk+1);
> + if (reclaim_blk == -1) /* wrap around */
> + reclaim_blk = bitmap_next (&bm, 0);
> +
> + reclaim_block ();
> +}
> +
> +static void
> +reclaim_block (void)
> +{
> + if (reclaim_blk == -1) {
> + nbdkit_debug ("cache: run out of blocks to reclaim!");
> + return;
> + }
> +
> + nbdkit_debug ("cache: reclaiming block %" PRIu64, reclaim_blk);
Where does this prioritize clean blocks over dirty blocks? Or is the
.pod change inaccurate given later changes to implementation?
> +#ifdef FALLOC_FL_PUNCH_HOLE
> + if (fallocate (fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> + reclaim_blk * BLKSIZE, BLKSIZE) == -1) {
> + nbdkit_error ("cache: reclaiming cache blocks: "
> + "fallocate: FALLOC_FL_PUNCH_HOLE: %m");
> + return;
> + }
> +#else
> +#error "no implementation for punching holes"
This #error should be unreachable, given the definition of
HAVE_CACHE_RECLAIM based on FALLOC_FL_PUNCH_HOLE.
> +#endif
> +
> + bitmap_set_blk (&bm, reclaim_blk, BLOCK_NOT_CACHED);
> +}
> +
> +#else /* !HAVE_CACHE_RECLAIM */
> +static void
> +reclaim (void)
> +{
> + /* nothing */
> +}
> +#endif /* !HAVE_CACHE_RECLAIM */
> diff --git a/filters/cache/cache.c b/filters/cache/cache.c
> index a14f789..67dde23 100644
> --- a/filters/cache/cache.c
> +++ b/filters/cache/cache.c
> @@ -65,6 +65,8 @@
> static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
>
> enum cache_mode cache_mode = CACHE_MODE_WRITEBACK;
> +int64_t max_size = -1;
> +int hi_thresh = 95, lo_thresh = 80;
> bool cache_on_read = false;
>
> static int cache_flush (struct nbdkit_next_ops *next_ops, void *nxdata, void *handle, uint32_t flags, int *err);
> @@ -105,6 +107,44 @@ cache_config (nbdkit_next_config *next, void *nxdata,
> return -1;
> }
> }
> +#ifdef HAVE_CACHE_RECLAIM
> + else if (strcmp (key, "cache-max-size") == 0) {
> + int64_t r;
> +
> + r = nbdkit_parse_size (value);
> + if (r == -1)
> + return -1;
> + /* We set a lower limit for the cache size just to keep out of
> + * trouble.
> + */
> + if (r < 1024*1024) {
> + nbdkit_error ("cache-max-size is too small");
> + return -1;
> + }
> + max_size = r;
> + return 0;
> + }
> + else if (strcmp (key, "cache-high-threshold") == 0) {
> + if (sscanf (value, "%d", &hi_thresh) != 1) {
> + nbdkit_error ("invalid cache-high-threshold parameter: %s", value);
> + return -1;
> + }
> + if (hi_thresh <= 0) {
> + nbdkit_error ("cache-high-threshold must be greater than zero");
> + }
> + return 0;
> + }
> + else if (strcmp (key, "cache-low-threshold") == 0) {
> + if (sscanf (value, "%d", &lo_thresh) != 1) {
> + nbdkit_error ("invalid cache-low-threshold parameter: %s", value);
> + return -1;
> + }
> + if (lo_thresh <= 0) {
> + nbdkit_error ("cache-low-threshold must be greater than zero");
> + }
> + return 0;
> + }
> +#endif
There is no #else, so on platforms that can't punch holes, this silently
passes cache-max-size and friends on to the next filter layer, where it
will probably result in some odd error about being unrecognized. Would
it be better to have an explicit #else branch here that specifically
handles (and rejects) these parameters where we can't implement them?
> +++ b/tests/test-cache-max-size.sh
> +
> +# Check that this is a Linux-like system supporting /proc/pid/fd.
> +if ! test -d /proc/self/fd; then
> + echo "$0: not a Linux-like system supporting /proc/pid/fd"
Would this read any better if you used /proc/\$pid/fd or /proc/PID/fd in
the error message? Especially since...
> + exit 77
> +fi
> +
> +# Test that qemu-io works
> +if ! qemu-io --help >/dev/null; then
> + echo "$0: missing or broken qemu-io"
> + exit 77
> +fi
> +
> +# Need the stat command from coreutils.
> +if ! stat --version >/dev/null; then
> + echo "$0: missing or broken stat command"
> + exit 77
> +fi
> +
> +d=cache-max-size.d
> +rm -rf $d
> +mkdir -p $d
> +cleanup_fn rm -rf $d
> +
> +# Create a cache directory.
> +mkdir $d/cache
> +TMPDIR=$d/cache
> +export TMPDIR
> +
> +# Create an empty base image.
> +truncate -s 1G $d/cache-max-size.img
> +
> +# Run nbdkit with the caching filter and a low size limit to ensure
> +# that the reclaim code is exercised.
> +start_nbdkit -P $d/cache-max-size.pid -U $d/cache-max-size.sock \
> + --filter=cache \
> + file $d/cache-max-size.img \
> + cache-max-size=10M cache-on-read=true
> +
> +# Write > 10M to the plugin.
> +qemu-io -f raw "nbd+unix://?socket=$d/cache-max-size.sock" \
> + -c "w -P 0 0 10M" \
> + -c "w -P 0 20M 10M" \
> + -c "r 20M 10M" \
> + -c "r 30M 10M" \
> + -c "w -P 0 40M 10M"
> +
> +# We can't use ‘du’ on the cache directory because the cache file is
> +# deleted by the filter, and so is only accessible via /proc/PID/fd.
...you use PID here to make it obvious it is a placeholder?
> +# Get the /proc link to the cache file, and the size of it in bytes.
> +fddir="/proc/$( cat $d/cache-max-size.pid )/fd"
> +ls -l $fddir
> +fd="$fddir/$( ls -l $fddir | grep $TMPDIR | head -1 | awk '{print $9}' )"
> +stat -L $fd
> +size=$(( $(stat -L -c '%b * %B' $fd) ))
Wow - that's quite a bit of work. But it looks reasonable and sticks to
tools likely to already be present for other tests, and avoids dragging
in lsof or other less-common dependencies.
> +
> +if [ "$size" -gt $(( 11 * 1024 * 1024 )) ]; then
> + echo "$0: cache size is larger than 10M (actual size: $size bytes)"
> + exit 1
> +fi
>
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3226
Virtualization: qemu.org | libvirt.org
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: OpenPGP digital signature
URL: <http://listman.redhat.com/archives/libguestfs/attachments/20190102/e91a2b7b/attachment.sig>
More information about the Libguestfs
mailing list