[Linux-cachefs] [PATCH 2/2] Suspend/resume culling based on recently released file/block counts

Steve Dickson SteveD at redhat.com
Wed Jan 27 16:03:46 UTC 2016



On 01/25/2016 11:49 AM, David Howells wrote:
> Newer kernels include the counts of objects and blocks unpinned since last
> the status was read over the control device fd.  The unpinning is usually
> through relinquishment by the netfs.
> 
> If these fields are present, extract them and use them to work out whether
> there can be anything to cull.  The fields are reset when read and POLLIN
> is signalled when the file counter is increased from 0.
> 
> If there is nothing new to cull, scanning to build a cull table is
> suspended until more stuff is available.  If the new fields are present,
> work on that basis - otherwise we sleep for 30s and try again.
> 
> The thresholds at which culling should be resumed can be set by adding:
> 
> 	resume_thresholds <block> <file>
> 
> to the config file.  The cull scan is retried when sufficient cumulative
> unpinning has happened that either threshold is exceeded.  The block
> threshold watches the cumulative i_blocks for unpinned cache inodes and the
> file threshold watches the number of cache inodes unpinned.  If either is
> "-" then that threshold is disabled.  The default is 5 files and no block
> threshold.
> 
> Further:
> 
>  (*) Some of the variable names are changed to be more obvious as to what
>      they do.
> 
>  (*) If time-based culling is used, then SIGALRM must be blocked between
>      the did-it-happen check and the call to poll.  SIGINT, SIGTERM and
>      SIGIO are already so blocked.
> 
>  (*) All the decision making logic is moved into the cachefilesd() function
>      rather than being spread out.
> 
> Signed-off-by: David Howells <dhowells at redhat.com>
Reviewed-by: Steve Dickson <steved at redhat.com>

steved.

> ---
> 
>  README             |   13 ++
>  cachefilesd.c      |  275 +++++++++++++++++++++++++++++++++++++---------------
>  cachefilesd.conf.5 |   10 ++
>  3 files changed, 219 insertions(+), 79 deletions(-)
> 
> diff --git a/README b/README
> index 8446a3b..6ed7de2 100644
> --- a/README
> +++ b/README
> @@ -114,6 +114,19 @@ set up cache ready for use.  The following script commands are available:
>  	entries.  The permissible values are between 12 and 20, the latter
>  	indicating 1048576 entries.  The default is 12.
>  
> + (*) resume_thresholds <blocks> <files>
> +
> +	Scanning to refill the cull table is suspended when all the objects in
> +	a cache are pinned by a live network filesystem in the kernel and
> +	there's nothing available to cull.  Scanning is resumed when the kernel
> +	releases sufficient objects that either the number of objects released
> +	exceeds the files parameter here or the cumulative i_blocks values
> +	exceed the blocks parameter.  Either threshold can be disabled by
> +	specifying it as "-".
> +
> +	The default is to ignore the block threshold and to resume when five or
> +	more files have been released.
> +
>   (*) debug <mask>
>  
>  	Specify a numeric bitmask to control debugging in the kernel module.
> diff --git a/cachefilesd.c b/cachefilesd.c
> index eaa1bb0..affd9f9 100644
> --- a/cachefilesd.c
> +++ b/cachefilesd.c
> @@ -46,6 +46,7 @@
>  #include <dirent.h>
>  #include <time.h>
>  #include <poll.h>
> +#include <limits.h>
>  #include <sys/inotify.h>
>  #include <sys/time.h>
>  #include <sys/vfs.h>
> @@ -85,8 +86,8 @@ static int nobjects = 1;
>  static int nopendir;
>  
>  /* current scan point */
> -static struct object *scan = &root;
> -static bool jumpstart_scan;
> +static struct object *scan_cursor;
> +static bool scan_signalled, stop_signalled, reap_signalled;
>  
>  /* ranked order of cullable objects
>   * - we have two tables: one we're building and one that's full of ready to be
> @@ -99,7 +100,10 @@ static struct object **cullready;
>  static unsigned nr_in_build_table;
>  static unsigned nr_in_ready_table;
>  static int ncullable;
> -static bool cull_delayed;
> +static bool kernel_wants_cull;
> +static bool have_nr_releases;
> +static unsigned long long f_released_since_last_scan;
> +static unsigned long long b_released_since_last_scan;
>  
>  
>  static const char *configfile = "/etc/cachefilesd.conf";
> @@ -108,11 +112,13 @@ static const char *procfile = "/proc/fs/cachefiles";
>  static const char *pidfile = "/var/run/cachefilesd.pid";
>  static char *cacheroot, *graveyardpath;
>  
> +static bool culling_disabled;
>  static bool xnolog, xopenedlog;
>  static int xdebug;
> -static bool nocull, stop, reap, cull;
>  static int graveyardfd;
>  static unsigned long long brun, bcull, bstop, frun, fcull, fstop;
> +static unsigned long long b_resume_threshold = ULLONG_MAX;
> +static unsigned long long f_resume_threshold = 5;
>  
>  #define cachefd 3
>  
> @@ -212,7 +218,8 @@ static void reap_graveyard_aux(const char *dirname);
>  static void read_cache_state(void);
>  static int is_object_in_use(const char *filename);
>  static void cull_file(const char *filename);
> -static void build_cull_table(void);
> +static void begin_building_cull_table(void);
> +static bool build_cull_table(void);
>  static void decant_cull_table(void);
>  static void insert_into_cull_table(struct object *object);
>  static void put_object(struct object *object);
> @@ -228,7 +235,7 @@ static void cull_objects(void);
>   */
>  static void sigterm(int sig)
>  {
> -	stop = true;
> +	stop_signalled = true;
>  }
>  
>  /*****************************************************************************/
> @@ -237,7 +244,7 @@ static void sigterm(int sig)
>   */
>  static void sigio(int sig)
>  {
> -	reap = true;
> +	reap_signalled = true;
>  }
>  
>  /*****************************************************************************/
> @@ -246,8 +253,7 @@ static void sigio(int sig)
>   */
>  static void sigalrm(int sig)
>  {
> -	jumpstart_scan = true;
> -	cull_delayed = false;
> +	scan_signalled = true;
>  }
>  
>  /*****************************************************************************/
> @@ -312,7 +318,7 @@ int main(int argc, char *argv[])
>  
>  		case 'N':
>  			/* disable culling */
> -			nocull = true;
> +			culling_disabled = true;
>  			break;
>  
>  		case 'f':
> @@ -420,7 +426,7 @@ int main(int argc, char *argv[])
>  		/*  allow culling to be disabled */
>  		if (memcmp(cp, "nocull", 6) == 0 &&
>  		    (!cp[6] || isspace(cp[6]))) {
> -			nocull = true;
> +			culling_disabled = true;
>  		}
>  
>  		/* note the cull table size command */
> @@ -439,6 +445,40 @@ int main(int argc, char *argv[])
>  			continue;
>  		}
>  
> +		/* Note the suspension resume released file count thresholds
> +		 * ("-" to disable a threshold).
> +		 */
> +		if (memcmp(cp, "resume_thresholds", 18) == 0 && isspace(cp[18])) {
> +			unsigned long long b_thresh, f_thresh;
> +			char *sp;
> +
> +			for (sp = cp + 18; isspace(*sp); sp++) {;}
> +
> +			if (*sp == '-') {
> +				sp++;
> +				b_thresh = ULLONG_MAX;
> +			} else {
> +				b_thresh = strtoul(sp, &sp, 10);
> +			}
> +
> +			if (!*sp || !isspace(*sp) || b_thresh == 0)
> +				cfgerror("Invalid resume threshold (blocks)");
> +			for (; isspace(*sp); sp++) {;}
> +
> +			if (*sp == '-') {
> +				sp++;
> +				f_thresh = ULLONG_MAX;
> +			} else {
> +				f_thresh = strtoul(sp, &sp, 10);
> +				if (*sp || f_thresh == 0)
> +					cfgerror("Invalid resume threshold (files)");
> +			}
> +
> +			b_resume_threshold = b_thresh;
> +			f_resume_threshold = f_thresh;
> +			continue;
> +		}
> +
>  		/* note the dir command */
>  		if (memcmp(cp, "dir", 3) == 0 && isspace(cp[3])) {
>  			char *sp;
> @@ -479,7 +519,7 @@ int main(int argc, char *argv[])
>  		oserror("Unable to close %s", configfile);
>  
>  	/* allocate the cull tables */
> -	if (!nocull) {
> +	if (!culling_disabled) {
>  		cullbuild = calloc(culltable_size, sizeof(cullbuild[0]));
>  		if (!cullbuild)
>  			oserror("calloc");
> @@ -583,6 +623,8 @@ static void open_cache(void)
>  static void cachefilesd(void)
>  {
>  	sigset_t sigs, osigs;
> +	bool scanning_suspended = false;
> +	bool scan_in_progress = false;
>  
>  	struct pollfd pollfds[1] = {
>  		[0] = {
> @@ -596,13 +638,14 @@ static void cachefilesd(void)
>  	/* open the cache directories */
>  	open_cache();
>  
> -	/* we need to disable I/O and termination signals so they're only
> -	 * caught at appropriate times
> +	/* We need to be able to disable signals that we need to check for
> +	 * before calling poll so that we don't race and miss something.
>  	 */
>  	sigemptyset(&sigs);
>  	sigaddset(&sigs, SIGIO);
>  	sigaddset(&sigs, SIGINT);
>  	sigaddset(&sigs, SIGTERM);
> +	sigaddset(&sigs, SIGALRM);
>  
>  	signal(SIGTERM, sigterm);
>  	signal(SIGINT, sigterm);
> @@ -610,16 +653,81 @@ static void cachefilesd(void)
>  	/* check the graveyard for graves */
>  	reap_graveyard();
>  
> -	while (!stop) {
> +	while (!stop_signalled) {
> +		bool do_cull = false;
> +
> +		debug(3, "Loop %sbuild=%d ready=%d susp=%u scan=%u",
> +		      culling_disabled ? "NOCULL " : "",
> +		      nr_in_build_table, nr_in_ready_table,
> +		      scanning_suspended, scan_in_progress);
> +		
>  		read_cache_state();
>  
> -		/* sleep without racing on reap and cull with the signal
> -		 * handlers */
> -		if (!scan && !reap && !(cull && !cull_delayed)) {
> +		if (!culling_disabled) {
> +			/* Determine if we're going to need to start a new scan
> +			 * to refill the cull table.  We want to do this if the
> +			 * secondary cull table is less than half full - but
> +			 * overriding that, we don't want to do this if we know
> +			 * there's insufficient cullables to make it worth
> +			 * while.
> +			 */
> +			if (!scan_in_progress) {
> +				bool begin_scan = false;
> +
> +				debug(1, "Consider scan %d/%d",
> +				      nr_in_build_table, culltable_size / 2);
> +
> +				if (nr_in_build_table < culltable_size / 2) {
> +					debug(1, "Want to scan");
> +					begin_scan = true;
> +				}
> +
> +				if (begin_scan && scanning_suspended) {
> +					debug(1, "Scanning suspended");
> +					if (have_nr_releases) {
> +						if (f_released_since_last_scan <
> +						    f_resume_threshold &&
> +						    b_released_since_last_scan <
> +						    b_resume_threshold)
> +							begin_scan = false;
> +					} else {
> +						begin_scan = scan_signalled;
> +					}
> +				}
> +
> +				if (begin_scan) {
> +					debug(1, "Beginning a scan");
> +					begin_building_cull_table();
> +					scan_in_progress = true;
> +					scanning_suspended = false;
> +					scan_signalled = false;
> +					f_released_since_last_scan = 0;
> +					b_released_since_last_scan = 0;
> +				}
> +			}
> +
> +			/* Determine if there's anything we can actually cull yet if
> +			 * the kernel is calling for space.
> +			 */
> +			if (kernel_wants_cull) {
> +				debug(1, "Want to cull");
> +				if (nr_in_ready_table > 0)
> +					do_cull = true;
> +			}
> +		}
> +
> +		/* We block the signals across the checks for reap, cull and
> +		 * scan initiation before polling so that we sleep without
> +		 * racing against the signal handlers.
> +		 */
> +		if (!scan_in_progress && !reap_signalled && !do_cull) {
>  			if (sigprocmask(SIG_BLOCK, &sigs, &osigs) < 0)
>  				oserror("Unable to block signals");
>  
> -			if (!reap && !stop && !jumpstart_scan) {
> +			if (!reap_signalled &&
> +			    !stop_signalled &&
> +			    !scan_signalled) {
> +				debug(1, "Poll");
>  				if (ppoll(pollfds, 1, NULL, &osigs) < 0 &&
>  				    errno != EINTR)
>  					oserror("Unable to suspend process");
> @@ -627,37 +735,44 @@ static void cachefilesd(void)
>  
>  			if (sigprocmask(SIG_UNBLOCK, &sigs, NULL) < 0)
>  				oserror("Unable to unblock signals");
> -
> -			read_cache_state();
> +			continue;
>  		}
>  
> -		if (nocull) {
> -			cull = false;
> -		} else {
> -			if (jumpstart_scan) {
> -				jumpstart_scan = false;
> -				if (!stop && !scan) {
> -					debug(1, "Refilling cull table");
> -					root.usage++;
> -					scan = &root;
> +		if (!culling_disabled) {
> +			if (do_cull)
> +				cull_objects();
> +
> +			if (scan_in_progress) {
> +				scan_in_progress = build_cull_table();
> +				if (!scan_in_progress) {
> +					/* Scan complete.
> +					 *
> +					 * If the scan didn't produce a full
> +					 * table then don't repeat the scan
> +					 * until something gets released by the
> +					 * kernel.
> +					 */
> +					if (nr_in_build_table < culltable_size) {
> +						debug(1, "Suspend scanning");
> +						scanning_suspended = true;
> +						if (!have_nr_releases) {
> +							signal(SIGALRM, sigalrm);
> +							alarm(30);
> +						}
> +					}
>  				}
>  			}
>  
> -			if (cull) {
> -				if (nr_in_ready_table > 0)
> -					cull_objects();
> -				else if (nr_in_build_table == 0 && !cull_delayed)
> -					jumpstart_scan = true;
> +			if (!scan_in_progress) {
> +				if (nr_in_ready_table <= culltable_size / 2 + 2 &&
> +				    nr_in_build_table > 0) {
> +					debug(1, "Decant");
> +					decant_cull_table();
> +				}
>  			}
> -
> -			if (scan)
> -				build_cull_table();
> -
> -			if (!scan && nr_in_ready_table == 0 && nr_in_build_table > 0)
> -				decant_cull_table();
>  		}
>  
> -		if (reap)
> +		if (reap_signalled)
>  			reap_graveyard();
>  	}
>  
> @@ -672,7 +787,7 @@ static void cachefilesd(void)
>  static void reap_graveyard(void)
>  {
>  	/* set a one-shot notification to catch more graves appearing */
> -	reap = false;
> +	reap_signalled = false;
>  	signal(SIGIO, sigio);
>  	if (fcntl(graveyardfd, F_NOTIFY, DN_CREATE) < 0)
>  		oserror("unable to set notification on graveyard");
> @@ -763,6 +878,8 @@ static void read_cache_state(void)
>  		oserror("Unable to read cache state");
>  	buffer[n] = '\0';
>  
> +	debug(3, "KERNEL: %s", buffer);
> +	
>  	tok = buffer;
>  	do {
>  		next = strpbrk(tok, " \t");
> @@ -777,20 +894,27 @@ static void read_cache_state(void)
>  			continue;
>  		}
>  
> -		if (strcmp(tok, "cull") == 0)
> -			cull = (strtoul(arg, NULL, 0) != 0);
> -		else if (strcmp(tok, "brun") == 0)
> +		if (strcmp(tok, "cull") == 0) {
> +			kernel_wants_cull = (strtoul(arg, NULL, 0) != 0);
> +		} else if (strcmp(tok, "brun") == 0) {
>  			brun = strtoull(arg, NULL, 16);
> -		else if (strcmp(tok, "bcull") == 0)
> +		} else if (strcmp(tok, "bcull") == 0) {
>  			bcull = strtoull(arg, NULL, 16);
> -		else if (strcmp(tok, "bstop") == 0)
> +		} else if (strcmp(tok, "bstop") == 0) {
>  			bstop = strtoull(arg, NULL, 16);
> -		else if (strcmp(tok, "frun") == 0)
> +		} else if (strcmp(tok, "frun") == 0) {
>  			frun = strtoull(arg, NULL, 16);
> -		else if (strcmp(tok, "fcull") == 0)
> +		} else if (strcmp(tok, "fcull") == 0) {
>  			fcull = strtoull(arg, NULL, 16);
> -		else if (strcmp(tok, "fstop") == 0)
> +		} else if (strcmp(tok, "fstop") == 0) {
>  			fstop = strtoull(arg, NULL, 16);
> +		} else if (strcmp(tok, "breleased") == 0) {
> +			b_released_since_last_scan += strtoull(arg, NULL, 16);
> +			have_nr_releases = true;
> +		} else if (strcmp(tok, "freleased") == 0) {
> +			f_released_since_last_scan += strtoull(arg, NULL, 16);
> +			have_nr_releases = true;
> +		}
>  
>  	} while ((tok = next));
>  }
> @@ -1123,9 +1247,21 @@ static void insert_into_cull_table(struct object *object)
>  
>  /*****************************************************************************/
>  /*
> - * do the next step in building up the cull table
> + * Begin a scan to build a cull table.
>   */
> -static void build_cull_table(void)
> +static void begin_building_cull_table(void)
> +{
> +	debug(1, "Refilling cull table");
> +	root.usage++;
> +	scan_cursor = &root;
> +}
> +
> +/*****************************************************************************/
> +/*
> + * Do the next step in building up the cull table.  Returns false upon
> + * completion of a scan.
> + */
> +static bool build_cull_table(void)
>  {
>  	struct dirent dirent, *de;
>  	struct object *curr, *child;
> @@ -1133,7 +1269,7 @@ static void build_cull_table(void)
>  	unsigned loop;
>  	int fd;
>  
> -	curr = scan;
> +	curr = scan_cursor;
>  
>  	if (!curr->dir) {
>  		curr->empty = true;
> @@ -1289,10 +1425,10 @@ next:
>  		debug(2, "- descend");
>  
>  		child->new = false;
> -		scan = child;
> +		scan_cursor = child;
>  
>  		debug(2, "<-- build_cull_table({%s})", curr->name);
> -		return;
> +		return true;
>  
>  	default:
>  		error("Unexpected type");
> @@ -1335,15 +1471,13 @@ dir_read_complete:
>  		}
>  	}
>  
> -	scan = curr->parent;
> -	if (!scan) {
> +	scan_cursor = curr->parent;
> +	if (!scan_cursor)
>  		debug(1, "Scan complete");
> -		decant_cull_table();
> -	}
>  
>  	debug(2, "<-- build_cull_table({%s})", curr->name);
>  	put_object(curr);
> -	return;
> +	return scan_cursor != NULL;
>  
>  	/* delete unexpected objects that we've found */
>  found_unexpected_object:
> @@ -1361,17 +1495,9 @@ static void decant_cull_table(void)
>  {
>  	unsigned loop, avail, copy, leave, space, n;
>  
> -	if (scan)
> +	if (scan_cursor)
>  		error("Can't decant cull table whilst scanning");
>  
> -	/* if nothing there, scan again in a short while */
> -	if (nr_in_build_table == 0) {
> -		cull_delayed = true;
> -		signal(SIGALRM, sigalrm);
> -		alarm(30);
> -		return;
> -	}
> -
>  	/* mark the new entries cullable */
>  	for (loop = 0; loop < nr_in_build_table; loop++) {
>  		if (!cullbuild[loop]->cullable) {
> @@ -1504,13 +1630,4 @@ static void cull_objects(void)
>  		cull_object(cullready[nr_in_ready_table - 1]);
>  		cullready[--nr_in_ready_table] = (void *)(0x6b000000 | __LINE__);
>  	}
> -
> -	/* must start refilling the cull table */
> -	if (!scan && nr_in_build_table < culltable_size / 2 + 2) {
> -		decant_cull_table();
> -
> -		debug(1, "Refilling cull table");
> -		root.usage++;
> -		scan = &root;
> -	}
>  }
> diff --git a/cachefilesd.conf.5 b/cachefilesd.conf.5
> index 028105e..b108bdc 100644
> --- a/cachefilesd.conf.5
> +++ b/cachefilesd.conf.5
> @@ -75,6 +75,16 @@ amount of a systems resources, which may be undesirable.  Supplying this option
>  disables all culling activity.  The cache will keep building up to the limits
>  set and won't be shrunk, except by the removal of out-dated cache files.
>  .TP
> +.B resume_thresholds <blocks> <files>
> +This command specifies the amount of blocks or files that the kernel should let
> +go of before the daemon should resume from culling table scan suspension.
> +.IP
> +Scanning to refill the cull table is suspended when all the objects in a cache
> +are pinned by a live network filesystem in the kernel and there's nothing to
> +cull.
> +.IP
> +Either value can be "-" to indicate that this threshold should be ignored.
> +.TP
>  .B debug <mask>
>  This command specifies a numeric bitmask to control debugging in the kernel
>  module.  The default is zero (all off).  The following values can be OR'd into
> 




More information about the Linux-cachefs mailing list