[Cluster-devel] [GFS2 PATCH 2/2 v2] gfs2: change gfs2 readdir cookie

Wed Aug 26 10:58:24 UTC 2015

Hi,

On 25/08/15 22:10, Benjamin Marzinski wrote:
> gfs2 currently returns 31 bits of filename hash as a cookie that readdir
> uses for an offset into the directory.  When there are a large number of
> directory entries, the likelihood of a collision goes up way too
> quickly.  GFS2 will now return cookies that are guaranteed unique for a
> while, and then fail back to using 30 bits of filename hash.
> Specifically, the directory leaf blocks are divided up into chunks based
> on the minimum size of a gfs2 directory entry (48 bytes). Each entry's
> cookie is based off the chunk where it starts, in the linked list of
> leaf blocks that it hashes to (there are 131072 hash buckets). Directory
> entries will have unique names until they take reach chunk 8192.
> Assuming the largest filenames possible, and the least efficient spacing
> possible, this new method will still be able to return unique names when
> the previous method has statistically more than a 99% chance of a
> collision.  The non-unique names it fails back to are guaranteed to not
> collide with the unique names.
>
> unique cookies will be in this format:
> - 1 bit "0" to make sure the the returned cookie is positive
> - 17 bits for the hash table index
> - 1 bit for the mode "0"
> - 13 bits for the offset
>
> non-unique cookies will be in this format:
> - 1 bit "0" to make sure the the returned cookie is positive
> - 17 bits for the hash table index
> - 1 bit for the mode "1"
> - 13 more bits of the name hash
>
> Another benefit of location based cookies, is that once a directory's
> exhash table is fully extended (so that multiple hash table indexs do
> not use the same leaf blocks), gfs2 can skip sorting the directory
> entries until it reaches the non-unique ones, and then it only needs to
> sort these. This provides a significant speed up for directory reads of
> very large directories.
>
> The only issue is that for these cookies to continue to point to the
> correct entry as files are added and removed from the directory, gfs2
> must keep the entries at the same offset in the leaf block when they are
> split (see my previous patch). This means that until all the nodes in a
> cluster are running with code that will split the directory leaf blocks
> this way, none of the nodes can use the new cookie code. To deal with
> this, gfs2 now has the mount option loccookie, which, if set, will make
> it return these new location based cookies.  This option must not be set
> until all nodes in the cluster are at least running this version of the
> kernel code, and you have guaranteed that there are no outstanding
> cookies required by other software, such as NFS.
>
> gfs2 uses some of the extra space at the end of the gfs2_dirent
> structure to store the calculated readdir cookies. This keeps us from
> needing to allocate a seperate array to hold these values.  gfs2
> recomputes the cookie stored in de_cookie for every readdir call.  The
> time it takes to do so is small, and if gfs2 expected this value to be
> saved on disk, the new code wouldn't work correctly on filesystems
> created with an earlier version of gfs2.
>
> One issue with adding de_cookie to the union in the gfs2_dirent
> structure is that it caused the union to align itself to a 4 byte
> boundary, instead of its previous 2 byte boundary. This changed the
> offset of de_rahead. To solve that, I pulled de_rahead out of the union,
> since it does not need to be there.
>
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
>   fs/gfs2/dir.c                    | 91 +++++++++++++++++++++++++++++++---------
>   fs/gfs2/incore.h                 |  3 ++
>   fs/gfs2/ops_fstype.c             |  3 ++
>   fs/gfs2/super.c                  | 12 ++++++
>   include/uapi/linux/gfs2_ondisk.h |  9 ++--
>   5 files changed, 95 insertions(+), 23 deletions(-)
>
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index a894557..fa5bf4b 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -82,6 +82,8 @@
>   
>   #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
>   #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
> +#define GFS2_HASH_INDEX_MASK 0xffffc000
> +#define GFS2_USE_HASH_FLAG 0x2000
>   
>   struct qstr gfs2_qdot __read_mostly;
>   struct qstr gfs2_qdotdot __read_mostly;
> @@ -1218,10 +1220,10 @@ static int compare_dents(const void *a, const void *b)
>   	int ret = 0;
>   
>   	dent_a = *(const struct gfs2_dirent **)a;
> -	hash_a = be32_to_cpu(dent_a->de_hash);
> +	hash_a = dent_a->de_cookie;
>   
Does the cookie need endianess conversion? I'd suggest using sparse to 
check that you've not missed any of these.

>   	dent_b = *(const struct gfs2_dirent **)b;
> -	hash_b = be32_to_cpu(dent_b->de_hash);
> +	hash_b = dent_b->de_cookie;
>   
>   	if (hash_a > hash_b)
>   		ret = 1;
> @@ -1259,19 +1261,20 @@ static int compare_dents(const void *a, const void *b)
>    */
>   
>   static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
> -			   const struct gfs2_dirent **darr, u32 entries,
> -			   int *copied)
> +			   struct gfs2_dirent **darr, u32 entries,
> +			   u32 sort_start, int *copied)
>   {
>   	const struct gfs2_dirent *dent, *dent_next;
>   	u64 off, off_next;
>   	unsigned int x, y;
>   	int run = 0;
>   
> -	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
> +	if (sort_start < entries)
> +		sort(&darr[sort_start], entries - sort_start,
> +		     sizeof(struct gfs2_dirent *), compare_dents, NULL);
>   
>   	dent_next = darr[0];
> -	off_next = be32_to_cpu(dent_next->de_hash);
> -	off_next = gfs2_disk_hash2offset(off_next);
> +	off_next = dent_next->de_cookie;
>   
>   	for (x = 0, y = 1; x < entries; x++, y++) {
>   		dent = dent_next;
> @@ -1279,8 +1282,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
>   
>   		if (y < entries) {
>   			dent_next = darr[y];
> -			off_next = be32_to_cpu(dent_next->de_hash);
> -			off_next = gfs2_disk_hash2offset(off_next);
> +			off_next = dent_next->de_cookie;
>   
>   			if (off < ctx->pos)
>   				continue;
> @@ -1327,6 +1329,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
>   	return ptr;
>   }
>   
> +
> +static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
> +			    unsigned leaf_nr, struct gfs2_dirent **darr,
> +			    unsigned entries)
> +{
> +	int sort_id = -1;
> +	int i;
> +	
> +	for (i = 0; i < entries; i++) {
> +		unsigned offset;
> +
> +		darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
> +		darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
> +
> +		if (!sdp->sd_args.ar_loccookie)
> +			continue;
> +		offset = (char *)(darr[i]) -
> +			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
> +		offset /= GFS2_MIN_DIRENT_SIZE;
> +		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
> +		if (offset >= GFS2_USE_HASH_FLAG ||
> +		    leaf_nr >= GFS2_USE_HASH_FLAG) {
> +			darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
> +			if (sort_id < 0)
> +				sort_id = i;
> +			continue;
> +		}
> +		darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
> +		darr[i]->de_cookie |= offset;
> +	}
> +	return sort_id;
> +}	
> +
> +
>   static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   			      int *copied, unsigned *depth,
>   			      u64 leaf_no)
> @@ -1336,12 +1372,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   	struct buffer_head *bh;
>   	struct gfs2_leaf *lf;
>   	unsigned entries = 0, entries2 = 0;
> -	unsigned leaves = 0;
> -	const struct gfs2_dirent **darr, *dent;
> +	unsigned leaves = 0, leaf = 0, offset, sort_offset;
> +	struct gfs2_dirent **darr, *dent;
>   	struct dirent_gather g;
>   	struct buffer_head **larr;
> -	int leaf = 0;
> -	int error, i;
> +	int error, i, need_sort = 0, sort_id;
>   	u64 lfn = leaf_no;
>   
>   	do {
> @@ -1357,6 +1392,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   		brelse(bh);
>   	} while(lfn);
>   
> +	if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
> +		need_sort = 1;
> +		sort_offset = 0;
> +	}
> +
>   	if (!entries)
>   		return 0;
>   
> @@ -1370,8 +1410,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
>   	if (!larr)
>   		goto out;
> -	darr = (const struct gfs2_dirent **)(larr + leaves);
> -	g.pdent = darr;
> +	darr = (struct gfs2_dirent **)(larr + leaves);
> +	g.pdent = (const struct gfs2_dirent **)darr;
>   	g.offset = 0;
>   	lfn = leaf_no;
>   
> @@ -1382,6 +1422,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   		lf = (struct gfs2_leaf *)bh->b_data;
>   		lfn = be64_to_cpu(lf->lf_next);
>   		if (lf->lf_entries) {
> +			offset = g.offset;
>   			entries2 += be16_to_cpu(lf->lf_entries);
>   			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
>   						gfs2_dirent_gather, NULL, &g);
> @@ -1399,17 +1440,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
>   				goto out_free;
>   			}
>   			error = 0;
> +			sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
> +						   be16_to_cpu(lf->lf_entries));
> +			if (!need_sort && sort_id >= 0) {
> +				need_sort = 1;
> +				sort_offset = offset + sort_id;
> +			}
>   			larr[leaf++] = bh;
>   		} else {
> +			larr[leaf++] = NULL;
>   			brelse(bh);
>   		}
>   	} while(lfn);
>   
>   	BUG_ON(entries2 != entries);
> -	error = do_filldir_main(ip, ctx, darr, entries, copied);
> +	error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
> +				sort_offset : entries, copied);
>   out_free:
>   	for(i = 0; i < leaf; i++)
> -		brelse(larr[i]);
> +		if (larr[i])
> +			brelse(larr[i]);
>   	kvfree(larr);
>   out:
>   	return error;
> @@ -1515,7 +1565,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
>   	struct gfs2_inode *dip = GFS2_I(inode);
>   	struct gfs2_sbd *sdp = GFS2_SB(inode);
>   	struct dirent_gather g;
> -	const struct gfs2_dirent **darr, *dent;
> +	struct gfs2_dirent **darr, *dent;
>   	struct buffer_head *dibh;
>   	int copied = 0;
>   	int error;
> @@ -1539,7 +1589,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
>   	/* 96 is max number of dirents which can be stuffed into an inode */
>   	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
>   	if (darr) {
> -		g.pdent = darr;
> +		g.pdent = (const struct gfs2_dirent **)darr;
>   		g.offset = 0;
>   		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
>   					gfs2_dirent_gather, NULL, &g);
> @@ -1556,8 +1606,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
>   			error = -EIO;
>   			goto out;
>   		}
> +		gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
>   		error = do_filldir_main(dip, ctx, darr,
> -					dip->i_entries, &copied);
> +					dip->i_entries, 0, &copied);
>   out:
>   		kfree(darr);
>   	}
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index e300f74..25cadee 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -559,6 +559,8 @@ struct gfs2_args {
>   	unsigned int ar_errors:2;               /* errors=withdraw | panic */
>   	unsigned int ar_nobarrier:1;            /* do not send barriers */
>   	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
> +	unsigned int ar_loccookie;		/* use location based readdir
> +						   cookies */
>   	int ar_commit;				/* Commit interval */
>   	int ar_statfs_quantum;			/* The fast statfs interval */
>   	int ar_quota_quantum;			/* The quota interval */
> @@ -686,6 +688,7 @@ struct gfs2_sbd {
>   	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
>   	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
>   	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
> +	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
>   
>   	struct gfs2_args sd_args;	/* Mount arguments */
>   	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
> diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
> index 1e3a93f..638c6f5 100644
> --- a/fs/gfs2/ops_fstype.c
> +++ b/fs/gfs2/ops_fstype.c
> @@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
>   	sdp->sd_jheightsize[x] = ~0;
>   	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
>   
> +	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
> +				      sizeof(struct gfs2_leaf)) /
> +				     GFS2_MIN_DIRENT_SIZE;
>   	return 0;
>   }
>   
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 2982445..e194b2b 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -83,6 +83,8 @@ enum {
>   	Opt_nobarrier,
>   	Opt_rgrplvb,
>   	Opt_norgrplvb,
> +	Opt_loccookie,
> +	Opt_noloccookie,
>   	Opt_error,
>   };
>   
> @@ -122,6 +124,8 @@ static const match_table_t tokens = {
>   	{Opt_nobarrier, "nobarrier"},
>   	{Opt_rgrplvb, "rgrplvb"},
>   	{Opt_norgrplvb, "norgrplvb"},
> +	{Opt_loccookie, "loccookie"},
> +	{Opt_noloccookie, "noloccookie"},
>   	{Opt_error, NULL}
>   };
>   
> @@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
>   		case Opt_norgrplvb:
>   			args->ar_rgrplvb = 0;
>   			break;
> +		case Opt_loccookie:
> +			args->ar_loccookie = 1;
> +			break;
> +		case Opt_noloccookie:
> +			args->ar_loccookie = 0;
> +			break;
Hmm. Is there some way we can make it the default, without needing to do 
this? The on-disk format should still be backwards compatible I think? 
Do we need any fsck/gfs2-utils updates too?

>   		case Opt_error:
>   		default:
>   			pr_warn("invalid mount option: %s\n", o);
> @@ -1419,6 +1429,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
>   		seq_puts(s, ",demote_interface_used");
>   	if (args->ar_rgrplvb)
>   		seq_puts(s, ",rgrplvb");
> +	if (args->ar_loccookie)
> +		seq_puts(s, ",loccookie");
>   	return 0;
>   }
>   
> diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
> index 1a763ea..7c4be77 100644
> --- a/include/uapi/linux/gfs2_ondisk.h
> +++ b/include/uapi/linux/gfs2_ondisk.h
> @@ -297,6 +297,8 @@ struct gfs2_dinode {
>   
>   #define GFS2_FNAMESIZE		255
>   #define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
> +#define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1))
> +
>   
>   struct gfs2_dirent {
>   	struct gfs2_inum de_inum;
> @@ -304,11 +306,12 @@ struct gfs2_dirent {
>   	__be16 de_rec_len;
>   	__be16 de_name_len;
>   	__be16 de_type;
> +	__be16 de_rahead;
>   	union {
> -		__u8 __pad[14];
> +		__u8 __pad[12];
>   		struct {
> -			__be16 de_rahead;
> -			__u8 pad2[12];
> +			__u32 de_cookie; /* ondisk value not used */
This is an ondisk structure so it should be __be32 to match the rest of 
the structure.

> +			__u8 pad3[8];
>   		};
>   	};
>   };

Otherwise looks good. It will need plenty of testing though with lots of 
corner cases to ensure that nothing has been missed,

Steve.