[Cluster-devel] [PATCH 2/2] gfs2: change gfs2 readdir cookie

Sat Jul 18 04:40:36 UTC 2015

gfs2 currently returns 31 bits of filename hash as a cookie that readdir
uses for an offset into the directory.  When there are a large number of
directory entries, the likelihood of a collision goes up way too
quickly.  GFS2 will now return cookies that are guaranteed unique for a
while, and then fail back to using 30 bits of filename hash.
Specifically, the directory leaf blocks are divided up into chunks based
on the minimum size of a gfs2 directory entry (48 bytes). Each entry's
cookie is based off the chunk where it starts, in the linked list of
leaf blocks that it hashes to (there are 131072 hash buckets). Directory
entries will have unique names until they take reach chunk 8192.
Assuming the largest filenames possible, and the least efficient spacing
possible, this new method will still be able to return unique names when
the previous method has statistically more than a 99% chance of a
collision.  The non-unique names it fails back to are guaranteed to not
collide with the unique names.

unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "0"
- 13 bits for the offset

non-unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "1"
- 13 more bits of the name hash

Another benefit of location based cookies, is that once a directory's
exhash table is fully extended, so that multiple hash table indexs do
not sure the same leaf blocks, gfs2 no longer needs to sort the
directory entries until it reaches the non-unique ones, and then it only
needs to sort these. This provides a significant speed up for directory
reads of very large directories.

The only issue is that for these cookies to continue to point to the
correct entry as files are added and removed from the directory, gfs2
must keep the entries at the same offset in the leaf block when they are
split (see my previous patch). This means that until all the nodes in a
cluster are running with code that will split the directory leaf blocks
this way, none of the nodes can use the new cookie code. To deal with
this, gfs2 now has the mount option loccookie, which, if set, will make
it return these new location based cookies.  This option must not be set
until all nodes in the cluster are at least running this version of the
kernel code, and you have guaranteed that there are no outstanding
cookies required by other software, such as NFS.

Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
---
 fs/gfs2/dir.c                    | 120 ++++++++++++++++++++++++++++++---------
 fs/gfs2/incore.h                 |   3 +
 fs/gfs2/ops_fstype.c             |   3 +
 fs/gfs2/super.c                  |  12 ++++
 include/uapi/linux/gfs2_ondisk.h |   2 +
 5 files changed, 114 insertions(+), 26 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index a894557..7c2ccca 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
 
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
 
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
@@ -474,8 +476,13 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 	return 0;
 }
 
+struct dirent_cookie {
+	const struct gfs2_dirent *dent;
+	u32 cookie;
+};
+
 struct dirent_gather {
-	const struct gfs2_dirent **pdent;
+	struct dirent_cookie *pdent;
 	unsigned offset;
 };
 
@@ -485,7 +492,7 @@ static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
 {
 	struct dirent_gather *g = opaque;
 	if (!gfs2_dirent_sentinel(dent)) {
-		g->pdent[g->offset++] = dent;
+		g->pdent[g->offset++].dent = dent;
 	}
 	return 0;
 }
@@ -1217,11 +1224,11 @@ static int compare_dents(const void *a, const void *b)
 	u32 hash_a, hash_b;
 	int ret = 0;
 
-	dent_a = *(const struct gfs2_dirent **)a;
-	hash_a = be32_to_cpu(dent_a->de_hash);
+	dent_a = ((const struct dirent_cookie *)a)->dent;
+	hash_a = ((const struct dirent_cookie *)a)->cookie;
 
-	dent_b = *(const struct gfs2_dirent **)b;
-	hash_b = be32_to_cpu(dent_b->de_hash);
+	dent_b = ((const struct dirent_cookie *)b)->dent;
+	hash_b = ((const struct dirent_cookie *)b)->cookie;
 
 	if (hash_a > hash_b)
 		ret = 1;
@@ -1259,28 +1266,28 @@ static int compare_dents(const void *a, const void *b)
  */
 
 static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-			   const struct gfs2_dirent **darr, u32 entries,
-			   int *copied)
+			   struct dirent_cookie *darr, u32 entries,
+			   u32 sort_start, int *copied)
 {
 	const struct gfs2_dirent *dent, *dent_next;
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
 
-	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+	if (sort_start < entries)
+		sort(&darr[sort_start], entries - sort_start, sizeof(darr[0]),
+		     compare_dents, NULL);
 
-	dent_next = darr[0];
-	off_next = be32_to_cpu(dent_next->de_hash);
-	off_next = gfs2_disk_hash2offset(off_next);
+	dent_next = darr[0].dent;
+	off_next = darr[0].cookie;
 
 	for (x = 0, y = 1; x < entries; x++, y++) {
 		dent = dent_next;
 		off = off_next;
 
 		if (y < entries) {
-			dent_next = darr[y];
-			off_next = be32_to_cpu(dent_next->de_hash);
-			off_next = gfs2_disk_hash2offset(off_next);
+			dent_next = darr[y].dent;
+			off_next = darr[y].cookie;
 
 			if (off < ctx->pos)
 				continue;
@@ -1327,6 +1334,36 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
 	return ptr;
 }
 
+
+static void gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			     unsigned leaf_nr, struct dirent_cookie *darr,
+			     unsigned entries)
+{
+	int i;
+	
+	for (i = 0; i < entries; i++) {
+		unsigned offset;
+
+		darr[i].cookie = be32_to_cpu(darr[i].dent->de_hash);
+		darr[i].cookie = gfs2_disk_hash2offset(darr[i].cookie);
+
+		if (!sdp->sd_args.ar_loccookie)
+			continue;
+		offset = (char *)(darr[i].dent) -
+			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
+		offset = offset / GFS2_MIN_DIRENT_SIZE;
+		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+		if (offset >= GFS2_USE_HASH_FLAG ||
+		    leaf_nr >= GFS2_USE_HASH_FLAG) {
+			darr[i].cookie |= GFS2_USE_HASH_FLAG;
+			continue;
+		}
+		darr[i].cookie &= GFS2_HASH_INDEX_MASK;
+		darr[i].cookie |= offset;
+	}
+}	
+
+
 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 			      int *copied, unsigned *depth,
 			      u64 leaf_no)
@@ -1336,12 +1373,12 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
 	unsigned entries = 0, entries2 = 0;
-	unsigned leaves = 0;
-	const struct gfs2_dirent **darr, *dent;
+	unsigned leaves = 0, leaf = 0, offset, sort_offset;
+	struct dirent_cookie *darr;
+	const struct gfs2_dirent *dent;
 	struct dirent_gather g;
 	struct buffer_head **larr;
-	int leaf = 0;
-	int error, i;
+	int error, i, need_sort = 0;
 	u64 lfn = leaf_no;
 
 	do {
@@ -1357,6 +1394,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		brelse(bh);
 	} while(lfn);
 
+	if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+		need_sort = 1;
+		sort_offset = 0;
+	}
+
 	if (!entries)
 		return 0;
 
@@ -1367,10 +1409,13 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	 * 99 is the maximum number of entries that can fit in a single
 	 * leaf block.
 	 */
-	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
+	larr = gfs2_alloc_sort_buffer(leaves * sizeof(void *));
 	if (!larr)
 		goto out;
-	darr = (const struct gfs2_dirent **)(larr + leaves);
+	darr = gfs2_alloc_sort_buffer((entries + 99) *
+				      sizeof(struct dirent_cookie));
+	if (!darr)
+		goto out_larr;
 	g.pdent = darr;
 	g.offset = 0;
 	lfn = leaf_no;
@@ -1382,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			offset = g.offset;
 			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
@@ -1399,17 +1445,37 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 				goto out_free;
 			}
 			error = 0;
+			gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+					 be16_to_cpu(lf->lf_entries));
+			if (!need_sort &&
+			    (darr[entries2 - 1].cookie & GFS2_USE_HASH_FLAG)) {
+				need_sort = 1;
+				for (i = offset; i < entries2; i++) {
+					if (darr[i].cookie & GFS2_USE_HASH_FLAG)
+						break;
+				}
+				sort_offset = i;
+			}
 			larr[leaf++] = bh;
 		} else {
+			larr[leaf++] = NULL;
 			brelse(bh);
 		}
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, ctx, darr, entries, copied);
+	if (!need_sort)
+		error = do_filldir_main(ip, ctx, darr, entries, entries,
+					copied);
+	else
+		error = do_filldir_main(ip, ctx, darr, entries, sort_offset,
+					copied);
 out_free:
 	for(i = 0; i < leaf; i++)
-		brelse(larr[i]);
+		if (larr[i])
+			brelse(larr[i]);
+	kvfree(darr);
+out_larr:
 	kvfree(larr);
 out:
 	return error;
@@ -1515,7 +1581,8 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
-	const struct gfs2_dirent **darr, *dent;
+	struct dirent_cookie *darr;
+	const struct gfs2_dirent *dent;
 	struct buffer_head *dibh;
 	int copied = 0;
 	int error;
@@ -1537,7 +1604,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 
 	error = -ENOMEM;
 	/* 96 is max number of dirents which can be stuffed into an inode */
-	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
+	darr = kmalloc(96 * sizeof(struct dirent_cookie), GFP_NOFS);
 	if (darr) {
 		g.pdent = darr;
 		g.offset = 0;
@@ -1556,8 +1623,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			error = -EIO;
 			goto out;
 		}
+		gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
 		error = do_filldir_main(dip, ctx, darr,
-					dip->i_entries, &copied);
+					dip->i_entries, 0, &copied);
 out:
 		kfree(darr);
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e300f74..25cadee 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -559,6 +559,8 @@ struct gfs2_args {
 	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	unsigned int ar_nobarrier:1;            /* do not send barriers */
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
+	unsigned int ar_loccookie;		/* use location based readdir
+						   cookies */
 	int ar_commit;				/* Commit interval */
 	int ar_statfs_quantum;			/* The fast statfs interval */
 	int ar_quota_quantum;			/* The quota interval */
@@ -686,6 +688,7 @@ struct gfs2_sbd {
 	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
 	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
 
 	struct gfs2_args sd_args;	/* Mount arguments */
 	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1e3a93f..638c6f5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	sdp->sd_jheightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
 
+	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+				      sizeof(struct gfs2_leaf)) /
+				     GFS2_MIN_DIRENT_SIZE;
 	return 0;
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445..e194b2b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
 	Opt_nobarrier,
 	Opt_rgrplvb,
 	Opt_norgrplvb,
+	Opt_loccookie,
+	Opt_noloccookie,
 	Opt_error,
 };
 
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_rgrplvb, "rgrplvb"},
 	{Opt_norgrplvb, "norgrplvb"},
+	{Opt_loccookie, "loccookie"},
+	{Opt_noloccookie, "noloccookie"},
 	{Opt_error, NULL}
 };
 
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 		case Opt_norgrplvb:
 			args->ar_rgrplvb = 0;
 			break;
+		case Opt_loccookie:
+			args->ar_loccookie = 1;
+			break;
+		case Opt_noloccookie:
+			args->ar_loccookie = 0;
+			break;
 		case Opt_error:
 		default:
 			pr_warn("invalid mount option: %s\n", o);
@@ -1419,6 +1429,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",demote_interface_used");
 	if (args->ar_rgrplvb)
 		seq_puts(s, ",rgrplvb");
+	if (args->ar_loccookie)
+		seq_puts(s, ",loccookie");
 	return 0;
 }
 
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 1a763ea..54f0025 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -297,6 +297,8 @@ struct gfs2_dinode {
 
 #define GFS2_FNAMESIZE		255
 #define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7)
+#define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1))
+
 
 struct gfs2_dirent {
 	struct gfs2_inum de_inum;
-- 
1.8.3.1