[Cluster-devel] [GFS2 PATCH v2] gfs2: fast dealloc for exhash directories

Bob Peterson rpeterso at redhat.com
Mon Mar 22 14:14:45 UTC 2021


Before this patch, whenever a directory was deleted, it called function
__gfs2_dir_exhash_dealloc to deallocate the directory's leaf blocks.
But __gfs2_dir_exhash_dealloc never knew if any given leaf block had
leaf continuation aka "next" blocks, so it read every single leaf block
in, only to determine in 99% of the cases that there was none. Still,
this reading in of all the leaf blocks was very slow.

This patch adds a new disk flag that indicates whether a directory is
clean of any "next leaf" blocks. If so, it takes an optimized path that
just deletes the leaf blocks and zeroes out the hash table.

It would seem to make more sense to have the new bit indicate when a
directory contains "next leaf" blocks rather than the inverse, but we
need to treat file systems created by older versions of gfs2 as if
they have "next leaf" blocks.

Signed-off-by: Bob Peterson <rpeterso at redhat.com>
---
 fs/gfs2/dir.c                    | 168 ++++++++++++++++++++++++++++++++++++---
 fs/gfs2/inode.c                  |   2 +-
 include/uapi/linux/gfs2_ondisk.h |  15 ++++
 3 files changed, 173 insertions(+), 12 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c0f2875c946c..44875fbdfc64 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -69,6 +69,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "super.h"
 #include "trans.h"
 #include "bmap.h"
 #include "util.h"
@@ -1761,6 +1762,11 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 		return error;
 	gfs2_trans_add_meta(ip->i_gl, bh);
 	gfs2_add_inode_blocks(&ip->i_inode, 1);
+	/*
+	 * This dinode now has a "next leaf" so we need to deallocate it the
+	 * old, slow way.
+	 */
+	ip->i_diskflags &= ~GFS2_DIF_NO_NEXT_LEAF;
 	gfs2_dinode_out(ip, bh->b_data);
 	brelse(bh);
 	return 0;
@@ -2083,17 +2089,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	return error;
 }
 
-/**
- * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
- * @dip: the directory
- *
- * Dealloc all on-disk directory leaves to FREEMETA state
- * Change on-disk inode type to "regular file"
- *
- * Returns: errno
- */
-
-int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+static int __gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 {
 	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
@@ -2140,6 +2136,156 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
 	return error;
 }
 
+/**
+ * leaves_in_this_rgd - count the leaf blocks in the index with the same rgrp
+ * @rgd: the target resource group structure
+ * @lp: the in-core directory hash table
+ * @hsize: the hash table size
+ */
+static int leaves_in_this_rgd(struct gfs2_rgrpd *rgd, __be64 *lp, u32 hsize)
+{
+	int index, leaves_in_rgd = 0;
+	u64 leaf_no, prev_leaf = 0;
+
+	for (index = 0; index < hsize; index++) {
+		leaf_no = be64_to_cpu(lp[index]);
+		if (!leaf_no)
+			continue;
+		if (leaf_no == prev_leaf)
+			continue;
+		prev_leaf = leaf_no;
+		if (rgrp_contains_block(rgd, leaf_no))
+			leaves_in_rgd++;
+	}
+	return leaves_in_rgd;
+}
+
+static int dir_exhash_fast_dealloc(struct gfs2_inode *dip)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder gh;
+	struct buffer_head *dibh;
+	u32 hsize, index = 0, next_index = 0;
+	__be64 *lp;
+	u64 leaf_no, freed_leaf;
+	int ret = 0;
+	int leaf_count;
+	int freed_blocks;
+
+	hsize = BIT(dip->i_depth);
+	lp = gfs2_dir_get_hash_table(dip);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+	ret = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		return ret;
+
+new_rgrp:
+	rgd = NULL;
+	freed_leaf = 0;
+	gfs2_holder_mark_uninitialized(&gh);
+	freed_blocks = 0;
+	while (index < hsize) {
+		leaf_no = be64_to_cpu(lp[index]);
+		if (!leaf_no)
+			goto skip_dups;
+		if (rgd) {
+			if (!rgrp_contains_block(rgd, leaf_no)) {
+				if (!next_index)
+					next_index = index;
+				goto skip_dups;
+			}
+		} else {
+			rgd = gfs2_blk2rgrpd(sdp, leaf_no, true);
+			if (!rgd) {
+				fs_err(sdp, "Error: rgrp for block 0x%llx "
+				       "not found in dir 0x%llx\n",
+				       (unsigned long long)leaf_no,
+				       (unsigned long long)dip->i_no_addr);
+				goto out_err;
+			}
+			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						 LM_FLAG_NODE_SCOPE, &gh);
+			if (ret)
+				goto out_err;
+			leaf_count = leaves_in_this_rgd(rgd, lp, hsize);
+			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+					       RES_QUOTA + rgd->rd_length,
+					       leaf_count);
+			if (ret)
+				goto out_rg_gunlock;
+		}
+		__gfs2_free_blocks(dip, rgd, leaf_no, 1, 1);
+		freed_blocks++;
+		freed_leaf = leaf_no;
+skip_dups:
+		while (index < hsize && be64_to_cpu(lp[index]) == leaf_no) {
+			if (leaf_no == freed_leaf)
+				lp[index] = 0;
+			index++;
+		}
+	}
+	if (current->journal_info) {
+		gfs2_statfs_change(sdp, 0, freed_blocks, 0);
+		gfs2_quota_change(dip, -(s64)freed_blocks, dip->i_inode.i_uid,
+				  dip->i_inode.i_gid);
+		gfs2_add_inode_blocks(&dip->i_inode, -freed_blocks);
+		ret = gfs2_meta_inode_buffer(dip, &dibh);
+		if (ret) {
+			fs_err(sdp, "Error: Unable to read dinode 0x%llx\n",
+			       (unsigned long long)dip->i_no_addr);
+			gfs2_consist_inode(dip);
+			gfs2_trans_end(sdp);
+			gfs2_glock_dq_uninit(&gh);
+			goto out_err;
+		}
+		gfs2_trans_add_meta(dip->i_gl, dibh);
+		/*
+		 * On the last dealloc, make this a regular file in case we
+		 * crash. (We don't want to free these blocks a second time.)
+		 */
+		if (!next_index)
+			dip->i_inode.i_mode = S_IFREG;
+		gfs2_dinode_out(dip, dibh->b_data);
+		brelse(dibh);
+		gfs2_trans_end(sdp);
+out_rg_gunlock:
+		gfs2_glock_dq_uninit(&gh);
+	}
+	if (!ret && next_index) {
+		index = next_index;
+		next_index = 0;
+		goto new_rgrp;
+	}
+
+	if (index != hsize) {
+out_err:
+		gfs2_consist_inode(dip);
+		ret = -EIO;
+	}
+	gfs2_quota_unhold(dip);
+	return ret;
+}
+
+/**
+ * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory
+ * @dip: the directory
+ *
+ * Dealloc all on-disk directory leaves to FREEMETA state
+ * Change on-disk inode type to "regular file"
+ *
+ * Returns: errno
+ */
+
+int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
+{
+	if (dip->i_diskflags & GFS2_DIF_NO_NEXT_LEAF)
+		return dir_exhash_fast_dealloc(dip);
+
+	return __gfs2_dir_exhash_dealloc(dip);
+}
+
 /**
  * gfs2_diradd_alloc_required - find if adding entry will require an allocation
  * @ip: the file being written to
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c9775d5c6594..1b7dcdf789ee 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -700,7 +700,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		break;
 	case S_IFDIR:
 		ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA);
-		ip->i_diskflags |= GFS2_DIF_JDATA;
+		ip->i_diskflags |= GFS2_DIF_JDATA | GFS2_DIF_NO_NEXT_LEAF;
 		ip->i_entries = 2;
 		break;
 	}
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 6ec4291bcc7a..ef793871a84b 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -258,6 +258,21 @@ enum {
 #define GFS2_DIF_SYNC			0x00000100
 #define GFS2_DIF_SYSTEM			0x00000200 /* New in gfs2 */
 #define GFS2_DIF_TOPDIR			0x00000400 /* New in gfs2 */
+/*
+ * GFS2_DIF_NO_NEXT_LEAF will be set for directory leaf blocks that do not have
+ * "next leaf block", lf_next. For EXHASH directories, when leaves have
+ * the maximum number of possible dirents, gfs2 allocates these next leaf
+ * blocks and they're chained together such that one terminating leaf points
+ * to another leaf with the same hash. We need to indicate which directories
+ * have none (as opposed to which ones have them) so that file systems created
+ * by older gfs2 kernels will not be treated as having no next leaf blocks.
+ *
+ * The reason we distinguish directories having "next leaf" versus none is
+ * to optimize performance of rmdir operations. When we have "next leaf"
+ * somewhere--anywhere--we need to read in all the leaf blocks to find it when
+ * deallocating. When we don't we can just deallocate the leaf blocks.
+ */
+#define GFS2_DIF_NO_NEXT_LEAF		0x00000800 /* No next leaf */
 #define GFS2_DIF_TRUNC_IN_PROG		0x20000000 /* New in gfs2 */
 #define GFS2_DIF_INHERIT_DIRECTIO	0x40000000 /* only in gfs1 */
 #define GFS2_DIF_INHERIT_JDATA		0x80000000




More information about the Cluster-devel mailing list