[Cluster-devel] [GFS2 PATCH 2/2] GFS2: Take advantage of new EXSH glock mode for rgrps

Wed Apr 18 19:25:55 UTC 2018

Hi,

On 18/04/18 17:58, Bob Peterson wrote:
> This patch switches rgrp locking to EXSH mode so that it's only
> taken when the rgrp is added to an active transaction, or when
> blocks are being reserved. As soon as the transaction is ended,
> the rgrp exclusivity is released. This allows for rgrp sharing.
Adding the rwsem looks like a good plan. I'm not sure that I understand 
why you need to lock the rgrp exclusively for the whole transaction and 
have added the new list. Do we really need to save the pid, or is that 
just there for debugging?

Steve.

> Signed-off-by: Bob Peterson <rpeterso at redhat.com>
> ---
>   fs/gfs2/bmap.c   |  2 +-
>   fs/gfs2/dir.c    |  2 +-
>   fs/gfs2/glops.c  |  3 +-
>   fs/gfs2/incore.h |  4 +++
>   fs/gfs2/inode.c  |  4 +--
>   fs/gfs2/rgrp.c   | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
>   fs/gfs2/rgrp.h   |  5 ++++
>   fs/gfs2/super.c  |  2 +-
>   fs/gfs2/trans.c  | 16 +++++++++++
>   fs/gfs2/xattr.c  |  6 ++--
>   10 files changed, 114 insertions(+), 14 deletions(-)
>
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 0590e93494f7..f5fe9784d937 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -1107,7 +1107,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
>   				ret = -EIO;
>   				goto out;
>   			}
> -			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> +			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXSH,
>   						 0, rd_gh);
>   			if (ret)
>   				goto out;
> diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
> index d9fb0ad6cc30..29dc1a81f725 100644
> --- a/fs/gfs2/dir.c
> +++ b/fs/gfs2/dir.c
> @@ -2020,7 +2020,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
>   		l_blocks++;
>   	}
>   
> -	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
> +	gfs2_rlist_alloc(&rlist, LM_ST_EXSH);
>   
>   	for (x = 0; x < rlist.rl_rgrps; x++) {
>   		struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
> diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> index d8782a7a1e7d..4e478ec0da2f 100644
> --- a/fs/gfs2/glops.c
> +++ b/fs/gfs2/glops.c
> @@ -89,6 +89,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
>   	memset(&tr, 0, sizeof(tr));
>   	INIT_LIST_HEAD(&tr.tr_buf);
>   	INIT_LIST_HEAD(&tr.tr_databuf);
> +	INIT_LIST_HEAD(&tr.tr_rgrps);
>   	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
>   
>   	if (!tr.tr_revokes)
> @@ -157,7 +158,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
>   
>   	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
>   		return;
> -	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
> +	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXSH);
>   
>   	gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL |
>   		       GFS2_LFC_RGRP_GO_SYNC);
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 58b3ed8531ee..db430dacafa3 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -100,7 +100,10 @@ struct gfs2_rgrpd {
>   #define GFS2_RDF_PREFERRED	0x80000000 /* This rgrp is preferred */
>   #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
>   	spinlock_t rd_rsspin;           /* protects reservation related vars */
> +	struct rw_semaphore rd_sem;	/* ensures local rgrp exclusivity */
>   	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	struct list_head rd_trans;	/* rgrp is queued to a transaction */
> +	pid_t rd_expid;			/* rgrp locked EX by this pid */
>   };
>   
>   struct gfs2_rbm {
> @@ -499,6 +502,7 @@ struct gfs2_trans {
>   	unsigned int tr_first;
>   	struct list_head tr_ail1_list;
>   	struct list_head tr_ail2_list;
> +	struct list_head tr_rgrps;
>   };
>   
>   struct gfs2_journal_extent {
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 8700eb815638..9ec100b3c3dd 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -1121,7 +1121,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
>   	if (!rgd)
>   		goto out_inodes;
>   
> -	gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
> +	gfs2_holder_init(rgd->rd_gl, LM_ST_EXSH, 0, ghs + 2);
>   
>   
>   	error = gfs2_glock_nq(ghs); /* parent */
> @@ -1409,7 +1409,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
>   		 */
>   		nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1);
>   		if (nrgd)
> -			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
> +			gfs2_holder_init(nrgd->rd_gl, LM_ST_EXSH, 0, ghs + num_gh++);
>   	}
>   
>   	for (x = 0; x < num_gh; x++) {
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 8b683917a27e..3b086f729b76 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -93,6 +93,7 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
>   	unsigned int buflen = bi->bi_len;
>   	const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
>   
> +	BUG_ON(rbm->rgd->rd_expid != pid_nr(task_pid(current)));
>   	byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
>   	end = bi->bi_bh->b_data + bi->bi_offset + buflen;
>   
> @@ -904,6 +905,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>   	rgd->rd_data = be32_to_cpu(buf.ri_data);
>   	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
>   	spin_lock_init(&rgd->rd_rsspin);
> +	init_rwsem(&rgd->rd_sem);
> +	INIT_LIST_HEAD(&rgd->rd_trans);
>   
>   	error = compute_bitstructs(rgd);
>   	if (error)
> @@ -1399,11 +1402,12 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
>   
>   	while (1) {
>   
> -		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
> +		ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXSH, 0, &gh);
>   		if (ret)
>   			goto out;
>   
>   		if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
> +			rgrp_lockex(rgd);
>   			/* Trim each bitmap in the rgrp */
>   			for (x = 0; x < rgd->rd_length; x++) {
>   				struct gfs2_bitmap *bi = rgd->rd_bits + x;
> @@ -1411,11 +1415,13 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
>   						rgd->rd_data0, NULL, bi, minlen,
>   						&amt);
>   				if (ret) {
> +					rgrp_unlockex(rgd);
>   					gfs2_glock_dq_uninit(&gh);
>   					goto out;
>   				}
>   				trimmed += amt;
>   			}
> +			rgrp_unlockex(rgd);
>   
>   			/* Mark rgrp as having been trimmed */
>   			ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
> @@ -1768,6 +1774,17 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
>    *
>    * Returns: 0 if no error
>    *          The inode, if one has been found, in inode.
> + *
> + * We must be careful to avoid deadlock here:
> + *
> + * All transactions expect: sd_log_flush_lock followed by rgrp ex (if neeeded),
> + * but try_rgrp_unlink takes sd_log_flush_lock outside a transaction and
> + * therefore must not have the rgrp ex already held. To avoid deadlock, we
> + * drop the rgrp ex lock before taking the log_flush_lock, then reacquire it
> + * to protect our call to gfs2_rbm_find.
> + *
> + * Also note that rgrp_unlockex must come AFTER the caller does gfs2_rs_deltree
> + * because rgrp ex needs to be held before making reservations.
>    */
>   
>   static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip)
> @@ -1781,7 +1798,12 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>   	struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
>   
>   	while (1) {
> +		/* As explained above, we need to drop the rgrp ex lock and
> +		 * reacquire it after we get for sd_log_flush_lock.
> +		 */
> +		rgrp_unlockex(rgd);
>   		down_write(&sdp->sd_log_flush_lock);
> +		rgrp_lockex(rgd);
>   		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
>   				      true);
>   		up_write(&sdp->sd_log_flush_lock);
> @@ -1959,6 +1981,45 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
>   	return 0;
>   }
>   
> +/**
> + * rgrp_lockex - gain exclusive access to an rgrp locked in EXSH
> + * @gl: the glock
> + *
> + * This function waits for exclusive access to an rgrp's glock that is held in
> + * EXSH, by way of its rwsem.
> + *
> + * If the rwsem is already held, we don't need to wait, but we may need to
> + * queue the rgrp to a transaction if we have one, on the assumption that the
> + * rwsem may have been locked prior to starting the transaction.
> + */
> +void rgrp_lockex(struct gfs2_rgrpd *rgd)
> +{
> +	struct gfs2_trans *tr = current->journal_info;
> +
> +	BUG_ON(!gfs2_glock_is_held_exsh(rgd->rd_gl));
> +	if (rgd->rd_expid != pid_nr(task_pid(current))) {
> +		down_write(&rgd->rd_sem);
> +		BUG_ON(rgd->rd_expid != 0);
> +		rgd->rd_expid = pid_nr(task_pid(current));
> +	}
> +	/**
> +	 * Check if this rgrp needs to be queued to the current transaction.
> +	 * We need to do this because trans_add_meta may be called several
> +	 * times for the same rgrp.
> +	 */
> +	if (tr && list_empty(&rgd->rd_trans))
> +		list_add_tail(&rgd->rd_trans, &tr->tr_rgrps);
> +}
> +
> +void rgrp_unlockex(struct gfs2_rgrpd *rgd)
> +{
> +	BUG_ON(!gfs2_glock_is_held_exsh(rgd->rd_gl));
> +	if (!list_empty(&rgd->rd_trans))
> +		list_del_init(&rgd->rd_trans);
> +	rgd->rd_expid = 0;
> +	up_write(&rgd->rd_sem);
> +}
> +
>   /**
>    * gfs2_inplace_reserve - Reserve space in the filesystem
>    * @ip: the inode to reserve space for
> @@ -2019,20 +2080,24 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>   					goto next_rgrp;
>   			}
>   			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
> -						   LM_ST_EXCLUSIVE, flags,
> +						   LM_ST_EXSH, flags,
>   						   &rs->rs_rgd_gh);
>   			if (unlikely(error))
>   				return error;
> +			rgrp_lockex(rs->rs_rbm.rgd);
>   			if (!gfs2_rs_active(rs) && (loops < 2) &&
>   			    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
>   				goto skip_rgrp;
>   			if (sdp->sd_args.ar_rgrplvb) {
>   				error = update_rgrp_lvb(rs->rs_rbm.rgd);
>   				if (unlikely(error)) {
> +					rgrp_unlockex(rs->rs_rbm.rgd);
>   					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>   					return error;
>   				}
>   			}
> +		} else {
> +			rgrp_lockex(rs->rs_rbm.rgd);
>   		}
>   
>   		/* Skip unuseable resource groups */
> @@ -2058,18 +2123,25 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
>   		     rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) {
>   			ip->i_rgd = rs->rs_rbm.rgd;
>   			ap->allowed = ip->i_rgd->rd_free_clone;
> +			rgrp_unlockex(rs->rs_rbm.rgd);
>   			return 0;
>   		}
>   check_rgrp:
>   		/* Check for unlinked inodes which can be reclaimed */
> -		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
> +		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) {
> +			/* Drop reservation if we couldn't use reserved rgrp */
> +			if (gfs2_rs_active(rs))
> +				gfs2_rs_deltree(rs);
>   			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
>   					ip->i_no_addr);
> +		}
>   skip_rgrp:
>   		/* Drop reservation, if we couldn't use reserved rgrp */
>   		if (gfs2_rs_active(rs))
>   			gfs2_rs_deltree(rs);
>   
> +		rgrp_unlockex(rs->rs_rbm.rgd);
> +
>   		/* Unlock rgrp if required */
>   		if (!rg_locked)
>   			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
> @@ -2227,10 +2299,10 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>   
>   	if (rgd == NULL)
>   		return;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u t:%d\n",
>   		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
>   		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> -		       rgd->rd_reserved, rgd->rd_extfail_pt);
> +		       rgd->rd_reserved, rgd->rd_extfail_pt, rgd->rd_expid);
>   	spin_lock(&rgd->rd_rsspin);
>   	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
>   		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> @@ -2341,6 +2413,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>   	int error;
>   
>   	gfs2_set_alloc_start(&rbm, ip, dinode);
> +	rgrp_lockex(rbm.rgd);
>   	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false);
>   
>   	if (error == -ENOSPC) {
> @@ -2350,6 +2423,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>   
>   	/* Since all blocks are reserved in advance, this shouldn't happen */
>   	if (error) {
> +		rgrp_unlockex(rbm.rgd);
>   		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
>   			(unsigned long long)ip->i_no_addr, error, *nblocks,
>   			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index e90478e2f545..c48c50c37dc7 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -76,6 +76,8 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>   				   struct buffer_head *bh,
>   				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>   extern int gfs2_fitrim(struct file *filp, void __user *argp);
> +extern void gfs2_rgrp_lock_ex(struct gfs2_holder *gh);
> +extern void gfs2_rgrp_unlock_ex(struct gfs2_holder *gh);
>   
>   /* This is how to tell if a reservation is in the rgrp tree: */
>   static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
> @@ -91,4 +93,7 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
>   }
>   
>   extern void check_and_update_goal(struct gfs2_inode *ip);
> +extern void rgrp_lockex(struct gfs2_rgrpd *rgd);
> +extern void rgrp_unlockex(struct gfs2_rgrpd *rgd);
> +
>   #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index cf5c7f3080d2..e11db05183c7 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1507,7 +1507,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>   		goto out_qs;
>   	}
>   
> -	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
> +	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXSH, 0, &gh);
>   	if (error)
>   		goto out_qs;
>   
> diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
> index c75cacaa349b..6e7a69a22abc 100644
> --- a/fs/gfs2/trans.c
> +++ b/fs/gfs2/trans.c
> @@ -20,6 +20,7 @@
>   #include "gfs2.h"
>   #include "incore.h"
>   #include "glock.h"
> +#include "glops.h"
>   #include "inode.h"
>   #include "log.h"
>   #include "lops.h"
> @@ -56,6 +57,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
>   						   sizeof(u64));
>   	INIT_LIST_HEAD(&tr->tr_databuf);
>   	INIT_LIST_HEAD(&tr->tr_buf);
> +	INIT_LIST_HEAD(&tr->tr_rgrps);
>   
>   	sb_start_intwrite(sdp->sd_vfs);
>   
> @@ -94,6 +96,17 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
>   
>   	current->journal_info = NULL;
>   
> +	/**
> +	 * During the transaction, we may have queued several rgrps, and in so
> +	 * doing, locked them ex. Before we end the transaction, we must unlock
> +	 * them all.
> +	 */
> +	while (!list_empty(&tr->tr_rgrps)) {
> +		struct gfs2_rgrpd *rgd = list_first_entry(&tr->tr_rgrps,
> +							  struct gfs2_rgrpd,
> +							  rd_trans);
> +		rgrp_unlockex(rgd);
> +	}
>   	if (!test_bit(TR_TOUCHED, &tr->tr_flags)) {
>   		gfs2_log_release(sdp, tr->tr_reserved);
>   		if (alloced) {
> @@ -209,6 +222,9 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
>   	struct gfs2_trans *tr = current->journal_info;
>   	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
>   
> +	if (gl->gl_ops == &gfs2_rgrp_glops)
> +		rgrp_lockex(gl->gl_object);
> +
>   	lock_buffer(bh);
>   	if (buffer_pinned(bh)) {
>   		set_bit(TR_TOUCHED, &tr->tr_flags);
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index f2bce1e0f6fb..e2b21519dee5 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -262,7 +262,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>   		return -EIO;
>   	}
>   
> -	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
> +	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXSH, 0, &rg_gh);
>   	if (error)
>   		return error;
>   
> @@ -1314,7 +1314,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
>   	else
>   		goto out;
>   
> -	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
> +	gfs2_rlist_alloc(&rlist, LM_ST_EXSH);
>   
>   	for (x = 0; x < rlist.rl_rgrps; x++) {
>   		struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
> @@ -1397,7 +1397,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
>   		return -EIO;
>   	}
>   
> -	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
> +	error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXSH, 0, &gh);
>   	if (error)
>   		return error;
>