[Cluster-devel] [GFS2 PATCH 1/3] GFS2: Set of distributed preferences for rgrps

Mon Oct 27 10:27:10 UTC 2014

Hi,

On 24/10/14 18:49, Bob Peterson wrote:
> This patch tries to use the journal numbers to evenly distribute
> which node prefers which resource group for block allocations. This
> is to help performance.
> ---
>   fs/gfs2/incore.h |  1 +
>   fs/gfs2/rgrp.c   | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
>   2 files changed, 62 insertions(+), 5 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index 39e7e99..1b89918 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -97,6 +97,7 @@ struct gfs2_rgrpd {
>   #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
>   #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>   #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
> +#define GFS2_RDF_PREFERRED	0x80000000 /* This rgrp is preferred */
>   #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
>   	spinlock_t rd_rsspin;           /* protects reservation related vars */
>   	struct rb_root rd_rstree;       /* multi-block reservation tree */
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 7474c41..f65e56b 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>   	rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
>   	rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
>   	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
> -	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
> +	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
>   	if (rgd->rd_data > sdp->sd_max_rg_data)
>   		sdp->sd_max_rg_data = rgd->rd_data;
>   	spin_lock(&sdp->sd_rindex_spin);
> @@ -955,6 +955,36 @@ fail:
>   }
>   
>   /**
> + * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
> + * @sdp: the GFS2 superblock
> + *
> + * The purpose of this function is to select a subset of the resource groups
> + * and mark them as PREFERRED. We do it in such a way that each node prefers
> + * to use a unique set of rgrps to minimize glock contention.
> + */
> +static void set_rgrp_preferences(struct gfs2_sbd *sdp)
> +{
> +	struct gfs2_rgrpd *rgd, *first;
> +	int i;
> +
> +	/* Skip an initial number of rgrps, based on this node's journal ID.
> +	   That should start each node out on its own set. */
> +	rgd = gfs2_rgrpd_get_first(sdp);
> +	for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++)
> +		rgd = gfs2_rgrpd_get_next(rgd);
> +	first = rgd;
> +
> +	do {
> +		rgd->rd_flags |= GFS2_RDF_PREFERRED;
> +		for (i = 0; i < sdp->sd_journals; i++) {
> +			rgd = gfs2_rgrpd_get_next(rgd);
> +			if (rgd == first)
> +				break;
> +		}
> +	} while (rgd != first);
> +}
> +
> +/**
>    * gfs2_ri_update - Pull in a new resource index from the disk
>    * @ip: pointer to the rindex inode
>    *
> @@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
>   	if (error < 0)
>   		return error;
>   
> +	set_rgrp_preferences(sdp);
> +
>   	sdp->sd_rindex_uptodate = 1;
>   	return 0;
>   }
> @@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
>   }
>   
>   /**
> + * fast_to_acquire - determine if a resource group will be fast to acquire
> + *
> + * If this is one of our preferred rgrps, it should be quicker to acquire,
> + * because we tried to set ourselves up as dlm lock master.
> + */
> +static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
> +{
> +	struct gfs2_glock *gl = rgd->rd_gl;
> +
> +	if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) &&
> +	    !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
> +	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
> +		return 1;
> +	if (rgd->rd_flags & GFS2_RDF_PREFERRED)
> +		return 1;
> +	return 0;
> +}
> +
> +/**
>    * gfs2_inplace_reserve - Reserve space in the filesystem
>    * @ip: the inode to reserve space for
>    * @ap: the allocation parameters
> @@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
>   			rg_locked = 0;
>   			if (skip && skip--)
>   				goto next_rgrp;
> -			if (!gfs2_rs_active(rs) && (loops < 2) &&
> -			     gfs2_rgrp_used_recently(rs, 1000) &&
> -			     gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
> -				goto next_rgrp;
> +			if (!gfs2_rs_active(rs)) {
> +				if (loops == 0 &&
> +				    !fast_to_acquire(rs->rs_rbm.rgd))
> +					goto next_rgrp;

> +				if ((loops < 3) &&
> +				    gfs2_rgrp_used_recently(rs, 1000) &&
> +				    gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
> +					goto next_rgrp;
> +			}
This makes no sense, we end the loop when loops == 3 so that these 
conditions will be applied in every case which is not what we want. We 
must always end up doing a search of every rgrp in the worst case, in 
order that if there is some space left somewhere, we will eventually 
find it.

Definitely better wrt figuring out which rgrps to prefer, but I'm not 
yet convinced about this logic. The whole point of the congestion logic 
is to figure out ahead of time, whether it will take a long time to 
access that rgrp, so it seems that this is not quite right, otherwise 
there should be no need to bypass it like this. The fast_to_acquire 
logic should at least by merged into the rgrp_congested logic, possibly 
by just reducing the threshold at which congestion is measured.

It might be useful to introduce a tracepoint for when we reject and rgrp 
during allocation, with a reason as to why it was rejected, so that it 
is easier to see whats going on here,

Steve.

>   			error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
>   						   LM_ST_EXCLUSIVE, flags,
>   						   &rs->rs_rgd_gh);