[Cluster-devel] [PATCH v3] GFS2: Add a next-resource-group pointer to resource groups

Andrew Price anprice at redhat.com
Mon Feb 13 18:06:51 UTC 2017


I should have mentioned: v3 just adds a check for MS_RDONLY and only 
updates the rg_skip value if the fs is rw.

Andy

On 13/02/17 17:59, Andrew Price wrote:
> Add a new rg_skip field to struct gfs2_rgrp, replacing __pad. The
> rg_skip field has the following meaning:
>
> - If rg_skip is zero, it is considered unset and not useful.
> - If rg_skip is non-zero, its value will be the number of blocks between
>   this rgrp's address and the next rgrp's address. This can be used as a
>   hint by fsck.gfs2 when rebuilding a bad rindex, for example.
>
> When gfs2_rgrp_bh_get() reads a resource group header and finds rg_skip
> to be 0 it will attempt to set it to the difference between its rd_addr
> and the rd_addr of the next resource group.
>
> The only special case is the final rgrp, which always has a rg_skip of
> 0. It is not set to a special value (like -1) because, when the
> filesystem is grown, the rgrp will no longer be the final one and it
> will then need to have its rg_skip field set. The overhead of this
> special case is a gfs2_rgrpd_get_next() call each time
> gfs2_rgrp_bh_get() is called for the final resource group.
>
> For the other resource groups, if the rg_skip field is 0, it is set
> appropriately and then the only overhead becomes the rgd->rg_skip == 0
> comparison in gfs2_rgrp_bh_get().
>
> Before this patch, gfs2_rgrp_out() zeroes the __pad field explicitly, so
> the rg_skip field can get set back to 0 in cases where nodes with and
> without this patch are mixed in a cluster. In some cases, the field may
> bounce between being set by one node and then zeroed by another which
> may harm performance slightly, e.g. when two nodes create many small
> files. In testing this situation is rare but it becomes more likely as
> the filesystem fills up and there are fewer resource groups to choose
> from. The problem goes away when all nodes are running with this patch.
> Dipping into the space currently occupied by the rg_reserved field would
> have resulted in the same problem as it is also explicitly zeroed, so
> unfortunately there is no other way around it.
>
> Signed-off-by: Andrew Price <anprice at redhat.com>
> ---
>  fs/gfs2/incore.h                 |  1 +
>  fs/gfs2/rgrp.c                   | 27 ++++++++++++++++++++++++++-
>  include/uapi/linux/gfs2_ondisk.h |  5 ++++-
>  3 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a6a3389..2c03287 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -88,6 +88,7 @@ struct gfs2_rgrpd {
>  	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
> +	u32 rd_skip;                    /* Distance to the next rgrp in fs blocks */
>  	u64 rd_igeneration;
>  	struct gfs2_bitmap *rd_bits;
>  	struct gfs2_sbd *rd_sbd;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 86ccc015..aaf435d 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1049,6 +1049,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
>  	rgd->rd_flags |= rg_flags;
>  	rgd->rd_free = be32_to_cpu(str->rg_free);
>  	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
> +	rgd->rd_skip = be32_to_cpu(str->rg_skip);
>  	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
>  }
>
> @@ -1059,7 +1060,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
>  	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
>  	str->rg_free = cpu_to_be32(rgd->rd_free);
>  	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
> -	str->__pad = cpu_to_be32(0);
> +	str->rg_skip = cpu_to_be32(rgd->rd_skip);
>  	str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
>  	memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
>  }
> @@ -1119,6 +1120,28 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
>  	return count;
>  }
>
> +/**
> + * Set the rg_next field if this isn't the final rgrp.
> + */
> +static void gfs2_rgrp_set_skip(struct gfs2_rgrpd *rgd)
> +{
> +	struct gfs2_sbd *sdp = rgd->rd_sbd;
> +	struct buffer_head *bh = rgd->rd_bits[0].bi_bh;
> +	struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
> +
> +	if (next == NULL || next->rd_addr <= rgd->rd_addr)
> +		return;
> +
> +	if (gfs2_trans_begin(sdp, RES_RG_HDR, 0) != 0)
> +		return;
> +
> +	rgd->rd_skip = next->rd_addr - rgd->rd_addr;
> +	gfs2_trans_add_meta(rgd->rd_gl, bh);
> +	gfs2_rgrp_out(rgd, bh->b_data);
> +	gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
> +	gfs2_trans_end(sdp);
> +	return;
> +}
>
>  /**
>   * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
> @@ -1184,6 +1207,8 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
>  		if (rgd->rd_rgl->rl_unlinked == 0)
>  			rgd->rd_flags &= ~GFS2_RDF_CHECK;
>  	}
> +	if (rgd->rd_skip == 0 && !(sdp->sd_vfs->s_flags & MS_RDONLY))
> +		gfs2_rgrp_set_skip(rgd);
>  	return 0;
>
>  fail:
> diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
> index 7c4be77..0064381f 100644
> --- a/include/uapi/linux/gfs2_ondisk.h
> +++ b/include/uapi/linux/gfs2_ondisk.h
> @@ -186,7 +186,10 @@ struct gfs2_rgrp {
>  	__be32 rg_flags;
>  	__be32 rg_free;
>  	__be32 rg_dinodes;
> -	__be32 __pad;
> +	union {
> +		__be32 __pad;
> +		__be32 rg_skip; /* Distance to the next rgrp in fs blocks */
> +	};
>  	__be64 rg_igeneration;
>
>  	__u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
>




More information about the Cluster-devel mailing list