[Cluster-devel] [PATCH v3] GFS2: Add a next-resource-group pointer to resource groups
Andrew Price
anprice at redhat.com
Mon Feb 13 18:06:51 UTC 2017
I should have mentioned: v3 just adds a check for MS_RDONLY and only
updates the rg_skip value if the fs is rw.
Andy
On 13/02/17 17:59, Andrew Price wrote:
> Add a new rg_skip field to struct gfs2_rgrp, replacing __pad. The
> rg_skip field has the following meaning:
>
> - If rg_skip is zero, it is considered unset and not useful.
> - If rg_skip is non-zero, its value will be the number of blocks between
> this rgrp's address and the next rgrp's address. This can be used as a
> hint by fsck.gfs2 when rebuilding a bad rindex, for example.
>
> When gfs2_rgrp_bh_get() reads a resource group header and finds rg_skip
> to be 0 it will attempt to set it to the difference between its rd_addr
> and the rd_addr of the next resource group.
>
> The only special case is the final rgrp, which always has a rg_skip of
> 0. It is not set to a special value (like -1) because, when the
> filesystem is grown, the rgrp will no longer be the final one and it
> will then need to have its rg_skip field set. The overhead of this
> special case is a gfs2_rgrpd_get_next() call each time
> gfs2_rgrp_bh_get() is called for the final resource group.
>
> For the other resource groups, if the rg_skip field is 0, it is set
> appropriately and then the only overhead becomes the rgd->rg_skip == 0
> comparison in gfs2_rgrp_bh_get().
>
> Before this patch, gfs2_rgrp_out() zeroes the __pad field explicitly, so
> the rg_skip field can get set back to 0 in cases where nodes with and
> without this patch are mixed in a cluster. In some cases, the field may
> bounce between being set by one node and then zeroed by another which
> may harm performance slightly, e.g. when two nodes create many small
> files. In testing this situation is rare but it becomes more likely as
> the filesystem fills up and there are fewer resource groups to choose
> from. The problem goes away when all nodes are running with this patch.
> Dipping into the space currently occupied by the rg_reserved field would
> have resulted in the same problem as it is also explicitly zeroed, so
> unfortunately there is no other way around it.
>
> Signed-off-by: Andrew Price <anprice at redhat.com>
> ---
> fs/gfs2/incore.h | 1 +
> fs/gfs2/rgrp.c | 27 ++++++++++++++++++++++++++-
> include/uapi/linux/gfs2_ondisk.h | 5 ++++-
> 3 files changed, 31 insertions(+), 2 deletions(-)
>
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index a6a3389..2c03287 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -88,6 +88,7 @@ struct gfs2_rgrpd {
> u32 rd_reserved; /* number of blocks reserved */
> u32 rd_free_clone;
> u32 rd_dinodes;
> + u32 rd_skip; /* Distance to the next rgrp in fs blocks */
> u64 rd_igeneration;
> struct gfs2_bitmap *rd_bits;
> struct gfs2_sbd *rd_sbd;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index 86ccc015..aaf435d 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -1049,6 +1049,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
> rgd->rd_flags |= rg_flags;
> rgd->rd_free = be32_to_cpu(str->rg_free);
> rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
> + rgd->rd_skip = be32_to_cpu(str->rg_skip);
> rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
> }
>
> @@ -1059,7 +1060,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
> str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
> str->rg_free = cpu_to_be32(rgd->rd_free);
> str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
> - str->__pad = cpu_to_be32(0);
> + str->rg_skip = cpu_to_be32(rgd->rd_skip);
> str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration);
> memset(&str->rg_reserved, 0, sizeof(str->rg_reserved));
> }
> @@ -1119,6 +1120,28 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
> return count;
> }
>
> +/**
> + * Set the rg_next field if this isn't the final rgrp.
> + */
> +static void gfs2_rgrp_set_skip(struct gfs2_rgrpd *rgd)
> +{
> + struct gfs2_sbd *sdp = rgd->rd_sbd;
> + struct buffer_head *bh = rgd->rd_bits[0].bi_bh;
> + struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd);
> +
> + if (next == NULL || next->rd_addr <= rgd->rd_addr)
> + return;
> +
> + if (gfs2_trans_begin(sdp, RES_RG_HDR, 0) != 0)
> + return;
> +
> + rgd->rd_skip = next->rd_addr - rgd->rd_addr;
> + gfs2_trans_add_meta(rgd->rd_gl, bh);
> + gfs2_rgrp_out(rgd, bh->b_data);
> + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
> + gfs2_trans_end(sdp);
> + return;
> +}
>
> /**
> * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps
> @@ -1184,6 +1207,8 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
> if (rgd->rd_rgl->rl_unlinked == 0)
> rgd->rd_flags &= ~GFS2_RDF_CHECK;
> }
> + if (rgd->rd_skip == 0 && !(sdp->sd_vfs->s_flags & MS_RDONLY))
> + gfs2_rgrp_set_skip(rgd);
> return 0;
>
> fail:
> diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
> index 7c4be77..0064381f 100644
> --- a/include/uapi/linux/gfs2_ondisk.h
> +++ b/include/uapi/linux/gfs2_ondisk.h
> @@ -186,7 +186,10 @@ struct gfs2_rgrp {
> __be32 rg_flags;
> __be32 rg_free;
> __be32 rg_dinodes;
> - __be32 __pad;
> + union {
> + __be32 __pad;
> + __be32 rg_skip; /* Distance to the next rgrp in fs blocks */
> + };
> __be64 rg_igeneration;
>
> __u8 rg_reserved[80]; /* Several fields from gfs1 now reserved */
>
More information about the Cluster-devel
mailing list