[Cluster-devel] [PATCH 04/14] mkfs.gfs2: Fix the resource group layout strategy, again
Andrew Price
anprice at redhat.com
Thu Apr 3 15:12:37 UTC 2014
The previous attempt at improving the resource group layout and
alignment fell somewhat short of the mark and left some issues, such as
the possibility of leaving a small resource group at the end of the
device which gfs2_grow could wrongly use as the file system's resource
group size.
The core of this patch is the new lgfs2_rgrps_plan() function which
calculates a sensible resource group size given a target maximum (which
we now default to 2GB instead of 256MB). In order to avoid leaving a gap
or a small rgrp at the end of the device, we adjust the rgrp length down
until a further adjustment would leave a gap, then apply a constant
adjustment to the size of a subset of the resource groups.
The rest of the patch aims to clean up libgfs2's resource group API and
give more control to the application rather than storing a lot of the
resource group layout parameters in the lgfs2_rgrps_t. This should make
it easier to use the same functions in gfs2_grow, fsck.gfs2 and any
other tools which might need to manipulate resource groups.
Signed-off-by: Andrew Price <anprice at redhat.com>
---
gfs2/libgfs2/libgfs2.h | 16 ++--
gfs2/libgfs2/rgrp.c | 206 ++++++++++++++++++++++++++++++++++---------------
gfs2/mkfs/main_mkfs.c | 104 +++++++++++++++++--------
3 files changed, 218 insertions(+), 108 deletions(-)
diff --git a/gfs2/libgfs2/libgfs2.h b/gfs2/libgfs2/libgfs2.h
index ce51e8c..24947c2 100644
--- a/gfs2/libgfs2/libgfs2.h
+++ b/gfs2/libgfs2/libgfs2.h
@@ -186,19 +186,16 @@ struct rgrp_tree {
struct gfs2_buffer_head **bh;
};
-struct lgfs2_rgrp_align {
- uint64_t base;
- uint64_t offset;
-};
-
typedef struct rgrp_tree *lgfs2_rgrp_t;
typedef struct _lgfs2_rgrps *lgfs2_rgrps_t;
-extern lgfs2_rgrps_t lgfs2_rgrps_init(unsigned bsize, uint64_t start, uint64_t devlen, uint32_t rglen, struct lgfs2_rgrp_align *al);
+extern lgfs2_rgrps_t lgfs2_rgrps_init(unsigned bsize, uint64_t devlen, uint64_t align, uint64_t offset);
+extern uint64_t lgfs2_rgrp_align_addr(const lgfs2_rgrps_t rgs, uint64_t addr);
+extern uint32_t lgfs2_rgrp_align_len(const lgfs2_rgrps_t rgs, uint32_t len);
extern unsigned lgfs2_rgsize_for_data(uint64_t blksreq, unsigned bsize);
-extern lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint32_t rglen, int expand);
-extern int lgfs2_rgrp_write(int fd, lgfs2_rgrp_t rg, unsigned bsize);
-extern int lgfs2_rgrps_end(lgfs2_rgrps_t rgs);
+extern uint32_t lgfs2_rgrps_plan(const lgfs2_rgrps_t rgs, uint64_t space, uint32_t tgtsize);
+extern lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint64_t addr, uint32_t rglen, uint64_t *nextaddr);
+extern int lgfs2_rgrp_write(lgfs2_rgrps_t rgs, int fd, lgfs2_rgrp_t rg);
extern struct gfs2_rindex *lgfs2_rgrp_index(lgfs2_rgrp_t rg);
// Temporary function to aid API migration
extern struct osi_node *lgfs2_rgrps_root(lgfs2_rgrps_t rgs) __attribute__((deprecated));
@@ -350,7 +347,6 @@ struct metapath {
#define GFS2_EXP_MIN_RGSIZE (1)
#define GFS2_MIN_RGSIZE (32)
-/* Look at this! Why can't we go bigger than 2GB? */
#define GFS2_MAX_RGSIZE (2048)
/* meta.c */
diff --git a/gfs2/libgfs2/rgrp.c b/gfs2/libgfs2/rgrp.c
index 0752772..1242385 100644
--- a/gfs2/libgfs2/rgrp.c
+++ b/gfs2/libgfs2/rgrp.c
@@ -223,6 +223,11 @@ void gfs2_rgrp_free(struct osi_root *rgrp_tree)
}
}
+struct rgplan {
+ uint32_t num;
+ uint32_t len;
+};
+
/**
* This structure is defined in libgfs2.h as an opaque type. It stores the
* constants and context required for creating resource groups from any point
@@ -230,17 +235,11 @@ void gfs2_rgrp_free(struct osi_root *rgrp_tree)
*/
struct _lgfs2_rgrps {
struct osi_root root;
- uint64_t nextaddr;
+ struct rgplan plan[2];
unsigned bsize;
unsigned long align;
unsigned long align_off;
- unsigned long curr_offset;
- uint64_t maxrgsz;
- uint64_t minrgsz;
uint64_t devlen;
- uint64_t count;
- uint64_t blks_total;
- uint32_t rgsize;
};
static uint64_t align_block(const uint64_t base, const uint64_t align)
@@ -251,29 +250,121 @@ static uint64_t align_block(const uint64_t base, const uint64_t align)
}
/**
+ * Calculate the aligned block address of a resource group.
+ * rgs: The resource groups handle
+ * base: The base address of the first resource group address, in blocks
+ * Returns the aligned address of the first resource group.
+ */
+uint64_t lgfs2_rgrp_align_addr(const lgfs2_rgrps_t rgs, uint64_t addr)
+{
+ return align_block(addr, rgs->align);
+}
+
+/**
+ * Calculate the aligned relative address of the next resource group (and thus
+ * the aligned length of this one).
+ * rgs: The resource groups handle
+ * base: The base length of the current resource group, in blocks
+ * Returns the length of the resource group (the aligned relative address of
+ * the next one)
+ */
+uint32_t lgfs2_rgrp_align_len(const lgfs2_rgrps_t rgs, uint32_t len)
+{
+ return align_block(len, rgs->align) + rgs->align_off;
+}
+
+/**
+ * Plan the sizes of resource groups for remaining free space, based on a
+ * target maximum size. In order to make best use of the space while keeping
+ * the resource groups aligned appropriately we need to either reduce the
+ * length of every resource group or of a subset of the resource groups, so
+ * we're left with either one or two resource group sizes. We keep track of
+ * both of these and the numbers of each size of resource group inside the
+ * resource groups descriptor.
+ * rgs: The resource groups descriptor
+ * space: The number of remaining blocks to be allocated
+ * tgtsize: The target resource group size in blocks
+ * Returns the larger of the calculated resource group sizes or 0 if the
+ * smaller would be less than GFS2_MIN_RGSIZE.
+ */
+uint32_t lgfs2_rgrps_plan(const lgfs2_rgrps_t rgs, uint64_t space, uint32_t tgtsize)
+{
+ uint32_t maxlen = (GFS2_MAX_RGSIZE << 20) / rgs->bsize;
+ uint32_t minlen = (GFS2_MIN_RGSIZE << 20) / rgs->bsize;
+
+ /* Apps should already have checked that the rg size is <=
+ GFS2_MAX_RGSIZE but just in case alignment pushes it over we clamp
+ it back down while calculating the initial rgrp length. */
+ do {
+ rgs->plan[0].len = lgfs2_rgrp_align_len(rgs, tgtsize);
+ tgtsize -= (rgs->align + 1);
+ } while (rgs->plan[0].len > maxlen);
+
+ rgs->plan[0].num = space / rgs->plan[0].len;
+
+ if ((space - (rgs->plan[0].num * rgs->plan[0].len)) > rgs->align) {
+ unsigned adj = (rgs->align > 0) ? rgs->align : 1;
+
+ /* Spread the adjustment required to fit a new rgrp at the end
+ over all of the rgrps so that we don't end with a single
+ tiny one. */
+ while (((rgs->plan[0].len - adj) * (rgs->plan[0].num + 1)) >= space)
+ rgs->plan[0].len -= adj;
+
+ /* We've adjusted the size of the rgrps down as far as we can
+ without leaving a large gap at the end of the device now,
+ but we still need to reduce the size of some rgrps in order
+ to make everything fit, so we use the second rgplan to
+ specify a second length for a subset of the resource groups.
+ If plan[0].len already divides the space with no remainder,
+ plan[1].num will stay 0 and it won't be used. */
+ rgs->plan[1].len = rgs->plan[0].len - adj;
+ rgs->plan[1].num = 0;
+
+ while (((rgs->plan[0].len * rgs->plan[0].num) +
+ (rgs->plan[1].len * rgs->plan[1].num)) > space) {
+ /* Total number of rgrps stays constant now. We just
+ need to shift some weight around */
+ rgs->plan[0].num--;
+ rgs->plan[1].num++;
+ }
+ }
+
+ /* Once we've reached this point,
+ (plan[0].num * plan[0].len) + (plan[1].num * plan[1].len)
+ will be less than one adjustment smaller than 'space'. */
+
+ if (rgs->plan[0].len < minlen)
+ return 0;
+
+ return rgs->plan[0].len;
+}
+
+/**
* Create and initialise an empty set of resource groups
* bsize: The block size of the fs
- * start: The block address of the first resource group
* devlen: The length of the device, in fs blocks
- * rglen: Default rg size, in blocks
- * al: The required alignment of the resource groups
+ * align: The required stripe alignment of the resource groups. Must be a multiple of 'offset'.
+ * offset: The required stripe offset of the resource groups
* Returns an initialised lgfs2_rgrps_t or NULL if unsuccessful with errno set
*/
-lgfs2_rgrps_t lgfs2_rgrps_init(unsigned bsize, uint64_t start, uint64_t devlen, uint32_t rglen, struct lgfs2_rgrp_align *al)
+lgfs2_rgrps_t lgfs2_rgrps_init(unsigned bsize, uint64_t devlen, uint64_t align, uint64_t offset)
{
- lgfs2_rgrps_t rgs = calloc(1, sizeof(*rgs));
+ lgfs2_rgrps_t rgs;
+
+ errno = EINVAL;
+ if (offset != 0 && (align % offset) != 0)
+ return NULL;
+
+ rgs = calloc(1, sizeof(*rgs));
if (rgs == NULL)
return NULL;
rgs->bsize = bsize;
- rgs->maxrgsz = (GFS2_MAX_RGSIZE << 20) / bsize;
- rgs->minrgsz = (GFS2_MIN_RGSIZE << 20) / bsize;
- rgs->rgsize = rglen;
rgs->devlen = devlen;
- rgs->align = al->base;
- rgs->align_off = al->offset;
+ rgs->align = align;
+ rgs->align_off = offset;
memset(&rgs->root, 0, sizeof(rgs->root));
- rgs->nextaddr = align_block(start, rgs->align);
return rgs;
}
@@ -287,14 +378,6 @@ struct gfs2_rindex *lgfs2_rgrp_index(lgfs2_rgrp_t rg)
}
/**
- * Return non-zero if there is space left for more resource groups or zero if not
- */
-int lgfs2_rgrps_end(lgfs2_rgrps_t rgs)
-{
- return (rgs->nextaddr == 0);
-}
-
-/**
* Returns the total resource group size, in blocks, required to give blksreq data blocks
*/
unsigned lgfs2_rgsize_for_data(uint64_t blksreq, unsigned bsize)
@@ -316,43 +399,38 @@ struct osi_node *lgfs2_rgrps_root(lgfs2_rgrps_t rgs)
/**
* Create a new resource group after the last resource group in a set.
* rgs: The set of resource groups
- * rglen: The required length of the resource group. If its is 0 the default rgsize
- * passed to lgfs2_rgrps_init() is used.
- * expand: Whether to expand the resource group when alignment would leave a gap.
- * Returns the new resource group on success or NULL on failure.
+ * addr: The address at which to place this resource group
+ * rglen: The required length of the resource group, in fs blocks.
+ * Returns the new resource group on success or NULL on failure with errno set.
+ * If errno is ENOSPC on a NULL return from this function, it could be
+ * interpreted as 'finished' unless you expected there to be enough space on
+ * the device for the resource group.
*/
-lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint32_t rglen, int expand)
+lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint64_t addr, uint32_t rglen, uint64_t *nextaddr)
{
int err = 0;
- lgfs2_rgrp_t rg = rgrp_insert(&rgs->root, rgs->nextaddr);
- if (rg == NULL)
- return NULL;
-
- rgs->curr_offset += rgs->align_off;
- if (rgs->curr_offset >= rgs->align)
- rgs->curr_offset = 0;
-
- if (rgs->rgsize > rglen)
- rglen = rgs->rgsize;
-
- rgs->nextaddr = align_block(rg->ri.ri_addr + rgs->rgsize, rgs->align) + rgs->curr_offset;
- /* Use up gap left by alignment if possible */
- if (expand && ((rgs->nextaddr - rg->ri.ri_addr) <= rgs->maxrgsz))
- rglen = rgs->nextaddr - rg->ri.ri_addr;
-
- if ((rgs->nextaddr + rgs->rgsize) > rgs->devlen) {
- /* Squeeze the last 1 or 2 rgs into the remaining space */
- if ((rgs->nextaddr < rgs->devlen) && ((rgs->devlen - rgs->nextaddr) >= rgs->minrgsz)) {
- rgs->rgsize = rgs->devlen - rgs->nextaddr;
+ lgfs2_rgrp_t rg;
+
+ if (rglen == 0) {
+ if (rgs->plan[0].num > 0) {
+ rglen = rgs->plan[0].len;
+ rgs->plan[0].num--;
+ } else if (rgs->plan[1].num > 0) {
+ rglen = rgs->plan[1].len;
+ rgs->plan[1].num--;
} else {
- if (rgs->devlen - rg->ri.ri_addr <= rgs->maxrgsz)
- rglen = rgs->devlen - rg->ri.ri_addr;
- else
- rglen = rgs->maxrgsz;
- /* This is the last rg */
- rgs->nextaddr = 0;
+ errno = ENOSPC;
+ return NULL;
}
}
+ if (addr + rglen > rgs->devlen) {
+ errno = ENOSPC;
+ return NULL;
+ }
+
+ rg = rgrp_insert(&rgs->root, addr);
+ if (rg == NULL)
+ return NULL;
rg->ri.ri_length = rgblocks2bitblocks(rgs->bsize, rglen, &rg->ri.ri_data);
rg->ri.ri_data0 = rg->ri.ri_addr + rg->ri.ri_length;
@@ -361,12 +439,12 @@ lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint32_t rglen, int expand)
rg->rg.rg_header.mh_type = GFS2_METATYPE_RG;
rg->rg.rg_header.mh_format = GFS2_FORMAT_RG;
rg->rg.rg_free = rg->ri.ri_data;
-
err = gfs2_compute_bitstructs(rgs->bsize, rg);
if (err != 0)
return NULL;
- rgs->blks_total += rg->ri.ri_data;
- rgs->count++;
+
+ if (nextaddr)
+ *nextaddr = rg->ri.ri_addr + rglen;
return rg;
}
@@ -374,10 +452,10 @@ lgfs2_rgrp_t lgfs2_rgrp_append(lgfs2_rgrps_t rgs, uint32_t rglen, int expand)
* Write a resource group to a file descriptor.
* Returns 0 on success or non-zero on failure with errno set
*/
-int lgfs2_rgrp_write(int fd, lgfs2_rgrp_t rg, unsigned bsize)
+int lgfs2_rgrp_write(const lgfs2_rgrps_t rgs, int fd, const lgfs2_rgrp_t rg)
{
ssize_t ret = 0;
- size_t len = rg->ri.ri_length * bsize;
+ size_t len = rg->ri.ri_length * rgs->bsize;
unsigned int i;
const struct gfs2_meta_header bmh = {
.mh_magic = GFS2_MAGIC,
@@ -390,9 +468,9 @@ int lgfs2_rgrp_write(int fd, lgfs2_rgrp_t rg, unsigned bsize)
gfs2_rgrp_out(&rg->rg, buff);
for (i = 1; i < rg->ri.ri_length; i++)
- gfs2_meta_header_out(&bmh, buff + (i * bsize));
+ gfs2_meta_header_out(&bmh, buff + (i * rgs->bsize));
- ret = pwrite(fd, buff, len, rg->ri.ri_addr * bsize);
+ ret = pwrite(fd, buff, len, rg->ri.ri_addr * rgs->bsize);
if (ret != len) {
free(buff);
return -1;
diff --git a/gfs2/mkfs/main_mkfs.c b/gfs2/mkfs/main_mkfs.c
index 40f4766..ae82c9f 100644
--- a/gfs2/mkfs/main_mkfs.c
+++ b/gfs2/mkfs/main_mkfs.c
@@ -160,7 +160,7 @@ static void opts_init(struct mkfs_opts *opts)
opts->bsize = GFS2_DEFAULT_BSIZE;
opts->jsize = GFS2_DEFAULT_JSIZE;
opts->qcsize = GFS2_DEFAULT_QCSIZE;
- opts->rgsize = GFS2_DEFAULT_RGSIZE;
+ opts->rgsize = GFS2_MAX_RGSIZE;
opts->lockproto = "lock_dlm";
opts->locktable = "";
opts->confirm = 1;
@@ -583,9 +583,9 @@ static void warn_of_destruction(const char *path)
static lgfs2_rgrps_t rgs_init(struct mkfs_opts *opts, struct gfs2_sbd *sdp)
{
- uint64_t rgsize = (opts->rgsize << 20) / sdp->bsize;
- struct lgfs2_rgrp_align align = {.base = 0, .offset = 0};
lgfs2_rgrps_t rgs;
+ uint64_t al_base = 0;
+ uint64_t al_off = 0;
if (opts->align && opts->got_sunit) {
if ((opts->sunit % sdp->bsize) != 0) {
@@ -597,18 +597,18 @@ static lgfs2_rgrps_t rgs_init(struct mkfs_opts *opts, struct gfs2_sbd *sdp)
opts->swidth, opts->sunit);
exit(1);
} else {
- align.base = opts->swidth / sdp->bsize;
- align.offset = opts->sunit / sdp->bsize;
+ al_base = opts->swidth / sdp->bsize;
+ al_off = opts->sunit / sdp->bsize;
}
} else if (opts->align) {
if ((opts->dev.minimum_io_size > opts->dev.physical_sector_size) &&
(opts->dev.optimal_io_size > opts->dev.physical_sector_size)) {
- align.base = opts->dev.optimal_io_size / sdp->bsize;
- align.offset = opts->dev.minimum_io_size / sdp->bsize;
+ al_base = opts->dev.optimal_io_size / sdp->bsize;
+ al_off = opts->dev.minimum_io_size / sdp->bsize;
}
}
- rgs = lgfs2_rgrps_init(sdp->bsize, sdp->sb_addr + 1, sdp->device.length, rgsize, &align);
+ rgs = lgfs2_rgrps_init(sdp->bsize, sdp->device.length, al_base, al_off);
if (rgs == NULL) {
perror(_("Could not initialise resource groups"));
exit(-1);
@@ -617,7 +617,7 @@ static lgfs2_rgrps_t rgs_init(struct mkfs_opts *opts, struct gfs2_sbd *sdp)
if (opts->debug) {
printf(" rgrp align = ");
if (opts->align)
- printf("%lu+%lu blocks\n", align.base, align.offset);
+ printf("%lu+%lu blocks\n", al_base, al_off);
else
printf("(disabled)\n");
}
@@ -625,36 +625,71 @@ static lgfs2_rgrps_t rgs_init(struct mkfs_opts *opts, struct gfs2_sbd *sdp)
return rgs;
}
-static uint64_t place_rgrps(struct gfs2_sbd *sdp, lgfs2_rgrps_t rgs, struct mkfs_opts *opts)
+static int place_rgrp(struct gfs2_sbd *sdp, lgfs2_rgrps_t rgs, uint64_t rgaddr, uint32_t len, uint64_t *next)
{
int err = 0;
lgfs2_rgrp_t rg = NULL;
struct gfs2_rindex *ri = NULL;
- while (!lgfs2_rgrps_end(rgs)) {
- rg = lgfs2_rgrp_append(rgs, 0, !opts->got_rgsize);
- if (rg == NULL) {
- perror(_("Failed to create resource group"));
- return 0;
- }
- err = lgfs2_rgrp_write(sdp->device_fd, rg, sdp->bsize);
- if (err != 0) {
- perror(_("Failed to write resource group"));
- return 0;
- }
- ri = lgfs2_rgrp_index(rg);
- if (opts->debug) {
- gfs2_rindex_print(ri);
- printf("\n");
- }
- sdp->blks_total += ri->ri_data;
- sdp->rgrps++;
+ rg = lgfs2_rgrp_append(rgs, rgaddr, len, next);
+ if (rg == NULL) {
+ if (errno == ENOSPC)
+ return 1;
+ perror(_("Failed to create resource group"));
+ return -1;
+ }
+ err = lgfs2_rgrp_write(rgs, sdp->device_fd, rg);
+ if (err != 0) {
+ perror(_("Failed to write resource group"));
+ return -1;
+ }
+ ri = lgfs2_rgrp_index(rg);
+ if (sdp->debug) {
+ gfs2_rindex_print(ri);
+ printf("\n");
}
+ sdp->blks_total += ri->ri_data;
+ sdp->fssize = ri->ri_data0 + ri->ri_data;
+ sdp->rgrps++;
+ return 0;
+}
- if (ri == NULL)
- return 0;
- else
- return ri->ri_data0 + ri->ri_data;
+static int place_rgrps(struct gfs2_sbd *sdp, lgfs2_rgrps_t rgs, struct mkfs_opts *opts)
+{
+ uint64_t jfsize = lgfs2_space_for_data(sdp, sdp->bsize, opts->jsize << 20);
+ uint32_t jrgsize = lgfs2_rgsize_for_data(jfsize, sdp->bsize);
+ uint64_t rgaddr = lgfs2_rgrp_align_addr(rgs, sdp->sb_addr + 1);
+ uint32_t rgsize = lgfs2_rgrps_plan(rgs, sdp->device.length - rgaddr, ((opts->rgsize << 20) / sdp->bsize));
+ unsigned j;
+
+ if (rgsize >= jrgsize)
+ jrgsize = rgsize;
+
+ if (rgsize < ((GFS2_MIN_RGSIZE << 20) / sdp->bsize)) {
+ fprintf(stderr, _("Resource group size is too small\n"));
+ return -1;
+ } else if (rgsize < ((GFS2_DEFAULT_RGSIZE << 20) / sdp->bsize)) {
+ fprintf(stderr, _("Warning: small resource group size could impact performance\n"));
+ }
+
+ for (j = 0; j < opts->journals; j++) {
+ int result = place_rgrp(sdp, rgs, rgaddr, jrgsize, NULL);
+ if (result != 0)
+ return result;
+ rgaddr = rgaddr + jrgsize;
+ }
+
+ if (rgsize != jrgsize)
+ lgfs2_rgrps_plan(rgs, sdp->device.length - rgaddr, ((opts->rgsize << 20) / sdp->bsize));
+
+ while (1) {
+ int result = place_rgrp(sdp, rgs, rgaddr, 0, &rgaddr);
+ if (result < 0)
+ return result;
+ if (result > 0)
+ break; /* Done */
+ }
+ return 0;
}
static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, unsigned bsize)
@@ -668,6 +703,7 @@ static void sbd_init(struct gfs2_sbd *sdp, struct mkfs_opts *opts, unsigned bsiz
sdp->md.journals = opts->journals;
sdp->device_fd = opts->dev.fd;
sdp->bsize = bsize;
+ sdp->debug = opts->debug;
if (compute_constants(sdp)) {
perror(_("Failed to compute file system constants"));
@@ -812,8 +848,8 @@ void main_mkfs(int argc, char *argv[])
if (!S_ISREG(opts.dev.stat.st_mode) && opts.discard)
discard_blocks(opts.dev.fd, opts.dev.size, opts.debug);
- sbd.fssize = place_rgrps(&sbd, rgs, &opts);
- if (sbd.fssize == 0) {
+ error = place_rgrps(&sbd, rgs, &opts);
+ if (error) {
fprintf(stderr, _("Failed to build resource groups\n"));
exit(1);
}
--
1.8.5.3
More information about the Cluster-devel
mailing list