[Cluster-devel] [GFS2 Patch] GFS2: Reduce file fragmentation

Steven Whitehouse swhiteho at redhat.com
Tue Jul 3 09:18:33 UTC 2012


Hi,

Some comments...

On Mon, 2012-07-02 at 11:34 -0400, Bob Peterson wrote:
> Hi,
> 
> Here is my long-awaited GFS2 file defragmentation patch for upstream:
> 
> Description:
> 
> This patch reduces GFS2 file fragmentation by pre-reserving blocks.
> 
> Comments:
> 
> Before the patch, if I evoke a script that does simultaneous dd's to
> a GFS2 file system, I end up with severe file fragmentation.
> The script looked like this:
> 
> #!/bin/bash
> dd if=/dev/zero of=/mnt/gfs2/10Ga bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gb bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gc bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gd bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Ge bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gf bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gg bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gh bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gi bs=1M count=10240 &
> dd if=/dev/zero of=/mnt/gfs2/10Gj bs=1M count=10240 &
> 
> After the script runs, I did this command to print file fragmentation:
> for i in a b c d e f g h i j ; do filefrag /mnt/gfs2/10G$i ; done
> 
> With the stock upstream GFS2 it showed:
> /mnt/gfs2/10Ga: 3950 extents found
> /mnt/gfs2/10Gb: 3701 extents found
> /mnt/gfs2/10Gc: 3719 extents found
> /mnt/gfs2/10Gd: 1999 extents found
> /mnt/gfs2/10Ge: 4399 extents found
> /mnt/gfs2/10Gf: 2092 extents found
> /mnt/gfs2/10Gg: 3274 extents found
> /mnt/gfs2/10Gh: 3786 extents found
> /mnt/gfs2/10Gi: 3761 extents found
> /mnt/gfs2/10Gj: 4446 extents found
> 
> With the defrag patch below it showed:
> 
> /mnt/gfs2/10Ga: 239 extents found
> /mnt/gfs2/10Gb: 246 extents found
> /mnt/gfs2/10Gc: 251 extents found
> /mnt/gfs2/10Gd: 242 extents found
> /mnt/gfs2/10Ge: 279 extents found
> /mnt/gfs2/10Gf: 276 extents found
> /mnt/gfs2/10Gg: 257 extents found
> /mnt/gfs2/10Gh: 254 extents found
> /mnt/gfs2/10Gi: 249 extents found
> /mnt/gfs2/10Gj: 261 extents found
> 
> Also, performance is improved in some use cases. Bear in mind this
> is a very slow local SATA hard drive, but:
> 
> Stock  : 421482496 bytes (421 MB) copied, 140.8 s, 2.9 MB/s
> Patched: 428494848 bytes (428 MB) copied, 103.65 s, 4.1 MB/s
> 
> (If you're wondering why only 421MB copied, the device
> is small and it filled up during the test in both versions.)
> 
> Regards,
> 
> Bob Peterson
> Red Hat File Systems
> 
> Signed-off-by: Bob Peterson <rpeterso at redhat.com> 
> ---
> diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
> index 6d957a8..7026811 100644
> --- a/fs/gfs2/bmap.c
> +++ b/fs/gfs2/bmap.c
> @@ -773,6 +773,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	else
>  		goto out; /* Nothing to do */
>  
> +	/* Make sure the reservation rgrp is in the rlist too, because
> +	   we need to free it. */
> +	if (gfs2_rs_active(ip))
> +		gfs2_rlist_add(ip, &rlist, ip->i_res->rs_start);
> +
I don't understand this... why do we need to add the reservation to the
rlist here? It would be better to just drop it if we start to deallocate
blocks since we'll probably want to start again in a different place if
we then being allocating again.

>  	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
>  
>  	for (x = 0; x < rlist.rl_rgrps; x++) {
> @@ -785,6 +790,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
>  	if (error)
>  		goto out_rlist;
>  
> +	if (gfs2_rs_active(ip)) /* needs to be done with the rgrp glock held */
> +		gfs2_rs_treedel(ip, true);
> +
>  	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
>  				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
>  				 revokes);
> diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
> index 6fbf3cb..95c5a00 100644
> --- a/fs/gfs2/file.c
> +++ b/fs/gfs2/file.c
> @@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint,
> +		   PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
> +
>  	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
>  	ret = gfs2_glock_nq(&gh);
>  	if (ret)
> @@ -571,22 +574,23 @@ fail:
>  
>  static int gfs2_release(struct inode *inode, struct file *file)
>  {
> -	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
> -	struct gfs2_file *fp;
>  	struct gfs2_inode *ip = GFS2_I(inode);
>  
> -	fp = file->private_data;
> +	kfree(file->private_data);
>  	file->private_data = NULL;
>  
> -	if ((file->f_mode & FMODE_WRITE) && ip->i_res &&
> -	    (atomic_read(&inode->i_writecount) == 1))
> -		gfs2_rs_delete(ip);
> -
> -	if (gfs2_assert_warn(sdp, fp))
> -		return -EIO;
> -
> -	kfree(fp);
> +	if ((file->f_mode & FMODE_WRITE) && gfs2_rs_active(ip) &&
> +	    (atomic_read(&inode->i_writecount) == 1)) {
> +		struct gfs2_holder gh;
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +		int error;
>  
> +		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh);
> +		if (!error) {
> +			gfs2_rs_delete(ip);
> +			gfs2_glock_dq_uninit(&gh);
> +		}
> +	}
>  	return 0;
>  }

Why do we need to take an exclusive cluster lock here? Reservations are
local things, so we should be able to drop them without taking cluster
locks.

>  
> @@ -662,14 +666,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
>  				   unsigned long nr_segs, loff_t pos)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t writesize = iov_length(iov, nr_segs);
>  	struct dentry *dentry = file->f_dentry;
>  	struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
> +	struct gfs2_sbd *sdp;
>  	int ret;
>  
> +	sdp = GFS2_SB(file->f_mapping->host);
>  	ret = gfs2_rs_alloc(ip);
>  	if (ret)
>  		return ret;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
>  	if (file->f_flags & O_APPEND) {
>  		struct gfs2_holder gh;
>  
> @@ -795,6 +803,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  	if (unlikely(error))
>  		goto out_uninit;
>  
> +	atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
> +
>  	while (len > 0) {
>  		if (len < bytes)
>  			bytes = len;
> @@ -803,10 +813,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
>  			offset += bytes;
>  			continue;
>  		}
> -		error = gfs2_rindex_update(sdp);
> -		if (error)
> -			goto out_unlock;
> -
>  		error = gfs2_quota_lock_check(ip);
>  		if (error)
>  			goto out_unlock;
> diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
> index dc73070..de114c4 100644
> --- a/fs/gfs2/incore.h
> +++ b/fs/gfs2/incore.h
> @@ -84,6 +84,7 @@ struct gfs2_rgrpd {
>  	u32 rd_data;			/* num of data blocks in rgrp */
>  	u32 rd_bitbytes;		/* number of bytes in data bitmaps */
>  	u32 rd_free;
> +	u32 rd_reserved;                /* number of blocks reserved */
>  	u32 rd_free_clone;
>  	u32 rd_dinodes;
>  	u64 rd_igeneration;
> @@ -91,11 +92,15 @@ struct gfs2_rgrpd {
>  	struct gfs2_sbd *rd_sbd;
>  	struct gfs2_rgrp_lvb *rd_rgl;
>  	u32 rd_last_alloc;
> +	u32 rd_next_rsrv;               /* preferred next reservation */
>  	u32 rd_flags;
>  #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
>  #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
>  #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
>  #define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
> +	struct mutex rd_rstree_mutex;   /* protects the reservation tree */
I think it ought to protect more than just the reservation tree. We
probably should be using it to protect the rest of the rgrp fields too,
so lets just call it rd_mutex.

> +	struct rb_root rd_rstree;       /* multi-block reservation tree */
> +	u32 rd_rs_cnt;                  /* count of current reservations */
>  };
>  
>  enum gfs2_state_bits {
> @@ -233,6 +238,41 @@ struct gfs2_holder {
>  	unsigned long gh_ip;
>  };
>  
> +/* Resource group multi-block reservation, in order of appearance:
> +
> +   Step 1. Function prepares to write, allocates a mb, sets the size hint.
> +   Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
> +   Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
> +   Step 4. Bits are assigned from the rgrp based on either the reservation
> +           or wherever it can.
> +*/
> +
> +struct gfs2_blkreserv {
> +	/* components used during write (step 1): */
> +	struct gfs2_inode *rs_ip;     /* pointer to the inode */
This doesn't hold a reference on the inode, so what prevents the inode
from vanishing? It would be better to arrange to drop the reservations
by scanning the inodes, rather than the rgrps, so that we don't need to
store this pointer. Either that or find a way to allow the reservations
to be dropped later when the inode is disposed of, and just mark them
invalid when disposing of the rgrps.

> +	atomic_t rs_sizehint;         /* hint of the write size */
> +
> +	/* components used during inplace_reserve (step 2): */
> +	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> +
> +	/* components used during get_local_rgrp (step 3): */
> +	struct gfs2_rgrpd *rs_rgd;    /* pointer to the gfs2_rgrpd */
> +	struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
> +	struct rb_node rs_node;       /* link to other block reservations */
> +
> +	/* components used during block searches and assignments (step 4): */
> +	struct gfs2_bitmap *rs_bi;    /* bitmap for the current allocation */
> +	u64 rs_start;                 /* absolute fs block address */
> +	u32 rs_biblk;                 /* start block relative to the bi */
> +	u32 rs_blks;                  /* number of blocks reserved */
> +	u32 rs_free;                  /* how many blocks are still free */
> +
> +	/* ancillary quota stuff */
> +	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> +	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> +	unsigned int rs_qa_qd_num;
> +};
> +
>  enum {
>  	GLF_LOCK			= 1,
>  	GLF_DEMOTE			= 3,
> @@ -290,16 +330,6 @@ struct gfs2_glock {
>  
>  #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */
>  
> -struct gfs2_blkreserv {
> -	u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
> -	struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
> -
> -	/* ancillary quota stuff */
> -	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
> -	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
> -	unsigned int rs_qa_qd_num;
> -};
> -
>  enum {
>  	GIF_INVALID		= 0,
>  	GIF_QD_LOCKED		= 1,
> @@ -307,7 +337,6 @@ enum {
>  	GIF_SW_PAGED		= 3,
>  };
>  
> -
>  struct gfs2_inode {
>  	struct inode i_inode;
>  	u64 i_no_addr;
> @@ -318,7 +347,7 @@ struct gfs2_inode {
>  	struct gfs2_glock *i_gl; /* Move into i_gh? */
>  	struct gfs2_holder i_iopen_gh;
>  	struct gfs2_holder i_gh; /* for prepare/commit_write only */
> -	struct gfs2_blkreserv *i_res; /* resource group block reservation */
> +	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
>  	struct gfs2_rgrpd *i_rgd;
>  	u64 i_goal;	/* goal block for allocations */
>  	struct rw_semaphore i_rw_mutex;
> diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
> index 2b035e0..12c7b6a 100644
> --- a/fs/gfs2/inode.c
> +++ b/fs/gfs2/inode.c
> @@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
>  	int error;
>  
>  	munge_mode_uid_gid(dip, &mode, &uid, &gid);
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
>  
>  	error = gfs2_quota_lock(dip, uid, gid);
>  	if (error)
> @@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto fail;
> @@ -596,7 +603,8 @@ fail_end_trans:
>  	gfs2_trans_end(sdp);
>  
>  fail_ipreserv:
> -	gfs2_inplace_release(dip);
> +	if (alloc_required)
> +		gfs2_inplace_release(dip);
>  
>  fail_quota_locks:
>  	gfs2_quota_unlock(dip);
> @@ -738,6 +746,7 @@ fail_gunlock:
>  		iput(inode);
>  	}
>  fail:
> +	gfs2_rs_delete(dip);
>  	if (bh)
>  		brelse(bh);
>  	return error;
> diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
> index e53d0a1..4afb7f0 100644
> --- a/fs/gfs2/rgrp.c
> +++ b/fs/gfs2/rgrp.c
> @@ -35,6 +35,9 @@
>  #define BFITNOENT ((u32)~0)
>  #define NO_BLOCK ((u64)~0)
>  
> +#define RSRV_CONTENTION_FACTOR 2
> +#define RGRP_RSRV_MAX_CONTENDERS 4
> +
>  #if BITS_PER_LONG == 32
>  #define LBITMASK   (0x55555555UL)
>  #define LBITSKIP55 (0x55555555UL)
> @@ -178,6 +181,60 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
>  }
>  
>  /**
> + * rs_cmp - multi-block reservation range compare
> + * @blk: absolute file system block number of the new reservation
> + * @len: number of blocks in the new reservation
> + * @rs: existing reservation to compare against
> + *
> + * returns: 1 if the block range is beyond the reach of the reservation
> + *         -1 if the block range is before the start of the reservation
> + *          0 if the block range overlaps with the reservation
> + */
> +static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
> +{
> +	if (blk > rs->rs_start + rs->rs_blks - 1)
> +		return 1;
> +	if (blk + len - 1 < rs->rs_start)
> +		return -1;
> +	return 0;
> +}
> +
> +/**
> + * rs_find - Find a rgrp multi-block reservation that contains a given block
> + * @rgd: The rgrp
> + * @rgblk: The block we're looking for, relative to the rgrp
> + */
> +static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
> +{
> +	struct rb_node **newn;
> +	int rc;
> +	u64 fsblk = rgblk + rgd->rd_data0;
> +
> +	mutex_lock(&rgd->rd_rstree_mutex);
> +	newn = &rgd->rd_rstree.rb_node;
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +		rc = rs_cmp(fsblk, 1, cur);
> +		if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else {
> +			mutex_unlock(&rgd->rd_rstree_mutex);
> +			return cur;
> +		}
> +	}
> +	mutex_unlock(&rgd->rd_rstree_mutex);
> +	return NULL;
> +}
> +
> +static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> +{
> +	return (bi->bi_start * GFS2_NBBY) + blk;
> +}
> +
> +/**
>   * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
>   *       a block in a given allocation state.
>   * @buf: the buffer that holds the bitmaps
> @@ -435,8 +492,67 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
>  	return error;
>  }
>  
> +static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
> +{
> +	gfs2_print_dbg(seq, "@=%llu, rs_start=%llu, rs_biblk=%u, rs_blks=%u, "
> +	       "rs_free=%u\n", rs->rs_rgd->rd_addr, rs->rs_start, rs->rs_biblk,
> +	       rs->rs_blks, rs->rs_free);
> +}
> +
> +/**
> + * gfs2_rs_treedel - remove a multi-block reservation from the rgd tree
> + * @ip: The inode for this reservation
> + * @lock: we need to lock the rgrp
> + *
> + */
> +void gfs2_rs_treedel(struct gfs2_inode *ip, bool lock)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd;
> +
> +	if (!gfs2_rs_active(ip))
> +		return;
> +
> +	rgd = rs->rs_rgd;
> +	/* We can't do this: The reason is that when the rgrp is invalidated,
> +	   it's in the "middle" of acquiring the glock, but the HOLDER bit
> +	   isn't set yet:
> +	   BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
> +	trace_gfs2_rs(ip, rs, 0, TRACE_RS_TREEDEL);
> +
> +	if (lock)
> +		mutex_lock(&rgd->rd_rstree_mutex);
> +	/* Now that we have the mutex, check again if the tree is empty just
> +	   in case someone else deleted this entry. */
> +	if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
> +		rb_erase(&rs->rs_node, &rgd->rd_rstree);
> +	BUG_ON(!rgd->rd_rs_cnt);
> +	rgd->rd_rs_cnt--;
> +
> +	if (rs->rs_free) {
> +		/* return reserved blocks to the rgrp and the ip */
> +		BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
> +		rs->rs_rgd->rd_reserved -= rs->rs_free;
> +		rs->rs_free = 0;
> +		clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
> +		smp_mb__after_clear_bit();
> +	}
> +	if (lock)
> +		mutex_unlock(&rgd->rd_rstree_mutex);
> +	/* We can't change any of the step 1 or step 2 components of the rs.
> +	   E.g. We can't set rs_rgd to NULL because the rgd glock is held and
> +	   dequeued through this pointer.
> +	   Can't: atomic_set(&rs->rs_sizehint, 0);
> +	   Can't: rs->rs_requested = 0;
> +	   Can't: rs->rs_rgd = NULL;*/
> +	rs->rs_bi = NULL;
> +	rs->rs_start = 0;
> +	rs->rs_biblk = 0;
> +	rs->rs_blks = 0;
> +}
> +
>  /**
> - * gfs2_rs_delete - delete a reservation
> + * gfs2_rs_delete - delete a multi-block reservation
>   * @ip: The inode for this reservation
>   *
>   */
> @@ -444,12 +560,40 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
>  {
>  	down_write(&ip->i_rw_mutex);
>  	if (ip->i_res) {
> +		if (gfs2_rs_active(ip))
> +			gfs2_rs_treedel(ip, true);
> +		trace_gfs2_rs(ip, ip->i_res, ip->i_res->rs_start,
> +			      TRACE_RS_DELETE);
> +		BUG_ON(ip->i_res->rs_free);
>  		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
>  		ip->i_res = NULL;
>  	}
>  	up_write(&ip->i_rw_mutex);
>  }
>  
> +/**
> + * return_all_reservations - return all reserved blocks back to the rgrp.
> + * @rgd: the rgrp that needs its space back
> + *
> + * We previously reserved a bunch of blocks for allocation. Now we need to
> + * give them back. This leave the reservation structures in tact, but removes
> + * all of their corresponding "no-fly zones".
> + */
> +static void return_all_reservations(struct gfs2_rgrpd *rgd)
> +{
> +	struct rb_node *n;
> +	struct gfs2_blkreserv *rs;
> +	struct gfs2_inode *ip;
> +
> +	mutex_lock(&rgd->rd_rstree_mutex);
> +	while ((n = rb_first(&rgd->rd_rstree))) {
> +		rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		ip = rs->rs_ip;
> +		gfs2_rs_treedel(ip, false);
> +	}
> +	mutex_unlock(&rgd->rd_rstree_mutex);
> +}
> +
>  void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  {
>  	struct rb_node *n;
> @@ -472,6 +616,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
>  
>  		gfs2_free_clones(rgd);
>  		kfree(rgd->rd_bits);
> +		return_all_reservations(rgd);
>  		kmem_cache_free(gfs2_rgrpd_cachep, rgd);
>  	}
>  }
> @@ -649,6 +794,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
>  	rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
>  	rgd->rd_data = be32_to_cpu(buf.ri_data);
>  	rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
> +	mutex_init(&rgd->rd_rstree_mutex);
>  
>  	error = compute_bitstructs(rgd);
>  	if (error)
> @@ -1115,29 +1261,231 @@ out:
>  }
>  
>  /**
> + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
> + * @bi: the bitmap with the blocks
> + * @ip: the inode structure
> + * @biblk: the 32-bit block number relative to the start of the bitmap
> + * @amount: the number of blocks to reserve
> + *
> + * Returns: NULL - reservation was already taken, so not inserted
> + *          pointer to the inserted reservation
> + */
> +static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
> +				       struct gfs2_inode *ip, u32 biblk,
> +				       int amount)
> +{
> +	struct rb_node **newn, *parent = NULL;
> +	int rc;
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +
> +	mutex_lock(&rgd->rd_rstree_mutex);
> +	newn = &rgd->rd_rstree.rb_node;
> +	BUG_ON(!ip->i_res);
> +	BUG_ON(gfs2_rs_active(ip));
> +	/* Figure out where to put new node */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	while (*newn) {
> +		struct gfs2_blkreserv *cur =
> +			rb_entry(*newn, struct gfs2_blkreserv, rs_node);
> +
> +		parent = *newn;
> +		rc = rs_cmp(fsblock, amount, cur);
> +		if (rc > 0)
> +			newn = &((*newn)->rb_right);
> +		else if (rc < 0)
> +			newn = &((*newn)->rb_left);
> +		else {
> +			mutex_unlock(&rgd->rd_rstree_mutex);
> +			return NULL; /* reservation already in use */
> +		}
> +	}
> +
> +	/* Do our reservation work */
> +	rs = ip->i_res;
> +	rs->rs_start = fsblock;
> +	rs->rs_blks = amount;
> +	rs->rs_free = amount;
> +	rs->rs_biblk = biblk;
> +	rs->rs_bi = bi;
> +	rs->rs_ip = ip;
> +	rb_link_node(&rs->rs_node, parent, newn);
> +	rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
> +
> +	/* Do our inode accounting for the reservation */
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +
> +	/* Do our rgrp accounting for the reservation */
> +	rgd->rd_reserved += amount; /* blocks reserved */
> +	rgd->rd_next_rsrv = rs->rs_start + rs->rs_blks;
> +	rgd->rd_rs_cnt++; /* number of in-tree reservations */
> +	if (!rgrp_contains_block(rgd, rgd->rd_next_rsrv))
> +		rgd->rd_next_rsrv = 0;
> +	mutex_unlock(&rgd->rd_rstree_mutex);
> +	trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_INSERT);
> +	return rs;
> +}
> +
> +/**
> + * unclaimed_blocks - return number of blocks that aren't spoken for
> + */
> +static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
> +{
> +	return rgd->rd_free_clone - rgd->rd_reserved;
> +}
> +
> +/**
> + * rg_mblk_search - find a group of multiple free blocks
> + * @rgd: the resource group descriptor
> + * @rs: the block reservation
> + * @ip: pointer to the inode for which we're reserving blocks
> + *
> + * This is very similar to rgblk_search, except we're looking for whole
> + * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
> + * on aligned dwords for speed's sake.
> + *
> + * Returns: 0 if successful or BFITNOENT if there isn't enough free space
> + */
> +
> +static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
> +{
> +	struct gfs2_bitmap *bi = rgd->rd_bits;
> +	const u32 length = rgd->rd_length;
> +	u32 blk;
> +	unsigned int buf, x, search_bytes;
> +	u8 *buffer = NULL;
> +	u8 *ptr, *end, *nonzero;
> +	u32 goal, rsv_bytes;
> +	struct gfs2_blkreserv *rs;
> +	u32 best_rs_bytes, unclaimed;
> +	int best_rs_blocks;
> +
> +	/* Find bitmap block that contains bits for goal block */
> +	if (rgd->rd_next_rsrv) {
> +		goal = rgd->rd_next_rsrv - rgd->rd_data0;
> +		for (buf = 0; buf < length; buf++) {
> +			bi = rgd->rd_bits + buf;
> +			/* Convert scope of "goal" from rgrp-wide to within
> +			   found bit block */
> +			if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
> +				goal -= bi->bi_start * GFS2_NBBY;
> +				goto do_search;
> +			}
> +		}
> +	}
> +	buf = 0;
> +	goal = 0;
> +
I'm not sure I understand where this goal block has come from... the
goal block should be ip->i_goal in the normal case.

> +do_search:
> +	/* We multiply the size hint by the number of active reservations.
> +	   That makes heavily contended resource groups grab bigger
> +	   reservations so they don't bump into one another. */
This doesn't tell you whether the rgrp is contended or not. It is
possible to have multiple reservations in an rgrp which are all being
used locally by inodes which are filling up very slowly and not to have
any contention at all on the rgrp.

> +	best_rs_blocks = max(atomic_read(&ip->i_res->rs_sizehint),
> +			     (int)(RGRP_RSRV_MINBLKS * rgd->rd_length));
> +	best_rs_bytes = (best_rs_blocks *
> +			 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
> +		GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
> +	best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
> +	unclaimed = unclaimed_blocks(rgd);
> +	if (best_rs_bytes * GFS2_NBBY > unclaimed)
> +		best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
> +
> +	for (x = 0; x <= length; x++) {
> +		bi = rgd->rd_bits + buf;
> +
> +		if (test_bit(GBF_FULL, &bi->bi_flags))
> +			goto skip;
> +
> +		WARN_ON(!buffer_uptodate(bi->bi_bh));
> +		if (bi->bi_clone)
> +			buffer = bi->bi_clone + bi->bi_offset;
> +		else
> +			buffer = bi->bi_bh->b_data + bi->bi_offset;
> +
> +		/* We have to keep the reservations aligned on u64 boundaries
> +		   otherwise we could get situations where a byte can't be
> +		   used because it's after a reservation, but a free bit still
> +		   is within the reservation's area. */
I'm not sure I understand. If there are free blocks after an existing
inode, then the reservation must start at that point and continue until
it runs into something preventing it from extending further. Finding
aligned blocks is ok, but only if we've already exhausted the
possibility of adding on to an existing extent first.

> +		ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
> +		end = (buffer + bi->bi_len);
> +		while (ptr < end) {
> +			rsv_bytes = 0;
> +			if ((ptr + best_rs_bytes) <= end)
> +				search_bytes = best_rs_bytes;
> +			else
> +				search_bytes = end - ptr;
> +			BUG_ON(!search_bytes);
> +			nonzero = memchr_inv(ptr, 0, search_bytes);
> +			/* If the lot is all zeroes, reserve the whole size. If
> +			   there's enough zeroes to satisfy the request, use
> +			   what we can. If there's not enough, keep looking. */
Why not use gfs2_bitfit to search for the first free block and then see
how long the extent is? That seems to make more sense than writing
something new from scratch, bearing in mind that we've already spent a
lot of time optimising it.

> +			if (nonzero == NULL)
> +				rsv_bytes = search_bytes;
> +			else if ((nonzero - ptr) * GFS2_NBBY >=
> +				 ip->i_res->rs_requested)
> +				rsv_bytes = (nonzero - ptr);
> +
> +			if (rsv_bytes) {
> +				blk = ((ptr - buffer) * GFS2_NBBY);
> +				BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
> +				rs = rs_insert(bi, ip, blk,
> +					       rsv_bytes * GFS2_NBBY);
> +				if (IS_ERR(rs))
> +					return PTR_ERR(rs);
> +				if (rs)
> +					return 0;
> +			}
> +			ptr += ALIGN(search_bytes, sizeof(u64));
> +		}
> +skip:
> +		/* Try next bitmap block (wrap back to rgrp header
> +		   if at end) */
> +		buf++;
> +		buf %= length;
> +		goal = 0;
> +	}
> +
> +	return BFITNOENT;
> +}
> +
> +/**
>   * try_rgrp_fit - See if a given reservation will fit in a given RG
>   * @rgd: the RG data
>   * @ip: the inode
>   *
>   * If there's room for the requested blocks to be allocated from the RG:
> + * This will try to get a multi-block reservation first, and if that doesn't
> + * fit, it will take what it can.
>   *
>   * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
>   */
>  
> -static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip)
> +static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
>  {
>  	const struct gfs2_blkreserv *rs = ip->i_res;
>  
>  	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
>  		return 0;
> -	if (rgd->rd_free_clone >= rs->rs_requested)
> +	/* Look for a multi-block reservation. We only do this for files,
> +	   not directories or other types. The problem with directories is
> +	   that items are often created and deleted in the directory in the
> +	   same breath, which can create "holes". In other words, we could get
> +	   into situations where two or more blocks are reserved, then used,
> +	   then one or more of the earlier blocks is freed. When we delete the
> +	   reservation, the rs_free will be off due to the hole, so the rgrp's
> +	   rg_free count can get off. Other non-files, like symlinks and
> +	   char or block devices, are very small in size, so it doesn't make
> +	   sense to reserve multiple blocks for them. */
> +	if (S_ISREG(ip->i_inode.i_mode) &&
> +	    unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS) {
> +		if (rg_mblk_search(rgd, ip) != BFITNOENT)
> +			return 1;
> +	}
> +	if (unclaimed_blocks(rgd) >= rs->rs_requested)
>  		return 1;
> -	return 0;
> -}
>  
> -static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk)
> -{
> -	return (bi->bi_start * GFS2_NBBY) + blk;
> +	return 0;
>  }
>  
>  /**
> @@ -1217,7 +1565,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
>  int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  {
>  	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> -	struct gfs2_rgrpd *rgd, *begin = NULL;
> +	struct gfs2_rgrpd *begin = NULL;
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  	int error = 0, rg_locked, flags = LM_FLAG_TRY;
>  	u64 last_unlinked = NO_BLOCK;
> @@ -1225,32 +1573,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  
>  	if (sdp->sd_args.ar_rgrplvb)
>  		flags |= GL_SKIP;
> -	rs = ip->i_res;
>  	rs->rs_requested = requested;
>  	if (gfs2_assert_warn(sdp, requested)) {
>  		error = -EINVAL;
>  		goto out;
>  	}
> -
> -	if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal))
> -		rgd = begin = ip->i_rgd;
> -	else
> -		rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> -
> -	if (rgd == NULL)
> +	if (gfs2_rs_active(ip)) {
> +		begin = rs->rs_rgd;
> +		flags = 0; /* Yoda: Do or do not. There is no try */
> +	} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
> +		rs->rs_rgd = begin = ip->i_rgd;
> +	} else {
> +		rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
> +	}
> +	if (rs->rs_rgd == NULL)
>  		return -EBADSLT;
>  
>  	while (loops < 3) {
>  		rg_locked = 0;
>  
> -		if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
> +		if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
>  			rg_locked = 1;
>  			error = 0;
> +		} else if (!loops && !gfs2_rs_active(ip) &&
> +			   rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
> +			/* If the rgrp already is maxed out for contenders,
> +			   we can eliminate it as a "first pass" without even
> +			   requesting the rgrp glock. */
> +			error = GLR_TRYFAILED;
>  		} else {
> -			error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
> -						   flags, &rs->rs_rgd_gh);
> +			error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
> +						   LM_ST_EXCLUSIVE, flags,
> +						   &rs->rs_rgd_gh);
>  			if (!error && sdp->sd_args.ar_rgrplvb) {
> -				error = update_rgrp_lvb(rgd);
> +				error = update_rgrp_lvb(rs->rs_rgd);
>  				if (error) {
>  					gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  					return error;
> @@ -1259,24 +1615,36 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
>  		}
>  		switch (error) {
>  		case 0:
> -			if (try_rgrp_fit(rgd, ip)) {
> +			if (gfs2_rs_active(ip)) {
> +				if (unclaimed_blocks(rs->rs_rgd) +
> +				    rs->rs_free >= rs->rs_requested) {
> +					ip->i_rgd = rs->rs_rgd;
> +					return 0;
> +				}
> +				/* We have a multi-block reservation, but the
> +				   rgrp doesn't have enough free blocks to
> +				   satisfy the request. Free the reservation
> +				   and look for a suitable rgrp. */
> +				gfs2_rs_treedel(ip, true);
> +			}
> +			if (try_rgrp_fit(rs->rs_rgd, ip)) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				ip->i_rgd = rgd;
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				ip->i_rgd = rs->rs_rgd;
>  				return 0;
>  			}
> -			if (rgd->rd_flags & GFS2_RDF_CHECK) {
> +			if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
>  				if (sdp->sd_args.ar_rgrplvb)
> -					gfs2_rgrp_bh_get(rgd);
> -				try_rgrp_unlink(rgd, &last_unlinked,
> +					gfs2_rgrp_bh_get(rs->rs_rgd);
> +				try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
>  						ip->i_no_addr);
>  			}
>  			if (!rg_locked)
>  				gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  			/* fall through */
>  		case GLR_TRYFAILED:
> -			rgd = gfs2_rgrpd_get_next(rgd);
> -			if (rgd != begin) /* If we didn't wrap */
> +			rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
> +			if (rs->rs_rgd != begin) /* If we didn't wrap */
>  				break;
>  
>  			flags &= ~LM_FLAG_TRY;
> @@ -1314,6 +1682,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
>  {
>  	struct gfs2_blkreserv *rs = ip->i_res;
>  
> +	if (!rs)
> +		return;
> +
> +	if (gfs2_rs_active(ip) && !rs->rs_free)
> +		gfs2_rs_treedel(ip, true);
> +
>  	if (rs->rs_rgd_gh.gh_gl)
>  		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
>  	rs->rs_requested = 0;
> @@ -1412,7 +1786,27 @@ do_search:
>  		if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
>  			buffer = bi->bi_clone + bi->bi_offset;
>  
> -		biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +		while (1) {
> +			struct gfs2_blkreserv *rs;
> +			u32 rgblk;
> +
> +			biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
> +			if (biblk == BFITNOENT)
> +				break;
> +			/* Check if this block is reserved() */
> +			rgblk = gfs2_bi2rgd_blk(bi, biblk);
> +			rs = rs_find(rgd, rgblk);
> +			if (rs == NULL)
> +				break;
> +
> +			BUG_ON(rs->rs_bi != bi);
> +			biblk = BFITNOENT;
> +			/* This should jump to the first block after the
> +			   reservation. */
> +			goal = rs->rs_biblk + rs->rs_free;
> +			if (goal >= bi->bi_len * GFS2_NBBY)
> +				break;
> +		}
>  		if (biblk != BFITNOENT)
>  			break;
>  
> @@ -1448,8 +1842,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  			     u32 blk, bool dinode, unsigned int *n)
>  {
>  	const unsigned int elen = *n;
> -	u32 goal;
> +	u32 goal, rgblk;
>  	const u8 *buffer = NULL;
> +	struct gfs2_blkreserv *rs;
>  
>  	*n = 0;
>  	buffer = bi->bi_bh->b_data + bi->bi_offset;
> @@ -1462,6 +1857,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
>  		goal++;
>  		if (goal >= (bi->bi_len * GFS2_NBBY))
>  			break;
> +		rgblk = gfs2_bi2rgd_blk(bi, goal);
> +		rs = rs_find(rgd, rgblk);
> +		if (rs) /* Oops, we bumped into someone's reservation */
> +			break;
>  		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
>  		    GFS2_BLKST_FREE)
>  			break;
> @@ -1538,11 +1937,20 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
>  int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
>  {
>  	const struct gfs2_rgrpd *rgd = gl->gl_object;
> +	struct gfs2_blkreserv *trs;
> +	const struct rb_node *n;
> +
>  	if (rgd == NULL)
>  		return 0;
> -	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
> +	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
>  		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
> -		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
> +		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
> +		       rgd->rd_reserved);
> +	gfs2_print_dbg(seq, "All reservations:\n");
This doesn't match the format for this file
> +	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
> +		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
> +		dump_rs(seq, trs);
Neither does dump_rs
> +	}
>  	return 0;
>  }
>  
> @@ -1557,10 +1965,74 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
>  }
>  
>  /**
> + * claim_reserved_blks - Claim previously reserved blocks
> + * @ip: the inode that's claiming the reservation
> + * @dinode: 1 if this block is a dinode block, otherwise data block
> + * @nblocks: desired extent length
> + *
> + * Lay claim to previously allocated block reservation blocks.
> + * Returns: Starting block number of the blocks claimed.
> + * Sets *nblocks to the actual extent length allocated.
> + */
> +static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
> +			       unsigned int *nblocks)
> +{
> +	struct gfs2_blkreserv *rs = ip->i_res;
> +	struct gfs2_rgrpd *rgd = rs->rs_rgd;
> +	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> +	struct gfs2_bitmap *bi;
> +	u32 biblk;
> +	u64 start_block = 0, fsblock;
> +	const unsigned int elen = *nblocks;
> +
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
> +	gfs2_assert_withdraw(sdp, rgd);
> +	/*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
> +	bi = rs->rs_bi;
> +	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
> +
> +	for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
> +		biblk = rs->rs_biblk;
> +
> +		fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
> +		/* Check if the next free block is not contiguous with
> +		   the others in the extent. */
> +		if (start_block && fsblock != start_block + (*nblocks))
> +			break;
> +
> +		rs->rs_biblk++;
> +		rs->rs_free--;
> +		/* Make sure the bitmap hasn't changed */
> +		gfs2_setbit(rgd, bi->bi_clone, bi, biblk, GFS2_BLKST_USED);
> +
> +		BUG_ON(!rgd->rd_reserved);
> +		rgd->rd_reserved--;
> +		if (!start_block) {
> +			start_block = fsblock;
> +			dinode = false;
> +		}
> +		trace_gfs2_rs(ip, rs, fsblock, TRACE_RS_CLAIM);
> +	}
> +
> +	if (!rs->rs_free) {
> +		struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
> +
> +		gfs2_rs_treedel(ip, true);
> +		/* -nblocks because we haven't returned to do the math yet.
> +		   I'm doing the math backwards to prevent negative numbers,
> +		   but think of it as:
> +		   if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
> +		if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
> +			rg_mblk_search(rgd, ip);
> +	}
> +	return start_block;
> +}
> +
> +/**
>   * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
>   * @ip: the inode to allocate the block for
>   * @bn: Used to return the starting block number
> - * @ndata: requested number of blocks/extent length (value/result)
> + * @nblocks: requested number of blocks/extent length (value/result)
>   * @dinode: 1 if we're allocating a dinode block, else 0
>   * @generation: the generation number of the inode
>   *
> @@ -1585,20 +2057,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  	if (ip->i_res->rs_requested == 0)
>  		return -ECANCELED;
>  
> -	rgd = ip->i_rgd;
> -
> -	if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> -		goal = ip->i_goal - rgd->rd_data0;
> -	else
> -		goal = rgd->rd_last_alloc;
> -
> -	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +	/* Check if we have a multi-block reservation, and if so, claim the
> +	   next free block from it. */
> +	if (gfs2_rs_active(ip)) {
> +		BUG_ON(!ip->i_res->rs_free);
> +		rgd = ip->i_res->rs_rgd;
> +		block = claim_reserved_blks(ip, dinode, nblocks);
> +	} else {
> +		rgd = ip->i_rgd;
>  
> -	/* Since all blocks are reserved in advance, this shouldn't happen */
> -	if (blk == BFITNOENT)
> -		goto rgrp_error;
> +		if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
> +			goal = ip->i_goal - rgd->rd_data0;
> +		else
> +			goal = rgd->rd_last_alloc;
> +
> +		blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
> +
> +		/* Since all blocks are reserved in advance, this shouldn't
> +		   happen */
> +		if (blk == BFITNOENT) {
> +			printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
> +			       *nblocks);
> +			printk(KERN_WARNING "FULL=%d\n",
> +			       test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
> +			goto rgrp_error;
> +		}
>  
> -	block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +		block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
> +	}
>  	ndata = *nblocks;
>  	if (dinode)
>  		ndata--;
> @@ -1615,8 +2101,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
>  			brelse(dibh);
>  		}
>  	}
> -	if (rgd->rd_free < *nblocks)
> +	if (rgd->rd_free < *nblocks) {
> +		printk(KERN_WARNING "nblocks=%u\n", *nblocks);
>  		goto rgrp_error;
> +	}
>  
>  	rgd->rd_free -= *nblocks;
>  	if (dinode) {
> @@ -1672,6 +2160,9 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
>  		return;
>  	trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
>  	rgd->rd_free += blen;
> +	if (bstart < rgd->rd_next_rsrv)
> +		rgd->rd_next_rsrv = bstart;
> +
>  	rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
>  	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
>  	gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
> diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
> index 5d8314d..5dd06d9 100644
> --- a/fs/gfs2/rgrp.h
> +++ b/fs/gfs2/rgrp.h
> @@ -13,6 +13,14 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +/* Since each block in the file system is represented by two bits in the
> + * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
> + * By reserving 32 blocks at a time, we can optimize / shortcut how we search
> + * through the bitmaps by looking a word at a time.
> + */
> +#define RGRP_RSRV_MINBYTES 8
> +#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
> +
>  struct gfs2_rgrpd;
>  struct gfs2_sbd;
>  struct gfs2_holder;
> @@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
>  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
>  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
>  
> +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
> +
>  extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
>  extern void gfs2_inplace_release(struct gfs2_inode *ip);
>  
> @@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
>  			     bool dinode, u64 *generation);
>  
>  extern int gfs2_rs_alloc(struct gfs2_inode *ip);
> +extern void gfs2_rs_treedel(struct gfs2_inode *ip, bool lock);
>  extern void gfs2_rs_delete(struct gfs2_inode *ip);
>  extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
>  extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
> @@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
>  				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
>  extern int gfs2_fitrim(struct file *filp, void __user *argp);
>  
> -/* This is how to tell if a reservation is "inplace" reserved: */
> +/* This is how to tell if a multi-block reservation is "inplace" reserved: */
>  static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  {
>  	if (ip->i_res && ip->i_res->rs_requested)
> @@ -70,4 +81,12 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
>  	return 0;
>  }
>  
> +/* This is how to tell if a multi-block reservation is in the rgrp tree: */
> +static inline int gfs2_rs_active(struct gfs2_inode *ip)
> +{
> +	if (ip->i_res && ip->i_res->rs_start)
> +		return 1;
> +	return 0;
> +}
> +
>  #endif /* __RGRP_DOT_H__ */
> diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
> index 7880687..5651463 100644
> --- a/fs/gfs2/super.c
> +++ b/fs/gfs2/super.c
> @@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
>  		return -EIO;
>  	}
>  
> +	error = gfs2_rindex_update(sdp);
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> @@ -1550,6 +1554,9 @@ out_truncate:
>  
>  out_unlock:
>  	/* Error path for case 1 */
> +	if (gfs2_rs_active(ip))
> +		gfs2_rs_treedel(ip, true);
> +
>  	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
>  		gfs2_glock_dq(&ip->i_iopen_gh);
>  	gfs2_holder_uninit(&ip->i_iopen_gh);
> diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
> index 1b8b815..46d4a1a 100644
> --- a/fs/gfs2/trace_gfs2.h
> +++ b/fs/gfs2/trace_gfs2.h
> @@ -31,6 +31,17 @@
>  			    { GFS2_BLKST_DINODE, "dinode" },	\
>  			    { GFS2_BLKST_UNLINKED, "unlinked" })
>  
> +#define TRACE_RS_DELETE  0
> +#define TRACE_RS_TREEDEL 1
> +#define TRACE_RS_INSERT  2
> +#define TRACE_RS_CLAIM   3
> +
> +#define rs_func_name(x) __print_symbolic(x,	\
> +					 { 0, "del " },	\
> +					 { 1, "tdel" },	\
> +					 { 2, "ins " },	\
> +					 { 3, "clm " })
> +
>  #define show_glock_flags(flags) __print_flags(flags, "",	\
>  	{(1UL << GLF_LOCK),			"l" },		\
>  	{(1UL << GLF_DEMOTE),			"D" },		\
> @@ -470,6 +481,7 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__field(	u8,	block_state		)
>  		__field(        u64,	rd_addr			)
>  		__field(        u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
>  	),
>  
>  	TP_fast_assign(
> @@ -480,16 +492,64 @@ TRACE_EVENT(gfs2_block_alloc,
>  		__entry->block_state	= block_state;
>  		__entry->rd_addr	= rgd->rd_addr;
>  		__entry->rd_free_clone	= rgd->rd_free_clone;
> +		__entry->rd_reserved	= rgd->rd_reserved;
>  	),
>  
> -	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u",
> +	TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr%lu",
Missing : after rr

>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  (unsigned long long)__entry->inum,
>  		  (unsigned long long)__entry->start,
>  		  (unsigned long)__entry->len,
>  		  block_state_name(__entry->block_state),
>  		  (unsigned long long)__entry->rd_addr,
> -		  __entry->rd_free_clone)
> +		  __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
> +);
> +
> +/* Keep track of multi-block reservations as they are allocated/freed */
> +TRACE_EVENT(gfs2_rs,
> +
> +	TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
> +		 u64 claimed, u8 func),
> +
> +	TP_ARGS(ip, rs, claimed, func),
> +
> +	TP_STRUCT__entry(
> +		__field(        dev_t,  dev                     )
> +		__field(	u64,	rd_addr			)
> +		__field(	u32,	rd_free_clone		)
> +		__field(	u32,	rd_reserved		)
> +		__field(	u64,	ip_no_addr		)
> +		__field(	u64,	rs_start		)
> +		__field(	u32,	rs_blks			)
> +		__field(	u64,	claimed			)
> +		__field(	u32,	free			)
> +		__field(	u8,	func			)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= ip->i_gl->gl_sbd->sd_vfs->s_dev;
> +		__entry->rd_addr	= rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
> +		__entry->rd_free_clone	= rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
> +		__entry->rd_reserved	= rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
> +		__entry->ip_no_addr	= ip->i_no_addr;
> +		__entry->rs_start	= rs->rs_start;
> +		__entry->rs_blks	= rs->rs_blks;
> +		__entry->claimed	= claimed;
> +		__entry->free		= rs->rs_free;
> +		__entry->func		= func;
> +	),
> +
> +	TP_printk("%u,%u rg:%llu rf:%lu rr:%lu bmap %llu rs:%llu blks:%lu %s "
> +		  "c:%llu f:%lu",
This format is incorrect for a bmap tracepoint - it needs to match the
common fields for the other bmap tracepoints.

> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  (unsigned long long)__entry->rd_addr,
> +		  (unsigned long)__entry->rd_free_clone,
> +		  (unsigned long)__entry->rd_reserved,
> +		  (unsigned long long)__entry->ip_no_addr,
> +		  (unsigned long long)__entry->rs_start,
> +		  (unsigned long)__entry->rs_blks,
> +		  rs_func_name(__entry->func),
> +		  __entry->claimed, (unsigned long)__entry->free)
>  );
>  
>  #endif /* _TRACE_GFS2_H */
> diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
> index 523c0de..27a0b4a 100644
> --- a/fs/gfs2/xattr.c
> +++ b/fs/gfs2/xattr.c
> @@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		goto out_alloc;
> @@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
>  	struct buffer_head *dibh;
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_lock_check(ip);
>  	if (error)
>  		return error;
> @@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
>  {
>  	int error;
>  
> +	error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
> +	if (error)
> +		return error;
> +
>  	error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
>  	if (error)
>  		return error;
> 





More information about the Cluster-devel mailing list