[Cluster-devel] [PATCH] GFS2: fallocate support
Steven Whitehouse
swhiteho at redhat.com
Mon Aug 23 15:29:39 UTC 2010
Hi,
Now in the -nmw git tree. Thanks,
Steve.
On Fri, 2010-08-20 at 00:21 -0500, Benjamin Marzinski wrote:
> This patch adds support for fallocate to gfs2. Since the gfs2 does not support
> uninitialized data blocks, it must write out zeros to all the blocks. However,
> since it does not need to lock any pages to read from, gfs2 can write out the
> zero blocks much more efficiently. On a moderately full filesystem, fallocate
> works around 5 times faster on average. The fallocate call also allows gfs2 to
> add blocks to the file without changing the filesize, which will make it
> possible for gfs2 to preallocate space for the rindex file, so that gfs2 can
> grow a completely full filesystem.
>
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
> fs/gfs2/aops.c | 4
> fs/gfs2/incore.h | 1
> fs/gfs2/inode.h | 2
> fs/gfs2/ops_inode.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> fs/gfs2/rgrp.c | 12 ++
> fs/gfs2/trans.h | 1
> 6 files changed, 272 insertions(+), 2 deletions(-)
>
> Index: gfs2-2.6-nmw/fs/gfs2/aops.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/aops.c
> +++ gfs2-2.6-nmw/fs/gfs2/aops.c
> @@ -36,8 +36,8 @@
> #include "glops.h"
>
>
> -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
> - unsigned int from, unsigned int to)
> +void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
> + unsigned int from, unsigned int to)
> {
> struct buffer_head *head = page_buffers(page);
> unsigned int bsize = head->b_size;
> Index: gfs2-2.6-nmw/fs/gfs2/inode.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/inode.h
> +++ gfs2-2.6-nmw/fs/gfs2/inode.h
> @@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page
> extern int gfs2_internal_read(struct gfs2_inode *ip,
> struct file_ra_state *ra_state,
> char *buf, loff_t *pos, unsigned size);
> +extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
> + unsigned int from, unsigned int to);
> extern void gfs2_set_aops(struct inode *inode);
>
> static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
> Index: gfs2-2.6-nmw/fs/gfs2/ops_inode.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/ops_inode.c
> +++ gfs2-2.6-nmw/fs/gfs2/ops_inode.c
> @@ -18,6 +18,8 @@
> #include <linux/gfs2_ondisk.h>
> #include <linux/crc32.h>
> #include <linux/fiemap.h>
> +#include <linux/swap.h>
> +#include <linux/falloc.h>
> #include <asm/uaccess.h>
>
> #include "gfs2.h"
> @@ -1277,6 +1279,257 @@ static int gfs2_removexattr(struct dentr
> return ret;
> }
>
> +static void empty_write_end(struct page *page, unsigned from,
> + unsigned to)
> +{
> + struct gfs2_inode *ip = GFS2_I(page->mapping->host);
> +
> + page_zero_new_buffers(page, from, to);
> + flush_dcache_page(page);
> + mark_page_accessed(page);
> +
> + if (!gfs2_is_writeback(ip))
> + gfs2_page_add_databufs(ip, page, from, to);
> +
> + block_commit_write(page, from, to);
> +}
> +
> +
> +static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
> +{
> + unsigned start, end, next;
> + struct buffer_head *bh, *head;
> + int error;
> +
> + if (!page_has_buffers(page)) {
> + error = block_prepare_write(page, from, to, gfs2_block_map);
> + if (unlikely(error))
> + return error;
> +
> + empty_write_end(page, from, to);
> + return 0;
> + }
> +
> + bh = head = page_buffers(page);
> + next = end = 0;
> + while (next < from) {
> + next += bh->b_size;
> + bh = bh->b_this_page;
> + }
> + start = next;
> + do {
> + next += bh->b_size;
> + if (buffer_mapped(bh)) {
> + if (end) {
> + error = block_prepare_write(page, start, end,
> + gfs2_block_map);
> + if (unlikely(error))
> + return error;
> + empty_write_end(page, start, end);
> + end = 0;
> + }
> + start = next;
> + }
> + else
> + end = next;
> + bh = bh->b_this_page;
> + } while (next < to);
> +
> + if (end) {
> + error = block_prepare_write(page, start, end, gfs2_block_map);
> + if (unlikely(error))
> + return error;
> + empty_write_end(page, start, end);
> + }
> +
> + return 0;
> +}
> +
> +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
> + int mode)
> +{
> + struct gfs2_inode *ip = GFS2_I(inode);
> + struct buffer_head *dibh;
> + int error;
> + u64 start = offset >> PAGE_CACHE_SHIFT;
> + unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
> + u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
> + pgoff_t curr;
> + struct page *page;
> + unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
> + unsigned int from, to;
> +
> + if (!end_offset)
> + end_offset = PAGE_CACHE_SIZE;
> +
> + error = gfs2_meta_inode_buffer(ip, &dibh);
> + if (unlikely(error))
> + goto out;
> +
> + gfs2_trans_add_bh(ip->i_gl, dibh, 1);
> +
> + if (gfs2_is_stuffed(ip)) {
> + error = gfs2_unstuff_dinode(ip, NULL);
> + if (unlikely(error))
> + goto out;
> + }
> +
> + curr = start;
> + offset = start << PAGE_CACHE_SHIFT;
> + from = start_offset;
> + to = PAGE_CACHE_SIZE;
> + while (curr <= end) {
> + page = grab_cache_page_write_begin(inode->i_mapping, curr,
> + AOP_FLAG_NOFS);
> + if (unlikely(!page)) {
> + error = -ENOMEM;
> + goto out;
> + }
> +
> + if (curr == end)
> + to = end_offset;
> + error = write_empty_blocks(page, from, to);
> + if (!error && offset + to > inode->i_size &&
> + !(mode & FALLOC_FL_KEEP_SIZE)) {
> + i_size_write(inode, offset + to);
> + }
> + unlock_page(page);
> + page_cache_release(page);
> + if (error)
> + goto out;
> + curr++;
> + offset += PAGE_CACHE_SIZE;
> + from = 0;
> + }
> +
> + gfs2_dinode_out(ip, dibh->b_data);
> + mark_inode_dirty(inode);
> +
> + brelse(dibh);
> +
> +out:
> + return error;
> +}
> +
> +static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
> + unsigned int *data_blocks, unsigned int *ind_blocks)
> +{
> + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> + unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
> + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
> +
> + for (tmp = max_data; tmp > sdp->sd_diptrs;) {
> + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
> + max_data -= tmp;
> + }
> + /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
> + so it might end up with fewer data blocks */
> + if (max_data <= *data_blocks)
> + return;
> + *data_blocks = max_data;
> + *ind_blocks = max_blocks - max_data;
> + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
> + if (*len > max) {
> + *len = max;
> + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
> + }
> +}
> +
> +static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
> + loff_t len)
> +{
> + struct gfs2_sbd *sdp = GFS2_SB(inode);
> + struct gfs2_inode *ip = GFS2_I(inode);
> + unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
> + loff_t bytes, max_bytes;
> + struct gfs2_alloc *al;
> + int error;
> + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
> + next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
> +
> + offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
> + sdp->sd_sb.sb_bsize_shift;
> +
> + len = next - offset;
> + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
> + if (!bytes)
> + bytes = UINT_MAX;
> +
> + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
> + error = gfs2_glock_nq(&ip->i_gh);
> + if (unlikely(error))
> + goto out_uninit;
> +
> + if (!gfs2_write_alloc_required(ip, offset, len))
> + goto out_unlock;
> +
> + while (len > 0) {
> + if (len < bytes)
> + bytes = len;
> + al = gfs2_alloc_get(ip);
> + if (!al) {
> + error = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + error = gfs2_quota_lock_check(ip);
> + if (error)
> + goto out_alloc_put;
> +
> +retry:
> + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
> +
> + al->al_requested = data_blocks + ind_blocks;
> + error = gfs2_inplace_reserve(ip);
> + if (error) {
> + if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
> + bytes >>= 1;
> + goto retry;
> + }
> + goto out_qunlock;
> + }
> + max_bytes = bytes;
> + calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
> + al->al_requested = data_blocks + ind_blocks;
> +
> + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
> + RES_RG_HDR + ip->i_alloc->al_rgd->rd_length;
> + if (gfs2_is_jdata(ip))
> + rblocks += data_blocks ? data_blocks : 1;
> +
> + error = gfs2_trans_begin(sdp, rblocks,
> + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
> + if (error)
> + goto out_trans_fail;
> +
> + error = fallocate_chunk(inode, offset, max_bytes, mode);
> + gfs2_trans_end(sdp);
> +
> + if (error)
> + goto out_trans_fail;
> +
> + len -= max_bytes;
> + offset += max_bytes;
> + gfs2_inplace_release(ip);
> + gfs2_quota_unlock(ip);
> + gfs2_alloc_put(ip);
> + }
> + goto out_unlock;
> +
> +out_trans_fail:
> + gfs2_inplace_release(ip);
> +out_qunlock:
> + gfs2_quota_unlock(ip);
> +out_alloc_put:
> + gfs2_alloc_put(ip);
> +out_unlock:
> + gfs2_glock_dq(&ip->i_gh);
> +out_uninit:
> + gfs2_holder_uninit(&ip->i_gh);
> + return error;
> +}
> +
> +
> static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
> u64 start, u64 len)
> {
> @@ -1327,6 +1580,7 @@ const struct inode_operations gfs2_file_
> .getxattr = gfs2_getxattr,
> .listxattr = gfs2_listxattr,
> .removexattr = gfs2_removexattr,
> + .fallocate = gfs2_fallocate,
> .fiemap = gfs2_fiemap,
> };
>
> Index: gfs2-2.6-nmw/fs/gfs2/incore.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
> +++ gfs2-2.6-nmw/fs/gfs2/incore.h
> @@ -571,6 +571,7 @@ struct gfs2_sbd {
> struct list_head sd_rindex_mru_list;
> struct gfs2_rgrpd *sd_rindex_forward;
> unsigned int sd_rgrps;
> + unsigned int sd_max_rg_data;
>
> /* Journal index stuff */
>
> Index: gfs2-2.6-nmw/fs/gfs2/rgrp.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/rgrp.c
> +++ gfs2-2.6-nmw/fs/gfs2/rgrp.c
> @@ -589,6 +589,8 @@ static int gfs2_ri_update(struct gfs2_in
> struct inode *inode = &ip->i_inode;
> struct file_ra_state ra_state;
> u64 rgrp_count = i_size_read(inode);
> + struct gfs2_rgrpd *rgd;
> + unsigned int max_data = 0;
> int error;
>
> do_div(rgrp_count, sizeof(struct gfs2_rindex));
> @@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_in
> }
> }
>
> + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
> + if (rgd->rd_data > max_data)
> + max_data = rgd->rd_data;
> + sdp->sd_max_rg_data = max_data;
> sdp->sd_rindex_uptodate = 1;
> return 0;
> }
> @@ -622,6 +628,8 @@ static int gfs2_ri_update_special(struct
> struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
> struct inode *inode = &ip->i_inode;
> struct file_ra_state ra_state;
> + struct gfs2_rgrpd *rgd;
> + unsigned int max_data = 0;
> int error;
>
> file_ra_state_init(&ra_state, inode->i_mapping);
> @@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct
> return error;
> }
> }
> + list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
> + if (rgd->rd_data > max_data)
> + max_data = rgd->rd_data;
> + sdp->sd_max_rg_data = max_data;
>
> sdp->sd_rindex_uptodate = 1;
> return 0;
> Index: gfs2-2.6-nmw/fs/gfs2/trans.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/trans.h
> +++ gfs2-2.6-nmw/fs/gfs2/trans.h
> @@ -20,6 +20,7 @@ struct gfs2_glock;
> #define RES_JDATA 1
> #define RES_DATA 1
> #define RES_LEAF 1
> +#define RES_RG_HDR 1
> #define RES_RG_BIT 2
> #define RES_EATTR 1
> #define RES_STATFS 1
>
More information about the Cluster-devel
mailing list