[Cluster-devel] [PATCH] GFS2: Various gfs2_logd improvements
Steven Whitehouse
swhiteho at redhat.com
Tue Apr 20 08:55:22 UTC 2010
Hi,
Generally looks good, but I'd like to retain the commit= mount option
for setting the log flush timer,
Steve.
On Mon, 2010-04-19 at 15:46 -0500, Benjamin Marzinski wrote:
> This patch contains various tweaks to how log flushes and active item writeback
> work. gfs2_logd is now managed by a waitqueue, and gfs2_log_reseve now waits
> for gfs2_logd to do the log flushing. Multiple functions were rewritten to
> remove the need to call gfs2_log_lock(). Instead of using one test to see if
> gfs2_logd had work to do, there are now seperate tests to check if there
> are two many buffers in the incore log or if there are two many items on the
> active items list.
>
> Signed-off-by: Benjamin Marzinski <bmarzins at redhat.com>
> ---
> fs/gfs2/incore.h | 10 +--
> fs/gfs2/log.c | 149 +++++++++++++++++++++++++++++----------------------
> fs/gfs2/log.h | 1
> fs/gfs2/lops.c | 2
> fs/gfs2/meta_io.c | 1
> fs/gfs2/ops_fstype.c | 15 ++---
> fs/gfs2/super.c | 5 -
> fs/gfs2/sys.c | 4 -
> fs/gfs2/trans.c | 18 ++++++
> 9 files changed, 120 insertions(+), 85 deletions(-)
>
> Index: gfs2-2.6-nmw/fs/gfs2/incore.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/incore.h
> +++ gfs2-2.6-nmw/fs/gfs2/incore.h
> @@ -439,9 +439,6 @@ struct gfs2_args {
> struct gfs2_tune {
> spinlock_t gt_spin;
>
> - unsigned int gt_incore_log_blocks;
> - unsigned int gt_log_flush_secs;
> -
> unsigned int gt_logd_secs;
>
> unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
> @@ -618,6 +615,7 @@ struct gfs2_sbd {
> unsigned int sd_log_commited_databuf;
> int sd_log_commited_revoke;
>
> + atomic_t sd_log_pinned;
> unsigned int sd_log_num_buf;
> unsigned int sd_log_num_revoke;
> unsigned int sd_log_num_rg;
> @@ -629,15 +627,17 @@ struct gfs2_sbd {
> struct list_head sd_log_le_databuf;
> struct list_head sd_log_le_ordered;
>
> + atomic_t sd_log_thresh1;
> + atomic_t sd_log_thresh2;
> atomic_t sd_log_blks_free;
> - struct mutex sd_log_reserve_mutex;
> + wait_queue_head_t sd_log_waitq;
> + wait_queue_head_t sd_logd_waitq;
>
> u64 sd_log_sequence;
> unsigned int sd_log_head;
> unsigned int sd_log_tail;
> int sd_log_idle;
>
> - unsigned long sd_log_flush_time;
> struct rw_semaphore sd_log_flush_lock;
> atomic_t sd_log_in_flight;
> wait_queue_head_t sd_log_flush_wait;
> Index: gfs2-2.6-nmw/fs/gfs2/log.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/log.c
> +++ gfs2-2.6-nmw/fs/gfs2/log.c
> @@ -168,7 +168,7 @@ static int gfs2_ail1_empty_one(struct gf
> return list_empty(&ai->ai_ail1_list);
> }
>
> -static void gfs2_ail1_start(struct gfs2_sbd *sdp, int flags)
> +static void gfs2_ail1_start(struct gfs2_sbd *sdp)
> {
> struct list_head *head;
> u64 sync_gen;
> @@ -189,14 +189,7 @@ static void gfs2_ail1_start(struct gfs2_
> first_ai->ai_sync_gen = sync_gen;
> gfs2_ail1_start_one(sdp, first_ai); /* This may drop log lock */
>
> - if (flags & DIO_ALL)
> - first = NULL;
> -
> while(!done) {
> - if (first && (head->prev != first ||
> - gfs2_ail1_empty_one(sdp, first_ai, 0)))
> - break;
> -
> done = 1;
> list_for_each_entry_safe_reverse(ai, tmp, head, ai_list) {
> if (ai->ai_sync_gen >= sync_gen)
> @@ -290,58 +283,57 @@ static void ail2_empty(struct gfs2_sbd *
> * flush time, so we ensure that we have just enough free blocks at all
> * times to avoid running out during a log flush.
> *
> + * We no longer flush the log here, instead we wake up logd to do that
> + * for us. To avoid the thundering herd and to ensure that we deal fairly
> + * with queued waiters, we use an exclusive wait. This means that when we
> + * get woken with enough journal space to get our reservation, we need to
> + * wake the next waiter on the list.
> + *
> * Returns: errno
> */
>
> int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
> {
> - unsigned int try = 0;
> unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
> + unsigned wanted = blks + reserved_blks;
> + DEFINE_WAIT(wait);
> + int did_wait = 0;
> + unsigned int free_blocks;
>
> if (gfs2_assert_warn(sdp, blks) ||
> gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
> return -EINVAL;
> -
> - mutex_lock(&sdp->sd_log_reserve_mutex);
> - gfs2_log_lock(sdp);
> - while(atomic_read(&sdp->sd_log_blks_free) <= (blks + reserved_blks)) {
> - gfs2_log_unlock(sdp);
> - gfs2_ail1_empty(sdp, 0);
> - gfs2_log_flush(sdp, NULL);
> -
> - if (try++)
> - gfs2_ail1_start(sdp, 0);
> - gfs2_log_lock(sdp);
> - }
> - atomic_sub(blks, &sdp->sd_log_blks_free);
> +retry:
> + free_blocks = atomic_read(&sdp->sd_log_blks_free);
> + if (unlikely(free_blocks <= wanted)) {
> + do {
> + wake_up(&sdp->sd_logd_waitq);
> + prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
> + TASK_UNINTERRUPTIBLE);
> + did_wait = 1;
> + if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
> + io_schedule();
> + free_blocks = atomic_read(&sdp->sd_log_blks_free);
> + } while(free_blocks <= wanted);
> + finish_wait(&sdp->sd_log_waitq, &wait);
> + }
> + if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
> + free_blocks - blks) != free_blocks)
> + goto retry;
> trace_gfs2_log_blocks(sdp, -blks);
> - gfs2_log_unlock(sdp);
> - mutex_unlock(&sdp->sd_log_reserve_mutex);
> +
> + /*
> + * If we waited, then so might others, wake them up _after_ we get
> + * our share of the log.
> + */
> + if (unlikely(did_wait))
> + wake_up(&sdp->sd_log_waitq);
>
> down_read(&sdp->sd_log_flush_lock);
>
> return 0;
> }
>
> -/**
> - * gfs2_log_release - Release a given number of log blocks
> - * @sdp: The GFS2 superblock
> - * @blks: The number of blocks
> - *
> - */
> -
> -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
> -{
> -
> - gfs2_log_lock(sdp);
> - atomic_add(blks, &sdp->sd_log_blks_free);
> - trace_gfs2_log_blocks(sdp, blks);
> - gfs2_assert_withdraw(sdp,
> - atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
> - gfs2_log_unlock(sdp);
> - up_read(&sdp->sd_log_flush_lock);
> -}
> -
> static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
> {
> struct gfs2_journal_extent *je;
> @@ -559,11 +551,10 @@ static void log_pull_tail(struct gfs2_sb
>
> ail2_empty(sdp, new_tail);
>
> - gfs2_log_lock(sdp);
> atomic_add(dist, &sdp->sd_log_blks_free);
> trace_gfs2_log_blocks(sdp, dist);
> - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks);
> - gfs2_log_unlock(sdp);
> + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
> + sdp->sd_jdesc->jd_blocks);
>
> sdp->sd_log_tail = new_tail;
> }
> @@ -822,6 +813,13 @@ static void buf_lo_incore_commit(struct
> * @sdp: the filesystem
> * @tr: the transaction
> *
> + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
> + * or the total number of used blocks (pinned blocks plus AIL blocks)
> + * is greater than thresh2.
> + *
> + * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
> + * journal size.
> + *
> * Returns: errno
> */
>
> @@ -832,10 +830,10 @@ void gfs2_log_commit(struct gfs2_sbd *sd
>
> up_read(&sdp->sd_log_flush_lock);
>
> - gfs2_log_lock(sdp);
> - if (sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks))
> - wake_up_process(sdp->sd_logd_process);
> - gfs2_log_unlock(sdp);
> + if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
> + ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
> + atomic_read(&sdp->sd_log_thresh2)))
> + wake_up(&sdp->sd_logd_waitq);
> }
>
> /**
> @@ -882,13 +880,23 @@ void gfs2_meta_syncfs(struct gfs2_sbd *s
> {
> gfs2_log_flush(sdp, NULL);
> for (;;) {
> - gfs2_ail1_start(sdp, DIO_ALL);
> + gfs2_ail1_start(sdp);
> if (gfs2_ail1_empty(sdp, DIO_ALL))
> break;
> msleep(10);
> }
> }
>
> +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
> +{
> + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
> +}
> +
> +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
> +{
> + unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
> + return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
> +}
>
> /**
> * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
> @@ -901,28 +909,43 @@ void gfs2_meta_syncfs(struct gfs2_sbd *s
> int gfs2_logd(void *data)
> {
> struct gfs2_sbd *sdp = data;
> - unsigned long t;
> - int need_flush;
> + unsigned long t = 1;
> + DEFINE_WAIT(wait);
> + unsigned preflush;
>
> while (!kthread_should_stop()) {
> - /* Advance the log tail */
>
> - t = sdp->sd_log_flush_time +
> - gfs2_tune_get(sdp, gt_log_flush_secs) * HZ;
> + preflush = atomic_read(&sdp->sd_log_pinned);
> + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
> + gfs2_ail1_empty(sdp, DIO_ALL);
> + gfs2_log_flush(sdp, NULL);
> + gfs2_ail1_empty(sdp, DIO_ALL);
> + }
>
> - gfs2_ail1_empty(sdp, DIO_ALL);
> - gfs2_log_lock(sdp);
> - need_flush = sdp->sd_log_num_buf > gfs2_tune_get(sdp, gt_incore_log_blocks);
> - gfs2_log_unlock(sdp);
> - if (need_flush || time_after_eq(jiffies, t)) {
> + if (gfs2_ail_flush_reqd(sdp)) {
> + gfs2_ail1_start(sdp);
> + io_schedule();
> + gfs2_ail1_empty(sdp, 0);
> gfs2_log_flush(sdp, NULL);
> - sdp->sd_log_flush_time = jiffies;
> + gfs2_ail1_empty(sdp, DIO_ALL);
> }
>
> + wake_up(&sdp->sd_log_waitq);
> t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
> if (freezing(current))
> refrigerator();
> - schedule_timeout_interruptible(t);
> +
> + do {
> + prepare_to_wait(&sdp->sd_logd_waitq, &wait,
> + TASK_UNINTERRUPTIBLE);
> + if (!gfs2_ail_flush_reqd(sdp) &&
> + !gfs2_jrnl_flush_reqd(sdp) &&
> + !kthread_should_stop())
> + t = schedule_timeout(t);
> + } while(t && !gfs2_ail_flush_reqd(sdp) &&
> + !gfs2_jrnl_flush_reqd(sdp) &&
> + !kthread_should_stop());
> + finish_wait(&sdp->sd_logd_waitq, &wait);
> }
>
> return 0;
> Index: gfs2-2.6-nmw/fs/gfs2/log.h
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/log.h
> +++ gfs2-2.6-nmw/fs/gfs2/log.h
> @@ -51,7 +51,6 @@ unsigned int gfs2_struct2blk(struct gfs2
> unsigned int ssize);
>
> int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
> -void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
> void gfs2_log_incr_head(struct gfs2_sbd *sdp);
>
> struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
> Index: gfs2-2.6-nmw/fs/gfs2/lops.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/lops.c
> +++ gfs2-2.6-nmw/fs/gfs2/lops.c
> @@ -54,6 +54,7 @@ static void gfs2_pin(struct gfs2_sbd *sd
> if (bd->bd_ail)
> list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
> get_bh(bh);
> + atomic_inc(&sdp->sd_log_pinned);
> trace_gfs2_pin(bd, 1);
> }
>
> @@ -94,6 +95,7 @@ static void gfs2_unpin(struct gfs2_sbd *
> trace_gfs2_pin(bd, 0);
> gfs2_log_unlock(sdp);
> unlock_buffer(bh);
> + atomic_dec(&sdp->sd_log_pinned);
> }
>
>
> Index: gfs2-2.6-nmw/fs/gfs2/meta_io.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/meta_io.c
> +++ gfs2-2.6-nmw/fs/gfs2/meta_io.c
> @@ -313,6 +313,7 @@ void gfs2_remove_from_journal(struct buf
> struct gfs2_bufdata *bd = bh->b_private;
>
> if (test_clear_buffer_pinned(bh)) {
> + atomic_dec(&sdp->sd_log_pinned);
> list_del_init(&bd->bd_le.le_list);
> if (meta) {
> gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
> Index: gfs2-2.6-nmw/fs/gfs2/ops_fstype.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/ops_fstype.c
> +++ gfs2-2.6-nmw/fs/gfs2/ops_fstype.c
> @@ -57,8 +57,7 @@ static void gfs2_tune_init(struct gfs2_t
> {
> spin_lock_init(>->gt_spin);
>
> - gt->gt_incore_log_blocks = 1024;
> - gt->gt_logd_secs = 1;
> + gt->gt_logd_secs = 30;
> gt->gt_quota_simul_sync = 64;
> gt->gt_quota_warn_period = 10;
> gt->gt_quota_scale_num = 1;
> @@ -101,14 +100,15 @@ static struct gfs2_sbd *init_sbd(struct
> spin_lock_init(&sdp->sd_trunc_lock);
>
> spin_lock_init(&sdp->sd_log_lock);
> -
> + atomic_set(&sdp->sd_log_pinned, 0);
> INIT_LIST_HEAD(&sdp->sd_log_le_buf);
> INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
> INIT_LIST_HEAD(&sdp->sd_log_le_rg);
> INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
> INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
>
> - mutex_init(&sdp->sd_log_reserve_mutex);
> + init_waitqueue_head(&sdp->sd_log_waitq);
> + init_waitqueue_head(&sdp->sd_logd_waitq);
> INIT_LIST_HEAD(&sdp->sd_ail1_list);
> INIT_LIST_HEAD(&sdp->sd_ail2_list);
>
> @@ -733,6 +733,8 @@ static int init_journal(struct gfs2_sbd
> if (sdp->sd_args.ar_spectator) {
> sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
> atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
> + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
> + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
> } else {
> if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) {
> fs_err(sdp, "can't mount journal #%u\n",
> @@ -770,6 +772,8 @@ static int init_journal(struct gfs2_sbd
> goto fail_jinode_gh;
> }
> atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
> + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5);
> + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
>
> /* Map the extents for this journal's blocks */
> map_journal_extents(sdp);
> @@ -951,8 +955,6 @@ static int init_threads(struct gfs2_sbd
> if (undo)
> goto fail_quotad;
>
> - sdp->sd_log_flush_time = jiffies;
> -
> p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
> error = IS_ERR(p);
> if (error) {
> @@ -1160,7 +1162,6 @@ static int fill_super(struct super_block
> GFS2_BASIC_BLOCK_SHIFT;
> sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
>
> - sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
> sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum;
> if (sdp->sd_args.ar_statfs_quantum) {
> sdp->sd_tune.gt_statfs_slow = 0;
> Index: gfs2-2.6-nmw/fs/gfs2/sys.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/sys.c
> +++ gfs2-2.6-nmw/fs/gfs2/sys.c
> @@ -469,8 +469,6 @@ static ssize_t name##_store(struct gfs2_
> } \
> TUNE_ATTR_2(name, name##_store)
>
> -TUNE_ATTR(incore_log_blocks, 0);
> -TUNE_ATTR(log_flush_secs, 0);
> TUNE_ATTR(quota_warn_period, 0);
> TUNE_ATTR(quota_quantum, 0);
> TUNE_ATTR(max_readahead, 0);
> @@ -482,8 +480,6 @@ TUNE_ATTR(statfs_quantum, 1);
> TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
>
> static struct attribute *tune_attrs[] = {
> - &tune_attr_incore_log_blocks.attr,
> - &tune_attr_log_flush_secs.attr,
> &tune_attr_quota_warn_period.attr,
> &tune_attr_quota_quantum.attr,
> &tune_attr_max_readahead.attr,
> Index: gfs2-2.6-nmw/fs/gfs2/trans.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/trans.c
> +++ gfs2-2.6-nmw/fs/gfs2/trans.c
> @@ -23,6 +23,7 @@
> #include "meta_io.h"
> #include "trans.h"
> #include "util.h"
> +#include "trace_gfs2.h"
>
> int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
> unsigned int revokes)
> @@ -75,6 +76,23 @@ fail_holder_uninit:
> return error;
> }
>
> +/**
> + * gfs2_log_release - Release a given number of log blocks
> + * @sdp: The GFS2 superblock
> + * @blks: The number of blocks
> + *
> + */
> +
> +static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
> +{
> +
> + atomic_add(blks, &sdp->sd_log_blks_free);
> + trace_gfs2_log_blocks(sdp, blks);
> + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
> + sdp->sd_jdesc->jd_blocks);
> + up_read(&sdp->sd_log_flush_lock);
> +}
> +
> void gfs2_trans_end(struct gfs2_sbd *sdp)
> {
> struct gfs2_trans *tr = current->journal_info;
> Index: gfs2-2.6-nmw/fs/gfs2/super.c
> ===================================================================
> --- gfs2-2.6-nmw.orig/fs/gfs2/super.c
> +++ gfs2-2.6-nmw/fs/gfs2/super.c
> @@ -1113,7 +1113,6 @@ static int gfs2_remount_fs(struct super_
> int error;
>
> spin_lock(>->gt_spin);
> - args.ar_commit = gt->gt_log_flush_secs;
> args.ar_quota_quantum = gt->gt_quota_quantum;
> if (gt->gt_statfs_slow)
> args.ar_statfs_quantum = 0;
> @@ -1160,7 +1159,6 @@ static int gfs2_remount_fs(struct super_
> else
> clear_bit(SDF_NOBARRIERS, &sdp->sd_flags);
> spin_lock(>->gt_spin);
> - gt->gt_log_flush_secs = args.ar_commit;
> gt->gt_quota_quantum = args.ar_quota_quantum;
> if (args.ar_statfs_quantum) {
> gt->gt_statfs_slow = 0;
> @@ -1305,9 +1303,6 @@ static int gfs2_show_options(struct seq_
> }
> if (args->ar_discard)
> seq_printf(s, ",discard");
> - val = sdp->sd_tune.gt_log_flush_secs;
> - if (val != 60)
> - seq_printf(s, ",commit=%d", val);
> val = sdp->sd_tune.gt_statfs_quantum;
> if (val != 30)
> seq_printf(s, ",statfs_quantum=%d", val);
>
More information about the Cluster-devel
mailing list