[Cluster-devel] [PATCH][GFS2] Bouncing locks in a cluster is slow in GFS2

Wed Jan 26 20:22:18 UTC 2011

Hi,

This patch is a performance improvement for GFS2 in a clustered
environment.  It makes the glock hold time self-adjusting.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson <rpeterso at redhat.com> 

Bouncing locks in a cluster is slow in GFS2
--
 fs/gfs2/glock.c  |   89 ++++++++++++++++++++++++++++++++++++++++--------------
 fs/gfs2/glock.h  |    6 ++++
 fs/gfs2/glops.c  |    2 -
 fs/gfs2/incore.h |    2 +-
 4 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c75d499..117d8e2 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -58,7 +58,6 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 
 static struct dentry *gfs2_root;
-static struct workqueue_struct *glock_workqueue;
 struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
@@ -67,9 +66,23 @@ static DEFINE_SPINLOCK(lru_lock);
 #define GFS2_GL_HASH_SHIFT      15
 #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
 #define GFS2_GL_HASH_MASK       (GFS2_GL_HASH_SIZE - 1)
+#define GL_WORKQUEUES            0x2
+#define GL_WQ_MASK               0x1
 
 static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
 static struct dentry *gfs2_root;
+static struct workqueue_struct *glock_workqueue[GL_WORKQUEUES];
+
+static inline int qwork(struct gfs2_glock *gl, unsigned long delay)
+{
+	struct workqueue_struct *wq;
+
+	wq = glock_workqueue[gl->gl_name.ln_type & GL_WQ_MASK];
+
+	if (gl->gl_name.ln_type != LM_TYPE_INODE)
+		delay = 0;
+	return queue_delayed_work(wq, &gl->gl_work, delay);
+}
 
 /**
  * gl_hash() - Turn glock number into hash bucket number
@@ -407,6 +420,10 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
 	if (held1 && held2 && list_empty(&gl->gl_holders))
 		clear_bit(GLF_QUEUED, &gl->gl_flags);
 
+	if (new_state != gl->gl_target)
+		/* shorten our minimum hold time */
+		gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR,
+				       GL_GLOCK_MIN_HOLD);
 	gl->gl_state = new_state;
 	gl->gl_tchange = jiffies;
 }
@@ -550,7 +567,7 @@ __acquires(&gl->gl_spin)
 		GLOCK_BUG_ON(gl, ret);
 	} else { /* lock_nolock */
 		finish_xmote(gl, target);
-		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		if (qwork(gl, 0) == 0)
 			gfs2_glock_put(gl);
 	}
 
@@ -623,7 +640,7 @@ out_sched:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
 	smp_mb__after_clear_bit();
 	gfs2_glock_hold(gl);
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+	if (qwork(gl, 0) == 0)
 		gfs2_glock_put_nolock(gl);
 	return;
 
@@ -670,15 +687,14 @@ static void glock_work_func(struct work_struct *work)
 	    gl->gl_state != LM_ST_UNLOCKED &&
 	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
 		unsigned long holdtime, now = jiffies;
-		holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+		holdtime = gl->gl_tchange + gl->gl_hold_time;
 		if (time_before(now, holdtime))
 			delay = holdtime - now;
 		set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
 	}
 	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
-	if (!delay ||
-	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+	if (!delay || qwork(gl, delay) == 0)
 		gfs2_glock_put(gl);
 	if (drop_ref)
 		gfs2_glock_put(gl);
@@ -741,6 +757,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
+	gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
 	INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
 	INIT_WORK(&gl->gl_delete, delete_work_func);
 
@@ -852,8 +869,15 @@ static int gfs2_glock_demote_wait(void *word)
 
 static void wait_on_holder(struct gfs2_holder *gh)
 {
+	unsigned long time1 = jiffies;
+
 	might_sleep();
 	wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
+	if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */
+		/* Lengthen the minimum hold time. */
+		gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
+					      GL_GLOCK_HOLD_INCR,
+					      GL_GLOCK_MAX_HOLD);
 }
 
 static void wait_on_demote(struct gfs2_glock *gl)
@@ -1087,8 +1111,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	gfs2_glock_hold(gl);
 	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 	    !test_bit(GLF_DEMOTE, &gl->gl_flags))
-		delay = gl->gl_ops->go_min_hold_time;
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		delay = gl->gl_hold_time;
+	if (qwork(gl, delay) == 0)
 		gfs2_glock_put(gl);
 }
 
@@ -1270,18 +1294,18 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 	unsigned long now = jiffies;
 
 	gfs2_glock_hold(gl);
-	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+	holdtime = gl->gl_tchange + gl->gl_hold_time;
 	if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
 		if (time_before(now, holdtime))
 			delay = holdtime - now;
 		if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
-			delay = gl->gl_ops->go_min_hold_time;
+			delay = gl->gl_hold_time;
 	}
 
 	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, delay);
 	spin_unlock(&gl->gl_spin);
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+	if (qwork(gl, delay) == 0)
 		gfs2_glock_put(gl);
 }
 
@@ -1343,7 +1367,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 	smp_wmb();
 	gfs2_glock_hold(gl);
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+	if (qwork(gl, 0) == 0)
 		gfs2_glock_put(gl);
 }
 
@@ -1379,7 +1403,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
 			}
 			clear_bit(GLF_LOCK, &gl->gl_flags);
 			smp_mb__after_clear_bit();
-			if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			if (qwork(gl, 0) == 0)
 				gfs2_glock_put_nolock(gl);
 			spin_unlock(&gl->gl_spin);
 			spin_lock(&lru_lock);
@@ -1447,7 +1471,7 @@ static void thaw_glock(struct gfs2_glock *gl)
 		return;
 	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 	gfs2_glock_hold(gl);
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+	if (qwork(gl, 0) == 0)
 		gfs2_glock_put(gl);
 }
 
@@ -1471,7 +1495,7 @@ static void clear_glock(struct gfs2_glock *gl)
 		handle_callback(gl, LM_ST_UNLOCKED, 0);
 	spin_unlock(&gl->gl_spin);
 	gfs2_glock_hold(gl);
-	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+	if (qwork(gl, 0) == 0)
 		gfs2_glock_put(gl);
 }
 
@@ -1510,8 +1534,11 @@ static void dump_glock_func(struct gfs2_glock *gl)
 
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
+	unsigned int x;
+
 	glock_hash_walk(clear_glock, sdp);
-	flush_workqueue(glock_workqueue);
+	for (x = 0; x < GL_WORKQUEUES; x++)
+		flush_workqueue(glock_workqueue[x]);
 	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
 	glock_hash_walk(dump_glock_func, sdp);
 }
@@ -1658,7 +1685,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d m:%ld\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
@@ -1666,7 +1693,7 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  state2str(gl->gl_target),
 		  state2str(gl->gl_demote_state), dtime,
 		  atomic_read(&gl->gl_ail_count),
-		  atomic_read(&gl->gl_ref));
+		  atomic_read(&gl->gl_ref), gl->gl_hold_time);
 
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
 		error = dump_holder(seq, gh);
@@ -1685,19 +1712,32 @@ out:
 int __init gfs2_glock_init(void)
 {
 	unsigned i;
+	char qn[32];
+
 	for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
 		INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
 	}
 
-	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
+	for (i = 0; i < GL_WORKQUEUES; i++) {
+		sprintf(qn, "gfs2workq%d", i);
+		glock_workqueue[i] = alloc_workqueue(qn, WQ_MEM_RECLAIM |
 					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
-	if (IS_ERR(glock_workqueue))
-		return PTR_ERR(glock_workqueue);
+		if (IS_ERR(glock_workqueue[i])) {
+			int error = PTR_ERR(glock_workqueue[i]);
+
+			while (i > 0) {
+				i--;
+				destroy_workqueue(glock_workqueue[i]);
+			}
+			return error;
+		}
+	}
 	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
 						WQ_MEM_RECLAIM | WQ_FREEZEABLE,
 						0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
-		destroy_workqueue(glock_workqueue);
+		for (i = 0; i < GL_WORKQUEUES; i++)
+			destroy_workqueue(glock_workqueue[i]);
 		return PTR_ERR(gfs2_delete_workqueue);
 	}
 
@@ -1708,9 +1748,12 @@ int __init gfs2_glock_init(void)
 
 void gfs2_glock_exit(void)
 {
+	int i;
+
 	unregister_shrinker(&glock_shrinker);
-	destroy_workqueue(glock_workqueue);
 	destroy_workqueue(gfs2_delete_workqueue);
+	for (i = 0; i < GL_WORKQUEUES; i++)
+		destroy_workqueue(glock_workqueue[i]);
 }
 
 static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index afa8bfe..3233add 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -113,6 +113,12 @@ enum {
 
 #define GLR_TRYFAILED		13
 
+#define GL_GLOCK_MAX_HOLD        (long)(HZ / 5)
+#define GL_GLOCK_DFT_HOLD        (long)(HZ / 5)
+#define GL_GLOCK_MIN_HOLD        (long)(0)
+#define GL_GLOCK_HOLD_INCR       (long)(HZ / 20)
+#define GL_GLOCK_HOLD_DECR       (long)(HZ / 40)
+
 struct lm_lockops {
 	const char *lm_proto_name;
 	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index ac5fac9..bba125e 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -399,7 +399,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_lock = inode_go_lock,
 	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
-	.go_min_hold_time = HZ / 5,
 	.go_flags = GLOF_ASPACE,
 };
 
@@ -410,7 +409,6 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_unlock = rgrp_go_unlock,
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
-	.go_min_hold_time = HZ / 5,
 	.go_flags = GLOF_ASPACE,
 };
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 720c1e6..f21f075 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -163,7 +163,6 @@ struct gfs2_glock_operations {
 	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
 	void (*go_callback) (struct gfs2_glock *gl);
 	const int go_type;
-	const unsigned long go_min_hold_time;
 	const unsigned long go_flags;
 #define GLOF_ASPACE 1
 };
@@ -237,6 +236,7 @@ struct gfs2_glock {
 	struct delayed_work gl_work;
 	struct work_struct gl_delete;
 	struct rcu_head gl_rcu;
+	long gl_hold_time;
 };
 
 #define GFS2_MIN_LVB_SIZE 32	/* Min size of LVB that gfs2 supports */