[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

Tue Apr 3 18:23:03 UTC 2007

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-03 19:23:01

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-xfr.h 

Log message:
	Bug 234539: multiple streams of I/O can cause system to lock up
	
	This bug provoked an audit of the communications exchange, locking,
	and memory allocations/stack usage.
	
	Communication fixes include:
	1) Added sequence numbers to ensure that replies from the server
	correctly correspond to client requests.  It was found that if
	a client timed out waiting for a server to respond, it would send
	the request again.  However, the server may have simply been too
	busy to respond in a timely fashion.  It ends up responding to
	both the original request and the resent request - causing the
	client and server to become out-of-sync WRT log requests.
	
	Locking fixes include:
	1) A semaphore was being "up"ed twice in some cases, rendering
	the lock impotent.
	
	2) A spin lock controlling region status lists was being held
	across blocking operations - sometimes causing deadlocks.  The
	spin lock was changed to a per-log lock, and some logging
	operations were restructured to better suit the way locking
	needed to be done.  A side-effect of this fix is a 20%
	improvement in write operations.
	
	3) The log list protection lock needed to change from a spin lock
	to a semaphore to allow blocking operations.
	
	Memory allocation fixes include:
	1) Wrong flags to kmalloc could cause deadlock.  Use NOFS instead
	of KERNEL.
	
	2) Mempools needed more reserves for low memory conditions.
	
	3) Server now allocates a communication structure instead of having
	it on the stack.  This reduces the likelyhood of stack corruption.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.1&r2=1.1.2.41.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.12&r2=1.1.2.12.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.1&r2=1.1.2.26.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.2.1&r2=1.1.2.2.2.2

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:34:44	1.1.2.41.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/03 18:23:01	1.1.2.41.2.2
@@ -28,20 +28,16 @@
 #include "dm-cmirror-server.h"
 #include "dm-cmirror-cman.h"
 
-spinlock_t log_list_lock;
+DECLARE_MUTEX(log_list_lock);
 LIST_HEAD(log_list_head);
 
 struct region_state {
-	struct log_c *rs_lc;
+	int rs_mark_logged;
 	region_t rs_region;
 	struct list_head rs_list;
 };
 
 static mempool_t *region_state_pool = NULL;
-static spinlock_t region_state_lock;
-static int clear_region_count=0;
-static struct list_head clear_region_list;
-static struct list_head marked_region_list;
 
 static int shutting_down=0;
 static atomic_t suspend_client;
@@ -145,15 +141,7 @@
 	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-	lc->recovering_bits = vmalloc(bitset_size);
-	if (!lc->recovering_bits) {
-		DMWARN("couldn't allocate sync bitset");
-		vfree(lc->sync_bits);
-		vfree(lc->clean_bits);
-		kfree(lc);
-		return -ENOMEM;
-	}
-	memset(lc->recovering_bits, 0, bitset_size);
+	lc->recovering_region = (uint64_t)-1;
 	lc->sync_search = 0;
 	log->context = lc;
 	return 0;
@@ -164,7 +152,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	vfree(lc->clean_bits);
 	vfree(lc->sync_bits);
-	vfree(lc->recovering_bits);
 	kfree(lc);
 }
 
@@ -321,8 +308,9 @@
 
 	request_count++;
 
-	lr = kmalloc(sizeof(struct log_request), GFP_KERNEL);
+	lr = kmalloc(sizeof(struct log_request), GFP_NOFS);
 	if(!lr){
+		BUG();
 		error = -ENOMEM;
 		*retry = 1;
 		goto fail;
@@ -404,15 +392,15 @@
 	}
     
 	if (seq != lr->lr_seq) {
-		DMERR("Message sequence number mismatch: %d/%d",
+		DMDEBUG("Message sequence number mismatch: %d/%d",
 		      seq, lr->lr_seq);
 		if (seq > lr->lr_seq) {
-			DMERR(" Skipping.  Listening again for response to %s",
+			DMDEBUG(" Skipping.  Listening again for response to %s",
 			      RQ_STRING(type));
 			memset(lr, 0, sizeof(struct log_request));
 			goto rerecv;
 		}
-		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		DMERR(" Seq# mismatch: Must try to resend request, %s", RQ_STRING(type));
 		error = -EBADE;
 		*retry = 1;
 		seq++;
@@ -509,91 +497,43 @@
 			new_server = 1;
 		}
 
-		spin_lock(&region_state_lock);
+		spin_lock(&lc->state_lock);
 		if(new_server && 
-		   (!list_empty(&clear_region_list) ||
-		    !list_empty(&marked_region_list))){
+		   !list_empty(&lc->mark_logged)){
 			int i=0;
-			struct region_state *tmp_rs;
+			LIST_HEAD(mark);
 
 			DMINFO("Clean-up required due to new server");
-			DMINFO(" - Wiping clear region list");
-			list_for_each_entry_safe(rs, tmp_rs,
-						 &clear_region_list, rs_list){
-				/* Remove only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
-				i++;
-				list_del_init(&rs->rs_list);
-				mempool_free(rs, region_state_pool);
-			}
-			clear_region_count -= i;
-			DMINFO(" - %d clear region requests wiped", i);
-			i=0;
 			DMINFO(" - Resending all mark region requests");
-			list_for_each_entry(rs, &marked_region_list, rs_list){
-				/* Resend only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
+			list_splice_init(&lc->mark_logged, &mark);
+
+			spin_unlock(&lc->state_lock);
+
+			list_for_each_entry(rs, &mark, rs_list){
 				do {
 					retry = 0;
-					i++;
-					rtn = _consult_server(rs->rs_lc, rs->rs_region,
+					rtn = _consult_server(lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
-						spin_unlock(&region_state_lock);
 						goto election;
 					}
 				} while(retry);
+				i++;
 			}
+
+			spin_lock(&lc->state_lock);
+			list_splice_init(&mark, &lc->mark_logged);
+
 			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
-			if(type == LRT_MARK_REGION){
-				/* we just handled all marks */
-				DMWARN("Mark request ignored.\n");
-				spin_unlock(&region_state_lock);
-				goto out;
-			} else {
-				DMINFO("Continuing request type, %d (%s)", type,
-				       RQ_STRING(type));
-			}
+			DMINFO("Continuing request type, %d (%s)", type,
+			       RQ_STRING(type));
 			new_server = 0;
 		}
-
-		rs = NULL;
-
-		if(!list_empty(&clear_region_list)){
-			rs = list_entry(clear_region_list.next,
-					struct region_state, rs_list);
-			list_del_init(&rs->rs_list);
-			clear_region_count--;
-		}
-
-		spin_unlock(&region_state_lock);
-		
-		/* ATTENTION -- it may be possible to remove a clear region **
-		** request from the list.  Then, have a mark region happen  **
-		** while we are here.  If the clear region request fails, it**
-		** would be re-added - perhaps prematurely clearing the bit */
+		spin_unlock(&lc->state_lock);
 		
-		if(rs && !rs->rs_lc->log_dev_failed){
-			_consult_server(rs->rs_lc, rs->rs_region,
-					LRT_CLEAR_REGION, NULL, &retry);
-
-			if(retry){
-				spin_lock(&region_state_lock);
-				list_add(&rs->rs_list, &clear_region_list);
-				clear_region_count++;
-				spin_unlock(&region_state_lock);
-
-			} else {
-				mempool_free(rs, region_state_pool);
-			}
-		}
 		retry = 0;
-		
 		rtn = _consult_server(lc, region, type, result, &retry);
-		schedule();
 	} while(retry);
 out:
 	up(&consult_server_lock);
@@ -640,7 +580,7 @@
 	atomic_set(&lc->in_sync, -1);
 	lc->uuid_ref = 1;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(tmp_lc, &log_list_head, log_list){
 		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
 			lc->uuid_ref = (lc->uuid_ref > tmp_lc->uuid_ref) ?
@@ -649,12 +589,16 @@
 	}
 
 	list_add(&lc->log_list, &log_list_head);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 	DMDEBUG("Creating %s (%d)",
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
 	INIT_LIST_HEAD(&lc->region_users);
+	INIT_LIST_HEAD(&lc->clear_waiting);
+	INIT_LIST_HEAD(&lc->mark_waiting);
+	INIT_LIST_HEAD(&lc->mark_logged);
+	spin_lock_init(&lc->state_lock);
 
 	lc->server_id = 0xDEAD;
 
@@ -761,31 +705,44 @@
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
-	if (!list_empty(&clear_region_list))
-		DMINFO("Leaving while clear region requests remain.");
-
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_del_init(&lc->log_list);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 	sock_release(lc->client_sock);
 
-	spin_lock(&region_state_lock);
+	spin_lock(&lc->state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list) {
-		if (lc == rs->rs_lc)
+	if (!list_empty(&lc->clear_waiting)) {
+		DMINFO("Clear requests remain at cluster log deactivation");
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
 			list_del_init(&rs->rs_list);
+			DMINFO(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list) {
-		if (lc == rs->rs_lc)
-			list_del_init(&rs->rs_list);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Pending mark requests remain at cluster_dtr");
+		BUG();
+	}
+
+	if (!list_empty(&lc->mark_logged)) {
+		DMERR("Mark requests remain at cluster log deactivation");
+		/*
+		 * Should I BUG() this?
+		 * No.  In the worst case, they will get cleaned up later
+		 */
+	}
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
 	}
 
-	spin_unlock(&region_state_lock);
+	spin_unlock(&lc->state_lock);
 
 	if (lc->log_dev)
 		disk_dtr(log);
@@ -803,19 +760,27 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	while (1) {
-		spin_lock(&region_state_lock);
-		if (list_empty(&clear_region_list)) {
-			spin_unlock(&region_state_lock);
-			break;
-		}
-		spin_unlock(&region_state_lock);
+	spin_lock(&lc->state_lock);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Mark requests remain at postsuspend!");
+		BUG();
+	}
 
-		/* Just an unnessesary call to clear out regions */
-		consult_server(lc, 0, LRT_IN_SYNC, NULL);
+	if (!list_empty(&lc->clear_waiting)) {
+		DMERR("Clear requests remain at postsuspend!");
+
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+			list_del_init(&rs->rs_list);
+			DMERR(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
+
+	spin_unlock(&lc->state_lock);
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
@@ -903,103 +868,162 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	int r = 0;
+	int clear_count = 0;
+	int mark_count = 0;
 	struct log_c *lc = (struct log_c *) log->context;
+	struct region_state *rs, *tmp_rs;
+	LIST_HEAD(mark);
+	LIST_HEAD(clear);
+
+	/*
+	 * It should never be a problem to temporarily have
+	 * the mark requests in limbo.  The only functions
+	 * that call cluster_flush are rh_update_states and
+	 * do_writes, and they are in the same thread as
+	 * those changing the region states
+	 */
+	spin_lock(&lc->state_lock);
+	list_splice_init(&lc->clear_waiting, &clear);
+	list_splice_init(&lc->mark_waiting, &mark);
+	spin_unlock(&lc->state_lock);
+
+	list_for_each_entry_safe(rs, tmp_rs, &clear, rs_list) {
+		/* don't really care if LRT_CLEAR_REGION fails */
+		consult_server(lc, rs->rs_region, LRT_CLEAR_REGION, NULL);
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
+		clear_count++;
+	}
+
+	list_for_each_entry_safe(rs, tmp_rs, &mark, rs_list) {
+		while (1) {
+			r = consult_server(lc, rs->rs_region,
+					   LRT_MARK_REGION, NULL);
+			if (!r)
+				break;
+
+			if (r == -EBUSY) {
+				DMDEBUG("Delaying mark to region %Lu, due to recovery",
+					rs->rs_region);
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/2);
+				continue;
+			}
 
-	/* FIXME:  flush all clear_region requests to server */
-	return (lc->log_dev_failed) ? -EIO : 0;
+			if (r == -EIO)
+				goto fail;
+
+			DMWARN("unable to get server (%u) to mark region (%Lu)",
+			       lc->server_id, rs->rs_region);
+			DMWARN("Reason :: %d", r);
+		}
+		mark_count++;
+	}
+
+	/* No flush work? */
+	if (!clear_count && !mark_count)
+		return 0;
+
+	spin_lock(&lc->state_lock);
+	list_splice_init(&mark, &lc->mark_logged);
+	spin_unlock(&lc->state_lock);
+
+	while ((r = consult_server(lc, 0, LRT_FLUSH, NULL))) {
+		if (r == -EBUSY) {
+			DMDEBUG("Delaying flush due to recovery");
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ/2);
+			continue;
+		}
+
+		if (r == -EIO)
+			break;
+	}
+
+fail:
+	if (r) {
+		DMERR("Log flush failure: %d%s", r,
+		      (r == -EIO) ? " -EIO" : "");
+		dm_table_event(lc->ti->table);
+		lc->log_dev_failed = 1;
+	}
+
+	return r;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
 {
-	int error = 0;
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
 
-	spin_lock(&region_state_lock);
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			/*
-			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
-				region, lc->uuid + (strlen(lc->uuid) - 8));
-			*/
+	/*
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  2) rh_update_states (DOESN'T FLUSH!!!, bug #235040)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * The following shouldn't have to be handled b/c of the flush
+	 *  0) Region finishes recovery
+	 *  1) rh_update_states clears region (DOES FLUSH)
+	 *  2) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+		if (region == rs->rs_region) {
+			if (!rs->rs_mark_logged) {
+				DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting",
+				      region, lc->uuid + (strlen(lc->uuid) - 8));
+			}
 			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &marked_region_list);
-			clear_region_count--;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+			list_add(&rs->rs_list,
+				 (rs->rs_mark_logged) ?
+				 &lc->mark_logged : &lc->mark_waiting);
+			goto out;
 		}
 	}
+
 	/*
-	 * In the mirroring code, it is possible for a write
-	 * to complete and call rh_dec - putting the region on
-	 * the clear_region list.  However, before the actual
-	 * clear request is issued to the log (rh_update_states)
-	 * another mark happens.  So, we check for and remove
-	 * duplicates.
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_update_states
+	 *  2) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
 	 */
-	list_for_each_entry(rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-#ifdef DEBUG
-			DMINFO("Double mark on region ("
-			       SECTOR_FORMAT ")", region);
-#endif
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if (region == rs->rs_region) {
+			goto out;
 		}
 	}
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for mark.");
-		BUG();
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if (region == rs->rs_region) {
+			DMERR("Mark already waiting (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
+		}
 	}
+	spin_unlock(&lc->state_lock);
 
-	rs_new->rs_lc = lc;
+	rs_new = mempool_alloc(region_state_pool, GFP_NOFS);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
+
+	spin_lock(&lc->state_lock);
+	rs_new->rs_mark_logged = 1;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &marked_region_list);
-
-	spin_unlock(&region_state_lock);
-
-	if (!lc->log_dev_failed) {
-		while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
-			if (error == -EBUSY) {
-				/* Remote recovering delay and try again */
-				DMDEBUG("Delaying mark to region %Lu, due to recovery",
-					region);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
-				continue;
-			}
-
-			if (error == -EIO) {
-				lc->log_dev_failed = 1;
-				break;
-			}
-			DMWARN("unable to get server (%u) to mark region (%Lu)",
-			       lc->server_id, region);
-			DMWARN("Reason :: %d", error);
-		}
+	list_add(&rs_new->rs_list, &lc->mark_waiting);
+out:
+	spin_unlock(&lc->state_lock);
 
-		if (lc->log_dev_failed) {
-			dm_table_event(lc->ti->table);
-			/*
-			  DMERR("Write failed on mirror log device, %s",
-			  lc->log_dev->name);
-			  if (!atomic_read(&lc->suspended))
-			  wait_for_completion(&lc->failure_completion);
-			*/
-		}
-	}
 	return;
 }
 
@@ -1008,53 +1032,48 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
+	/* Should find match in this list, or no lists at all */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if(region == rs->rs_region){
+			list_del_init(&rs->rs_list);
+			list_add(&rs->rs_list, &lc->clear_waiting);
+			goto out;
+		}
+	}
 
-	spin_lock(&region_state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			DMINFO("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("Clear pre-empting mark (%Lu/%s)",
+			       region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &clear_region_list);
-			clear_region_count++;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			BUG();
 		}
 	}
-
-	/* We can get here because we my be doing resync_work, and therefore, **
+	/* We can get here because we may be doing resync_work, and therefore,**
 	** clearing without ever marking..................................... */
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for clear.");
-		BUG();
-	}
+	/* Don't need to spin_unlock, because allocation is non-blocking */
+	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
 
-	rs_new->rs_lc = lc;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &clear_region_list);
-	clear_region_count++;
-	if(!(clear_region_count & 0x7F)){
-		DMINFO("clear_region_count :: %d", clear_region_count);
-	}
+	list_add(&rs_new->rs_list, &lc->clear_waiting);
+
+out:
+	spin_unlock(&lc->state_lock);
 
-	spin_unlock(&region_state_lock);
 	return;
 }
 
@@ -1122,27 +1141,6 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
-/*
-		spin_lock(&region_state_lock);
-		i = clear_region_count;
-		list_for_each_entry(rs, &marked_region_list, rs_list){
-			j++;
-		}
-		spin_unlock(&region_state_lock);
-
-		DMINFO("CLIENT OUTPUT::");
-		DMINFO("  My ID            : %u", my_id);
-		DMINFO("  Server ID        : %u", lc->server_id);
-
-		DMINFO("  In-sync          : %s", (atomic_read(&lc->in_sync)>0)?
-		       "YES" : "NO");
-		DMINFO("  Regions marked   : %d", j);
-		DMINFO("  Regions clearing : %d", i);
-
-		if(lc->server_id == my_id){
-			print_server_status(lc);
-		}
-*/
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1195,11 +1193,11 @@
 
 	atomic_set(&suspend_client, 1);
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list) {
 		atomic_set(&lc->in_sync, 0);
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	/*
 	if (likely(!shutting_down))
@@ -1221,7 +1219,12 @@
 	global_nodeids = nodeids;
 	global_count = count;
 
-	kcl_get_node_by_nodeid(0, &node);
+	for (i = 0; kcl_get_node_by_nodeid(0, &node); i++) {
+		if (i > 10)
+			BUG();
+		else
+			DMERR("Bad call to kcl_get_node_by_nodeid");
+	}
 	my_id = node.node_id;
 
 	/* Wait for any outstanding starts to complete */
@@ -1233,7 +1236,7 @@
 	switch(type){
 	case SERVICE_NODE_LEAVE:
 	case SERVICE_NODE_FAILED:
-		spin_lock(&log_list_lock);
+		down(&log_list_lock);
 		list_for_each_entry(lc, &log_list_head, log_list){
 			for(i=0, server = 0xDEAD; i < count; i++){
 				if(lc->server_id == nodeids[i]){
@@ -1243,7 +1246,7 @@
 			/* ATTENTION -- need locking around this ? */
 			lc->server_id = server;
 		}
-		spin_unlock(&log_list_lock);
+		up(&log_list_lock);
 
 		break;
 	case SERVICE_NODE_JOIN:
@@ -1279,10 +1282,8 @@
 
 	down(&cmirror_register_lock);
 
-	if (mirror_set_count++) {
-		up(&cmirror_register_lock);
+	if (mirror_set_count++)
 		goto out;
-	}
 
 	r = kcl_register_service("clustered_log", 13, SERVICE_LEVEL_GDLM, &clog_ops,
 				 1, NULL, &local_id);
@@ -1383,12 +1384,7 @@
         DMINFO("dm-cmirror %s (built %s %s) installed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 
-	INIT_LIST_HEAD(&clear_region_list);
-	INIT_LIST_HEAD(&marked_region_list);
-
-	spin_lock_init(&region_state_lock);
-	spin_lock_init(&log_list_lock);
-	region_state_pool = mempool_create(20, region_state_alloc,
+	region_state_pool = mempool_create(500, region_state_alloc,
 					   region_state_free, NULL);
 	if(!region_state_pool){
 		DMWARN("couldn't create region state pool");
@@ -1424,6 +1420,8 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+        DMINFO("dm-cmirror %s (built %s %s) removed",
+               CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
 
 module_init(cluster_dirty_log_init);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/02/21 17:14:44	1.1.2.12
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:23:01	1.1.2.12.2.1
@@ -97,7 +97,7 @@
 	unsigned bitset_uint32_count;
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
-	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
+	uint64_t recovering_region;
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
@@ -134,7 +134,12 @@
 	atomic_t in_sync;  /* like sync_count, except all or nothing */
 
 	struct list_head log_list;
-	struct list_head region_users;
+	struct list_head region_users;  /* Used by Server */
+
+	spinlock_t state_lock;
+	struct list_head clear_waiting;
+	struct list_head mark_waiting;
+	struct list_head mark_logged;
 
 	uint32_t server_id;
 	struct socket *client_sock;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:34:44	1.1.2.26.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/03 18:23:01	1.1.2.26.2.2
@@ -47,7 +47,7 @@
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
-extern spinlock_t log_list_lock;
+extern struct semaphore log_list_lock;
 extern struct list_head log_list_head;
 
 static void *region_user_alloc(int gfp_mask, void *pool_data){
@@ -225,6 +225,11 @@
 
 static int _core_get_resync_work(struct log_c *lc, region_t *region)
 {
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering (%Lu)", lc->recovering_region);
+		return 0;
+	}
+
 	if (lc->sync_search >= lc->region_count) {
 		/*
 		 * FIXME: pvmove is not supported yet, but when it is,
@@ -237,18 +242,16 @@
 			return 0;
 		}
 	}
-	do {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  lc->sync_search);
-		lc->sync_search = *region + 1;
-
-		if (*region >= lc->region_count)
-			return 0;
+	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					  lc->region_count,
+					  lc->sync_search);
+	lc->sync_search = *region + 1;
 
-	} while (log_test_bit(lc->recovering_bits, *region));
+	if (*region >= lc->region_count)
+		return 0;
 
-	log_set_bit(lc, lc->recovering_bits, *region);
+	lc->recovering_region = *region;
+	DMDEBUG("Assigning recovery work: %Lu", *region);
 	return 1;
 }
 
@@ -371,7 +374,7 @@
 			bad_count++;
 			log_clear_bit(lc, lc->sync_bits, ru->ru_region);
 			if (ru->ru_rw == RU_RECOVER) {
-				log_clear_bit(lc, lc->recovering_bits, ru->ru_region);
+				lc->recovering_region = (uint64_t)-1;
 			}
 			list_del(&ru->ru_list);
 			mempool_free(ru, region_user_pool);
@@ -506,10 +509,9 @@
 
 static int server_mark_region(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
-	int r = 0;
 	struct region_user *ru, *new;
 
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
 		return -ENOMEM;
 	}
@@ -519,21 +521,13 @@
     
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		r = write_bits(lc);
-
 		list_add(&new->ru_list, &lc->region_users);
-		if (!r) {
-			lc->touched = 0;
-			lc->log_dev_failed = 0;
-		} else {
-			lc->log_dev_failed = 1;
-		}
 	} else if (ru->ru_rw == RU_RECOVER) {
-		DMINFO("Attempt to mark a region " SECTOR_FORMAT 
+		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
 		      "/%s which is being recovered.",
 		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMINFO("Current recoverer: %u", ru->ru_nodeid);
-		DMINFO("Mark requester   : %u", who);
+		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
+		DMDEBUG("Mark requester   : %u", who);
 
 		mempool_free(new, region_user_pool);
 		return -EBUSY;
@@ -547,7 +541,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return (lc->log_dev_failed) ? -EIO : 0;
+	return 0;
 }
 
 
@@ -567,28 +561,34 @@
 
 	if(!find_ru_by_region(lc, lr->u.lr_region)){
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
-		write_bits(lc);
-		/*
-		if (write_bits(lc))
-			DMERR("Write bits failed on mirror log device, %s",
-			      lc->log_dev->name);
-		*/
 	}
 	return 0;
 }
 
 
+static int server_flush(struct log_c *lc)
+{
+	int r = 0;
+
+	r = write_bits(lc);
+	if (!r) {
+		lc->touched = 0;
+		lc->log_dev_failed = 0;
+	} else {
+		lc->log_dev_failed = 1;
+	}
+
+	return (lc->log_dev_failed) ? -EIO : 0;
+}
+
+
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
 
-/* We now have the ability to use remote_recovering
-	if (my_id != who)
-		return 0;
-*/
-
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
+		lr->u.lr_int_rtn = 0;
 		return -ENOMEM;
 	}
 	
@@ -610,9 +610,15 @@
 		return -EINVAL;
 	}
 
-	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
-
 	if (success) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Told to clear recovery on wrong region %Lu/%Lu",
+			      lr->u.lr_region, lc->recovering_region);
+			return -EINVAL;
+		}
+
+		lc->recovering_region = (uint64_t)-1;
+
 		/* We could receive multiple identical request due to network failure */
 		if(!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 			log_set_bit(lc, lc->sync_bits, lr->u.lr_region);
@@ -650,7 +656,7 @@
 static struct log_c *get_log_context(char *uuid, int uuid_ref){
 	struct log_c *lc, *r = NULL;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list){
 		if (!strncmp(lc->uuid, uuid, MAX_NAME_LEN) &&
 		    (uuid_ref == lc->uuid_ref)) {
@@ -660,7 +666,7 @@
 				r = lc;
 		}
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	return r;
 }
@@ -838,6 +844,7 @@
  * Returns: 0 on success, -1 on error
  */
 static int process_log_request(struct socket *sock){
+	static struct log_request *lr = NULL;
 	int error;
 	uint32_t nodeid;
 	struct msghdr msg;
@@ -845,9 +852,13 @@
 	struct sockaddr_in saddr_in;
 	mm_segment_t fs;
 	struct log_c *lc;
-	struct log_request lr; /* ATTENTION -- could be too much on the stack */
 
-	memset(&lr, 0, sizeof(struct log_request));
+	if (unlikely(!lr))
+		lr = kmalloc(sizeof(*lr), GFP_KERNEL);
+	if (!lr)
+		return -1;
+
+	memset(lr, 0, sizeof(struct log_request));
 	memset(&saddr_in, 0, sizeof(saddr_in));
 		
 	msg.msg_control = NULL;
@@ -858,7 +869,7 @@
 	msg.msg_name = &saddr_in;
 	msg.msg_namelen = sizeof(saddr_in);
 	iov.iov_len = sizeof(struct log_request);
-	iov.iov_base = &lr;
+	iov.iov_base = lr;
 		
 	fs = get_fs();
 	set_fs(get_ds());
@@ -871,14 +882,14 @@
 		if(error < sizeof(struct log_request)){
 			DMERR("Cluster mirror log server received incomplete message.");
 		}
-		lc = get_log_context(lr.lr_uuid, lr.lr_uuid_ref);
+		lc = get_log_context(lr->lr_uuid, lr->lr_uuid_ref);
 
-		if(lr.lr_type == LRT_ELECTION ||
-		   lr.lr_type == LRT_SELECTION ||
-		   lr.lr_type == LRT_MASTER_ASSIGN ||
-		   lr.lr_type == LRT_MASTER_LEAVING){
+		if(lr->lr_type == LRT_ELECTION ||
+		   lr->lr_type == LRT_SELECTION ||
+		   lr->lr_type == LRT_MASTER_ASSIGN ||
+		   lr->lr_type == LRT_MASTER_LEAVING){
 			uint32_t old = (lc)?lc->server_id: 0xDEAD;
-			if(process_election(&lr, lc, &saddr_in)){
+			if(process_election(lr, lc, &saddr_in)){
 				DMERR("Election processing failed.");
 				return -1;
 			}
@@ -896,12 +907,12 @@
 		}
 
 		if(!lc){
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
 		if (lc->server_id != my_id) {
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
@@ -911,23 +922,23 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr->lr_type));
 			*/
 			if (my_id != nodeid) {
-				lr.u.lr_int_rtn = -ENXIO;
+				lr->u.lr_int_rtn = -ENXIO;
 				goto reply;
 			}
 		}			
 
-		switch(lr.lr_type){
+		switch(lr->lr_type){
 		case LRT_IS_CLEAN:
-			error = server_is_clean(lc, &lr);
+			error = server_is_clean(lc, lr);
 			break;
 		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, &lr);
+			error = server_is_remote_recovering(lc, lr);
 			break;
 		case LRT_IN_SYNC:
-			error = server_in_sync(lc, &lr);
+			error = server_in_sync(lc, lr);
 			break;
 		case LRT_MARK_REGION:
 			if(!(nodeid = 
@@ -935,8 +946,8 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_mark_region(lc, &lr, nodeid);
-			lr.u.lr_int_rtn = 0;
+			error = server_mark_region(lc, lr, nodeid);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_CLEAR_REGION:
 			if(!(nodeid = 
@@ -944,7 +955,10 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_clear_region(lc, &lr, nodeid);
+			error = server_clear_region(lc, lr, nodeid);
+			break;
+		case LRT_FLUSH:
+			error = server_flush(lc);
 			break;
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid = 
@@ -952,14 +966,14 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_get_resync_work(lc, &lr, nodeid);
+			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, &lr, lr.u.lr_int_rtn);
-			lr.u.lr_int_rtn = 0;
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
-			error = server_get_sync_count(lc, &lr);
+			error = server_get_sync_count(lc, lr);
 			break;
 		default:
 			DMWARN("unknown request type received");
@@ -971,15 +985,15 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error, RQ_STRING(lr.lr_type));
+			       error, RQ_STRING(lr->lr_type));
 */
-			lr.u.lr_int_rtn = error;
+			lr->u.lr_int_rtn = error;
 		}
 	reply:
     
 		/* Why do we need to reset this? */
 		iov.iov_len = sizeof(struct log_request);
-		iov.iov_base = &lr;
+		iov.iov_base = lr;
 		msg.msg_name = &saddr_in;
 		msg.msg_namelen = sizeof(saddr_in);
 
@@ -991,7 +1005,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       RQ_STRING(lr.lr_type), error);
+			       RQ_STRING(lr->lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1052,7 +1066,7 @@
 			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
-			spin_lock(&log_list_lock);
+			down(&log_list_lock);
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
 					if (atomic_read(&lc->suspended)) {
@@ -1062,7 +1076,7 @@
 					}
 				}
 			}
-			spin_unlock(&log_list_lock);
+			up(&log_list_lock);
 
 			break;
 		default:
@@ -1150,7 +1164,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
-	region_user_pool = mempool_create(100, region_user_alloc,
+	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
 		DMWARN("unable to allocate region user pool for server");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:34:44	1.1.2.2.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:23:01	1.1.2.2.2.2
@@ -14,14 +14,15 @@
 #define LRT_IN_SYNC             	3
 #define LRT_MARK_REGION         	4
 #define LRT_CLEAR_REGION        	5
-#define LRT_GET_RESYNC_WORK     	6
-#define LRT_COMPLETE_RESYNC_WORK        7
-#define LRT_GET_SYNC_COUNT      	8
-
-#define LRT_ELECTION			9
-#define LRT_SELECTION			10
-#define LRT_MASTER_ASSIGN		11
-#define LRT_MASTER_LEAVING		12
+#define LRT_FLUSH                       6
+#define LRT_GET_RESYNC_WORK     	7
+#define LRT_COMPLETE_RESYNC_WORK        8
+#define LRT_GET_SYNC_COUNT      	9
+
+#define LRT_ELECTION			10
+#define LRT_SELECTION			11
+#define LRT_MASTER_ASSIGN		12
+#define LRT_MASTER_LEAVING		13
 
 #define CLUSTER_LOG_PORT 51005
 
@@ -29,6 +30,7 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
 	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \