[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

Thu Sep 13 15:24:21 UTC 2007

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-13 15:24:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 257881: Flush/recovery collision leads to deadlock after leg ...
	
	The procedure for coordinating nominal I/O and recovery I/O, was to
	either:
	1) delay a flush which contained a mark to a region being recovered
	2) skip over regions that are currently marked when assigning recovery
	
	This bug has to do with the way #1 was implemented.
	
	The following scenario would trigger it:
	1) node1 is assigned recovery on region X
	2) node1 also does a mark (write) on region Y
	3) node2 attempts to mark region X
	**) any flush issued here will delay waiting for recovery to complete on X
	4) node1 needs to perform the flush before it can get on with completing
	recovery - but it can't flush, so everyone is delayed *forever*.
	
	The fix was to allow flushes from nodes that are not attempting to mark
	regions that are being recovered.  In the example above, node1 should be
	allowed to complete the flush because it is not trying to write to the
	same region that is being recovered.  node2 would be correctly delayed.
	Since node1 can complete the flush, it can also complete the recovery -
	thus allowing things to proceed.
	
	This bug only affects mirrors that are not in-sync and are doing I/O.
	This bug can occur whether there are device/machine failures or not.
	This bug is most easily reproduced with a number of mirrors, but would
	be possible with just one.
	
	I've also fixed up some debugging output so it is more consistent and
	easier to follow the flow of events.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.50&r2=1.1.2.51
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.36&r2=1.1.2.37

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/08/23 16:51:39	1.1.2.50
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/13 15:24:20	1.1.2.51
@@ -946,7 +946,8 @@
 
 	while ((r = consult_server(lc, 0, LRT_FLUSH, NULL))) {
 		if (r == -EBUSY) {
-			DMDEBUG("Delaying flush due to recovery");
+			DMDEBUG("Delaying flush due to recovery (%s)",
+				lc->uuid + (strlen(lc->uuid) - 8));
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ);
 			continue;
@@ -1110,8 +1111,8 @@
 	}
 
 	if (rtn)
-		DMDEBUG("Client received resync work: %Lu/%s",
-			*region, lc->uuid + (strlen(lc->uuid) - 8));
+		DMDEBUG("Received recovery work from %u: %Lu/%s",
+			lc->server_id, *region, lc->uuid + (strlen(lc->uuid) - 8));
 
 	/*
 	 * Check for bug 235039
@@ -1137,12 +1138,19 @@
 	region_t success_tmp = success;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	if (success)
+		DMDEBUG("Client finishing recovery: %Lu/%s",
+			region, lc->uuid + (strlen(lc->uuid) - 8));
+	else
+		DMDEBUG("Notifying server(%u) of sync change: %Lu/%s",
+			lc->server_id, region,
+			lc->uuid + (strlen(lc->uuid) - 8));
 	for (i = 0; i < 5; i++) {
 		if (!consult_server(lc, region,
 				    LRT_COMPLETE_RESYNC_WORK, &success_tmp))
 			break;
 		success_tmp = success;
-		DMWARN("Unable to notify server of completed resync work");
+		DMWARN("Unable to notify server of sync state change");
 	}
 	return;
 }
@@ -1203,6 +1211,7 @@
 		DMDEBUG("LOG INFO:");
 		DMDEBUG("  uuid: %s", lc->uuid);
 		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG("  log type    : %s", (lc->log_dev) ? "disk" : "core");
 		DMDEBUG(" ?region_count: %Lu", lc->region_count);
 		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
 		DMDEBUG(" ?sync_search : %d", lc->sync_search);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/07/11 16:18:03	1.1.2.36
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/13 15:24:20	1.1.2.37
@@ -549,10 +549,11 @@
 }
 
 
-static int server_flush(struct log_c *lc)
+static int server_flush(struct log_c *lc, uint32_t who)
 {
 	int r = 0;
 	int count = 0;
+	int do_flush = 1;
 	struct region_user *ru, *marker = NULL, *recoverer = NULL;
 
 	if (lc->recovering_region != (uint64_t)-1) {
@@ -560,22 +561,28 @@
 			if (ru->ru_region == lc->recovering_region) {
 				if (ru->ru_rw == RU_RECOVER)
 					recoverer = ru;
-				else
+				else if (ru->ru_nodeid == who) {
+					do_flush = 0;
 					marker = ru;
+				} else
+					marker = ru;
+
 				count++;
 			}
 
 		if (marker && recoverer) {
-			DMDEBUG("Flush/recovery collision (%Lu/%s): count = %d, marker = %u, recoverer = %u",
+			DMDEBUG("Flush/recovery collision on %Lu/%s: count = %d, marker = %u, recoverer = %u",
 				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
 				count, marker->ru_nodeid, recoverer->ru_nodeid);
-			/*
-			DMDEBUG("  sync_bit: %s, clean_bit: %s",
-				log_test_bit(lc->sync_bits, lc->recovering_region) ? "set" : "unset",
-				log_test_bit(lc->clean_bits, lc->recovering_region) ? "set" : "unset");
-			*/
-
-			return -EBUSY;
+			DMDEBUG("  Count     = %d", count);
+			DMDEBUG("  Marker    = %u", marker->ru_nodeid);
+			DMDEBUG("  Recoverer = %u", recoverer->ru_nodeid);
+			DMDEBUG("  Flusher   = %u", who);
+			if (!do_flush) {
+				DMDEBUG("Blocking flush");
+				return -EBUSY;
+			} else
+				DMDEBUG("Allowing flush");
 		}
 	}
 
@@ -650,8 +657,8 @@
 	new->ru_region = *region;
 	new->ru_rw = RU_RECOVER;
 	list_add(&new->ru_list, &lc->region_users);
-	DMDEBUG("Assigning recovery work to %u/%s: %Lu",
-		who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+	DMDEBUG("Assigning recovery work to %u: %Lu/%s",
+		who, new->ru_region, lc->uuid + (strlen(lc->uuid) - 8));
 
 	return 0;
 }
@@ -680,7 +687,8 @@
 		}
 		lc->sync_pass = 0;
 
-		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
+		DMDEBUG("Resync work completed by %u: %Lu/%s",
+			who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		return 0;
 	}
 
@@ -1077,7 +1085,12 @@
 			error = server_clear_region(lc, lr, nodeid);
 			break;
 		case LRT_FLUSH:
-			error = server_flush(lc);
+			if(!(nodeid = 
+			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+				error = -ENXIO;
+				break;
+			}
+			error = server_flush(lc, nodeid);
 			break;
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid =