[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

jbrassow at sourceware.org jbrassow at sourceware.org
Fri Sep 21 20:07:38 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-21 20:07:37

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 291521: Cluster mirror can become out-of-sync if nominal I/O overla...
	
	It is insufficient to simply delay flush requests that have marks
	pending to a recovering region.  Although a collision between nominal
	I/O and resync I/O can be avoided this way, the state of the region
	changes from RH_NOSYNC to RH_CLEAN in the mean time.  The machine
	being delayed will think the region is still in the RH_NOSYNC state
	and only write to the primary device... leaving the other mirror
	devices out-of-sync.
	
	We must delay writes to remotely recovering regions before the state
	of the region is determined and cached in the region caching code...
	The entry point for this already exists in 'is_remote_recovering'.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.51&r2=1.1.2.52
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.37&r2=1.1.2.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.6&r2=1.1.2.7

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/13 15:24:20	1.1.2.51
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/21 20:07:37	1.1.2.52
@@ -858,7 +858,15 @@
 
 static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
 {
-	return 0;
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+ 	 
+	if (atomic_read(&lc->in_sync) == 1) {
+		return 0;
+	}
+
+	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
+	return rtn;
 }
 
 static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/13 15:24:20	1.1.2.37
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/21 20:07:37	1.1.2.38
@@ -444,6 +444,24 @@
 	return 0;
 }
 
+static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
+{
+	uint64_t high, low;
+
+	high = lc->sync_search + 10;
+	low = (lc->recovering_region != (uint64_t)-1) ?
+		lc->recovering_region :
+		lc->sync_search;
+	if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
+		DMDEBUG("Remote recovery conflict: %Lu/%s",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		lr->u.lr_int_rtn = 1;
+	} else
+		lr->u.lr_int_rtn = 0;
+
+	return 0;
+}
+
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
 	if (lr->u.lr_region > lc->region_count) {
@@ -485,51 +503,28 @@
 		list_add(&new->ru_list, &lc->region_users);
 	} else if (ru->ru_rw == RU_RECOVER) {
 		/*
-		 * The flush will block if a write conflicts with a
-		 * recovering region.  In the meantime, we add this
-		 * entry to the tail of the list so the recovery
-		 * gets cleared first.
-		 */
-		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
-		      "/%s which is being recovered.",
-		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
-		DMDEBUG("Mark requester   : %u", who);
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add_tail(&new->ru_list, &lc->region_users);
+		 * A mark that happens to a region in recovery
+		 * means certain corruption.
+		 */
+		DMERR("Mark attempted to recovering region by %u: %Lu/%s",
+		      who, lr->u.lr_region,
+		      lc->uuid + (strlen(lc->uuid) - 8));
+		DMERR("  lc->recovering_region = %Lu", lc->recovering_region);
+		DMERR("  ru->ru_rw             = %d", ru->ru_rw);
+		DMERR("  ru->ru_nodeid         = %u", ru->ru_nodeid);
+		DMERR("  ru->ru_region         = %Lu", ru->ru_region);
+		BUG();
 	} else {
 		list_add(&new->ru_list, &ru->ru_list);
 	}
 
-	/*
-	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add(&new->ru_list, &lc->region_users);
-	} else if (ru->ru_rw == RU_RECOVER) {
-		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
-		      "/%s which is being recovered.",
-		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
-		DMDEBUG("Mark requester   : %u", who);
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add_tail(&new->ru_list, &lc->region_users);
-	} else if (!find_ru(lc, who, lr->u.lr_region)) {
-		list_add(&new->ru_list, &ru->ru_list);
-	} else {
-		DMWARN("Attempt to mark a already marked region (%u,"
-		       SECTOR_FORMAT
-		       "/%s)",
-		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		mempool_free(new, region_user_pool);
-	}
-	*/
-
 	return 0;
 }
 
 
 static int server_clear_region(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
+	int check_bug = 0;
 	struct region_user *ru;
 
 	ru = find_ru(lc, who, lr->u.lr_region);
@@ -538,13 +533,22 @@
 		       who, lr->u.lr_region);
 		return -EINVAL;
 	} else {
+		if (lc->recovering_region == lr->u.lr_region) {
+			lc->recovering_region = (uint64_t)-1;
+			check_bug = 1;
+		}
 		list_del(&ru->ru_list);
 		mempool_free(ru, region_user_pool);
 	}
 
-	if(!find_ru_by_region(lc, lr->u.lr_region)){
+	if (!find_ru_by_region(lc, lr->u.lr_region)) {
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+	} else if (check_bug) {
+		DMERR("Multiple marks exist on a region being recovered: %Lu/%s",
+		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		BUG();
 	}
+		
 	return 0;
 }
 
@@ -552,37 +556,17 @@
 static int server_flush(struct log_c *lc, uint32_t who)
 {
 	int r = 0;
-	int count = 0;
-	int do_flush = 1;
-	struct region_user *ru, *marker = NULL, *recoverer = NULL;
+	struct region_user *ru;
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		list_for_each_entry(ru, &lc->region_users, ru_list)
-			if (ru->ru_region == lc->recovering_region) {
-				if (ru->ru_rw == RU_RECOVER)
-					recoverer = ru;
-				else if (ru->ru_nodeid == who) {
-					do_flush = 0;
-					marker = ru;
-				} else
-					marker = ru;
-
-				count++;
-			}
-
-		if (marker && recoverer) {
-			DMDEBUG("Flush/recovery collision on %Lu/%s: count = %d, marker = %u, recoverer = %u",
-				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
-				count, marker->ru_nodeid, recoverer->ru_nodeid);
-			DMDEBUG("  Count     = %d", count);
-			DMDEBUG("  Marker    = %u", marker->ru_nodeid);
-			DMDEBUG("  Recoverer = %u", recoverer->ru_nodeid);
-			DMDEBUG("  Flusher   = %u", who);
-			if (!do_flush) {
-				DMDEBUG("Blocking flush");
-				return -EBUSY;
-			} else
-				DMDEBUG("Allowing flush");
+		list_for_each_entry(ru, &lc->region_users, ru_list) {
+			if ((ru->ru_region == lc->recovering_region) &&
+			    (ru->ru_rw != RU_RECOVER)) {
+				DMERR("Flush attempted to recovering region by %u: %Lu/%s",
+				      who, lc->recovering_region,
+				      lc->uuid + (strlen(lc->uuid) - 8));
+				BUG();
+			}
 		}
 	}
 
@@ -601,7 +585,6 @@
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
-	int sync_search, conflict = 0;
 	region_t *region = &(lr->u.lr_region_rtn);
 
 	lr->u.lr_int_rtn = 0; /* Default to no work */
@@ -625,19 +608,13 @@
 		}
 	}
 
-	for (sync_search = lc->sync_search;
-	     sync_search < lc->region_count;
-	     sync_search = (*region + 1)) {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  sync_search);
-		if (find_ru_by_region(lc, *region)) {
-			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-				*region, lc->uuid + (strlen(lc->uuid) - 8));
-		} else {
-			break;
-		}
+	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					  lc->region_count,
+					  lc->sync_search);
+	if (find_ru_by_region(lc, *region)) {
+		DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+			*region, lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
 	}
 
 	if (*region >= lc->region_count)
@@ -647,8 +624,7 @@
 	if (!new)
 		return -ENOMEM;
 
-	if (!conflict)
-		lc->sync_search = *region + 1;
+	lc->sync_search = *region + 1;
 
 	lc->recovering_region = *region;
 
@@ -678,13 +654,18 @@
 			return -EINVAL;
 		}
 
-		lc->recovering_region = (uint64_t)-1;
-
 		/* We could receive multiple identical request due to network failure */
-		if(!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
+		if (!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 			log_set_bit(lc, lc->sync_bits, lr->u.lr_region);
 			lc->sync_count++;
 		}
+
+		/*
+		 * We will: 
+		 * lc->recovering_region = (uint64_t)-1;
+		 * in clear_region so we can do extra validation
+		 */
+
 		lc->sync_pass = 0;
 
 		DMDEBUG("Resync work completed by %u: %Lu/%s",
@@ -1064,6 +1045,9 @@
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, lr);
 			break;
+		case LRT_IS_REMOTE_RECOVERING:
+			error = server_is_remote_recovering(lc, lr);
+			break;
 		case LRT_IN_SYNC:
 			error = server_in_sync(lc, lr);
 			break;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/07/11 16:18:03	1.1.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/09/21 20:07:37	1.1.2.7
@@ -10,18 +10,19 @@
 #define MAX_NAME_LEN 128
 
 #define LRT_IS_CLEAN			1
-#define LRT_IN_SYNC             	2
-#define LRT_MARK_REGION         	3
-#define LRT_CLEAR_REGION        	4
-#define LRT_FLUSH                       5
-#define LRT_GET_RESYNC_WORK     	6
-#define LRT_COMPLETE_RESYNC_WORK        7
-#define LRT_GET_SYNC_COUNT      	8
-
-#define LRT_ELECTION			9
-#define LRT_SELECTION			10
-#define LRT_MASTER_ASSIGN		11
-#define LRT_MASTER_LEAVING		12
+#define LRT_IS_REMOTE_RECOVERING	2
+#define LRT_IN_SYNC             	3
+#define LRT_MARK_REGION         	4
+#define LRT_CLEAR_REGION        	5
+#define LRT_FLUSH                       6
+#define LRT_GET_RESYNC_WORK     	7
+#define LRT_COMPLETE_RESYNC_WORK        8
+#define LRT_GET_SYNC_COUNT      	9
+
+#define LRT_ELECTION			10
+#define LRT_SELECTION			11
+#define LRT_MASTER_ASSIGN		12
+#define LRT_MASTER_LEAVING		13
 
 #define CLUSTER_LOG_PORT 51005
 




More information about the Cluster-devel mailing list