[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

jbrassow at sourceware.org jbrassow at sourceware.org
Tue Apr 24 20:10:22 UTC 2007


CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-24 21:10:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-server.h dm-cmirror-xfr.h 

Log message:
	Bug 199433: NULL pointer dereference in cman:process_messages for cmirro...
	- While this isn't a complete fix for 199433, it is most likely the
	cause of the error.  Cluster mirrors were steadily leaking memory
	every time they were deactivated.
	
	Bug 237028: cmirror recovery deadlock due to machine failure + primary l...
	- If there is outstanding resync work remaining when the server gets
	notice to suspend, delay for a moment to wait for it.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.4&r2=1.1.2.41.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.6&r2=1.1.2.26.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.8.1&r2=1.1.2.2.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.2.2&r2=1.1.2.2.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:13:15	1.1.2.41.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:10:20	1.1.2.41.2.5
@@ -379,7 +379,8 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Error while listening for server response: %d", len);
+		DMWARN("Error listening for server(%u) response for %s: %d",
+		       lc->server_id, lc->uuid + (strlen(lc->uuid) - 8), len);
 		error = len;
 		*retry = 1;
 		seq++;
@@ -767,6 +768,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int i;
 	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
@@ -788,10 +790,20 @@
 
 	spin_unlock(&lc->state_lock);
 
+	if(lc->server_id == my_id) {
+		for (i = 0; server_busy(lc) && (i < 10); i++) {
+			DMDEBUG("Server for %s still busy, waiting for others",
+				lc->uuid + (strlen(lc->uuid) - 8));
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ*2);
+		}
+	}
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
-			DMDEBUG("Telling everyone I'm suspending");
+			DMDEBUG("Telling everyone I'm suspending (%s)",
+				lc->uuid + (strlen(lc->uuid) - 8));
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -799,13 +811,15 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
-				DMDEBUG("Delaying suspend, work to be done.");
+				DMDEBUG("Delaying suspend, work to be done (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
-				DMDEBUG("Suspending now");
+				DMDEBUG("Suspending now (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				break;
 			}
 		}
@@ -1196,6 +1210,16 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
+		DMDEBUG("LOG INFO:");
+		DMDEBUG("  uuid: %s", lc->uuid);
+		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG(" ?region_count: %Lu", lc->region_count);
+		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
+		DMDEBUG(" ?sync_search : %d", lc->sync_search);
+		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+		DMDEBUG("  server_id   : %u", lc->server_id);
+		DMDEBUG("  server_valid: %s",
+			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1254,11 +1278,6 @@
 	}
 	up(&log_list_lock);
 
-	/*
-	if (likely(!shutting_down))
-		suspend_server();
-	*/
-
 	return 0;
 }
 
@@ -1311,9 +1330,7 @@
 		BUG();
 		break;
 	}
-	/*
-	resume_server();
-	*/
+
 	return 0;
 }
 
@@ -1452,6 +1469,7 @@
 	r = dm_register_dirty_log_type(&_clustered_core_type);
 	if (r) {
 		DMWARN("couldn't register clustered_core dirty log type");
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1459,6 +1477,7 @@
 	if (r) {
 		DMWARN("couldn't register clustered_disk dirty log type");
 		dm_unregister_dirty_log_type(&_clustered_core_type);
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1475,6 +1494,7 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+	mempool_destroy(region_state_pool);
         DMINFO("dm-cmirror %s (built %s %s) removed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/10 18:10:42	1.1.2.26.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:10:20	1.1.2.26.2.7
@@ -42,8 +42,6 @@
 static atomic_t server_run;
 static struct completion server_completion;
 
-static wait_queue_head_t _suspend_queue;
-static atomic_t _suspend;
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
@@ -706,8 +704,8 @@
 	}
 
 	if (!ru) {
-		DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
-		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		DMDEBUG("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
 		/*
 		 * This is a valid case, when the following happens:
 		 * 1) a region is recovering and has waiting writes
@@ -798,6 +796,12 @@
 	uint32_t lowest, next;
 	uint32_t node_count=global_count, *nodeids=global_nodeids;
 
+	DMDEBUG("%s(%d): (%s)", RQ_STRING(lr->lr_type), lr->lr_type,
+		(lc) ? lc->uuid + (strlen(lc->uuid) - 8) : "none");
+	DMDEBUG("  starter     : %u", lr->u.lr_starter);
+	DMDEBUG("  co-ordinator: %u", lr->u.lr_coordinator);
+	DMDEBUG("  node_count  : %d", lr->u.lr_node_count);
+
 	/* Record the starter's port number so we can get back to him */
 	if((lr->u.lr_starter == my_id) && (!lr->u.lr_node_count)){
 		lr->u.lr_starter_port = saddr->sin_port;
@@ -1175,12 +1179,12 @@
 
 	complete(&server_completion);
   
+	DMDEBUG("cluster_log_serverd ready for work");
 	for(;;){
 		if(!atomic_read(&server_run)){
 			break;
 		}
 
-		suspend_on(&_suspend_queue, atomic_read(&_suspend));
 		switch(atomic_read(&restart_event_type)){
 		case SERVICE_NODE_LEAVE:
 			/* ATTENTION -- may wish to check if regions **
@@ -1206,6 +1210,9 @@
 			up(&log_list_lock);
 
 			break;
+		case SERVICE_NODE_JOIN:
+			DMDEBUG("Node joining");
+			break;
 		default:
 			/* Someone has joined, or there is no event */
 			break;
@@ -1227,6 +1234,7 @@
 		schedule();
 	}
 
+	DMDEBUG("Closing socket on server side");
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;
@@ -1244,8 +1252,6 @@
 void print_server_status(struct log_c *lc){
 	int i;
 
-	atomic_set(&_suspend, 1);
-
 	DMINFO("SERVER OUTPUT::");
 
 	DMINFO("  Live nodes        :: %d", global_count);
@@ -1267,11 +1273,18 @@
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->bitset_uint32_count);
 	DMINFO("  Total = %d", i);
 
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
 }
 */
 
+int server_busy(struct log_c *lc)
+{
+	if (!list_empty(&lc->region_users) ||
+	    (lc->recovering_region != (uint64_t)-1))
+		return 1;
+	else
+		return 0;
+}
+
 int server_free_region_users(struct log_c *lc)
 {
 	int i = 0;
@@ -1287,18 +1300,6 @@
 	return 0;
 }
 
-
-int suspend_server(void){
-	atomic_set(&_suspend, 1);
-	return 0;
-}
-
-int resume_server(void){
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
-	return 0;
-}
-
 int resume_server_requests(void) {
 	atomic_set(&_do_requests, 1);
 	return 0;
@@ -1307,6 +1308,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
+	DMDEBUG("start_server called");
 	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
@@ -1314,20 +1316,20 @@
 		return -ENOMEM;
 	}
 
-	init_waitqueue_head(&_suspend_queue);
-
 	atomic_set(&_do_requests, 0);
 	atomic_set(&server_run, 1);
 	init_completion(&server_completion);
 
 	error = kernel_thread(cluster_log_serverd, NULL, 0);
 	if(error < 0){
+		mempool_destroy(region_user_pool);
 		DMWARN("failed to start kernel thread.");
 		return error;
 	}
 	wait_for_completion(&server_completion);
 
 	if(!atomic_read(&server_run)){
+		mempool_destroy(region_user_pool);
 		DMWARN("Cluster mirror log server thread failed to start");
 		return -1;
 	}
@@ -1337,9 +1339,17 @@
 
 
 void stop_server(void){
+	DMDEBUG("stop_server called");
 	atomic_set(&server_run, 0);
 
 	wait_for_completion(&server_completion);
+	down(&log_list_lock);
+	if (!list_empty(&log_list_head)) {
+		DMERR("Log elements remain at cluster log server shutdown");
+	}
+	up(&log_list_lock);
+	mempool_destroy(region_user_pool);
+
 	dm_io_put(32);
 }
 /*
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:13:15	1.1.2.2.8.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/24 20:10:20	1.1.2.2.8.2
@@ -7,7 +7,7 @@
 #ifndef __DM_CMIRROR_SERVER_H__
 #define __DM_CMIRROR_SERVER_H__
 
-int suspend_server(void);
+int server_busy(struct log_c *lc);
 int resume_server(void);
 int resume_server_requests(void);
 int start_server(void);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:23:01	1.1.2.2.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/24 20:10:20	1.1.2.2.2.3
@@ -30,14 +30,15 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
-	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
-	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
-	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
-	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": \
+	((x) == LRT_MASTER_ASSIGN) ? "LRT_MASTER_ASSIGN": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING" : "UNKNOWN"
 
 struct log_request {
 	int lr_type;




More information about the Cluster-devel mailing list