[Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ...

teigland at sourceware.org teigland at sourceware.org
Tue Sep 26 19:17:22 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	teigland at sourceware.org	2006-09-26 19:17:21

Modified files:
	group/daemon   : app.c cpg.c gd_internal.h joinleave.c main.c 

Log message:
	Add debugging in four areas to help us know more quickly when something
	might be wrong at the cpg level:
	- log if cpg flow control goes on
	- log when we're waiting to receive a cpg event for our own join
	- when we're in a FOO_STOP_WAIT or FOO_START_WAIT state, log how
	many more cpg messages we're waiting to receive before moving on
	to the next state
	- save the event id of the last cpg message we sent, and clear that
	value when we receive that message back (this value is printed to
	the debug log when someone runs group_tool, not shown in the
	group_tool output)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.45&r2=1.46
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/joinleave.c.diff?cvsroot=cluster&r1=1.17&r2=1.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45

--- cluster/group/daemon/app.c	2006/09/15 20:07:15	1.45
+++ cluster/group/daemon/app.c	2006/09/26 19:17:20	1.46
@@ -692,6 +692,7 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send stopped");
+	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg));
 }
 
@@ -710,6 +711,7 @@
 	msg_bswap_out(&msg);
 
 	log_group(g, "send started");
+	g->app->sent_event_id = ev->id;
 	return send_message_groupd(g, &msg, sizeof(msg));
 }
 
@@ -788,7 +790,6 @@
 	}
 }
 
-#if 0
 static int count_nodes_not_stopped(app_t *a)
 {
 	node_t *node;
@@ -800,7 +801,6 @@
 	}
 	return i;
 }
-#endif
 
 int event_state_begin(app_t *a)
 {
@@ -853,7 +853,7 @@
 	event_t *ev = a->current_event;
 	node_t *node, *n;
 	struct nodeid *id;
-	int rv = 0, do_start = 0;
+	int rv = 0, do_start = 0, count;
 
 	if (!(event_state_stopping(a) || event_state_starting(a)))
 		log_group(g, "process_current_event %llx %d %s",
@@ -904,10 +904,9 @@
 		break;
 
 	case EST_JOIN_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before JOIN_ALL_STOPPED", count);
 		break;
 
 	case EST_JOIN_ALL_STOPPED:
@@ -939,10 +938,9 @@
 		break;
 
 	case EST_LEAVE_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before LEAVE_ALL_STOPPED", count);
 		break;
 
 	case EST_LEAVE_ALL_STOPPED:
@@ -993,10 +991,9 @@
 		break;
 
 	case EST_FAIL_STOP_WAIT:
-		/*
 		count = count_nodes_not_stopped(a);
-		log_group(g, "waiting for %d more nodes to be stopped", count);
-		*/
+		log_group(g, "waiting for %d more stopped messages "
+			  "before FAIL_ALL_STOPPED", count);
 		break;
 
 	case EST_FAIL_ALL_STOPPED:
@@ -1470,8 +1467,11 @@
 		}
 
 		if (ev) {
+			a->need_first_event = 0;
 			a->current_event = ev;
 			rv = process_current_event(g);
+		} else if (a->need_first_event) {
+			log_group(g, "waiting for our own cpg join event");
 		}
 	}
  out:
--- cluster/group/daemon/cpg.c	2006/09/08 23:14:56	1.31
+++ cluster/group/daemon/cpg.c	2006/09/26 19:17:20	1.32
@@ -20,6 +20,7 @@
 static int			saved_left_count;
 static cpg_handle_t		saved_handle;
 static struct cpg_name		saved_name;
+static int			message_flow_control_on;
 
 
 static node_t *find_group_node(group_t *g, int nodeid)
@@ -246,6 +247,9 @@
 		  msg_type(msg->ms_type));
 	*/
 
+	if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id)
+		g->app->sent_event_id = 0;
+
 	save = malloc(sizeof(struct save_msg));
 	memset(save, 0, sizeof(struct save_msg));
 	save->nodeid = nodeid;
@@ -375,6 +379,7 @@
 	cpg_error_t error;
 	cpg_handle_t handle;
 	int found = 0;
+	cpg_flow_control_state_t flow_control_state;
 
 	if (ci == groupd_ci) {
 		handle = groupd_handle;
@@ -404,6 +409,18 @@
 		return;
 	}
 
+	error = cpg_flow_control_state_get(handle, &flow_control_state);
+	if (error != CPG_OK)
+		log_error(g, "cpg_flow_control_state_get %d", error);
+	else if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+		message_flow_control_on = 1;
+		log_debug("flow control on");
+	} else {
+		if (message_flow_control_on)
+			log_debug("flow control off");
+		message_flow_control_on = 0;
+	}
+
 	if (got_confchg)
 		process_confchg();
 }
--- cluster/group/daemon/gd_internal.h	2006/09/15 18:20:36	1.40
+++ cluster/group/daemon/gd_internal.h	2006/09/26 19:17:21	1.41
@@ -180,6 +180,8 @@
 	struct list_head	events;
 	event_t			*current_event;
 	group_t			*g;
+	uint64_t		sent_event_id; /* for debugging */
+	int			need_first_event; /* for debugging */
 };
 
 #define MSG_APP_STOPPED        1
--- cluster/group/daemon/joinleave.c	2006/06/28 22:16:36	1.17
+++ cluster/group/daemon/joinleave.c	2006/09/26 19:17:21	1.18
@@ -63,6 +63,7 @@
 	a = malloc(sizeof(app_t));
 	memset(a, 0, sizeof(app_t));
 
+	a->need_first_event = 1;
 	INIT_LIST_HEAD(&a->nodes);
 	INIT_LIST_HEAD(&a->events);
 	a->g = g;
--- cluster/group/daemon/main.c	2006/09/15 18:20:36	1.44
+++ cluster/group/daemon/main.c	2006/09/26 19:17:21	1.45
@@ -462,6 +462,10 @@
 			else
 				data->event_local_status = -1;
 		}
+
+		if (g->app->sent_event_id)
+			log_group(g, "sent_event_id %llx",
+				  g->app->sent_event_id);
 	}
 
 	data->member_count = g->app->node_count;




More information about the Cluster-devel mailing list