[Cluster-devel] cluster/group/daemon app.c cpg.c gd_internal.h ...
teigland at sourceware.org
teigland at sourceware.org
Tue Sep 26 19:17:22 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-09-26 19:17:21
Modified files:
group/daemon : app.c cpg.c gd_internal.h joinleave.c main.c
Log message:
Add debugging in four areas to help us know more quickly when something
might be wrong at the cpg level:
- log if cpg flow control goes on
- log when we're waiting to receive a cpg event for our own join
- when we're in a FOO_STOP_WAIT or FOO_START_WAIT state, log how
many more cpg messages we're waiting to receive before moving on
to the next state
- save the event id of the last cpg message we sent, and clear that
value when we receive that message back (this value is printed to
the debug log when someone runs group_tool, not shown in the
group_tool output)
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&r1=1.45&r2=1.46
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/cpg.c.diff?cvsroot=cluster&r1=1.31&r2=1.32
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/joinleave.c.diff?cvsroot=cluster&r1=1.17&r2=1.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/main.c.diff?cvsroot=cluster&r1=1.44&r2=1.45
--- cluster/group/daemon/app.c 2006/09/15 20:07:15 1.45
+++ cluster/group/daemon/app.c 2006/09/26 19:17:20 1.46
@@ -692,6 +692,7 @@
msg_bswap_out(&msg);
log_group(g, "send stopped");
+ g->app->sent_event_id = ev->id;
return send_message_groupd(g, &msg, sizeof(msg));
}
@@ -710,6 +711,7 @@
msg_bswap_out(&msg);
log_group(g, "send started");
+ g->app->sent_event_id = ev->id;
return send_message_groupd(g, &msg, sizeof(msg));
}
@@ -788,7 +790,6 @@
}
}
-#if 0
static int count_nodes_not_stopped(app_t *a)
{
node_t *node;
@@ -800,7 +801,6 @@
}
return i;
}
-#endif
int event_state_begin(app_t *a)
{
@@ -853,7 +853,7 @@
event_t *ev = a->current_event;
node_t *node, *n;
struct nodeid *id;
- int rv = 0, do_start = 0;
+ int rv = 0, do_start = 0, count;
if (!(event_state_stopping(a) || event_state_starting(a)))
log_group(g, "process_current_event %llx %d %s",
@@ -904,10 +904,9 @@
break;
case EST_JOIN_STOP_WAIT:
- /*
count = count_nodes_not_stopped(a);
- log_group(g, "waiting for %d more nodes to be stopped", count);
- */
+ log_group(g, "waiting for %d more stopped messages "
+ "before JOIN_ALL_STOPPED", count);
break;
case EST_JOIN_ALL_STOPPED:
@@ -939,10 +938,9 @@
break;
case EST_LEAVE_STOP_WAIT:
- /*
count = count_nodes_not_stopped(a);
- log_group(g, "waiting for %d more nodes to be stopped", count);
- */
+ log_group(g, "waiting for %d more stopped messages "
+ "before LEAVE_ALL_STOPPED", count);
break;
case EST_LEAVE_ALL_STOPPED:
@@ -993,10 +991,9 @@
break;
case EST_FAIL_STOP_WAIT:
- /*
count = count_nodes_not_stopped(a);
- log_group(g, "waiting for %d more nodes to be stopped", count);
- */
+ log_group(g, "waiting for %d more stopped messages "
+ "before FAIL_ALL_STOPPED", count);
break;
case EST_FAIL_ALL_STOPPED:
@@ -1470,8 +1467,11 @@
}
if (ev) {
+ a->need_first_event = 0;
a->current_event = ev;
rv = process_current_event(g);
+ } else if (a->need_first_event) {
+ log_group(g, "waiting for our own cpg join event");
}
}
out:
--- cluster/group/daemon/cpg.c 2006/09/08 23:14:56 1.31
+++ cluster/group/daemon/cpg.c 2006/09/26 19:17:20 1.32
@@ -20,6 +20,7 @@
static int saved_left_count;
static cpg_handle_t saved_handle;
static struct cpg_name saved_name;
+static int message_flow_control_on;
static node_t *find_group_node(group_t *g, int nodeid)
@@ -246,6 +247,9 @@
msg_type(msg->ms_type));
*/
+ if (nodeid == our_nodeid && g->app->sent_event_id == msg->ms_event_id)
+ g->app->sent_event_id = 0;
+
save = malloc(sizeof(struct save_msg));
memset(save, 0, sizeof(struct save_msg));
save->nodeid = nodeid;
@@ -375,6 +379,7 @@
cpg_error_t error;
cpg_handle_t handle;
int found = 0;
+ cpg_flow_control_state_t flow_control_state;
if (ci == groupd_ci) {
handle = groupd_handle;
@@ -404,6 +409,18 @@
return;
}
+ error = cpg_flow_control_state_get(handle, &flow_control_state);
+ if (error != CPG_OK)
+ log_error(g, "cpg_flow_control_state_get %d", error);
+ else if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) {
+ message_flow_control_on = 1;
+ log_debug("flow control on");
+ } else {
+ if (message_flow_control_on)
+ log_debug("flow control off");
+ message_flow_control_on = 0;
+ }
+
if (got_confchg)
process_confchg();
}
--- cluster/group/daemon/gd_internal.h 2006/09/15 18:20:36 1.40
+++ cluster/group/daemon/gd_internal.h 2006/09/26 19:17:21 1.41
@@ -180,6 +180,8 @@
struct list_head events;
event_t *current_event;
group_t *g;
+ uint64_t sent_event_id; /* for debugging */
+ int need_first_event; /* for debugging */
};
#define MSG_APP_STOPPED 1
--- cluster/group/daemon/joinleave.c 2006/06/28 22:16:36 1.17
+++ cluster/group/daemon/joinleave.c 2006/09/26 19:17:21 1.18
@@ -63,6 +63,7 @@
a = malloc(sizeof(app_t));
memset(a, 0, sizeof(app_t));
+ a->need_first_event = 1;
INIT_LIST_HEAD(&a->nodes);
INIT_LIST_HEAD(&a->events);
a->g = g;
--- cluster/group/daemon/main.c 2006/09/15 18:20:36 1.44
+++ cluster/group/daemon/main.c 2006/09/26 19:17:21 1.45
@@ -462,6 +462,10 @@
else
data->event_local_status = -1;
}
+
+ if (g->app->sent_event_id)
+ log_group(g, "sent_event_id %llx",
+ g->app->sent_event_id);
}
data->member_count = g->app->node_count;
More information about the Cluster-devel
mailing list