[Cluster-devel] cluster/group/daemon app.c gd_internal.h

Fri Feb 9 16:03:25 UTC 2007

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	teigland at sourceware.org	2007-02-09 16:03:24

Modified files:
	group/daemon   : app.c gd_internal.h 

Log message:
	If the only two groups were two dlm lockspaces, then during recovery,
	the first would detect the all_nodes_all_stopped condition and move
	on to the starting state, and the second would never get a chance to
	detect the all_nodes_all_stopped state since the event state of the
	first was no longer FAIL_ALL_STOPPED.  Use a separate flag to indicate
	that the all stopped state has been reached instead of relying on the
	event state.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.52.2.2&r2=1.52.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.44.2.2&r2=1.44.2.3

--- cluster/group/daemon/app.c	2007/01/05 18:50:01	1.52.2.2
+++ cluster/group/daemon/app.c	2007/02/09 16:03:24	1.52.2.3
@@ -403,20 +403,14 @@
 	list_for_each_entry(re, &rs->entries, list) {
 		ev = re->group->app->current_event;
 
-#if 0
-		/* if we're not in the group yet, skip it */
-		if (ev &&
-		    ev->state == EST_JOIN_STOP_WAIT &&
-		    is_our_join(ev)) {
-			log_group(re->group, "skip all_stopped check for rs %d",
-				  rs->nodeid);
-			continue;
-		}
-#endif
+		/* we need to use ev->fail_all_stopped instead of checking
+		   ev->state == FAIL_ALL_STOPPED because if two groups are at
+		   the low level, one will detect all_levels_all_stopped first
+		   and then immediately move on to starting before the other,
+		   also checking all_levels_all_stopped, can see it's in
+		   FAIL_ALL_STOPPED */
 
-		if (ev &&
-		    is_recovery_event(ev) &&
-		    ev->state == EST_FAIL_ALL_STOPPED)
+		if (ev && is_recovery_event(ev) && ev->fail_all_stopped)
 			continue;
 		else
 			return 0;
@@ -574,6 +568,7 @@
 	ev = create_event(g);
 	ev->nodeid = nodeid;
 	ev->state = EST_FAIL_BEGIN;
+	ev->fail_all_stopped = 0;
 	ev->id = make_event_id(g, EST_FAIL_BEGIN, nodeid);
 
 	log_group(g, "queue recover event for nodeid %d", nodeid);
@@ -1116,6 +1111,7 @@
 		break;
 
 	case EST_FAIL_ALL_STOPPED:
+		ev->fail_all_stopped = 1;
 
 		/* when recovering for failed nodes, we immediately stop all
 		   apps the node was involved with but wait for quorum before
@@ -1500,6 +1496,7 @@
 
 		if (ev->state > EST_FAIL_ALL_STOPPED) {
 			ev->state = EST_FAIL_BEGIN;
+			ev->fail_all_stopped = 0;
 			clear_all_nodes_stopped(a);
 		} else if (event_state_stopping(a)) {
 			mark_node_stopped(a, rev->nodeid);
--- cluster/group/daemon/gd_internal.h	2007/01/05 18:50:01	1.44.2.2
+++ cluster/group/daemon/gd_internal.h	2007/02/09 16:03:24	1.44.2.3
@@ -150,6 +150,7 @@
 	uint64_t		id;
 	struct list_head	extended;
 	int			start_app_before_pending_rev;
+	int			fail_all_stopped;
 };
 
 /*