[Cluster-devel] cluster/group/daemon app.c gd_internal.h
teigland at sourceware.org
teigland at sourceware.org
Fri Feb 9 16:03:25 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: teigland at sourceware.org 2007-02-09 16:03:24
Modified files:
group/daemon : app.c gd_internal.h
Log message:
If the only two groups were two dlm lockspaces, then during recovery,
the first would detect the all_nodes_all_stopped condition and move
on to the starting state, and the second would never get a chance to
detect the all_nodes_all_stopped state since the event state of the
first was no longer FAIL_ALL_STOPPED. Use a separate flag to indicate
that the all stopped state has been reached instead of relying on the
event state.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/app.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.52.2.2&r2=1.52.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/daemon/gd_internal.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.44.2.2&r2=1.44.2.3
--- cluster/group/daemon/app.c 2007/01/05 18:50:01 1.52.2.2
+++ cluster/group/daemon/app.c 2007/02/09 16:03:24 1.52.2.3
@@ -403,20 +403,14 @@
list_for_each_entry(re, &rs->entries, list) {
ev = re->group->app->current_event;
-#if 0
- /* if we're not in the group yet, skip it */
- if (ev &&
- ev->state == EST_JOIN_STOP_WAIT &&
- is_our_join(ev)) {
- log_group(re->group, "skip all_stopped check for rs %d",
- rs->nodeid);
- continue;
- }
-#endif
+ /* we need to use ev->fail_all_stopped instead of checking
+ ev->state == FAIL_ALL_STOPPED because if two groups are at
+ the low level, one will detect all_levels_all_stopped first
+ and then immediately move on to starting before the other,
+ also checking all_levels_all_stopped, can see it's in
+ FAIL_ALL_STOPPED */
- if (ev &&
- is_recovery_event(ev) &&
- ev->state == EST_FAIL_ALL_STOPPED)
+ if (ev && is_recovery_event(ev) && ev->fail_all_stopped)
continue;
else
return 0;
@@ -574,6 +568,7 @@
ev = create_event(g);
ev->nodeid = nodeid;
ev->state = EST_FAIL_BEGIN;
+ ev->fail_all_stopped = 0;
ev->id = make_event_id(g, EST_FAIL_BEGIN, nodeid);
log_group(g, "queue recover event for nodeid %d", nodeid);
@@ -1116,6 +1111,7 @@
break;
case EST_FAIL_ALL_STOPPED:
+ ev->fail_all_stopped = 1;
/* when recovering for failed nodes, we immediately stop all
apps the node was involved with but wait for quorum before
@@ -1500,6 +1496,7 @@
if (ev->state > EST_FAIL_ALL_STOPPED) {
ev->state = EST_FAIL_BEGIN;
+ ev->fail_all_stopped = 0;
clear_all_nodes_stopped(a);
} else if (event_state_stopping(a)) {
mark_node_stopped(a, rev->nodeid);
--- cluster/group/daemon/gd_internal.h 2007/01/05 18:50:01 1.44.2.2
+++ cluster/group/daemon/gd_internal.h 2007/02/09 16:03:24 1.44.2.3
@@ -150,6 +150,7 @@
uint64_t id;
struct list_head extended;
int start_app_before_pending_rev;
+ int fail_all_stopped;
};
/*
More information about the Cluster-devel
mailing list