[Cluster-devel] cluster/rgmanager include/resgroup.h include/r ...
lhh at sourceware.org
lhh at sourceware.org
Fri Feb 1 15:15:04 UTC 2008
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL5
Changes by: lhh at sourceware.org 2008-02-01 15:15:03
Modified files:
rgmanager/include: resgroup.h restart_counter.h
rgmanager/src/daemons: groups.c restart_counter.c rg_state.c
slang_event.c
rgmanager/src/resources: default_event_script.sl
Log message:
Allow restart counters to work with central_processing; #400211 / #431130
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.9&r2=1.15.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.14&r2=1.25.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.17&r2=1.24.2.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/slang_event.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.1&r2=1.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.3&r2=1.1.2.4
--- cluster/rgmanager/include/resgroup.h 2007/12/18 17:52:56 1.15.2.9
+++ cluster/rgmanager/include/resgroup.h 2008/02/01 15:15:02 1.15.2.10
@@ -135,6 +135,7 @@
int svc_fail(char *svcName);
int svc_migrate(char *svcName, int target);
int check_restart(char *svcName);
+int add_restart(char *svcName);
int rt_enqueue_request(const char *resgroupname, int request,
msgctx_t *resp_ctx,
--- cluster/rgmanager/include/restart_counter.h 2007/12/18 17:52:56 1.1.2.2
+++ cluster/rgmanager/include/restart_counter.h 2008/02/01 15:15:02 1.1.2.3
@@ -25,6 +25,7 @@
int restart_add(restart_counter_t arg);
int restart_clear(restart_counter_t arg);
int restart_count(restart_counter_t arg);
+int restart_treshold_exceeded(restart_counter_t arg);
restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
int restart_cleanup(restart_counter_t arg);
--- cluster/rgmanager/src/daemons/groups.c 2007/12/18 17:52:56 1.25.2.14
+++ cluster/rgmanager/src/daemons/groups.c 2008/02/01 15:15:02 1.25.2.15
@@ -1787,7 +1787,7 @@
int
-check_restart(char *rg_name)
+add_restart(char *rg_name)
{
resource_node_t *node;
int ret = 1;
@@ -1796,11 +1796,24 @@
node = node_by_ref(&_tree, rg_name);
if (node) {
ret = restart_add(node->rn_restart_counter);
- if (ret) {
- /* Clear it out - caller is about
- to relocate the service anyway */
- restart_clear(node->rn_restart_counter);
- }
+ }
+ pthread_rwlock_unlock(&resource_lock);
+
+ return ret;
+}
+
+
+int
+check_restart(char *rg_name)
+{
+ resource_node_t *node;
+ int ret = 0;
+
+ pthread_rwlock_rdlock(&resource_lock);
+ node = node_by_ref(&_tree, rg_name);
+ if (node) {
+ printf("%s %p\n", rg_name, node->rn_restart_counter);
+ ret = restart_threshold_exceeded(node->rn_restart_counter);
}
pthread_rwlock_unlock(&resource_lock);
--- cluster/rgmanager/src/daemons/restart_counter.c 2007/11/26 21:46:27 1.1.2.1
+++ cluster/rgmanager/src/daemons/restart_counter.c 2008/02/01 15:15:02 1.1.2.2
@@ -46,6 +46,10 @@
#define VALIDATE(arg, ret) \
do { \
+ if (!arg) {\
+ errno = EINVAL; \
+ return ret; \
+ } \
if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
errno = EINVAL; \
return ret; \
@@ -97,6 +101,21 @@
}
+int
+restart_threshold_exceeded(restart_counter_t arg)
+{
+ restart_info_t *restarts = (restart_info_t *)arg;
+ time_t now;
+
+ VALIDATE(arg, -1);
+ now = time(NULL);
+ restart_timer_purge(arg, now);
+ if (restarts->restart_count >= restarts->max_restarts)
+ return 1;
+ return 0;
+}
+
+
/* Add a restart entry to the list. Returns 1 if restart
count is exceeded */
int
@@ -127,7 +146,7 @@
/* Check and remove old entries */
restart_timer_purge(restarts, t);
- if (restarts->restart_count > restarts->max_restarts)
+ if (restarts->restart_count >= restarts->max_restarts)
return 1;
return 0;
@@ -170,6 +189,7 @@
info->expire_timeout = expire_timeout;
info->max_restarts = max_restarts;
info->restart_count = 0;
+ info->restart_nodes = NULL;
return (void *)info;
}
--- cluster/rgmanager/src/daemons/rg_state.c 2008/01/25 18:09:24 1.24.2.17
+++ cluster/rgmanager/src/daemons/rg_state.c 2008/02/01 15:15:02 1.24.2.18
@@ -678,7 +678,6 @@
clulog(LOG_NOTICE,
"Recovering failed service %s\n",
svcName);
- svcStatus->rs_state = RG_STATE_STOPPED;
/* Start! */
ret = 1;
break;
@@ -789,13 +788,16 @@
/* LOCK HELD if we get here */
svcStatus.rs_owner = my_id();
- svcStatus.rs_state = RG_STATE_STARTING;
svcStatus.rs_transition = (uint64_t)time(NULL);
- if (req == RG_START_RECOVER)
+ if (svcStatus.rs_state == RG_STATE_RECOVER) {
+ add_restart(svcName);
svcStatus.rs_restarts++;
- else
+ } else {
svcStatus.rs_restarts = 0;
+ }
+
+ svcStatus.rs_state = RG_STATE_STARTING;
if (set_rg_state(svcName, &svcStatus) < 0) {
clulog(LOG_ERR,
@@ -1248,7 +1250,7 @@
{
struct dlm_lksb lockp;
rg_state_t svcStatus;
- int ret;
+ int ret = 0;
int old_state;
if (!rg_quorate()) {
@@ -1291,6 +1293,18 @@
old_state = svcStatus.rs_state;
+ if (old_state == RG_STATE_RECOVER) {
+ clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n",
+ svcName);
+ svcStatus.rs_state = newstate;
+
+ if (set_rg_state(svcName, &svcStatus) != 0) {
+ rg_unlock(&lockp);
+ clulog(LOG_ERR, "#52: Failed changing RG status\n");
+ return RG_EFAIL;
+ }
+ }
+
clulog(LOG_NOTICE, "Stopping service %s\n", svcName);
if (recover)
--- cluster/rgmanager/src/daemons/slang_event.c 2007/12/18 17:52:56 1.3.2.1
+++ cluster/rgmanager/src/daemons/slang_event.c 2008/02/01 15:15:02 1.3.2.2
@@ -80,6 +80,7 @@
_node_clean = 0,
_service_owner = 0,
_service_last_owner = 0,
+ _service_restarts_exceeded = 0,
_user_request = 0,
_user_arg1 = 0,
_user_arg2 = 0,
@@ -123,6 +124,8 @@
MAKE_VARIABLE("service_owner", &_service_owner,SLANG_INT_TYPE, 1),
MAKE_VARIABLE("service_last_owner", &_service_last_owner,
SLANG_INT_TYPE, 1),
+ MAKE_VARIABLE("service_restarts_exceeded", &_service_restarts_exceeded,
+ SLANG_INT_TYPE, 1),
/* User event information */
MAKE_VARIABLE("user_request", &_user_request, SLANG_INT_TYPE,1),
@@ -226,6 +229,7 @@
sl_service_status(char *svcName)
{
rg_state_t svcStatus;
+ int restarts_exceeded = 0;
char *state_str;
if (get_service_state_internal(svcName, &svcStatus) < 0) {
@@ -236,6 +240,15 @@
return;
}
+ restarts_exceeded = check_restart(svcName);
+ if (SLang_push_integer(restarts_exceeded) < 0) {
+ SLang_verror(SL_RunTime_Error,
+ "%s: Failed to push restarts_exceeded %s",
+ __FUNCTION__,
+ svcName);
+ return;
+ }
+
if (SLang_push_integer(svcStatus.rs_restarts) < 0) {
SLang_verror(SL_RunTime_Error,
"%s: Failed to push restarts for %s",
@@ -1085,6 +1098,7 @@
_service_state = (char *)rg_state_str(state);
_service_owner = owner;
_service_last_owner = last_owner;
+ _service_restarts_exceeded = check_restart(name);
switch(state) {
case RG_STATE_DISABLED:
@@ -1102,6 +1116,7 @@
_service_state = 0;
_service_owner = 0;
_service_last_owner = 0;
+ _service_restarts_exceeded = 0;
return ret;
}
--- cluster/rgmanager/src/resources/default_event_script.sl 2007/12/19 21:33:26 1.1.2.3
+++ cluster/rgmanager/src/resources/default_event_script.sl 2008/02/01 15:15:03 1.1.2.4
@@ -154,7 +154,8 @@
debug("Recovering",
" Service: ", service_name,
" Last owner: ", service_last_owner,
- " Policy: ", policy);
+ " Policy: ", policy,
+ " RTE: ", service_restarts_exceeded);
if (policy == "disable") {
() = service_stop(service_name, 1);
@@ -162,13 +163,17 @@
}
nodes = allowed_nodes(service_name);
- if (policy == "restart") {
- tmp = union(service_last_owner, nodes);
+ if (policy == "restart" and service_restarts_exceeded == 0) {
+ nodes = union(service_last_owner, nodes);
} else {
% relocate
tmp = subtract(nodes, service_last_owner);
- nodes = tmp;
- tmp = union(nodes, service_last_owner);
+ if (length(tmp) == 0) {
+ () = service_stop(service_name,0);
+ return;
+ }
+
+ nodes = union(tmp, service_last_owner);
}
()=move_or_start(service_name, nodes);
More information about the Cluster-devel
mailing list