[Cluster-devel] cluster/rgmanager include/resgroup.h include/r ...

Fri Feb 1 15:15:04 UTC 2008

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL5
Changes by:	lhh at sourceware.org	2008-02-01 15:15:03

Modified files:
	rgmanager/include: resgroup.h restart_counter.h 
	rgmanager/src/daemons: groups.c restart_counter.c rg_state.c 
	                       slang_event.c 
	rgmanager/src/resources: default_event_script.sl 

Log message:
	Allow restart counters to work with central_processing; #400211 / #431130

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.15.2.9&r2=1.15.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/restart_counter.h.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.25.2.14&r2=1.25.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/restart_counter.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.24.2.17&r2=1.24.2.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/slang_event.c.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.3.2.1&r2=1.3.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/default_event_script.sl.diff?cvsroot=cluster&only_with_tag=RHEL5&r1=1.1.2.3&r2=1.1.2.4

--- cluster/rgmanager/include/resgroup.h	2007/12/18 17:52:56	1.15.2.9
+++ cluster/rgmanager/include/resgroup.h	2008/02/01 15:15:02	1.15.2.10
@@ -135,6 +135,7 @@
 int svc_fail(char *svcName);
 int svc_migrate(char *svcName, int target);
 int check_restart(char *svcName);
+int add_restart(char *svcName);
 
 int rt_enqueue_request(const char *resgroupname, int request,
 		       msgctx_t *resp_ctx,
--- cluster/rgmanager/include/restart_counter.h	2007/12/18 17:52:56	1.1.2.2
+++ cluster/rgmanager/include/restart_counter.h	2008/02/01 15:15:02	1.1.2.3
@@ -25,6 +25,7 @@
 int restart_add(restart_counter_t arg);
 int restart_clear(restart_counter_t arg);
 int restart_count(restart_counter_t arg);
+int restart_treshold_exceeded(restart_counter_t arg);
 restart_counter_t restart_init(time_t expire_timeout, int max_restarts);
 int restart_cleanup(restart_counter_t arg);
 
--- cluster/rgmanager/src/daemons/groups.c	2007/12/18 17:52:56	1.25.2.14
+++ cluster/rgmanager/src/daemons/groups.c	2008/02/01 15:15:02	1.25.2.15
@@ -1787,7 +1787,7 @@
 
 
 int
-check_restart(char *rg_name)
+add_restart(char *rg_name)
 {
 	resource_node_t *node;
 	int ret = 1;
@@ -1796,11 +1796,24 @@
 	node = node_by_ref(&_tree, rg_name);
 	if (node) {
 		ret = restart_add(node->rn_restart_counter);
-		if (ret) {
-			/* Clear it out - caller is about 
-			   to relocate the service anyway */
-			restart_clear(node->rn_restart_counter);
-		}
+	}
+	pthread_rwlock_unlock(&resource_lock);
+
+	return ret;
+}
+
+
+int
+check_restart(char *rg_name)
+{
+	resource_node_t *node;
+	int ret = 0;
+
+	pthread_rwlock_rdlock(&resource_lock);
+	node = node_by_ref(&_tree, rg_name);
+	if (node) {
+		printf("%s %p\n", rg_name, node->rn_restart_counter);
+		ret = restart_threshold_exceeded(node->rn_restart_counter);
 	}
 	pthread_rwlock_unlock(&resource_lock);
 
--- cluster/rgmanager/src/daemons/restart_counter.c	2007/11/26 21:46:27	1.1.2.1
+++ cluster/rgmanager/src/daemons/restart_counter.c	2008/02/01 15:15:02	1.1.2.2
@@ -46,6 +46,10 @@
 
 #define VALIDATE(arg, ret) \
 do { \
+	if (!arg) {\
+		errno = EINVAL; \
+		return ret; \
+	} \
 	if (((restart_info_t *)arg)->magic != RESTART_INFO_MAGIC) {\
 		errno = EINVAL; \
 		return ret; \
@@ -97,6 +101,21 @@
 }
 
 
+int
+restart_threshold_exceeded(restart_counter_t arg)
+{
+	restart_info_t *restarts = (restart_info_t *)arg;
+	time_t now;
+
+	VALIDATE(arg, -1);
+	now = time(NULL);
+	restart_timer_purge(arg, now);
+	if (restarts->restart_count >= restarts->max_restarts)
+		return 1;
+	return 0;
+}
+
+
 /* Add a restart entry to the list.  Returns 1 if restart
    count is exceeded */
 int
@@ -127,7 +146,7 @@
 	/* Check and remove old entries */
 	restart_timer_purge(restarts, t);
 
-	if (restarts->restart_count > restarts->max_restarts)
+	if (restarts->restart_count >= restarts->max_restarts)
 		return 1;
 
 	return 0;
@@ -170,6 +189,7 @@
 	info->expire_timeout = expire_timeout;
 	info->max_restarts = max_restarts;
 	info->restart_count = 0;
+	info->restart_nodes = NULL;
 
 	return (void *)info;
 }
--- cluster/rgmanager/src/daemons/rg_state.c	2008/01/25 18:09:24	1.24.2.17
+++ cluster/rgmanager/src/daemons/rg_state.c	2008/02/01 15:15:02	1.24.2.18
@@ -678,7 +678,6 @@
 			clulog(LOG_NOTICE,
 			       "Recovering failed service %s\n",
 			       svcName);
-			svcStatus->rs_state = RG_STATE_STOPPED;
 			/* Start! */
 			ret = 1;
 			break;
@@ -789,13 +788,16 @@
 	/* LOCK HELD if we get here */
 
 	svcStatus.rs_owner = my_id();
-	svcStatus.rs_state = RG_STATE_STARTING;
 	svcStatus.rs_transition = (uint64_t)time(NULL);
 
-	if (req == RG_START_RECOVER)
+	if (svcStatus.rs_state == RG_STATE_RECOVER) {
+		add_restart(svcName);
 		svcStatus.rs_restarts++;
-	else
+	} else {
 		svcStatus.rs_restarts = 0;
+	}
+
+	svcStatus.rs_state = RG_STATE_STARTING;
 
 	if (set_rg_state(svcName, &svcStatus) < 0) {
 		clulog(LOG_ERR,
@@ -1248,7 +1250,7 @@
 {
 	struct dlm_lksb lockp;
 	rg_state_t svcStatus;
-	int ret;
+	int ret = 0;
 	int old_state;
 
 	if (!rg_quorate()) {
@@ -1291,6 +1293,18 @@
 
 	old_state = svcStatus.rs_state;
 
+	if (old_state == RG_STATE_RECOVER) {
+		clulog(LOG_DEBUG, "%s is clean; skipping double-stop\n",
+		       svcName);
+		svcStatus.rs_state = newstate;
+
+		if (set_rg_state(svcName, &svcStatus) != 0) {
+			rg_unlock(&lockp);
+			clulog(LOG_ERR, "#52: Failed changing RG status\n");
+			return RG_EFAIL;
+		}
+	} 
+
 	clulog(LOG_NOTICE, "Stopping service %s\n", svcName);
 
 	if (recover) 
--- cluster/rgmanager/src/daemons/slang_event.c	2007/12/18 17:52:56	1.3.2.1
+++ cluster/rgmanager/src/daemons/slang_event.c	2008/02/01 15:15:02	1.3.2.2
@@ -80,6 +80,7 @@
    _node_clean = 0,
    _service_owner = 0,
    _service_last_owner = 0,
+   _service_restarts_exceeded = 0,
    _user_request = 0,
    _user_arg1 = 0,
    _user_arg2 = 0,
@@ -123,6 +124,8 @@
 	MAKE_VARIABLE("service_owner",	&_service_owner,SLANG_INT_TYPE, 1),
 	MAKE_VARIABLE("service_last_owner", &_service_last_owner,
 		      					SLANG_INT_TYPE, 1),
+	MAKE_VARIABLE("service_restarts_exceeded", &_service_restarts_exceeded,
+		      					SLANG_INT_TYPE, 1),
 
 	/* User event information */
 	MAKE_VARIABLE("user_request",	&_user_request,	SLANG_INT_TYPE,1),
@@ -226,6 +229,7 @@
 sl_service_status(char *svcName)
 {
 	rg_state_t svcStatus;
+	int restarts_exceeded = 0;
 	char *state_str;
 
 	if (get_service_state_internal(svcName, &svcStatus) < 0) {
@@ -236,6 +240,15 @@
 		return;
 	}
 
+	restarts_exceeded = check_restart(svcName);
+	if (SLang_push_integer(restarts_exceeded) < 0) {
+		SLang_verror(SL_RunTime_Error,
+			     "%s: Failed to push restarts_exceeded %s",
+			     __FUNCTION__,
+			     svcName);
+		return;
+	}
+
 	if (SLang_push_integer(svcStatus.rs_restarts) < 0) {
 		SLang_verror(SL_RunTime_Error,
 			     "%s: Failed to push restarts for %s",
@@ -1085,6 +1098,7 @@
 	_service_state = (char *)rg_state_str(state);
 	_service_owner = owner;
 	_service_last_owner = last_owner;
+	_service_restarts_exceeded = check_restart(name);
 
 	switch(state) {
 	case RG_STATE_DISABLED:
@@ -1102,6 +1116,7 @@
 	_service_state = 0;
 	_service_owner = 0;
 	_service_last_owner = 0;
+	_service_restarts_exceeded = 0;
 
 	return ret;
 }
--- cluster/rgmanager/src/resources/default_event_script.sl	2007/12/19 21:33:26	1.1.2.3
+++ cluster/rgmanager/src/resources/default_event_script.sl	2008/02/01 15:15:03	1.1.2.4
@@ -154,7 +154,8 @@
 		debug("Recovering",
 		      " Service: ", service_name,
 		      " Last owner: ", service_last_owner,
-		      " Policy: ", policy);
+		      " Policy: ", policy,
+		      " RTE: ", service_restarts_exceeded);
 
 		if (policy == "disable") {
 			() = service_stop(service_name, 1);
@@ -162,13 +163,17 @@
 		}
 
 		nodes = allowed_nodes(service_name);
-		if (policy == "restart") {
-			tmp = union(service_last_owner, nodes);
+		if (policy == "restart" and service_restarts_exceeded == 0) {
+			nodes = union(service_last_owner, nodes);
 		} else {
 			% relocate 
 			tmp = subtract(nodes, service_last_owner);
-			nodes = tmp;
-			tmp = union(nodes, service_last_owner);
+			if (length(tmp) == 0) {
+				() = service_stop(service_name,0);
+				return;
+			}
+
+			nodes = union(tmp, service_last_owner);
 		}
 
 		()=move_or_start(service_name, nodes);