[Cluster-devel] cluster/rgmanager include/resgroup.h src/daemo ...
lhh at sourceware.org
lhh at sourceware.org
Wed Nov 14 19:03:39 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL4
Changes by: lhh at sourceware.org 2007-11-14 19:03:37
Modified files:
rgmanager/include: resgroup.h
rgmanager/src/daemons: groups.c rg_state.c
Log message:
Fix #360401 - hang forever during shutdown due to previous service boot problem
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/include/resgroup.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.3.2.9&r2=1.3.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.8.2.21&r2=1.8.2.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.4.2.21&r2=1.4.2.22
--- cluster/rgmanager/include/resgroup.h 2007/01/03 21:08:17 1.3.2.9
+++ cluster/rgmanager/include/resgroup.h 2007/11/14 19:03:37 1.3.2.10
@@ -156,6 +156,7 @@
cluster_member_list_t *member_list(void);
uint64_t my_id(void);
+#define RG_ERELO -9 /* Operation cannot complete here */
#define RG_ENODEDEATH -8 /* Processing node died */
#define RG_ERUN -7 /* Service is running already */
#define RG_EAGAIN -6 /* Try again */
--- cluster/rgmanager/src/daemons/groups.c 2007/09/28 15:14:52 1.8.2.21
+++ cluster/rgmanager/src/daemons/groups.c 2007/11/14 19:03:37 1.8.2.22
@@ -192,7 +192,8 @@
}
if (st.rs_state != RG_STATE_STARTED &&
- st.rs_state != RG_STATE_STARTING)
+ st.rs_state != RG_STATE_STARTING &&
+ st.rs_state != RG_STATE_STOPPING)
continue;
if (mp->cm_id != st.rs_owner)
--- cluster/rgmanager/src/daemons/rg_state.c 2007/06/28 11:54:50 1.4.2.21
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/11/14 19:03:37 1.4.2.22
@@ -41,10 +41,13 @@
int set_rg_state(char *servicename, rg_state_t *svcblk);
int get_rg_state(char *servicename, rg_state_t *svcblk);
void get_recovery_policy(char *rg_name, char *buf, size_t buflen);
-int have_exclusive_resources();
+int have_exclusive_resources(void);
int check_exclusive_resources(cluster_member_list_t *membership, char *svcName);
+pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+
uint64_t
next_node_id(cluster_member_list_t *membership, uint64_t me)
{
@@ -446,6 +449,7 @@
break;
case RG_STATE_DISABLED:
+ ret = 2;
case RG_STATE_UNINITIALIZED:
if (req == RG_DISABLE) {
clulog(LOG_NOTICE,
@@ -536,7 +540,7 @@
}
clulog(LOG_NOTICE,
- "Starting stopped service%s\n",
+ "Starting stopped service %s\n",
svcName);
ret = 1;
break;
@@ -557,7 +561,7 @@
snprintf(query,
sizeof(query),
"/cluster/clusternodes/clusternode[@nodeid=\"%d\"]/@name",
- svcStatus->rs_owner);
+ (int)svcStatus->rs_owner);
ccs_get(fd, query, &nodename);
ccs_disconnect(fd);
}
@@ -650,42 +654,61 @@
svc_start(char *svcName, int req)
{
void *lockp = NULL;
- int ret;
+ int ret, xret;
rg_state_t svcStatus;
+ int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+
+ if (need_check)
+ pthread_mutex_lock(&exclusive_mutex);
+
+ ret = RG_EFAIL;
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#45: Unable to obtain cluster lock: %s\n",
strerror(errno));
- return FAIL;
+ goto out_nolock;
}
if (get_rg_state(svcName, &svcStatus) != 0) {
- rg_unlock(svcName, lockp);
clulog(LOG_ERR, "#46: Failed getting status for RG %s\n",
svcName);
- return FAIL;
+ goto out_unlock;
+ }
+
+ if (need_check) {
+ membership = member_list();
+ xret = check_exclusive_resources(membership, svcName);
+ cml_free(membership);
+ if (xret != 0) {
+ if (xret > 0)
+ /* Exc. service running */
+ ret = RG_ERELO;
+ else
+ /* XXX */
+ ret = RG_ENOSERVICE;
+ goto out_unlock;
+ }
}
/* LOCK HELD */
switch (svc_advise_start(&svcStatus, svcName, req)) {
case 0: /* Don't start service, return FAIL */
- rg_unlock(svcName, lockp);
- return FAIL;
+ goto out_unlock;
case 2: /* Don't start service, return 0 */
- rg_unlock(svcName, lockp);
- return 0;
+ ret = 0;
+ goto out_unlock;
case 3:
- rg_unlock(svcName, lockp);
- return RG_EAGAIN;
+ ret = RG_EAGAIN;
+ goto out_unlock;
case 4:
- rg_unlock(svcName, lockp);
- return RG_ERUN;
+ ret = RG_ERUN;
+ goto out_unlock;
default:
break;
}
/* LOCK HELD if we get here */
-
svcStatus.rs_owner = my_id();
svcStatus.rs_state = RG_STATE_STARTING;
svcStatus.rs_transition = (uint64_t)time(NULL);
@@ -699,10 +722,17 @@
clulog(LOG_ERR,
"#47: Failed changing service status\n");
rg_unlock(svcName, lockp);
- return FAIL;
+ goto out_unlock;
}
rg_unlock(svcName, lockp);
+
+ /* release excl. mutex during start */
+ if (need_check) {
+ /* Also clear need_check so we don't double-unlock */
+ pthread_mutex_unlock(&exclusive_mutex);
+ need_check = 0;
+ }
ret = group_op(svcName, RG_START);
ret = !!ret; /* Either it worked or it didn't. Ignore all the
@@ -711,17 +741,17 @@
if (rg_lock(svcName, &lockp) < 0) {
clulog(LOG_ERR, "#74: Unable to obtain cluster lock: %s\n",
strerror(errno));
- return FAIL;
+ ret = RG_EFAIL;
+ goto out_nolock;
}
svcStatus.rs_state = RG_STATE_STARTED;
if (set_rg_state(svcName, &svcStatus) != 0) {
clulog(LOG_ERR,
"#75: Failed changing service status\n");
- rg_unlock(svcName, lockp);
- return FAIL;
+ ret = RG_EFAIL;
+ goto out_unlock;
}
- rg_unlock(svcName, lockp);
if (ret == 0)
clulog(LOG_NOTICE,
@@ -732,6 +762,11 @@
"#68: Failed to start %s; return value: %d\n",
svcName, ret);
+out_unlock:
+ rg_unlock(svcName, lockp);
+out_nolock:
+ if (need_check)
+ pthread_mutex_unlock(&exclusive_mutex);
return ret;
}
@@ -1115,7 +1150,7 @@
{
cluster_member_list_t *allowed_nodes, *backup = NULL;
uint64_t target = preferred_target, me = my_id();
- int ret, x;
+ int ret, x, tried = 0;
/*
* Stop the service - if we haven't already done so.
@@ -1181,6 +1216,7 @@
* It's legal to start the service on the given
* node. Try to do so.
*/
+ ++tried;
if (relocate_service(svcName, request, target) == 0) {
*new_owner = target;
/*
@@ -1211,9 +1247,12 @@
if (target == me)
goto exhausted;
+ ++tried;
+
+ /* Each node gets one try */
+ memb_mark_down(allowed_nodes, target);
switch (relocate_service(svcName, request, target)) {
case RG_EFAIL:
- memb_mark_down(allowed_nodes, target);
continue;
case RG_EABORT:
svc_report_failure(svcName);
@@ -1228,9 +1267,10 @@
(uint32_t)(target&0xffffffff), request);
return 0;
case 0:
- *new_owner = target;
clulog(LOG_NOTICE, "Service %s is now running "
"on member %d\n", svcName, (int)target);
+ case RG_ERUN:
+ *new_owner = target;
cml_free(allowed_nodes);
return 0;
default:
@@ -1254,9 +1294,10 @@
*/
exhausted:
if (!rg_locked()) {
- clulog(LOG_WARNING,
- "#70: Attempting to restart service %s locally.\n",
- svcName);
+ if (tried)
+ clulog(LOG_WARNING,
+ "#70: Attempting to restart service %s locally.\n",
+ svcName);
if (svc_start(svcName, RG_START_RECOVER) == 0) {
*new_owner = me;
return FAIL;
@@ -1275,9 +1316,9 @@
int
handle_fd_start_req(char *svcName, int request, uint64_t *new_owner)
{
- cluster_member_list_t *allowed_nodes, *backup = NULL;
+ cluster_member_list_t *allowed_nodes;
uint64_t target, me = my_id();
- int ret, x;
+ int ret;
allowed_nodes = member_list();
@@ -1326,7 +1367,6 @@
}
-pthread_mutex_t exclusive_mutex = PTHREAD_MUTEX_INITIALIZER;
/**
* handle_start_req - Handle a generic start request from a user or during
* service manager boot.
@@ -1342,7 +1382,6 @@
{
int ret, tolerance = FOD_BEST;
cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
/*
* When a service request is from a user application (eg, clusvcadm),
@@ -1358,18 +1397,6 @@
cml_free(membership);
return FAIL;
}
- if (need_check) {
- pthread_mutex_lock(&exclusive_mutex);
- ret = check_exclusive_resources(membership, svcName);
- if (ret != 0) {
- cml_free(membership);
- pthread_mutex_unlock(&exclusive_mutex);
- if (ret > 0)
- goto relocate;
- else
- return FAIL;
- }
- }
cml_free(membership);
/*
@@ -1377,25 +1404,22 @@
* mask here - so that we can try all nodes if necessary.
*/
ret = svc_start(svcName, req);
- if (need_check)
- pthread_mutex_unlock(&exclusive_mutex);
-
- /*
- If services are locked, return the error
- */
- if (ret == RG_EAGAIN || ret == RG_ERUN)
+ switch(ret) {
+ case RG_ERELO:
+ goto relocate;
+
+ case RG_EAGAIN:
+ /* If services are locked, return the error */
+ case RG_ENOSERVICE:
+ /* service doesn't exist? */
+ case RG_ERUN:
+ /* If service is already running, return that value */
return ret;
- /*
- * If we succeeded, then we're done.
- */
- if (ret == SUCCESS) {
+ case SUCCESS:
+ /* If we succeeded, then we're done. */
*new_owner = my_id();
- return SUCCESS;
- }
-
- /* Already running? */
- if (ret == NO) {
+ case NO:
return SUCCESS;
}
@@ -1418,13 +1442,13 @@
return RG_EABORT;
}
-relocate:
/*
* OK, it failed to start - but succeeded to stop. Now,
* we should relocate the service.
*/
clulog(LOG_WARNING, "#71: Relocating failed service %s\n",
svcName);
+relocate:
ret = handle_relocate_req(svcName, RG_START_RECOVER, -1, new_owner);
/* If we leave the service stopped, instead of disabled, someone
@@ -1456,7 +1480,6 @@
int x;
uint64_t me = my_id();
cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
/* XXX ok, so we need to say "should I start this if I was the
only cluster member online */
@@ -1477,23 +1500,29 @@
cml_free(membership);
return FAIL;
}
- if (need_check) {
- pthread_mutex_lock(&exclusive_mutex);
- if (check_exclusive_resources(membership, svcName) != 0) {
- pthread_mutex_unlock(&exclusive_mutex);
- cml_free(membership);
- return FAIL;
- }
- }
cml_free(membership);
x = svc_start(svcName, req);
- if (need_check)
- pthread_mutex_unlock(&exclusive_mutex);
- if (x == 0)
- return 0;
- if (x == RG_ERUN)
- return RG_ERUN;
+ switch(x) {
+ case RG_ERELO:
+ /* Don't relocate from here; it was a remote start */
+ /* Return fail so the other node can go ahead and
+ try the other nodes in the cluster */
+ case NO:
+ return RG_EFAIL;
+
+ case RG_EAGAIN:
+ /* If services are locked, return the error */
+ case RG_ENOSERVICE:
+ /* service doesn't exist? */
+ case RG_ERUN:
+ /* If service is already running, return that value */
+ return x;
+
+ case SUCCESS:
+ /* If we succeeded, then we're done. */
+ return SUCCESS;
+ }
if (svc_stop(svcName, RG_STOP_RECOVER) == 0)
return RG_EFAIL;
More information about the Cluster-devel
mailing list