[Cluster-devel] cluster/rgmanager ChangeLog src/clulib/msg_clu ...
lhh at sourceware.org
lhh at sourceware.org
Mon Jul 23 20:49:15 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: lhh at sourceware.org 2007-07-23 20:49:13
Modified files:
rgmanager : ChangeLog
rgmanager/src/clulib: msg_cluster.c msgtest.c vft.c
rgmanager/src/daemons: groups.c main.c nodeevent.c rg_event.c
rg_forward.c rg_state.c rg_thread.c
rgmanager/src/resources: service.sh
rgmanager/src/utils: clusvcadm.c
Added files:
rgmanager/src/clulib: tmgr.c
Log message:
Misc. bugfixes; see ChangeLog
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&r1=1.52&r2=1.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/tmgr.c.diff?cvsroot=cluster&r1=NONE&r2=1.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msg_cluster.c.diff?cvsroot=cluster&r1=1.4&r2=1.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msgtest.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&r1=1.36&r2=1.37
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&r1=1.40&r2=1.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&r1=1.7&r2=1.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_event.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&r1=1.23&r2=1.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/service.sh.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
--- cluster/rgmanager/ChangeLog 2007/07/12 11:25:09 1.52
+++ cluster/rgmanager/ChangeLog 2007/07/23 20:49:13 1.53
@@ -1,3 +1,23 @@
+2007-07-23 Lon Hohberger <lhh at redhat.com>
+ * general: make threads exit with pthread_exit() so we can wrap/track them.
+ Add internal statedump (SIGUSR1) support.
+ * src/clulib/msg_cluster.c: Fix rare deadlock condition. Add dump support.
+ * src/clulib/tmgr.c: Add thread wrappers so we can report threads in
+ internal state dumps.
+ * src/clulib/vft.c: Fix rare crash if vf_resolve_views gets called with
+ NULL. Add dump support.
+ * src/daemons/main.c: Fix minor memory leak in membership_update(). Fix
+ crash-on-exit race. Don't exit if someone requests foreground mode.
+ * src/daemons/rg_forward.c: Clean up forwarding logic and handle missed
+ case (forward-to-self -> ERROR!)
+ * src/daemons/rg_state.c: Move closing / free of contexts out of
+ send_ret/send_response to the caller (where they belong). Don't let
+ people relocate disabled services.
+ * src/daemons/rg_thread.c: Don't loop forever if the thread exits before
+ we notice that it's started.
+ * src/daemons/clusvcadm.c: Fix error codes if you try to relocate when
+ rgmanager isn't running
+
2007-07-12 Marek Grac <mgrac at redhat.com>
* src/resources/Makefile: Fix #245178 - install RA for named
/cvs/cluster/cluster/rgmanager/src/clulib/tmgr.c,v --> standard output
revision 1.1
--- cluster/rgmanager/src/clulib/tmgr.c
+++ - 2007-07-23 20:49:14.098409000 +0000
@@ -0,0 +1,128 @@
+/*
+ Copyright Red Hat, Inc. 2007
+ Copyright Crosswalk 2006-2007
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+#ifdef WRAP_THREADS
+#include <stdio.h>
+#include <sys/types.h>
+#include <gettid.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <list.h>
+#include <execinfo.h>
+
+typedef struct _thr {
+ list_head();
+ void *(*fn)(void *arg);
+ char **name;
+ pthread_t th;
+} mthread_t;
+
+static mthread_t *_tlist = NULL;
+static int _tcount = 0;
+static pthread_rwlock_t _tlock = PTHREAD_RWLOCK_INITIALIZER;
+
+void
+dump_thread_states(FILE *fp)
+{
+ int x;
+ mthread_t *curr;
+ fprintf(fp, "Thread Information\n");
+ pthread_rwlock_rdlock(&_tlock);
+ list_for(&_tlist, curr, x) {
+ fprintf(fp, " Thread #%d id: %d function: %s\n",
+ x, (unsigned)curr->th, curr->name[0]);
+ }
+ pthread_rwlock_unlock(&_tlock);
+ fprintf(fp, "\n\n");
+}
+
+
+int __real_pthread_create(pthread_t *, const pthread_attr_t *,
+ void *(*)(void*), void *);
+int
+__wrap_pthread_create(pthread_t *th, const pthread_attr_t *attr,
+ void *(*start_routine)(void*),
+ void *arg)
+{
+ void *fn = start_routine;
+ mthread_t *new;
+ int ret;
+
+ new = malloc(sizeof (*new));
+
+ ret = __real_pthread_create(th, attr, start_routine, arg);
+ if (ret) {
+ if (new)
+ free(new);
+ return ret;
+ }
+
+ if (new) {
+ new->th = *th;
+ new->fn = start_routine;
+ new->name = backtrace_symbols(&new->fn, 1);
+ pthread_rwlock_wrlock(&_tlock);
+ list_insert(&_tlist, new);
+ ++_tcount;
+ pthread_rwlock_unlock(&_tlock);
+ }
+
+ return ret;
+}
+
+
+void __real_pthread_exit(void *);
+void
+__wrap_pthread_exit(void *exitval)
+{
+ mthread_t *old;
+ int ret = 0, found = 0;
+ pthread_t me = pthread_self();
+
+ pthread_rwlock_rdlock(&_tlock);
+ list_for(&_tlist, old, ret) {
+ if (old->th == me) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ old = NULL;
+ pthread_rwlock_unlock(&_tlock);
+
+ if (!old)
+ __real_pthread_exit(exitval);
+
+ pthread_rwlock_wrlock(&_tlock);
+ list_remove(&_tlist, old);
+ --_tcount;
+ pthread_rwlock_unlock(&_tlock);
+
+ if (old->name)
+ free(old->name);
+ free(old);
+ __real_pthread_exit(exitval);
+}
+#endif
--- cluster/rgmanager/src/clulib/msg_cluster.c 2006/10/23 22:47:00 1.4
+++ cluster/rgmanager/src/clulib/msg_cluster.c 2007/07/23 20:49:13 1.5
@@ -46,7 +46,7 @@
static msgctx_t *contexts[MAX_CONTEXTS];
static int _me = 0;
pthread_t comms_thread;
-int thread_running;
+int thread_running = 0;
#define is_established(ctx) \
(((ctx->type == MSG_CLUSTER) && \
@@ -856,7 +856,6 @@
errno = EINVAL;
cluster_msg_hdr_t *m;
msg_q_t *n;
- char done = 0;
char foo;
if (!listenctx || !acceptctx)
@@ -884,24 +883,38 @@
m = n->message;
switch(m->msg_control) {
case M_OPEN:
+ /* XXX make this case statement its own function or at
+ least make it not a big case block . */
list_remove(&listenctx->u.cluster_info.queue, n);
/*printf("Accepting connection from %d %d\n",
m->src_nodeid, m->src_ctx);*/
- /* New connection */
+ /* Release lock on listen context queue; we're done
+ with it at this point */
+ pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
+
+ /* New connection: first, create + lock the mutex */
pthread_mutex_init(&acceptctx->u.cluster_info.mutex,
NULL);
+ /* Lock this while we finish initializing */
+ pthread_mutex_lock(&acceptctx->u.cluster_info.mutex);
+
pthread_cond_init(&acceptctx->u.cluster_info.cond,
NULL);
+
acceptctx->u.cluster_info.queue = NULL;
acceptctx->u.cluster_info.remote_ctx = m->src_ctx;
acceptctx->u.cluster_info.nodeid = m->src_nodeid;
acceptctx->u.cluster_info.port = m->msg_port;
acceptctx->flags = (SKF_READ | SKF_WRITE);
- if (assign_ctx(acceptctx) < 0) {
+ /* assign_ctx requires the context lock. We need to
+ ensure we don't try to take the context lock w/ a local
+ queue lock held on a context that's in progress (i.e.
+ the global cluster context...) */
+ if (assign_ctx(acceptctx) < 0)
printf("FAILED TO ASSIGN CONTEXT\n");
- }
+
cluster_send_control_msg(acceptctx, M_OPEN_ACK);
if (listenctx->u.cluster_info.select_pipe[0] >= 0) {
@@ -910,11 +923,14 @@
&foo, 1);
}
- done = 1;
free(m);
free(n);
- break;
+ /* Let the new context go. */
+ pthread_mutex_unlock(&acceptctx->u.cluster_info.mutex);
+ return 0;
+ /* notreached */
+
case M_DATA:
/* Data messages (i.e. from broadcast msgs) are
okay too!... but we don't handle them here */
@@ -925,9 +941,6 @@
break;
}
- if (done)
- break;
-
} while (!list_done(&listenctx->u.cluster_info.queue, n));
pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
@@ -950,7 +963,7 @@
poll_cluster_messages(2);
}
- return NULL;
+ pthread_exit(NULL);
}
@@ -1105,7 +1118,7 @@
pthread_attr_init(&attrs);
pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
- pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+ /*pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);*/
thread_running = 1;
pthread_create(&comms_thread, &attrs, cluster_comms_thread, NULL);
@@ -1130,16 +1143,81 @@
}
+void
+dump_cluster_ctx(FILE *fp)
+{
+ int x;
+ msgctx_t *ctx;
+
+ fprintf(fp, "CMAN/mux subsystem status\n");
+ if (thread_running) {
+ fprintf(fp, " Thread: %d\n", (unsigned)comms_thread);
+ } else {
+ fprintf(fp, " Thread Offline\n");
+ }
+
+ pthread_mutex_lock(&context_lock);
+ for (x = 0; x < MAX_CONTEXTS; x++) {
+ if (!contexts[x])
+ continue;
+ ctx = contexts[x];
+
+ fprintf(fp, " Cluster Message Context %p\n", ctx);
+ fprintf(fp, " Flags %08x ", ctx->flags);
+ if (ctx->flags & SKF_READ)
+ fprintf(fp, "SKF_READ ");
+ if (ctx->flags & SKF_WRITE)
+ fprintf(fp, "SKF_WRITE ");
+ if (ctx->flags & SKF_LISTEN)
+ fprintf(fp, "SKF_LISTEN ");
+ if (ctx->flags & SKF_MCAST)
+ fprintf(fp, "SKF_MCAST ");
+ fprintf(fp, "\n");
+ fprintf(fp, " Target node ID %d\n", ctx->u.cluster_info.nodeid);
+ fprintf(fp, " Local Index %d\n", ctx->u.cluster_info.local_ctx);
+ fprintf(fp, " Remote Index %d\n", ctx->u.cluster_info.remote_ctx);
+ }
+ pthread_mutex_unlock(&context_lock);
+ fprintf(fp, "\n");
+}
+
+
int
cluster_msg_shutdown(void)
{
cman_handle_t ch;
+ cluster_msg_hdr_t m;
+ msgctx_t *ctx;
+ int x;
+
+ thread_running = 0;
+ pthread_join(comms_thread, NULL);
ch = cman_lock(1, SIGUSR2);
cman_end_recv_data(ch);
- pthread_kill(comms_thread, SIGTERM);
cman_unlock(ch);
+ /* Send close message to all open contexts */
+ memset(&m, 0, sizeof(m));
+ m.msg_control = M_CLOSE;
+
+ pthread_mutex_lock(&context_lock);
+ for (x = 0; x < MAX_CONTEXTS; x++) {
+ if (!contexts[x])
+ continue;
+
+ ctx = contexts[x];
+
+ /* Kill remote side if it exists */
+ if (is_established(ctx))
+ cluster_send_control_msg(ctx, M_CLOSE);
+
+ /* Queue close for local side */
+ queue_for_context(ctx, (void *)&m, sizeof(m));
+ }
+ pthread_mutex_unlock(&context_lock);
+
+
return 0;
}
--- cluster/rgmanager/src/clulib/msgtest.c 2006/08/07 22:05:01 1.2
+++ cluster/rgmanager/src/clulib/msgtest.c 2007/07/23 20:49:13 1.3
@@ -49,7 +49,7 @@
if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 0) != 0) {
printf("Could not set up mcast socket!\n");
- return NULL;
+ pthread_exit(NULL);
}
printf("PIGGYBACK CONTEXT\n");
@@ -66,7 +66,7 @@
printf("PIGGY flies...\n");
- return NULL;
+ pthread_exit(NULL);
}
@@ -102,7 +102,7 @@
if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 1) != 0) {
printf("Could not set up mcast socket!\n");
- return NULL;
+ pthread_exit(NULL);
}
snprintf(buf, sizeof(buf), "Babble, babble\n");
@@ -116,7 +116,7 @@
printf("Private thread is outta here...\n");
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/clulib/vft.c 2007/04/27 04:23:05 1.20
+++ cluster/rgmanager/src/clulib/vft.c 2007/07/23 20:49:13 1.21
@@ -121,9 +121,9 @@
struct vf_args {
- uint16_t port;
- int local_node_id;
msgctx_t *ctx;
+ int local_node_id;
+ uint16_t port;
};
@@ -277,6 +277,9 @@
uint32_t datalen;
uint32_t trans;
+ if (!key_node)
+ return 0;
+
while ((trans = vf_try_commit(key_node)) != 0) {
commits++;
}
@@ -895,7 +898,7 @@
msg_close(ctx);
msg_free_ctx(ctx);
- return NULL;
+ pthread_exit(NULL);
}
@@ -1776,3 +1779,40 @@
return VFR_OK;
}
+
+void
+dump_vf_states(FILE *fp)
+{
+ key_node_t *cur;
+
+ fprintf(fp, "View-Formation States:\n");
+ fprintf(fp, " Thread: %d\n", (unsigned)vf_thread);
+ fprintf(fp, " Default callbacks:\n Vote: %p\n Commit: %p\n",
+ default_vote_cb, default_commit_cb);
+ fprintf(fp, " Distributed key metadata:\n");
+
+ pthread_mutex_lock(&key_list_mutex);
+
+ for (cur = key_list; cur; cur = cur->kn_next) {
+ fprintf(fp, " %s, View: %d, Size: %d, Address: %p\n",
+ cur->kn_keyid,
+ (int)cur->kn_viewno,
+ cur->kn_datalen,
+ cur->kn_data);
+ if (cur->kn_vote_cb != default_vote_cb)
+ fprintf(fp, " Vote callback: %p\n", cur->kn_vote_cb);
+ if (cur->kn_commit_cb != default_commit_cb)
+ fprintf(fp, " Commit callback: %p\n", cur->kn_commit_cb);
+
+ if (cur->kn_jvlist)
+ fprintf(fp, " This key has unresolved "
+ "new views pending\n");
+ if (cur->kn_clist)
+ fprintf(fp, " This key has unresolved "
+ "commits pending\n");
+
+ }
+
+ pthread_mutex_unlock(&key_list_mutex);
+ fprintf(fp, "\n");
+}
--- cluster/rgmanager/src/daemons/groups.c 2007/07/10 18:25:26 1.36
+++ cluster/rgmanager/src/daemons/groups.c 2007/07/23 20:49:13 1.37
@@ -1033,7 +1033,7 @@
msg_send_simple(ctx, RG_FAIL, RG_EAGAIN, 0);
msg_close(ctx);
msg_free_ctx(ctx);
- return NULL;
+ pthread_exit(NULL);
}
pthread_rwlock_rdlock(&resource_lock);
@@ -1056,7 +1056,7 @@
rg_dec_status();
- return NULL;
+ pthread_exit(NULL);
}
@@ -1172,7 +1172,7 @@
/* Only one status thread at a time, please! */
if (pthread_mutex_trylock(&status_mutex) != 0)
- return NULL;
+ pthread_exit(NULL);
pthread_rwlock_rdlock(&resource_lock);
list_do(&_tree, curr) {
@@ -1198,7 +1198,7 @@
pthread_rwlock_unlock(&resource_lock);
pthread_mutex_unlock(&status_mutex);
- return NULL;
+ pthread_exit(NULL);
}
@@ -1400,6 +1400,13 @@
}
+void
+dump_config_version(FILE *fp)
+{
+ fprintf(fp, "Cluster configuration version %d\n\n", config_version);
+}
+
+
/**
Initialize resource groups. This reads all the resource groups from
CCS, builds the tree, etc. Ideally, we'll have a similar function
--- cluster/rgmanager/src/daemons/main.c 2007/06/27 14:03:51 1.40
+++ cluster/rgmanager/src/daemons/main.c 2007/07/23 20:49:13 1.41
@@ -40,6 +40,9 @@
#define L_SYS (1<<1)
#define L_USER (1<<0)
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
int configure_logging(int ccsfd, int debug);
void node_event(int, int, int, int);
@@ -63,7 +66,7 @@
int next_node_id(cluster_member_list_t *membership, int me);
int rg_event_q(char *svcName, uint32_t state, int owner);
-
+void malloc_dump_table(FILE *, size_t, size_t);
void
segfault(int sig)
@@ -259,6 +262,7 @@
free_member_list(node_delta);
free_member_list(new_ml);
+ free_member_list(old_membership);
rg_unlockall(L_SYS);
@@ -405,7 +409,8 @@
sz = msg_receive(ctx, msg_hdr, sizeof(msgbuf), 1);
if (sz < sizeof (generic_msg_hdr)) {
clulog(LOG_ERR,
- "#37: Error receiving message header (%d)\n", sz);
+ "#37: Error receiving header from %d sz=%d CTX %p\n",
+ nodeid, sz, ctx);
goto out;
}
@@ -593,6 +598,7 @@
break;
case M_DATA:
+ nodeid = msg_get_nodeid(ctx);
dispatch_msg(ctx, nodeid, 0);
break;
@@ -629,7 +635,26 @@
}
-void dump_threads(void);
+void dump_threads(FILE *fp);
+void dump_config_version(FILE *fp);
+void dump_vf_states(FILE *fp);
+void dump_cluster_ctx(FILE *fp);
+
+void
+dump_internal_state(char *loc)
+{
+ FILE *fp;
+ fp=fopen(loc, "w+");
+ dump_config_version(fp);
+ dump_threads(fp);
+ dump_vf_states(fp);
+#ifdef WRAP_THREADS
+ dump_thread_states(fp);
+#endif
+ dump_cluster_ctx(fp);
+ //malloc_dump_table(fp, 1, 16384); /* Only works if alloc.c us used */
+ fclose(fp);
+}
int
event_loop(msgctx_t *localctx, msgctx_t *clusterctx)
@@ -645,10 +670,8 @@
if (signalled) {
signalled = 0;
- /*
- malloc_stats();
- dump_threads();
- */
+
+ dump_internal_state("/tmp/rgmanager-dump");
}
while (running && (tv.tv_sec || tv.tv_usec)) {
@@ -747,7 +770,6 @@
cleanup(msgctx_t *clusterctx)
{
kill_resource_groups();
- member_list_update(NULL);
send_exit_msg(clusterctx);
}
@@ -760,7 +782,7 @@
}
-void malloc_dump_table(size_t, size_t);
+void malloc_dump_table(FILE *, size_t, size_t);
/*
@@ -846,10 +868,13 @@
rg_doall(RG_STOP_EXITING, 1, NULL);
running = 0;
- return 0;
+ pthread_exit(NULL);
}
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
int
main(int argc, char **argv)
{
@@ -871,7 +896,9 @@
break;
case 'f':
foreground = 1;
+ break;
default:
+ return 1;
break;
}
}
@@ -984,6 +1011,9 @@
event_loop(local_ctx, cluster_ctx);
if (shutdown_pending == 1) {
+ /* Kill local socket; local requests need to
+ be ignored here */
+ msg_close(local_ctx);
++shutdown_pending;
clulog(LOG_NOTICE, "Shutting down\n");
pthread_create(&th, NULL, shutdown_thread, NULL);
--- cluster/rgmanager/src/daemons/nodeevent.c 2007/06/27 14:03:51 1.7
+++ cluster/rgmanager/src/daemons/nodeevent.c 2007/07/23 20:49:13 1.8
@@ -196,7 +196,7 @@
/* Mutex held */
ne_thread = 0;
pthread_mutex_unlock(&ne_queue_mutex);
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_event.c 2006/07/11 23:52:41 1.1
+++ cluster/rgmanager/src/daemons/rg_event.c 2007/07/23 20:49:13 1.2
@@ -64,7 +64,7 @@
/* Mutex held */
rg_ev_thread = 0;
pthread_mutex_unlock(&rg_queue_mutex);
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_forward.c 2006/12/14 22:03:17 1.9
+++ cluster/rgmanager/src/daemons/rg_forward.c 2007/07/23 20:49:13 1.10
@@ -24,6 +24,7 @@
#include <msgsimple.h>
#include <clulog.h>
#include <message.h>
+#include <members.h>
void
@@ -49,59 +50,100 @@
request_t *req = (request_t *)arg;
struct dlm_lksb lockp;
msgctx_t *ctx = NULL;
+ cluster_member_list_t *m = NULL;
SmMessageSt msg;
+ int response_code = RG_EAGAIN, ret;
+ int new_owner = 0, retries = 0;
- if (rg_lock(req->rr_group, &lockp) != 0)
+ if (rg_lock(req->rr_group, &lockp) != 0) {
+ clulog(LOG_WARNING, "FW: Forwarding failed; lock unavailable for %s\n",
+ req->rr_group);
goto out_fail;
-
+ }
if (get_rg_state(req->rr_group, &rgs) != 0) {
rg_unlock(&lockp);
+ clulog(LOG_WARNING, "FW: Forwarding failed; state unavailable for %s\n",
+ req->rr_group);
goto out_fail;
}
-
rg_unlock(&lockp);
- /* Construct message */
- build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
-
if (rgs.rs_owner == 0)
rgs.rs_owner = req->rr_target;
if (rgs.rs_owner == 0) {
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
- rq_free(req);
- clulog(LOG_ERR, "Attempt to forward to invalid node ID\n");
- pthread_exit(NULL);
+ clulog(LOG_ERR, "FW: Attempt to forward to invalid node ID\n");
+ goto out_fail;
+ }
+ if (rgs.rs_owner == my_id()) {
+ clulog(LOG_WARNING, "BUG! Attempt to forward to myself!\n");
+ goto out_fail;
}
- clulog(LOG_DEBUG, "Forwarding %s request to %d\n",
+ clulog(LOG_DEBUG, "FW: Forwarding %s request to %d\n",
rg_req_str(req->rr_request), rgs.rs_owner);
- while ((ctx = msg_new_ctx()) == NULL)
- sleep(1);
-
- if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0)
+ ctx = msg_new_ctx();
+ if (ctx == NULL) {
+ clulog(LOG_DEBUG, "FW: Failed to allocate socket context: %s\n",
+ strerror(errno));
goto out_fail;
- if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg))
+ }
+
+ /* Construct message */
+ build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
+
+ if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0) {
+ clulog(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n",
+ rgs.rs_owner, ctx);
goto out_fail;
- if (msg_receive(ctx, &msg, sizeof(msg), 600) < sizeof(msg))
+ }
+ if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg)) {
+ clulog(LOG_DEBUG, "FW: Failed to send message to %d CTX: %p\n",
+ rgs.rs_owner, ctx);
goto out_fail;
+ }
- msg_close(ctx);
- msg_free_ctx(ctx);
+ /*
+ * Ok, we're forwarding a message to another node. Keep tabs on
+ * the node to make sure it doesn't die. Basically, wake up every
+ * now and again to make sure it's still online. If it isn't, send
+ * a response back to the caller.
+ */
+ do {
+ ret = msg_receive(ctx, &msg, sizeof(msg), 10);
+ if (ret < (int)sizeof(msg)) {
+ if (ret < 0 && errno == ETIMEDOUT) {
+ m = member_list();
+ if (!memb_online(m, rgs.rs_owner)) {
+ response_code = RG_ENODE;
+ goto out_fail;
+ }
+ free_member_list(m);
+ m = NULL;
+ continue;
+ }
+ goto out_fail;
+ }
+ break;
+ } while(++retries < 60); /* old 60 second rule */
swab_SmMessageSt(&msg);
- send_response(msg.sm_data.d_ret, msg.sm_data.d_svcOwner, req);
- rq_free(req);
- pthread_exit(NULL);
-
-out_fail: /* Failure path */
+
+ response_code = msg.sm_data.d_ret;
+ new_owner = msg.sm_data.d_svcOwner;
+
+out_fail:
+ send_response(response_code, new_owner, req);
+ msg_close(req->rr_resp_ctx);
+ msg_free_ctx(req->rr_resp_ctx);
+
if (ctx) {
msg_close(ctx);
msg_free_ctx(ctx);
}
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
+ if (m)
+ free_member_list(m);
+
rq_free(req);
pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/07/02 15:15:00 1.37
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/07/23 20:49:13 1.38
@@ -217,9 +217,6 @@
swab_SmMessageSt(msgp);
msg_send(ctx, msgp, sizeof(*msgp));
-
- /* :) */
- msg_close(ctx);
}
@@ -245,11 +242,6 @@
swab_SmMessageSt(msgp);
msg_send(req->rr_resp_ctx, msgp, sizeof(*msgp));
-
- /* :( */
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
- req->rr_resp_ctx = NULL;
}
@@ -556,6 +548,7 @@
break;
}
+ ret = 2;
clulog(LOG_DEBUG, "Not stopping disabled service %s\n",
svcName);
break;
@@ -1615,6 +1608,11 @@
int ret, x;
rg_state_t svcStatus;
+ get_rg_state_local(svcName, &svcStatus);
+ if (svcStatus.rs_state == RG_STATE_DISABLED ||
+ svcStatus.rs_state == RG_STATE_UNINITIALIZED)
+ return RG_EINVAL;
+
if (preferred_target > 0) {
/* TODO: simplify this and don't keep alloc/freeing
member lists */
@@ -1684,8 +1682,10 @@
* I am the ONLY one capable of running this service,
* PERIOD...
*/
- if (target == me && me != preferred_target)
+ if (target == me && me != preferred_target) {
+ free_member_list(backup);
goto exhausted;
+ }
if (target == me) {
/*
@@ -1948,8 +1948,16 @@
int tolerance = FOD_BEST;
int x;
uint32_t me = my_id();
- cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+ int need_check;
+
+ if (rg_locked()) {
+ /* don't even calc if rg's locked */
+ return RG_EFAIL;
+ }
+
+ need_check = have_exclusive_resources();
+ membership = member_list();
/* XXX ok, so we need to say "should I start this if I was the
only cluster member online */
@@ -2042,25 +2050,28 @@
svcName, 1);
if (target == me) {
ret = handle_start_remote_req(svcName, request);
+ if (ret == RG_EAGAIN)
+ goto out;
} else if (target < 0) {
- free_member_list(allowed_nodes);
- return RG_EFAIL;
+ goto out;
} else {
ret = relocate_service(svcName, request, target);
}
switch(ret) {
case RG_ESUCCESS:
- return RG_ESUCCESS;
+ ret = RG_ESUCCESS;
+ goto out;
case RG_ERUN:
- return RG_ERUN;
+ ret = RG_ERUN;
+ goto out;
case RG_EFAIL:
memb_mark_down(allowed_nodes, target);
continue;
case RG_EABORT:
svc_report_failure(svcName);
- free_member_list(allowed_nodes);
- return RG_EFAIL;
+ ret = RG_EFAIL;
+ goto out;
default:
clulog(LOG_ERR,
"#6X: Invalid reply [%d] from member %d during"
@@ -2068,6 +2079,7 @@
}
}
+out:
free_member_list(allowed_nodes);
- return RG_EFAIL;
+ return ret;
}
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/07/10 18:25:26 1.23
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/07/23 20:49:13 1.24
@@ -60,19 +60,39 @@
SIGUSR1 output
*/
void
-dump_threads(void)
+dump_threads(FILE *fp)
{
resthread_t *rt;
+ request_t *req;
+ int x = 0, y = 0;
- printf("+++ BEGIN Thread dump\n");
+ fprintf(fp, "Resource Group Threads \n");
pthread_mutex_lock(&reslist_mutex);
- list_do(&resthread_list, rt) {
- printf("TID %d group %s (@ %p) request %d\n",
- (int)rt->rt_thread,
- rt->rt_name, rt, rt->rt_request);
- } while (!list_done(&resthread_list, rt));
+ list_for(&resthread_list, rt, x) {
+ fprintf(fp, " %s id:%d (@ %p) processing %s request (%d)\n",
+ rt->rt_name,
+ (unsigned)rt->rt_thread,
+ rt,
+ rg_req_str(rt->rt_request),
+ rt->rt_request);
+ if (!*rt->rt_queue) {
+ fprintf(fp, " Pending requests: \n");
+ list_for(rt->rt_queue, req, y) {
+ fprintf(fp, " %s tgt:%d ctx:%p a0:%d a1:%d\n",
+ rg_req_str(req->rr_request),
+ req->rr_target,
+ req->rr_resp_ctx,
+ req->rr_arg0,
+ req->rr_arg1);
+ }
+ }
+ }
+
+ x = !!resthread_list;
pthread_mutex_unlock(&reslist_mutex);
- printf("--- END Thread dump\n");
+ if (!x)
+ fprintf(fp, " (none)\n");
+ fprintf(fp, "\n");
}
@@ -151,6 +171,8 @@
dprintf("Removed request %d\n", curr->rr_request);
if (curr->rr_resp_ctx) {
send_response(RG_EABORT, 0, curr);
+ msg_close(curr->rr_resp_ctx);
+ msg_free_ctx(curr->rr_resp_ctx);
}
rq_free(curr);
}
@@ -241,12 +263,14 @@
break;
case RG_ENABLE:
+ #if 0
if (req->rr_target != 0 &&
req->rr_target != my_id()) {
error = RG_EFORWARD;
ret = RG_NONE;
break;
}
+ #endif
case RG_START:
if (req->rr_arg0) {
error = handle_fd_start_req(myname,
@@ -476,6 +500,8 @@
if (ret != RG_NONE && rg_initialized() &&
(req->rr_resp_ctx)) {
send_response(error, newowner, req);
+ msg_close(req->rr_resp_ctx);
+ msg_free_ctx(req->rr_resp_ctx);
}
rq_free(req);
@@ -565,7 +591,6 @@
int ret;
resthread_t *resgroup = NULL;
-retry:
pthread_mutex_lock(&reslist_mutex);
while (resgroup == NULL) {
resgroup = find_resthread_byname(resgroupname);
@@ -584,7 +609,7 @@
pthread_mutex_unlock(&reslist_mutex);
if (wait_initialize(resgroupname) < 0) {
- goto retry;
+ return -1;
}
return ret;
@@ -689,6 +714,8 @@
case RG_ENABLE:
send_ret(response_ctx, resgroup->rt_name, RG_EDEADLCK,
request);
+ msg_close(response_ctx);
+ msg_free_ctx(response_ctx);
break;
}
fprintf(stderr, "Failed to queue request: Would block\n");
--- cluster/rgmanager/src/resources/service.sh 2007/04/05 15:08:20 1.9
+++ cluster/rgmanager/src/resources/service.sh 2007/07/23 20:49:13 1.10
@@ -67,7 +67,7 @@
<content type="string"/>
</parameter>
- <parameter name="autostart">
+ <parameter name="autostart" reconfig="1">
<longdesc lang="en">
If set to yes, this resource group will automatically be started
after the cluster forms a quorum. If set to no, this resource
@@ -80,7 +80,7 @@
<content type="boolean"/>
</parameter>
- <parameter name="hardrecovery">
+ <parameter name="hardrecovery" reconfig="1">
<longdesc lang="en">
If set to yes, the last owner will reboot if this resource
group fails to stop cleanly, thus allowing the resource
@@ -128,7 +128,7 @@
<content type="boolean"/>
</parameter>
- <parameter name="recovery">
+ <parameter name="recovery" reconfig="1">
<longdesc lang="en">
This currently has three possible options: "restart" tries
to restart failed parts of this resource group locally before
--- cluster/rgmanager/src/utils/clusvcadm.c 2007/06/14 15:06:52 1.20
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/07/23 20:49:13 1.21
@@ -390,7 +390,10 @@
printf("Member %s %s %s", nodename, actionstr, svcname);
printf("...");
fflush(stdout);
- msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+ if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+ printf("Could not connect to resource group manager\n");
+ return 1;
+ }
} else {
if (!svctarget)
printf("Trying to %s %s", actionstr, svcname);
@@ -399,7 +402,10 @@
nodename);
printf("...");
fflush(stdout);
- msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+ if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+ printf("Could not connect to resource group manager\n");
+ return 1;
+ }
}
if (ctx.type < 0) {
More information about the Cluster-devel
mailing list