[Cluster-devel] cluster/rgmanager ChangeLog src/clulib/Makefil ...
lhh at sourceware.org
lhh at sourceware.org
Tue Jul 24 18:49:20 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL51
Changes by: lhh at sourceware.org 2007-07-24 18:49:18
Modified files:
rgmanager : ChangeLog
rgmanager/src/clulib: Makefile msg_cluster.c msgtest.c vft.c
rgmanager/src/daemons: Makefile groups.c main.c nodeevent.c
rg_event.c rg_forward.c rg_state.c
rg_thread.c
rgmanager/src/resources: Makefile vm.sh
rgmanager/src/utils: clusvcadm.c
Added files:
rgmanager/src/clulib: tmgr.c
Log message:
Resolves: #247291 #249314 #249408
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/ChangeLog.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.31.2.19&r2=1.31.2.19.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/tmgr.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=NONE&r2=1.1.4.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.10.2.2&r2=1.10.2.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msg_cluster.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.4&r2=1.4.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/msgtest.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.2&r2=1.2.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/clulib/vft.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.17.2.2&r2=1.17.2.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.14.2.2&r2=1.14.2.2.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/groups.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.25.2.9&r2=1.25.2.9.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/main.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.34.2.6&r2=1.34.2.6.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/nodeevent.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.4.2.3&r2=1.4.2.3.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_event.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.1&r2=1.1.6.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_forward.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.8.2.1&r2=1.8.2.1.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_state.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.24.2.10&r2=1.24.2.10.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/daemons/rg_thread.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.15.2.7&r2=1.15.2.7.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/Makefile.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.13.2.5&r2=1.13.2.5.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/resources/vm.sh.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.1.2.4&r2=1.1.2.4.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/rgmanager/src/utils/clusvcadm.c.diff?cvsroot=cluster&only_with_tag=RHEL51&r1=1.12.2.4&r2=1.12.2.4.2.1
--- cluster/rgmanager/ChangeLog 2007/07/10 18:24:00 1.31.2.19
+++ cluster/rgmanager/ChangeLog 2007/07/24 18:49:17 1.31.2.19.2.1
@@ -1,3 +1,28 @@
+2007-07-24 Lon Hohberger <lhh at redhat.com>
+ * general: make threads exit with pthread_exit() so we can wrap/track them.
+ Add internal statedump (SIGUSR1) support.
+ * src/clulib/msg_cluster.c: Fix rare deadlock condition. bz #249314.
+ * src/clulib/vft.c: Fix rare crash if vf_resolve_views gets called with
+ NULL. bz #247291
+ * src/daemons/main.c: Fix minor memory leak in membership_update()
+ when lots of transitions occur. bz #249311. Fix crash-on-exit race
+ bz #247291. Don't exit if someone requests foreground mode.
+ * src/daemons/rg_forward.c: Clean up forwarding logic and handle missed
+ error case; fixes deadlock. bz #249314.
+ * src/daemons/rg_state.c: Move closing / free of contexts out of
+ send_ret/send_response to the caller (where they belong). Don't let
+ people relocate disabled services. bz #249311.
+ * src/daemons/rg_thread.c: Don't loop forever if the thread exits before
+ we notice that it's started. bz #249314.
+ * src/daemons/clusvcadm.c: Fix error codes if you try to relocate when
+ rgmanager isn't running.
+
+2007-07-12 Marek Grac <mgrac at redhat.com>
+ * src/resources/Makefile: Fix #245178 - install RA for named
+
+2007-07-11 Lon Hohberger <lhh at redhat.com>
+ * src/resources/vm.sh: Fix #247866 / #247154
+
2007-07-10 Lon Hohberger <lhh at redhat.com>
* src/daemons/rg_thread.c, groups.c: Make status queue max 1 instead
of unbounded (resolves: 247488)
/cvs/cluster/cluster/rgmanager/src/clulib/tmgr.c,v --> standard output
revision 1.1.4.1
--- cluster/rgmanager/src/clulib/tmgr.c
+++ - 2007-07-24 18:49:18.845188000 +0000
@@ -0,0 +1,128 @@
+/*
+ Copyright Red Hat, Inc. 2007
+ Copyright Crosswalk 2006-2007
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge,
+ MA 02139, USA.
+*/
+#ifdef WRAP_THREADS
+#include <stdio.h>
+#include <sys/types.h>
+#include <gettid.h>
+#include <pthread.h>
+#include <string.h>
+#include <errno.h>
+#include <malloc.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <list.h>
+#include <execinfo.h>
+
+typedef struct _thr {
+ list_head();
+ void *(*fn)(void *arg);
+ char **name;
+ pthread_t th;
+} mthread_t;
+
+static mthread_t *_tlist = NULL;
+static int _tcount = 0;
+static pthread_rwlock_t _tlock = PTHREAD_RWLOCK_INITIALIZER;
+
+void
+dump_thread_states(FILE *fp)
+{
+ int x;
+ mthread_t *curr;
+ fprintf(fp, "Thread Information\n");
+ pthread_rwlock_rdlock(&_tlock);
+ list_for(&_tlist, curr, x) {
+ fprintf(fp, " Thread #%d id: %d function: %s\n",
+ x, (unsigned)curr->th, curr->name[0]);
+ }
+ pthread_rwlock_unlock(&_tlock);
+ fprintf(fp, "\n\n");
+}
+
+
+int __real_pthread_create(pthread_t *, const pthread_attr_t *,
+ void *(*)(void*), void *);
+int
+__wrap_pthread_create(pthread_t *th, const pthread_attr_t *attr,
+ void *(*start_routine)(void*),
+ void *arg)
+{
+ void *fn = start_routine;
+ mthread_t *new;
+ int ret;
+
+ new = malloc(sizeof (*new));
+
+ ret = __real_pthread_create(th, attr, start_routine, arg);
+ if (ret) {
+ if (new)
+ free(new);
+ return ret;
+ }
+
+ if (new) {
+ new->th = *th;
+ new->fn = start_routine;
+ new->name = backtrace_symbols(&fn, 1);
+ pthread_rwlock_wrlock(&_tlock);
+ list_insert(&_tlist, new);
+ ++_tcount;
+ pthread_rwlock_unlock(&_tlock);
+ }
+
+ return ret;
+}
+
+
+void __real_pthread_exit(void *);
+void
+__wrap_pthread_exit(void *exitval)
+{
+ mthread_t *old;
+ int ret = 0, found = 0;
+ pthread_t me = pthread_self();
+
+ pthread_rwlock_rdlock(&_tlock);
+ list_for(&_tlist, old, ret) {
+ if (old->th == me) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ old = NULL;
+ pthread_rwlock_unlock(&_tlock);
+
+ if (!old)
+ __real_pthread_exit(exitval);
+
+ pthread_rwlock_wrlock(&_tlock);
+ list_remove(&_tlist, old);
+ --_tcount;
+ pthread_rwlock_unlock(&_tlock);
+
+ if (old->name)
+ free(old->name);
+ free(old);
+ __real_pthread_exit(exitval);
+}
+#endif
--- cluster/rgmanager/src/clulib/Makefile 2007/05/10 16:23:43 1.10.2.2
+++ cluster/rgmanager/src/clulib/Makefile 2007/07/24 18:49:18 1.10.2.2.2.1
@@ -17,7 +17,7 @@
INCLUDE += -I $(top_srcdir)/include -I $(top_srcdir)/../cman/lib -I $(top_srcdir)/../ccs/lib -I $(top_srcdir)/../dlm/lib
INCLUDE += -I${incdir}
-CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE
+CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE -DWRAP_THREADS
CFLAGS+= -DCMAN_RELEASE_NAME=\"${RELEASE}\"
TARGETS=libclulib.a liblalloc.a msgtest
@@ -34,7 +34,7 @@
libclulib.a: clulog.o daemon_init.o signals.o msgsimple.o \
gettid.o rg_strings.o message.o members.o fdops.o \
lock.o cman.o vft.o msg_cluster.o msg_socket.o \
- wrap_lock.o
+ wrap_lock.o tmgr.o
${AR} cru $@ $^
ranlib $@
--- cluster/rgmanager/src/clulib/msg_cluster.c 2006/10/23 22:47:00 1.4
+++ cluster/rgmanager/src/clulib/msg_cluster.c 2007/07/24 18:49:18 1.4.6.1
@@ -46,7 +46,7 @@
static msgctx_t *contexts[MAX_CONTEXTS];
static int _me = 0;
pthread_t comms_thread;
-int thread_running;
+int thread_running = 0;
#define is_established(ctx) \
(((ctx->type == MSG_CLUSTER) && \
@@ -856,7 +856,6 @@
errno = EINVAL;
cluster_msg_hdr_t *m;
msg_q_t *n;
- char done = 0;
char foo;
if (!listenctx || !acceptctx)
@@ -884,24 +883,38 @@
m = n->message;
switch(m->msg_control) {
case M_OPEN:
+ /* XXX make this case statement its own function or at
+ least make it not a big case block . */
list_remove(&listenctx->u.cluster_info.queue, n);
/*printf("Accepting connection from %d %d\n",
m->src_nodeid, m->src_ctx);*/
- /* New connection */
+ /* Release lock on listen context queue; we're done
+ with it at this point */
+ pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
+
+ /* New connection: first, create + lock the mutex */
pthread_mutex_init(&acceptctx->u.cluster_info.mutex,
NULL);
+ /* Lock this while we finish initializing */
+ pthread_mutex_lock(&acceptctx->u.cluster_info.mutex);
+
pthread_cond_init(&acceptctx->u.cluster_info.cond,
NULL);
+
acceptctx->u.cluster_info.queue = NULL;
acceptctx->u.cluster_info.remote_ctx = m->src_ctx;
acceptctx->u.cluster_info.nodeid = m->src_nodeid;
acceptctx->u.cluster_info.port = m->msg_port;
acceptctx->flags = (SKF_READ | SKF_WRITE);
- if (assign_ctx(acceptctx) < 0) {
+ /* assign_ctx requires the context lock. We need to
+ ensure we don't try to take the context lock w/ a local
+ queue lock held on a context that's in progress (i.e.
+ the global cluster context...) */
+ if (assign_ctx(acceptctx) < 0)
printf("FAILED TO ASSIGN CONTEXT\n");
- }
+
cluster_send_control_msg(acceptctx, M_OPEN_ACK);
if (listenctx->u.cluster_info.select_pipe[0] >= 0) {
@@ -910,11 +923,14 @@
&foo, 1);
}
- done = 1;
free(m);
free(n);
- break;
+ /* Let the new context go. */
+ pthread_mutex_unlock(&acceptctx->u.cluster_info.mutex);
+ return 0;
+ /* notreached */
+
case M_DATA:
/* Data messages (i.e. from broadcast msgs) are
okay too!... but we don't handle them here */
@@ -925,9 +941,6 @@
break;
}
- if (done)
- break;
-
} while (!list_done(&listenctx->u.cluster_info.queue, n));
pthread_mutex_unlock(&listenctx->u.cluster_info.mutex);
@@ -950,7 +963,7 @@
poll_cluster_messages(2);
}
- return NULL;
+ pthread_exit(NULL);
}
@@ -1105,7 +1118,7 @@
pthread_attr_init(&attrs);
pthread_attr_setinheritsched(&attrs, PTHREAD_INHERIT_SCHED);
- pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
+ /*pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);*/
thread_running = 1;
pthread_create(&comms_thread, &attrs, cluster_comms_thread, NULL);
@@ -1130,16 +1143,81 @@
}
+void
+dump_cluster_ctx(FILE *fp)
+{
+ int x;
+ msgctx_t *ctx;
+
+ fprintf(fp, "CMAN/mux subsystem status\n");
+ if (thread_running) {
+ fprintf(fp, " Thread: %d\n", (unsigned)comms_thread);
+ } else {
+ fprintf(fp, " Thread Offline\n");
+ }
+
+ pthread_mutex_lock(&context_lock);
+ for (x = 0; x < MAX_CONTEXTS; x++) {
+ if (!contexts[x])
+ continue;
+ ctx = contexts[x];
+
+ fprintf(fp, " Cluster Message Context %p\n", ctx);
+ fprintf(fp, " Flags %08x ", ctx->flags);
+ if (ctx->flags & SKF_READ)
+ fprintf(fp, "SKF_READ ");
+ if (ctx->flags & SKF_WRITE)
+ fprintf(fp, "SKF_WRITE ");
+ if (ctx->flags & SKF_LISTEN)
+ fprintf(fp, "SKF_LISTEN ");
+ if (ctx->flags & SKF_MCAST)
+ fprintf(fp, "SKF_MCAST ");
+ fprintf(fp, "\n");
+ fprintf(fp, " Target node ID %d\n", ctx->u.cluster_info.nodeid);
+ fprintf(fp, " Local Index %d\n", ctx->u.cluster_info.local_ctx);
+ fprintf(fp, " Remote Index %d\n", ctx->u.cluster_info.remote_ctx);
+ }
+ pthread_mutex_unlock(&context_lock);
+ fprintf(fp, "\n");
+}
+
+
int
cluster_msg_shutdown(void)
{
cman_handle_t ch;
+ cluster_msg_hdr_t m;
+ msgctx_t *ctx;
+ int x;
+
+ thread_running = 0;
+ pthread_join(comms_thread, NULL);
ch = cman_lock(1, SIGUSR2);
cman_end_recv_data(ch);
- pthread_kill(comms_thread, SIGTERM);
cman_unlock(ch);
+ /* Send close message to all open contexts */
+ memset(&m, 0, sizeof(m));
+ m.msg_control = M_CLOSE;
+
+ pthread_mutex_lock(&context_lock);
+ for (x = 0; x < MAX_CONTEXTS; x++) {
+ if (!contexts[x])
+ continue;
+
+ ctx = contexts[x];
+
+ /* Kill remote side if it exists */
+ if (is_established(ctx))
+ cluster_send_control_msg(ctx, M_CLOSE);
+
+ /* Queue close for local side */
+ queue_for_context(ctx, (void *)&m, sizeof(m));
+ }
+ pthread_mutex_unlock(&context_lock);
+
+
return 0;
}
--- cluster/rgmanager/src/clulib/msgtest.c 2006/08/07 22:05:01 1.2
+++ cluster/rgmanager/src/clulib/msgtest.c 2007/07/24 18:49:18 1.2.6.1
@@ -49,7 +49,7 @@
if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 0) != 0) {
printf("Could not set up mcast socket!\n");
- return NULL;
+ pthread_exit(NULL);
}
printf("PIGGYBACK CONTEXT\n");
@@ -66,7 +66,7 @@
printf("PIGGY flies...\n");
- return NULL;
+ pthread_exit(NULL);
}
@@ -102,7 +102,7 @@
if (msg_open(MSG_CLUSTER, 0, MYPORT, &ctx, 1) != 0) {
printf("Could not set up mcast socket!\n");
- return NULL;
+ pthread_exit(NULL);
}
snprintf(buf, sizeof(buf), "Babble, babble\n");
@@ -116,7 +116,7 @@
printf("Private thread is outta here...\n");
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/clulib/vft.c 2007/05/10 16:23:43 1.17.2.2
+++ cluster/rgmanager/src/clulib/vft.c 2007/07/24 18:49:18 1.17.2.2.2.1
@@ -121,9 +121,9 @@
struct vf_args {
- uint16_t port;
- int local_node_id;
msgctx_t *ctx;
+ int local_node_id;
+ uint16_t port;
};
@@ -277,6 +277,9 @@
uint32_t datalen;
uint32_t trans;
+ if (!key_node)
+ return 0;
+
while ((trans = vf_try_commit(key_node)) != 0) {
commits++;
}
@@ -895,7 +898,7 @@
msg_close(ctx);
msg_free_ctx(ctx);
- return NULL;
+ pthread_exit(NULL);
}
@@ -1776,3 +1779,40 @@
return VFR_OK;
}
+
+void
+dump_vf_states(FILE *fp)
+{
+ key_node_t *cur;
+
+ fprintf(fp, "View-Formation States:\n");
+ fprintf(fp, " Thread: %d\n", (unsigned)vf_thread);
+ fprintf(fp, " Default callbacks:\n Vote: %p\n Commit: %p\n",
+ default_vote_cb, default_commit_cb);
+ fprintf(fp, " Distributed key metadata:\n");
+
+ pthread_mutex_lock(&key_list_mutex);
+
+ for (cur = key_list; cur; cur = cur->kn_next) {
+ fprintf(fp, " %s, View: %d, Size: %d, Address: %p\n",
+ cur->kn_keyid,
+ (int)cur->kn_viewno,
+ cur->kn_datalen,
+ cur->kn_data);
+ if (cur->kn_vote_cb != default_vote_cb)
+ fprintf(fp, " Vote callback: %p\n", cur->kn_vote_cb);
+ if (cur->kn_commit_cb != default_commit_cb)
+ fprintf(fp, " Commit callback: %p\n", cur->kn_commit_cb);
+
+ if (cur->kn_jvlist)
+ fprintf(fp, " This key has unresolved "
+ "new views pending\n");
+ if (cur->kn_clist)
+ fprintf(fp, " This key has unresolved "
+ "commits pending\n");
+
+ }
+
+ pthread_mutex_unlock(&key_list_mutex);
+ fprintf(fp, "\n");
+}
--- cluster/rgmanager/src/daemons/Makefile 2007/03/20 17:09:11 1.14.2.2
+++ cluster/rgmanager/src/daemons/Makefile 2007/07/24 18:49:18 1.14.2.2.2.1
@@ -17,9 +17,9 @@
INCLUDE += -I $(top_srcdir)/include -I $(top_srcdir)/../cman/lib -I $(top_srcdir)/../ccs/lib -I $(top_srcdir)/../dlm/lib
INCLUDE += -I${incdir} -I/usr/include/libxml2
-CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE
+CFLAGS+= -g -Wstrict-prototypes -Wshadow -fPIC -D_GNU_SOURCE -DWRAP_THREADS
-LDFLAGS+= -L ../clulib -L../../../cman/lib -L../../../ccs/lib -L../../../dlm/lib -L${libdir} -lclulib -lxml2 -lpthread -ldl
+LDFLAGS+= -L ../clulib -L../../../cman/lib -L../../../ccs/lib -L../../../dlm/lib -L${libdir} -lclulib -lxml2 -lpthread -ldl -Wl,-wrap,pthread_create,-wrap,pthread_exit -rdynamic
TARGETS=clurgmgrd clurmtabd rg_test
all: ${TARGETS}
--- cluster/rgmanager/src/daemons/groups.c 2007/07/10 18:24:00 1.25.2.9
+++ cluster/rgmanager/src/daemons/groups.c 2007/07/24 18:49:18 1.25.2.9.2.1
@@ -1030,7 +1030,7 @@
msg_send_simple(ctx, RG_FAIL, RG_EAGAIN, 0);
msg_close(ctx);
msg_free_ctx(ctx);
- return NULL;
+ pthread_exit(NULL);
}
pthread_rwlock_rdlock(&resource_lock);
@@ -1053,7 +1053,7 @@
rg_dec_status();
- return NULL;
+ pthread_exit(NULL);
}
@@ -1169,7 +1169,7 @@
/* Only one status thread at a time, please! */
if (pthread_mutex_trylock(&status_mutex) != 0)
- return NULL;
+ pthread_exit(NULL);
pthread_rwlock_rdlock(&resource_lock);
list_do(&_tree, curr) {
@@ -1195,7 +1195,7 @@
pthread_rwlock_unlock(&resource_lock);
pthread_mutex_unlock(&status_mutex);
- return NULL;
+ pthread_exit(NULL);
}
@@ -1397,6 +1397,13 @@
}
+void
+dump_config_version(FILE *fp)
+{
+ fprintf(fp, "Cluster configuration version %d\n\n", config_version);
+}
+
+
/**
Initialize resource groups. This reads all the resource groups from
CCS, builds the tree, etc. Ideally, we'll have a similar function
--- cluster/rgmanager/src/daemons/main.c 2007/06/26 21:55:46 1.34.2.6
+++ cluster/rgmanager/src/daemons/main.c 2007/07/24 18:49:18 1.34.2.6.2.1
@@ -40,6 +40,9 @@
#define L_SYS (1<<1)
#define L_USER (1<<0)
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
int configure_logging(int ccsfd, int debug);
void node_event(int, int, int, int);
@@ -63,7 +66,7 @@
int next_node_id(cluster_member_list_t *membership, int me);
int rg_event_q(char *svcName, uint32_t state, int owner);
-
+void malloc_dump_table(FILE *, size_t, size_t);
void
segfault(int sig)
@@ -259,6 +262,7 @@
free_member_list(node_delta);
free_member_list(new_ml);
+ free_member_list(old_membership);
rg_unlockall(L_SYS);
@@ -405,7 +409,8 @@
sz = msg_receive(ctx, msg_hdr, sizeof(msgbuf), 1);
if (sz < sizeof (generic_msg_hdr)) {
clulog(LOG_ERR,
- "#37: Error receiving message header (%d)\n", sz);
+ "#37: Error receiving header from %d sz=%d CTX %p\n",
+ nodeid, sz, ctx);
goto out;
}
@@ -593,6 +598,7 @@
break;
case M_DATA:
+ nodeid = msg_get_nodeid(ctx);
dispatch_msg(ctx, nodeid, 0);
break;
@@ -629,7 +635,26 @@
}
-void dump_threads(void);
+void dump_threads(FILE *fp);
+void dump_config_version(FILE *fp);
+void dump_vf_states(FILE *fp);
+void dump_cluster_ctx(FILE *fp);
+
+void
+dump_internal_state(char *loc)
+{
+ FILE *fp;
+ fp=fopen(loc, "w+");
+ dump_config_version(fp);
+ dump_threads(fp);
+ dump_vf_states(fp);
+#ifdef WRAP_THREADS
+ dump_thread_states(fp);
+#endif
+ dump_cluster_ctx(fp);
+ //malloc_dump_table(fp, 1, 16384); /* Only works if alloc.c us used */
+ fclose(fp);
+}
int
event_loop(msgctx_t *localctx, msgctx_t *clusterctx)
@@ -645,10 +670,8 @@
if (signalled) {
signalled = 0;
- /*
- malloc_stats();
- dump_threads();
- */
+
+ dump_internal_state("/tmp/rgmanager-dump");
}
while (running && (tv.tv_sec || tv.tv_usec)) {
@@ -747,7 +770,6 @@
cleanup(msgctx_t *clusterctx)
{
kill_resource_groups();
- member_list_update(NULL);
send_exit_msg(clusterctx);
}
@@ -760,7 +782,7 @@
}
-void malloc_dump_table(size_t, size_t);
+void malloc_dump_table(FILE *, size_t, size_t);
/*
@@ -846,10 +868,13 @@
rg_doall(RG_STOP_EXITING, 1, NULL);
running = 0;
- return 0;
+ pthread_exit(NULL);
}
+#ifdef WRAP_THREADS
+void dump_thread_states(FILE *);
+#endif
int
main(int argc, char **argv)
{
@@ -871,7 +896,9 @@
break;
case 'f':
foreground = 1;
+ break;
default:
+ return 1;
break;
}
}
@@ -984,6 +1011,9 @@
event_loop(local_ctx, cluster_ctx);
if (shutdown_pending == 1) {
+ /* Kill local socket; local requests need to
+ be ignored here */
+ msg_close(local_ctx);
++shutdown_pending;
clulog(LOG_NOTICE, "Shutting down\n");
pthread_create(&th, NULL, shutdown_thread, NULL);
--- cluster/rgmanager/src/daemons/nodeevent.c 2007/06/26 21:55:46 1.4.2.3
+++ cluster/rgmanager/src/daemons/nodeevent.c 2007/07/24 18:49:18 1.4.2.3.2.1
@@ -196,7 +196,7 @@
/* Mutex held */
ne_thread = 0;
pthread_mutex_unlock(&ne_queue_mutex);
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_event.c 2006/07/11 23:52:41 1.1
+++ cluster/rgmanager/src/daemons/rg_event.c 2007/07/24 18:49:18 1.1.6.1
@@ -64,7 +64,7 @@
/* Mutex held */
rg_ev_thread = 0;
pthread_mutex_unlock(&rg_queue_mutex);
- return NULL;
+ pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_forward.c 2006/12/13 18:38:41 1.8.2.1
+++ cluster/rgmanager/src/daemons/rg_forward.c 2007/07/24 18:49:18 1.8.2.1.2.1
@@ -24,6 +24,7 @@
#include <msgsimple.h>
#include <clulog.h>
#include <message.h>
+#include <members.h>
void
@@ -49,59 +50,100 @@
request_t *req = (request_t *)arg;
struct dlm_lksb lockp;
msgctx_t *ctx = NULL;
+ cluster_member_list_t *m = NULL;
SmMessageSt msg;
+ int response_code = RG_EAGAIN, ret;
+ int new_owner = 0, retries = 0;
- if (rg_lock(req->rr_group, &lockp) != 0)
+ if (rg_lock(req->rr_group, &lockp) != 0) {
+ clulog(LOG_WARNING, "FW: Forwarding failed; lock unavailable for %s\n",
+ req->rr_group);
goto out_fail;
-
+ }
if (get_rg_state(req->rr_group, &rgs) != 0) {
rg_unlock(&lockp);
+ clulog(LOG_WARNING, "FW: Forwarding failed; state unavailable for %s\n",
+ req->rr_group);
goto out_fail;
}
-
rg_unlock(&lockp);
- /* Construct message */
- build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
-
if (rgs.rs_owner == 0)
rgs.rs_owner = req->rr_target;
if (rgs.rs_owner == 0) {
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
- rq_free(req);
- clulog(LOG_ERR, "Attempt to forward to invalid node ID\n");
- pthread_exit(NULL);
+ clulog(LOG_ERR, "FW: Attempt to forward to invalid node ID\n");
+ goto out_fail;
+ }
+ if (rgs.rs_owner == my_id()) {
+ clulog(LOG_WARNING, "BUG! Attempt to forward to myself!\n");
+ goto out_fail;
}
- clulog(LOG_DEBUG, "Forwarding %s request to %d\n",
+ clulog(LOG_DEBUG, "FW: Forwarding %s request to %d\n",
rg_req_str(req->rr_request), rgs.rs_owner);
- while ((ctx = msg_new_ctx()) == NULL)
- sleep(1);
-
- if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0)
+ ctx = msg_new_ctx();
+ if (ctx == NULL) {
+ clulog(LOG_DEBUG, "FW: Failed to allocate socket context: %s\n",
+ strerror(errno));
goto out_fail;
- if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg))
+ }
+
+ /* Construct message */
+ build_message(&msg, req->rr_request, req->rr_group, req->rr_target);
+
+ if (msg_open(MSG_CLUSTER, rgs.rs_owner, RG_PORT, ctx, 10) < 0) {
+ clulog(LOG_DEBUG, "FW: Failed to open channel to %d CTX: %p\n",
+ rgs.rs_owner, ctx);
goto out_fail;
- if (msg_receive(ctx, &msg, sizeof(msg), 600) < sizeof(msg))
+ }
+ if (msg_send(ctx, &msg, sizeof(msg)) < sizeof(msg)) {
+ clulog(LOG_DEBUG, "FW: Failed to send message to %d CTX: %p\n",
+ rgs.rs_owner, ctx);
goto out_fail;
+ }
- msg_close(ctx);
- msg_free_ctx(ctx);
+ /*
+ * Ok, we're forwarding a message to another node. Keep tabs on
+ * the node to make sure it doesn't die. Basically, wake up every
+ * now and again to make sure it's still online. If it isn't, send
+ * a response back to the caller.
+ */
+ do {
+ ret = msg_receive(ctx, &msg, sizeof(msg), 10);
+ if (ret < (int)sizeof(msg)) {
+ if (ret < 0 && errno == ETIMEDOUT) {
+ m = member_list();
+ if (!memb_online(m, rgs.rs_owner)) {
+ response_code = RG_ENODE;
+ goto out_fail;
+ }
+ free_member_list(m);
+ m = NULL;
+ continue;
+ }
+ goto out_fail;
+ }
+ break;
+ } while(++retries < 60); /* old 60 second rule */
swab_SmMessageSt(&msg);
- send_response(msg.sm_data.d_ret, msg.sm_data.d_svcOwner, req);
- rq_free(req);
- pthread_exit(NULL);
-
-out_fail: /* Failure path */
+
+ response_code = msg.sm_data.d_ret;
+ new_owner = msg.sm_data.d_svcOwner;
+
+out_fail:
+ send_response(response_code, new_owner, req);
+ msg_close(req->rr_resp_ctx);
+ msg_free_ctx(req->rr_resp_ctx);
+
if (ctx) {
msg_close(ctx);
msg_free_ctx(ctx);
}
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
+ if (m)
+ free_member_list(m);
+
rq_free(req);
pthread_exit(NULL);
}
--- cluster/rgmanager/src/daemons/rg_state.c 2007/07/02 15:13:43 1.24.2.10
+++ cluster/rgmanager/src/daemons/rg_state.c 2007/07/24 18:49:18 1.24.2.10.2.1
@@ -217,9 +217,6 @@
swab_SmMessageSt(msgp);
msg_send(ctx, msgp, sizeof(*msgp));
-
- /* :) */
- msg_close(ctx);
}
@@ -245,11 +242,6 @@
swab_SmMessageSt(msgp);
msg_send(req->rr_resp_ctx, msgp, sizeof(*msgp));
-
- /* :( */
- msg_close(req->rr_resp_ctx);
- msg_free_ctx(req->rr_resp_ctx);
- req->rr_resp_ctx = NULL;
}
@@ -548,6 +540,7 @@
break;
}
+ ret = 2;
clulog(LOG_DEBUG, "Not stopping disabled service %s\n",
svcName);
break;
@@ -1510,6 +1503,11 @@
int ret, x;
rg_state_t svcStatus;
+ get_rg_state_local(svcName, &svcStatus);
+ if (svcStatus.rs_state == RG_STATE_DISABLED ||
+ svcStatus.rs_state == RG_STATE_UNINITIALIZED)
+ return RG_EINVAL;
+
if (preferred_target > 0) {
/* TODO: simplify this and don't keep alloc/freeing
member lists */
@@ -1576,8 +1574,10 @@
* I am the ONLY one capable of running this service,
* PERIOD...
*/
- if (target == me && me != preferred_target)
+ if (target == me && me != preferred_target) {
+ free_member_list(backup);
goto exhausted;
+ }
if (target == me) {
/*
@@ -1839,8 +1839,16 @@
int tolerance = FOD_BEST;
int x;
uint32_t me = my_id();
- cluster_member_list_t *membership = member_list();
- int need_check = have_exclusive_resources();
+ cluster_member_list_t *membership;
+ int need_check;
+
+ if (rg_locked()) {
+ /* don't even calc if rg's locked */
+ return RG_EFAIL;
+ }
+
+ need_check = have_exclusive_resources();
+ membership = member_list();
/* XXX ok, so we need to say "should I start this if I was the
only cluster member online */
@@ -1933,6 +1941,8 @@
svcName, 1);
if (target == me) {
ret = handle_start_remote_req(svcName, request);
+ if (ret == RG_EAGAIN)
+ goto out;
} else if (target < 0) {
ret = RG_EFAIL;
goto out;
--- cluster/rgmanager/src/daemons/rg_thread.c 2007/07/10 18:24:00 1.15.2.7
+++ cluster/rgmanager/src/daemons/rg_thread.c 2007/07/24 18:49:18 1.15.2.7.2.1
@@ -60,19 +60,39 @@
SIGUSR1 output
*/
void
-dump_threads(void)
+dump_threads(FILE *fp)
{
resthread_t *rt;
+ request_t *req;
+ int x = 0, y = 0;
- printf("+++ BEGIN Thread dump\n");
+ fprintf(fp, "Resource Group Threads \n");
pthread_mutex_lock(&reslist_mutex);
- list_do(&resthread_list, rt) {
- printf("TID %d group %s (@ %p) request %d\n",
- (int)rt->rt_thread,
- rt->rt_name, rt, rt->rt_request);
- } while (!list_done(&resthread_list, rt));
+ list_for(&resthread_list, rt, x) {
+ fprintf(fp, " %s id:%d (@ %p) processing %s request (%d)\n",
+ rt->rt_name,
+ (unsigned)rt->rt_thread,
+ rt,
+ rg_req_str(rt->rt_request),
+ rt->rt_request);
+ if (!*rt->rt_queue) {
+ fprintf(fp, " Pending requests: \n");
+ list_for(rt->rt_queue, req, y) {
+ fprintf(fp, " %s tgt:%d ctx:%p a0:%d a1:%d\n",
+ rg_req_str(req->rr_request),
+ req->rr_target,
+ req->rr_resp_ctx,
+ req->rr_arg0,
+ req->rr_arg1);
+ }
+ }
+ }
+
+ x = !!resthread_list;
pthread_mutex_unlock(&reslist_mutex);
- printf("--- END Thread dump\n");
+ if (!x)
+ fprintf(fp, " (none)\n");
+ fprintf(fp, "\n");
}
@@ -151,6 +171,8 @@
dprintf("Removed request %d\n", curr->rr_request);
if (curr->rr_resp_ctx) {
send_response(RG_EABORT, 0, curr);
+ msg_close(curr->rr_resp_ctx);
+ msg_free_ctx(curr->rr_resp_ctx);
}
rq_free(curr);
}
@@ -241,12 +263,14 @@
break;
case RG_ENABLE:
+ #if 0
if (req->rr_target != 0 &&
req->rr_target != my_id()) {
error = RG_EFORWARD;
ret = RG_NONE;
break;
}
+ #endif
case RG_START:
if (req->rr_arg0) {
error = handle_fd_start_req(myname,
@@ -464,6 +488,8 @@
if (ret != RG_NONE && rg_initialized() &&
(req->rr_resp_ctx)) {
send_response(error, newowner, req);
+ msg_close(req->rr_resp_ctx);
+ msg_free_ctx(req->rr_resp_ctx);
}
rq_free(req);
@@ -553,7 +579,6 @@
int ret;
resthread_t *resgroup = NULL;
-retry:
pthread_mutex_lock(&reslist_mutex);
while (resgroup == NULL) {
resgroup = find_resthread_byname(resgroupname);
@@ -572,7 +597,7 @@
pthread_mutex_unlock(&reslist_mutex);
if (wait_initialize(resgroupname) < 0) {
- goto retry;
+ return -1;
}
return ret;
@@ -677,6 +702,8 @@
case RG_ENABLE:
send_ret(response_ctx, resgroup->rt_name, RG_EDEADLCK,
request);
+ msg_close(response_ctx);
+ msg_free_ctx(response_ctx);
break;
}
fprintf(stderr, "Failed to queue request: Would block\n");
--- cluster/rgmanager/src/resources/Makefile 2007/05/09 19:07:02 1.13.2.5
+++ cluster/rgmanager/src/resources/Makefile 2007/07/24 18:49:18 1.13.2.5.2.1
@@ -21,10 +21,11 @@
script.sh netfs.sh clusterfs.sh smb.sh \
apache.sh openldap.sh samba.sh mysql.sh \
postgres-8.sh tomcat-5.sh lvm.sh vm.sh \
- SAPInstance SAPDatabase
+ SAPInstance SAPDatabase named.sh
METADATA=apache.metadata openldap.metadata samba.metadata \
- mysql.metadata postgres-8.metadata tomcat-5.metadata
+ mysql.metadata postgres-8.metadata tomcat-5.metadata \
+ named.metadata
TARGETS=${RESOURCES} ocf-shellfuncs svclib_nfslock
--- cluster/rgmanager/src/resources/vm.sh 2007/06/26 21:55:46 1.1.2.4
+++ cluster/rgmanager/src/resources/vm.sh 2007/07/24 18:49:18 1.1.2.4.2.1
@@ -212,7 +212,7 @@
# controlled externally; the external monitoring app
# should.
#
- declare cmdline="restart=\"never\""
+ declare cmdline="on_shutdown=\"destroy\" on_reboot=\"destroy\" on_crash=\"destroy\""
declare varp val temp
#
--- cluster/rgmanager/src/utils/clusvcadm.c 2007/06/14 13:35:59 1.12.2.4
+++ cluster/rgmanager/src/utils/clusvcadm.c 2007/07/24 18:49:18 1.12.2.4.2.1
@@ -380,7 +380,10 @@
printf("Member %s %s %s", nodename, actionstr, svcname);
printf("...");
fflush(stdout);
- msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+ if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+ printf("Could not connect to resource group manager\n");
+ return 1;
+ }
} else {
if (!svctarget)
printf("Trying to %s %s", actionstr, svcname);
@@ -389,7 +392,10 @@
nodename);
printf("...");
fflush(stdout);
- msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5);
+ if (msg_open(MSG_SOCKET, 0, RG_PORT, &ctx, 5) < 0) {
+ printf("Could not connect to resource group manager\n");
+ return 1;
+ }
}
if (ctx.type < 0) {
More information about the Cluster-devel
mailing list