[Cluster-devel] cluster fence/fenced/group.c fence/fenced/main ...
rpeterso at sourceware.org
rpeterso at sourceware.org
Fri Oct 13 16:03:49 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: rpeterso at sourceware.org 2006-10-13 16:03:48
Modified files:
fence/fenced : group.c main.c
gnbd/utils : group.c
group/dlm_controld: group.c
group/gfs_controld: group.c
group/lib : libgroup.c libgroup.h
Log message:
This fix is for bugzilla 210641: Race condition hang/failure
between cman daemons and groupd. Added a retry with timeout
to group_init and all its callers.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/group.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.h.diff?cvsroot=cluster&r1=1.16&r2=1.17
--- cluster/fence/fenced/group.c 2006/06/20 18:11:58 1.9
+++ cluster/fence/fenced/group.c 2006/10/13 16:03:47 1.10
@@ -18,6 +18,8 @@
#define DO_TERMINATE 4
#define DO_SETID 5
+#define GROUPD_TIMEOUT 10 /* seconds */
+
/* save all the params from callback functions here because we can't
do the processing within the callback function itself */
@@ -157,17 +159,14 @@
{
int rv;
- gh = group_init(NULL, "fence", 0, &callbacks);
+ gh = group_init(NULL, "fence", 0, &callbacks, GROUPD_TIMEOUT);
if (!gh) {
log_error("group_init error %d %d", (int) gh, errno);
return -ENOTCONN;
}
-
rv = group_get_fd(gh);
- if (rv < 0) {
+ if (rv < 0)
log_error("group_get_fd error %d %d", rv, errno);
- }
-
return rv;
}
--- cluster/fence/fenced/main.c 2006/08/15 17:17:45 1.37
+++ cluster/fence/fenced/main.c 2006/10/13 16:03:47 1.38
@@ -182,7 +182,7 @@
fd = find_domain(name);
if (fd) {
- log_debug("join error: domain exists");
+ log_debug("join error: domain %s exists", name);
rv = -EEXIST;
goto out;
}
@@ -204,6 +204,7 @@
rv = group_join(gh, name);
if (rv) {
log_error("group_join error %d", rv);
+ list_del(&fd->list);
free(fd);
}
out:
--- cluster/gnbd/utils/group.c 2006/05/16 19:08:18 1.1
+++ cluster/gnbd/utils/group.c 2006/10/13 16:03:47 1.2
@@ -21,6 +21,7 @@
#include "group.h"
#define MAXLINE 256
+#define GROUPD_TIMEOUT 10
/* save all the params from callback functions here because we can't
do the processing within the callback function itself */
@@ -139,7 +140,7 @@
{
int rv;
- gh = group_init(NULL, name, 0, &callbacks);
+ gh = group_init(NULL, name, 0, &callbacks, GROUPD_TIMEOUT);
if (!gh) {
log_err("group_init error %d %d", (int) gh, errno);
return -ENOTCONN;
--- cluster/group/dlm_controld/group.c 2006/08/31 18:17:00 1.2
+++ cluster/group/dlm_controld/group.c 2006/10/13 16:03:47 1.3
@@ -18,6 +18,8 @@
#define DO_TERMINATE 4
#define DO_SETID 5
+#define GROUPD_TIMEOUT 10 /* seconds */
+
/* save all the params from callback functions here because we can't
do the processing within the callback function itself */
@@ -199,7 +201,7 @@
{
int rv;
- gh = group_init(NULL, "dlm", 1, &callbacks);
+ gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
if (!gh) {
log_error("group_init error %d %d", (int) gh, errno);
return -ENOTCONN;
--- cluster/group/gfs_controld/group.c 2006/06/15 20:41:46 1.2
+++ cluster/group/gfs_controld/group.c 2006/10/13 16:03:47 1.3
@@ -15,6 +15,8 @@
/* save all the params from callback functions here because we can't
do the processing within the callback function itself */
+#define GROUPD_TIMEOUT 10 /* seconds */
+
group_handle_t gh;
static int cb_action;
static char cb_name[MAX_GROUP_NAME_LEN+1];
@@ -168,7 +170,7 @@
int rv;
gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
- &callbacks);
+ &callbacks, GROUPD_TIMEOUT);
if (!gh) {
log_error("group_init error %d %d", (int) gh, errno);
return -ENOTCONN;
--- cluster/group/lib/libgroup.c 2006/09/07 19:24:08 1.20
+++ cluster/group/lib/libgroup.c 2006/10/13 16:03:48 1.21
@@ -279,11 +279,11 @@
}
group_handle_t group_init(void *private, char *prog_name, int level,
- group_callbacks_t *cbs)
+ group_callbacks_t *cbs, int timeout)
{
struct group_handle *h;
char buf[GROUPD_MSGLEN];
- int rv, saved_errno;
+ int rv, saved_errno, i;
h = malloc(sizeof(struct group_handle));
if (!h)
@@ -295,20 +295,25 @@
h->level = level;
strncpy(h->prog_name, prog_name, 32);
- h->fd = connect_groupd();
- if (h->fd < 0)
- goto fail;
-
- memset(buf, 0, sizeof(buf));
- snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
-
- rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
- if (rv < 0)
- goto fail;
-
- return (group_handle_t) h;
-
- fail:
+ for (i = 0; !timeout || i < timeout * 2; i++) {
+ h->fd = connect_groupd();
+ if (h->fd > 0 || !timeout) /* if successful or only once allowed */
+ break;
+ usleep(500000);
+ }
+ if (h->fd > 0) {
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
+
+ for (; !timeout || i < timeout * 2; i++) {
+ rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
+ if (rv >= 0)
+ return (group_handle_t) h;
+ if (!timeout)
+ break;
+ usleep(500000);
+ }
+ }
saved_errno = errno;
close(h->fd);
free(h);
@@ -475,31 +480,30 @@
return rv;
}
-int group_get_group(int level, char *name, group_data_t *data)
+int group_get_group(int level, const char *name, group_data_t *data)
{
- char buf[GROUPD_MSGLEN];
- char data_buf[sizeof(group_data_t)];
- int fd, rv, len;
+ char buf[GROUPD_MSGLEN];
+ char data_buf[sizeof(group_data_t)];
+ int fd, rv, len;
- fd = connect_groupd();
- if (fd < 0)
- return fd;
+ fd = connect_groupd();
+ if (fd < 0)
+ return fd;
- memset(buf, 0, sizeof(buf));
- snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
+ memset(buf, 0, sizeof(buf));
+ snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
- rv = do_write(fd, &buf, GROUPD_MSGLEN);
- if (rv < 0)
- goto out;
+ rv = do_write(fd, &buf, GROUPD_MSGLEN);
+ if (rv < 0)
+ goto out;
- rv = do_read(fd, &data_buf, sizeof(data_buf));
- if (rv < 0)
- goto out;
+ rv = do_read(fd, &data_buf, sizeof(data_buf));
+ if (rv < 0)
+ goto out;
- memcpy(data, data_buf, sizeof(group_data_t));
- rv = 0;
+ memcpy(data, data_buf, sizeof(group_data_t));
+ rv = 0;
out:
- close(fd);
- return rv;
+ close(fd);
+ return rv;
}
-
--- cluster/group/lib/libgroup.h 2006/03/02 20:24:17 1.16
+++ cluster/group/lib/libgroup.h 2006/10/13 16:03:48 1.17
@@ -54,7 +54,7 @@
group_deliver_t deliver;
} group_callbacks_t;
-group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs);
+group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs, int timeout);
int group_exit(group_handle_t handle);
int group_join(group_handle_t handle, char *name);
@@ -88,7 +88,7 @@
don't interfere with dispatchable callback messages. */
int group_get_groups(int max, int *count, group_data_t *data);
-int group_get_group(int level, char *name, group_data_t *data);
+int group_get_group(int level, const char *name, group_data_t *data);
#endif
More information about the Cluster-devel
mailing list