[Cluster-devel] cluster fence/fenced/group.c fence/fenced/main ...

rpeterso at sourceware.org rpeterso at sourceware.org
Fri Oct 13 16:03:49 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	cluster
Changes by:	rpeterso at sourceware.org	2006-10-13 16:03:48

Modified files:
	fence/fenced   : group.c main.c 
	gnbd/utils     : group.c 
	group/dlm_controld: group.c 
	group/gfs_controld: group.c 
	group/lib      : libgroup.c libgroup.h 

Log message:
	This fix is for bugzilla 210641: Race condition hang/failure
	between cman daemons and groupd.  Added a retry with timeout
	to group_init and all its callers.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/group.c.diff?cvsroot=cluster&r1=1.9&r2=1.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/fence/fenced/main.c.diff?cvsroot=cluster&r1=1.37&r2=1.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/gnbd/utils/group.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/group.c.diff?cvsroot=cluster&r1=1.2&r2=1.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.c.diff?cvsroot=cluster&r1=1.20&r2=1.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/lib/libgroup.h.diff?cvsroot=cluster&r1=1.16&r2=1.17

--- cluster/fence/fenced/group.c	2006/06/20 18:11:58	1.9
+++ cluster/fence/fenced/group.c	2006/10/13 16:03:47	1.10
@@ -18,6 +18,8 @@
 #define DO_TERMINATE 4
 #define DO_SETID 5
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
@@ -157,17 +159,14 @@
 {
 	int rv;
 
-	gh = group_init(NULL, "fence", 0, &callbacks);
+	gh = group_init(NULL, "fence", 0, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
 	}
-
 	rv = group_get_fd(gh);
-	if (rv < 0) {
+	if (rv < 0)
 		log_error("group_get_fd error %d %d", rv, errno);
-	}
-
 	return rv;
 }
 
--- cluster/fence/fenced/main.c	2006/08/15 17:17:45	1.37
+++ cluster/fence/fenced/main.c	2006/10/13 16:03:47	1.38
@@ -182,7 +182,7 @@
 
 	fd = find_domain(name);
 	if (fd) {
-		log_debug("join error: domain exists");
+		log_debug("join error: domain %s exists", name);
 		rv = -EEXIST;
 		goto out;
 	}
@@ -204,6 +204,7 @@
 	rv = group_join(gh, name);
 	if (rv) {
 		log_error("group_join error %d", rv);
+		list_del(&fd->list);
 		free(fd);
 	}
  out:
--- cluster/gnbd/utils/group.c	2006/05/16 19:08:18	1.1
+++ cluster/gnbd/utils/group.c	2006/10/13 16:03:47	1.2
@@ -21,6 +21,7 @@
 #include "group.h"
 
 #define MAXLINE 256
+#define GROUPD_TIMEOUT 10
 
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
@@ -139,7 +140,7 @@
 {
 	int rv;
 
-	gh = group_init(NULL, name, 0, &callbacks);
+	gh = group_init(NULL, name, 0, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_err("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/dlm_controld/group.c	2006/08/31 18:17:00	1.2
+++ cluster/group/dlm_controld/group.c	2006/10/13 16:03:47	1.3
@@ -18,6 +18,8 @@
 #define DO_TERMINATE 4
 #define DO_SETID 5
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
@@ -199,7 +201,7 @@
 {
 	int rv;
 
-	gh = group_init(NULL, "dlm", 1, &callbacks);
+	gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/gfs_controld/group.c	2006/06/15 20:41:46	1.2
+++ cluster/group/gfs_controld/group.c	2006/10/13 16:03:47	1.3
@@ -15,6 +15,8 @@
 /* save all the params from callback functions here because we can't
    do the processing within the callback function itself */
 
+#define GROUPD_TIMEOUT 10 /* seconds */
+
 group_handle_t gh;
 static int cb_action;
 static char cb_name[MAX_GROUP_NAME_LEN+1];
@@ -168,7 +170,7 @@
 	int rv;
 
 	gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
-			&callbacks);
+					&callbacks, GROUPD_TIMEOUT);
 	if (!gh) {
 		log_error("group_init error %d %d", (int) gh, errno);
 		return -ENOTCONN;
--- cluster/group/lib/libgroup.c	2006/09/07 19:24:08	1.20
+++ cluster/group/lib/libgroup.c	2006/10/13 16:03:48	1.21
@@ -279,11 +279,11 @@
 }
 
 group_handle_t group_init(void *private, char *prog_name, int level,
-			  group_callbacks_t *cbs)
+			  group_callbacks_t *cbs, int timeout)
 {
 	struct group_handle *h;
 	char buf[GROUPD_MSGLEN];
-	int rv, saved_errno;
+	int rv, saved_errno, i;
 
 	h = malloc(sizeof(struct group_handle));
 	if (!h)
@@ -295,20 +295,25 @@
 	h->level = level;
 	strncpy(h->prog_name, prog_name, 32);
 
-	h->fd = connect_groupd();
-	if (h->fd < 0)
-		goto fail;
-
-	memset(buf, 0, sizeof(buf));
-	snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
-
-	rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
-	if (rv < 0)
-		goto fail;
-
-	return (group_handle_t) h;
-
- fail:
+	for (i = 0; !timeout || i < timeout * 2; i++) {
+		h->fd = connect_groupd();
+		if (h->fd > 0 || !timeout) /* if successful or only once allowed */
+			break;
+		usleep(500000);
+	}
+	if (h->fd > 0) {
+		memset(buf, 0, sizeof(buf));
+		snprintf(buf, sizeof(buf), "setup %s %d", prog_name, level);
+
+		for (; !timeout || i < timeout * 2; i++) {
+			rv = do_write(h->fd, &buf, GROUPD_MSGLEN);
+			if (rv >= 0)
+				return (group_handle_t) h;
+			if (!timeout)
+				break;
+			usleep(500000);
+		}
+	}
 	saved_errno = errno;
 	close(h->fd);
 	free(h);
@@ -475,31 +480,30 @@
 	return rv;
 }
 
-int group_get_group(int level, char *name, group_data_t *data)
+int group_get_group(int level, const char *name, group_data_t *data)
 {
-	char buf[GROUPD_MSGLEN];
-	char data_buf[sizeof(group_data_t)];
-	int fd, rv, len;
+       char buf[GROUPD_MSGLEN];
+       char data_buf[sizeof(group_data_t)];
+       int fd, rv, len;
 
-	fd = connect_groupd();
-	if (fd < 0)
-		return fd;
+       fd = connect_groupd();
+       if (fd < 0)
+               return fd;
 
-	memset(buf, 0, sizeof(buf));
-	snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
+       memset(buf, 0, sizeof(buf));
+       snprintf(buf, sizeof(buf), "get_group %d %s", level, name);
 
-	rv = do_write(fd, &buf, GROUPD_MSGLEN);
-	if (rv < 0)
-		goto out;
+       rv = do_write(fd, &buf, GROUPD_MSGLEN);
+       if (rv < 0)
+               goto out;
 
-	rv = do_read(fd, &data_buf, sizeof(data_buf));
-	if (rv < 0)
-		goto out;
+       rv = do_read(fd, &data_buf, sizeof(data_buf));
+       if (rv < 0)
+               goto out;
 
-	memcpy(data, data_buf, sizeof(group_data_t));
-	rv = 0;
+       memcpy(data, data_buf, sizeof(group_data_t));
+       rv = 0;
  out:
-	close(fd);
-	return rv;
+       close(fd);
+       return rv;
 }
-
--- cluster/group/lib/libgroup.h	2006/03/02 20:24:17	1.16
+++ cluster/group/lib/libgroup.h	2006/10/13 16:03:48	1.17
@@ -54,7 +54,7 @@
 	group_deliver_t deliver;
 } group_callbacks_t;
 
-group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs);
+group_handle_t group_init(void *private, char *prog_name, int level, group_callbacks_t *cbs, int timeout);
 int group_exit(group_handle_t handle);
 
 int group_join(group_handle_t handle, char *name);
@@ -88,7 +88,7 @@
    don't interfere with dispatchable callback messages. */
 
 int group_get_groups(int max, int *count, group_data_t *data);
-int group_get_group(int level, char *name, group_data_t *data);
+int group_get_group(int level, const char *name, group_data_t *data);
 
 #endif
 




More information about the Cluster-devel mailing list