[Cluster-devel] cluster/group/gfs_controld cpg.c lock_dlm.h ma ...
teigland at sourceware.org
teigland at sourceware.org
Thu Jun 15 15:27:44 UTC 2006
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2006-06-15 15:27:43
Modified files:
group/gfs_controld: cpg.c lock_dlm.h main.c recover.c
Log message:
Significant reworking of how mounts are processed. The previous
approach couldn't deal with certain node failures that occured while
processing a new mounter. In this new approach, processing a mounter
is largely independent of processing node failures. Nodes failing
while processing a mounter hasn't actually been tested yet, so there
are sure to be details to fix.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/cpg.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/lock_dlm.h.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/main.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/gfs_controld/recover.c.diff?cvsroot=cluster&r1=1.1&r2=1.2
--- cluster/group/gfs_controld/cpg.c 2006/06/09 20:59:57 1.1
+++ cluster/group/gfs_controld/cpg.c 2006/06/15 15:27:43 1.2
@@ -27,7 +27,7 @@
void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
int from);
void receive_recovery_done(struct mountgroup *mg, char *buf, int len, int from);
-
+char *msg_name(int type);
static void do_deliver(int nodeid, char *data, int len)
{
@@ -58,8 +58,8 @@
discard them since they're only relevant to the app group. */
if (!mg->last_callback) {
- log_group(mg, "discard message type %d len %d from %d",
- hd->type, len, nodeid);
+ log_group(mg, "discard %s len %d from %d",
+ msg_name(hd->type), len, nodeid);
return;
}
--- cluster/group/gfs_controld/lock_dlm.h 2006/06/09 20:59:57 1.1
+++ cluster/group/gfs_controld/lock_dlm.h 2006/06/15 15:27:43 1.2
@@ -111,37 +111,37 @@
char dir[PATH_MAX+1];
char options[MAX_OPTIONS_LEN+1];
- char error_msg[128];
- int mount_client;
- int remount_client;
int last_stop;
int last_start;
int last_finish;
int last_callback;
int start_event_nr;
int start_type;
- int needs_recovery;
- int our_jid;
+
+ char error_msg[128];
+ int mount_client;
+ int remount_client;
+ int init;
+ int got_our_options;
+ int got_our_journals;
+ int mount_client_notified;
+ int mount_client_delay;
+ int delay_send_journals;
+ int got_kernel_mount;
int first_mounter;
int first_mounter_done;
int emulate_first_mounter;
int wait_first_done;
- int init;
- int init2;
int low_finished_nodeid;
+
+ int needs_recovery;
+ int our_jid;
int spectator;
int readonly;
int rw;
int withdraw;
- void *journals_msg;
- int journals_msg_len;
- int journals_msg_from;
- void *options_msg;
- int options_msg_len;
- int options_msg_from;
- struct list_head saved_recovery_status;
-
+ struct list_head saved_messages;
void *start2_fn;
};
@@ -174,11 +174,12 @@
struct list_head list;
int nodeid;
int jid;
- int new;
+
int spectator;
int readonly;
int rw;
uint32_t opts;
+
int tell_gfs_to_recover;
int wait_gfs_recover_done;
int gone_event;
@@ -188,6 +189,7 @@
int recovery_status;
int withdraw;
struct dlm_lksb wd_lksb;
+ int needs_journals;
};
enum {
@@ -234,6 +236,7 @@
int do_remount(int ci, char *dir, char *mode);
int do_withdraw(char *name);
int kernel_recovery_done(char *name);
+void ping_kernel_mount(char *table);
int client_send(int ci, char *buf, int len);
--- cluster/group/gfs_controld/main.c 2006/06/09 20:59:57 1.1
+++ cluster/group/gfs_controld/main.c 2006/06/15 15:27:43 1.2
@@ -201,9 +201,10 @@
if (!strcmp(act, "change@"))
kernel_recovery_done(argv[3]);
-
else if (!strcmp(act, "offline@"))
do_withdraw(argv[3]);
+ else
+ ping_kernel_mount(argv[3]);
return 0;
}
--- cluster/group/gfs_controld/recover.c 2006/06/09 20:59:57 1.1
+++ cluster/group/gfs_controld/recover.c 2006/06/15 15:27:43 1.2
@@ -16,10 +16,12 @@
struct list_head list;
int nodeid;
int len;
- char buf[MAX_MSGLEN];
+ int type;
+ char buf[0];
};
#define SYSFS_DIR "/sys/fs"
+#define JID_INIT -9
extern char *clustername;
extern int our_nodeid;
@@ -27,12 +29,12 @@
struct list_head mounts;
+void send_journals(struct mountgroup *mg, int nodeid);
int hold_withdraw_locks(struct mountgroup *mg);
void release_withdraw_lock(struct mountgroup *mg, struct mg_member *memb);
void release_withdraw_locks(struct mountgroup *mg);
void start_participant_init_2(struct mountgroup *mg);
-void start_participant_2(struct mountgroup *mg);
void start_spectator_init_2(struct mountgroup *mg);
void start_spectator_2(struct mountgroup *mg);
@@ -53,6 +55,8 @@
return -1;
}
+ mg->got_kernel_mount = 1;
+
memset(out, 0, 16);
sprintf(out, "%d", val);
rv = write(fd, out, strlen(out));
@@ -81,6 +85,8 @@
return -1;
}
+ mg->got_kernel_mount = 1;
+
rv = read(fd, buf, len);
if (rv < 0)
log_error("read %s error %d %d", fname, rv, errno);
@@ -117,13 +123,6 @@
return NULL;
}
-void clear_new(struct mountgroup *mg)
-{
- struct mg_member *memb;
- list_for_each_entry(memb, &mg->members, list)
- memb->new = 0;
-}
-
static void start_done(struct mountgroup *mg)
{
log_group(mg, "start_done %d", mg->start_event_nr);
@@ -239,37 +238,59 @@
{
struct save_msg *sm, *sm2;
- if (list_empty(&mg->saved_recovery_status))
+ if (list_empty(&mg->saved_messages))
return;
log_group(mg, "process_saved_recovery_status");
- list_for_each_entry_safe(sm, sm2, &mg->saved_recovery_status, list) {
+ list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+ if (sm->type != MSG_RECOVERY_STATUS)
+ continue;
_receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid);
list_del(&sm->list);
free(sm);
}
}
+char *msg_name(int type)
+{
+ switch (type) {
+ case MSG_JOURNAL:
+ return "MSG_JOURNAL";
+ case MSG_OPTIONS:
+ return "MSG_OPTIONS";
+ case MSG_REMOUNT:
+ return "MSG_REMOUNT";
+ case MSG_PLOCK:
+ return "MSG_PLOCK";
+ case MSG_RECOVERY_STATUS:
+ return "MSG_RECOVERY_STATUS";
+ case MSG_RECOVERY_DONE:
+ return "MSG_RECOVERY_DONE";
+ }
+ return "unknown";
+}
+
/* we can receive recovery_status messages from other nodes doing start before
we actually process the corresponding start callback ourselves */
-void save_recovery_status(struct mountgroup *mg, char *buf, int len, int from)
+void save_message(struct mountgroup *mg, char *buf, int len, int from, int type)
{
struct save_msg *sm;
- sm = malloc(sizeof(struct save_msg));
+ sm = malloc(sizeof(struct save_msg) + len);
if (!sm)
return;
- memset(sm, 0, sizeof(struct save_msg));
+ memset(sm, 0, sizeof(struct save_msg) + len);
memcpy(&sm->buf, buf, len);
+ sm->type = type;
sm->len = len;
sm->nodeid = from;
- list_add_tail(&sm->list, &mg->saved_recovery_status);
+ log_group(mg, "save %s from %d len %d", msg_name(type), from, len);
- log_group(mg, "save_recovery_status from %d len %d", from, len);
+ list_add_tail(&sm->list, &mg->saved_messages);
}
void receive_recovery_status(struct mountgroup *mg, char *buf, int len,
@@ -277,7 +298,7 @@
{
switch (mg->last_callback) {
case DO_STOP:
- save_recovery_status(mg, buf, len, from);
+ save_message(mg, buf, len, from, MSG_RECOVERY_STATUS);
break;
case DO_START:
_receive_recovery_status(mg, buf, len, from);
@@ -460,6 +481,97 @@
free(buf);
}
+/* We set the new member's jid to the lowest unused jid.
+ If we're the lowest existing member (by nodeid), then
+ send jid info to the new node. */
+
+/* Look at rw/ro/spectator status of all existing mounters and whether
+ we need to do recovery. Based on that, decide if the current mount
+ mode (ro/spectator) is permitted; if not, set jid = -2. If spectator
+ mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set
+ real jid. */
+
+int assign_journal(struct mountgroup *mg, struct mg_member *new)
+{
+ struct mg_member *memb;
+ int i, total, rw_count, ro_count, spect_count, invalid_count;
+
+ total = rw_count = ro_count = spect_count = invalid_count = 0;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (memb->nodeid == new->nodeid)
+ continue;
+ total++;
+ if (memb->jid == -2)
+ invalid_count++;
+ else if (memb->spectator)
+ spect_count++;
+ else if (memb->rw)
+ rw_count++;
+ else if (memb->readonly)
+ ro_count++;
+ }
+
+ log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d",
+ total, invalid_count, rw_count, ro_count, spect_count);
+
+ /* do we let the new member mount? jid=-2 means no.
+ - we only allow an rw mount when the fs needs recovery
+ - we only allow a single rw mount when the fs needs recovery */
+
+ if (mg->needs_recovery) {
+ if (!new->rw || rw_count)
+ new->jid = -2;
+ }
+
+ if (new->jid == -2) {
+ log_group(mg, "assign_journal: fail - needs_recovery %d",
+ mg->needs_recovery);
+ goto out;
+ }
+
+ if (new->spectator) {
+ log_group(mg, "assign_journal: new spectator allowed");
+ new->jid = -1;
+ goto out;
+ }
+
+ for (i = 0; i < 1024; i++) {
+ memb = find_memb_jid(mg, i);
+ if (!memb) {
+ new->jid = i;
+ break;
+ }
+ }
+
+ /* Currently the fs needs recovery, i.e. none of the current
+ mounters (ro/spectators) can recover journals. So, this new rw
+ mounter is told to do first-mounter recovery of all the journals. */
+
+ if (mg->needs_recovery) {
+ log_group(mg, "assign_journal: new member OPT_RECOVER");
+ new->opts |= MEMB_OPT_RECOVER;
+ }
+
+ out:
+ log_group(mg, "assign_journal: new member %d got jid %d",
+ new->nodeid, new->jid);
+
+ /* if we're the first mounter and haven't gotten others_may_mount
+ yet, then don't send journals until kernel_recovery_done_first
+ so the second node won't mount the fs until omm. */
+
+ if (mg->low_finished_nodeid == our_nodeid) {
+ if (mg->first_mounter && !mg->first_mounter_done) {
+ log_group(mg, "delay sending journals to %d",
+ new->nodeid);
+ mg->delay_send_journals = new->nodeid;
+ } else
+ send_journals(mg, new->nodeid);
+ }
+ return 0;
+}
+
void _receive_options(struct mountgroup *mg, char *buf, int len, int from)
{
struct mg_member *memb;
@@ -475,9 +587,6 @@
return;
}
- if (from == our_nodeid)
- return;
-
if (strstr(options, "spectator")) {
memb->spectator = 1;
memb->opts |= MEMB_OPT_SPECT;
@@ -489,34 +598,58 @@
memb->opts |= MEMB_OPT_RO;
}
- log_group(mg, "receive_options from %d rw=%d ro=%d spect=%d opts=%x",
+ log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x",
from, memb->rw, memb->readonly, memb->spectator, memb->opts);
+
+ assign_journal(mg, memb);
}
void receive_options(struct mountgroup *mg, char *buf, int len, int from)
{
struct gdlm_header *hd = (struct gdlm_header *)buf;
-
- if (hd->nodeid == our_nodeid)
- return;
+ struct mg_member *memb;
log_group(mg, "receive_options from %d len %d last_cb %d",
from, len, mg->last_callback);
- /* If last_callback isn't DO_START it means we've not gotten
- the start callback for the new node addition yet, and we need to
- save this message to be processed after we get our first start. */
-
- if (mg->last_callback != DO_START) {
- mg->options_msg = malloc(len);
- mg->options_msg_len = len;
- mg->options_msg_from = from;
- memcpy(mg->options_msg, buf, len);
- } else {
- void (*start2)(struct mountgroup *mg) = mg->start2_fn;
+ if (hd->nodeid == our_nodeid) {
+ mg->got_our_options = 1;
+ return;
+ }
+
+ if (!mg->got_our_options) {
+ log_group(mg, "ignore options from %d", from);
+ return;
+ }
+
+ /* we can receive an options message before getting the start
+ that adds the mounting node that sent the options, or
+ we can receive options messages before we get the journals
+ message for out own mount */
+
+ memb = find_memb_nodeid(mg, from);
+
+ if (!memb || !mg->got_our_journals)
+ save_message(mg, buf, len, from, MSG_OPTIONS);
+ else
_receive_options(mg, buf, len, from);
- start2(mg);
- mg->start2_fn = NULL;
+}
+
+void process_saved_options(struct mountgroup *mg)
+{
+ struct save_msg *sm, *sm2;
+
+ if (list_empty(&mg->saved_messages))
+ return;
+
+ log_group(mg, "process_saved_options");
+
+ list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
+ if (sm->type != MSG_OPTIONS)
+ continue;
+ _receive_options(mg, sm->buf, sm->len, sm->nodeid);
+ list_del(&sm->list);
+ free(sm);
}
}
@@ -564,6 +697,7 @@
void _receive_journals(struct mountgroup *mg, char *buf, int len, int from)
{
+ void (*start2)(struct mountgroup *mg) = mg->start2_fn;
struct mg_member *memb, *memb2;
struct gdlm_header *hd;
int *ids, count, i, nodeid, jid, opts;
@@ -571,13 +705,6 @@
hd = (struct gdlm_header *)buf;
count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
-
- if (count != mg->memb_count) {
- log_error("invalid journals message len %d counts %d %d",
- len, count, mg->memb_count);
- return;
- }
-
ids = (int *) (buf + sizeof(struct gdlm_header));
for (i = 0; i < count; i++) {
@@ -615,140 +742,47 @@
memb->spectator = 1;
}
}
+
+ /* we delay processing any options messages from new mounters
+ until after we receive the journals message for our own mount */
+ process_saved_options(mg);
+
+ start2(mg);
}
void receive_journals(struct mountgroup *mg, char *buf, int len, int from)
{
struct gdlm_header *hd = (struct gdlm_header *)buf;
+ struct mg_member *memb;
int count;
- if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
- return;
-
count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
- log_group(mg, "receive_journals from %d len %d count %d last_cb %d",
- from, len, count, mg->last_callback);
-
- /* If init is still 1 it means we've not run do_start()
- for our join yet, and we need to save this message to be
- processed after we get our first start. */
-
-
- /* it should now be impossible to receive a journals message prior to
- our start because the node sending journals won't do so until
- receiving our options message
- if (mg->init) {
- mg->journals_msg = malloc(len);
- mg->journals_msg_len = len;
- mg->journals_msg_from = from;
- memcpy(mg->journals_msg, buf, len);
- } else {
- *******/
-
- ASSERT(mg->last_callback == DO_START);
-
- {
- void (*start2)(struct mountgroup *mg) = mg->start2_fn;
- _receive_journals(mg, buf, len, from);
- start2(mg);
- mg->start2_fn = NULL;
- }
-}
-
-/* We set the new member's jid to the lowest unused jid.
- If we're the lowest existing member (by nodeid), then
- send jid info to the new node. */
-
-/* Look at rw/ro/spectator status of all existing mounters and whether
- we need to do recovery. Based on that, decide if the current mount
- mode (ro/spectator) is permitted; if not, set jid = -2. If spectator
- mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set
- real jid. */
-
-int discover_journals(struct mountgroup *mg)
-{
- struct mg_member *memb, *new = NULL;
- int i, total, rw_count, ro_count, spect_count, invalid_count;
-
- total = rw_count = ro_count = spect_count = invalid_count = 0;
+ log_group(mg, "receive_journals from %d to %d len %d count %d cb %d",
+ from, hd->to_nodeid, len, count, mg->last_callback);
- list_for_each_entry(memb, &mg->members, list) {
- if (memb->new && new) {
- log_error("more than one new member %d %d",
- new->nodeid, memb->nodeid);
- return -1;
- } else if (memb->new) {
- new = memb;
- } else {
- total++;
- if (memb->jid == -2)
- invalid_count++;
- else if (memb->spectator)
- spect_count++;
- else if (memb->rw)
- rw_count++;
- else if (memb->readonly)
- ro_count++;
- }
- }
-
- if (!new) {
- log_group(mg, "discover_journals: no new member");
- return 0;
- }
+ /* just like we can receive an options msg from a newly added node
+ before we get the start adding it, we can receive the journals
+ message sent to it before we get the start adding it */
- log_group(mg, "discover_journals: total %d iv %d rw %d ro %d spect %d",
- total, invalid_count, rw_count, ro_count, spect_count);
-
- log_group(mg, "discover_journals: new member %d rw=%d ro=%d spect=%d",
- new->nodeid, new->rw, new->readonly, new->spectator);
-
- /* do we let the new member mount? jid=-2 means no.
- - we only allow an rw mount when the fs needs recovery
- - we only allow a single rw mount when the fs needs recovery */
-
- if (mg->needs_recovery) {
- if (!new->rw || rw_count)
- new->jid = -2;
- }
-
- if (new->jid == -2) {
- log_group(mg, "discover_journals: fail - needs_recovery %d",
- mg->needs_recovery);
- goto out;
- }
-
- if (new->spectator) {
- log_group(mg, "discover_journals: new spectator allowed");
- new->jid = -1;
- goto out;
- }
-
- for (i = 0; i < 1024; i++) {
- memb = find_memb_jid(mg, i);
- if (!memb) {
- new->jid = i;
- break;
- }
+ memb = find_memb_nodeid(mg, hd->to_nodeid);
+ if (!memb) {
+ log_group(mg, "receive_journals from %d to unknown %d",
+ from, hd->to_nodeid);
+ return;
}
+ memb->needs_journals = 0;
- /* Currently the fs needs recovery, i.e. none of the current
- mounters (ro/spectators) can recover journals. So, this new rw
- mounter is told to do first-mounter recovery of all the journals. */
+ if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
+ return;
- if (mg->needs_recovery) {
- log_group(mg, "discover_journals: new member OPT_RECOVER");
- new->opts |= MEMB_OPT_RECOVER;
+ if (mg->got_our_journals) {
+ log_group(mg, "receive_journals from %d duplicate", from);
+ return;
}
+ mg->got_our_journals = 1;
- out:
- log_group(mg, "discover_journals: new member %d got jid %d",
- new->nodeid, new->jid);
-
- if (mg->low_finished_nodeid == our_nodeid)
- send_journals(mg, new->nodeid);
- return 0;
+ _receive_journals(mg, buf, len, from);
}
static void add_ordered_member(struct mountgroup *mg, struct mg_member *new)
@@ -786,10 +820,13 @@
memset(memb, 0, sizeof(*memb));
memb->nodeid = nodeid;
- memb->jid = -9;
- memb->new = 1;
+ memb->jid = JID_INIT;
add_ordered_member(mg, memb);
mg->memb_count++;
+
+ if (!mg->init)
+ memb->needs_journals = 1;
+
return 0;
}
@@ -837,6 +874,8 @@
clear_memb_list(&mg->members_gone);
}
+/* This can happen before we receive a journals message for our mount. */
+
void recover_members(struct mountgroup *mg, int num_nodes,
int *nodeids, int *pos_out, int *neg_out)
{
@@ -867,13 +906,13 @@
memb->local_recovery_status = 0;
/* - journal cb for failed or withdrawing nodes
- - journal cb only if failed node finished joining
+ - failed node was assigned a journal
- no journal cb if failed node was spectator
- no journal cb if we've already done a journl cb */
if ((memb->gone_type == GROUP_NODE_FAILED ||
memb->withdraw) &&
- memb->mount_finished &&
+ memb->jid != JID_INIT &&
!memb->spectator &&
!memb->wait_gfs_recover_done) {
memb->tell_gfs_to_recover = 1;
@@ -887,7 +926,7 @@
mg->spectator,
mg->start_type,
memb->withdraw,
- memb->mount_finished,
+ memb->jid,
memb->spectator,
memb->wait_gfs_recover_done);
}
@@ -929,9 +968,8 @@
INIT_LIST_HEAD(&mg->members);
INIT_LIST_HEAD(&mg->members_gone);
INIT_LIST_HEAD(&mg->resources);
- INIT_LIST_HEAD(&mg->saved_recovery_status);
+ INIT_LIST_HEAD(&mg->saved_messages);
mg->init = 1;
- mg->init2 = 1;
strncpy(mg->name, name, MAXNAME);
@@ -1130,10 +1168,11 @@
we delayed calling start_done() (to complete adding
the second node) until here. */
- if (mg->wait_first_done) {
- clear_new(mg);
+ if (mg->wait_first_done)
start_done(mg);
- }
+
+ if (mg->delay_send_journals)
+ send_journals(mg, mg->delay_send_journals);
}
return 0;
}
@@ -1147,13 +1186,15 @@
struct mg_member *memb;
int rv;
- if (mg->spectator || mg->readonly) {
+ if (mg->spectator || mg->readonly || mg->our_jid == JID_INIT) {
list_for_each_entry(memb, &mg->members_gone, list) {
if (!memb->tell_gfs_to_recover)
continue;
- log_group(mg, "recover journal %d nodeid %d skip ro",
- memb->jid, memb->nodeid);
+ log_group(mg, "recover journal %d nodeid %d skip, "
+ "spect %d ro %d our_jid %d",
+ memb->jid, memb->nodeid,
+ mg->spectator, mg->readonly, mg->our_jid);
memb->tell_gfs_to_recover = 0;
memb->local_recovery_status = RS_READONLY;
}
@@ -1395,6 +1436,11 @@
strncpy(buf, mg->error_msg, MAXLINE);
error = 1;
} else {
+ if (mg->mount_client_delay) {
+ log_group(mg, "notify_mount_client delayed");
+ return;
+ }
+
if (mg->our_jid < 0)
snprintf(buf, MAXLINE, "hostdata=id=%u:first=%d",
mg->id, mg->first_mounter);
@@ -1414,7 +1460,24 @@
if (error) {
log_group(mg, "leaving due to mount error: %s", mg->error_msg);
group_leave(gh, mg->name);
- }
+ } else
+ mg->mount_client_notified = 1;
+}
+
+void ping_kernel_mount(char *table)
+{
+ struct mountgroup *mg;
+ char buf[MAXLINE];
+ char *name = strstr(table, ":") + 1;
+ int rv;
+
+ mg = find_mg(name);
+ if (!mg)
+ return;
+
+ rv = get_sysfs(mg, "id", buf, sizeof(buf));
+
+ log_group(mg, "ping_kernel_mount %d", rv);
}
/* When mounting a fs, we first join the mountgroup, then tell mount.gfs
@@ -1438,9 +1501,54 @@
}
}
+/* The processing of new mounters (send/recv options, send/recv journals,
+ notify mount.gfs) is not very integrated with the stop/start/finish
+ callbacks from libgroup. A start callback just notifies us of a new
+ mounter and the options/journals messages drive things from there.
+ Recovery for failed nodes _is_ controlled more directly by the
+ stop/start/finish callbacks. So, processing new mounters happens
+ independently of recovery and of the libgroup callbacks. One place
+ where they need to intersect, though, is in stopping/suspending
+ gfs-kernel:
+ - When we get a stop callback, we need to be certain that gfs-kernel
+ is blocked.
+ - When a mounter notifies mount.gfs to go ahead, gfs-kernel will
+ shortly begin running in an unblocked fashion as it goes through
+ the kernel mounting process.
+ Given this, we need to be sure that if gfs-kernel is supposed to be
+ blocked, we don't notify mount.gfs to go ahead and do the kernel mount
+ since that starts gfs-kernel in an unblocked state. */
+
+/* - if we're unmounting, the kernel is gone, so no problem.
+ - if we've just mounted and notified mount.gfs, then wait for kernel
+ mount and then block.
+ - if we're mounting and have not yet notified mount.gfs, then set
+ a flag that delays the notification until block is set to 0. */
+
int do_stop(struct mountgroup *mg)
{
- set_sysfs(mg, "block", 1);
+ int rv;
+
+ for (;;) {
+ rv = set_sysfs(mg, "block", 1);
+ if (!rv)
+ break;
+
+ /* if the kernel instance of gfs existed before but now
+ we can't see it, that must mean it's been unmounted,
+ so it's implicitly stopped */
+
+ if (mg->got_kernel_mount)
+ break;
+
+ if (mg->mount_client_notified)
+ wait_for_kernel_mount(mg);
+ else {
+ mg->mount_client_delay = 1;
+ break;
+ }
+ }
+
group_stop_done(gh, mg->name);
return 0;
}
@@ -1498,12 +1606,17 @@
leave_blocked = 1;
}
- if (mg->mount_client) {
- notify_mount_client(mg);
- wait_for_kernel_mount(mg);
- } else if (!leave_blocked)
+ if (!leave_blocked) {
set_sysfs(mg, "block", 0);
+ /* we may have been holding back our local mount due to
+ being stopped/blocked */
+ if (mg->mount_client_delay) {
+ mg->mount_client_delay = 0;
+ notify_mount_client(mg);
+ }
+ }
+
return 0;
}
@@ -1544,9 +1657,7 @@
struct mg_member *memb;
log_group(mg, "start_first_mounter");
-
set_our_memb_options(mg);
-
memb = find_memb_nodeid(mg, our_nodeid);
ASSERT(memb);
@@ -1561,11 +1672,12 @@
mg->our_jid = 0;
mg->first_mounter = 1;
mg->first_mounter_done = 0;
+ mg->got_our_options = 1;
+ mg->got_our_journals = 1;
hold_withdraw_locks(mg);
}
- clear_new(mg);
start_done(mg);
- mg->init = 0;
+ notify_mount_client(mg);
}
/* called for the initial start on a rw/ro mounter;
@@ -1574,25 +1686,11 @@
void start_participant_init(struct mountgroup *mg)
{
log_group(mg, "start_participant_init");
-
set_our_memb_options(mg);
send_options(mg);
hold_withdraw_locks(mg);
-
- if (mg->journals_msg) {
- _receive_journals(mg,
- mg->journals_msg,
- mg->journals_msg_len,
- mg->journals_msg_from);
- free(mg->journals_msg);
- mg->journals_msg = NULL;
-
- start_participant_init_2(mg);
- } else {
- /* will be called in receive_journals() */
- mg->start2_fn = start_participant_init_2;
- }
- mg->init = 0;
+ start_done(mg);
+ mg->start2_fn = start_participant_init_2;
}
/* called for the initial start on a rw/ro mounter after _receive_journals() */
@@ -1613,7 +1711,7 @@
/* fs needs recovery and existing mounters can't recover it,
i.e. they're spectator/readonly, so we're told to do
first-mounter recovery on the fs. */
-
+
if (first_mounter_recovery(mg)) {
log_group(mg, "first_mounter_recovery");
mg->emulate_first_mounter = 1;
@@ -1621,12 +1719,13 @@
mg->first_mounter_done = 0;
}
out:
- clear_new(mg);
- start_done(mg);
- mg->init2 = 0;
+ notify_mount_client(mg);
}
-/* called for a non-initial start on a normal mounter */
+/* called for a non-initial start on a normal mounter.
+ NB we can get here without having received a journals message for
+ our (recent) mount yet in which case we don't know the jid or ro/rw
+ status of any members, and don't know our own jid. */
void start_participant(struct mountgroup *mg, int pos, int neg)
{
@@ -1635,80 +1734,40 @@
if (pos) {
hold_withdraw_locks(mg);
- if (mg->options_msg) {
- _receive_options(mg,
- mg->options_msg,
- mg->options_msg_len,
- mg->options_msg_from);
- free(mg->options_msg);
- mg->options_msg = NULL;
+ /* If we're the first mounter, and we're adding a second
+ node here, but haven't gotten first_done (others_may_mount)
+ from gfs yet, then don't do the start_done() to complete
+ adding the second node. Set wait_first_done=1 to have
+ first_recovery_done() call start_done(). This also requires
+ that we unblock locking on the first mounter if gfs hasn't
+ done others_may_mount yet. */
+
+ if (mg->first_mounter && !mg->first_mounter_done) {
+ mg->wait_first_done = 1;
+ set_sysfs(mg, "block", 0);
+ log_group(mg, "delay start_done til others_may_mount");
+ } else
+ start_done(mg);
+
+ mg->start2_fn = NULL;
+ process_saved_options(mg);
- start_participant_2(mg);
- } else {
- /* will be called in receive_options() */
- mg->start2_fn = start_participant_2;
- }
} else if (neg) {
recover_journals(mg);
process_saved_recovery_status(mg);
}
}
-/* called for a non-initial start on a normal mounter when adding a node,
- after _receive_options(). we need to know if the new node is a spectator
- or not (from options) before deciding if it should be given a journal
- in discover_journals() */
-
-void start_participant_2(struct mountgroup *mg)
-{
- log_group(mg, "start_participant_2");
-
- discover_journals(mg);
-
- /* If we're the first mounter, and we're adding a second
- node here, but haven't gotten first_done (others_may_mount) from gfs
- yet, then don't do the start_done() to complete adding the
- second node. Set wait_first_done=1 to have first_recovery_done()
- call start_done().
- This also requires that we unblock locking on the first
- mounter if gfs hasn't done others_may_mount yet. */
-
- if (mg->init2 && mg->first_mounter && !mg->first_mounter_done) {
- mg->wait_first_done = 1;
- set_sysfs(mg, "block", 0);
- log_group(mg, "delay start_done until others_may_mount");
- } else {
- clear_new(mg);
- start_done(mg);
- }
-
- mg->init2 = 0;
-}
-
/* called for the initial start on a spectator mounter */
void start_spectator_init(struct mountgroup *mg)
{
log_group(mg, "start_spectator_init");
-
set_our_memb_options(mg);
send_options(mg);
hold_withdraw_locks(mg);
-
- if (mg->journals_msg) {
- _receive_journals(mg,
- mg->journals_msg,
- mg->journals_msg_len,
- mg->journals_msg_from);
- free(mg->journals_msg);
- mg->journals_msg = NULL;
-
- start_spectator_init_2(mg);
- } else {
- /* will be called in receive_journals() */
- mg->start2_fn = start_spectator_init_2;
- }
- mg->init = 0;
+ start_done(mg);
+ mg->start2_fn = start_spectator_init_2;
}
/* called for the initial start on a spectator mounter,
@@ -1726,9 +1785,7 @@
else
ASSERT(mg->our_jid == -1);
- clear_new(mg);
- start_done(mg);
- mg->init2 = 0;
+ notify_mount_client(mg);
}
/* called for a non-initial start on a spectator mounter */
@@ -1739,37 +1796,14 @@
if (pos) {
hold_withdraw_locks(mg);
-
- if (mg->options_msg) {
- _receive_options(mg,
- mg->options_msg,
- mg->options_msg_len,
- mg->options_msg_from);
- free(mg->options_msg);
- mg->options_msg = NULL;
-
- start_spectator_2(mg);
- } else {
- /* will be called in receive_options() */
- mg->start2_fn = start_spectator_2;
- }
+ start_done(mg);
+ process_saved_options(mg);
} else if (neg) {
recover_journals(mg);
process_saved_recovery_status(mg);
}
}
-/* called for a non-initial start on a spectator mounter when adding a
- node, after _receive_options() */
-
-void start_spectator_2(struct mountgroup *mg)
-{
- log_group(mg, "start_spectator_2");
- discover_journals(mg);
- clear_new(mg);
- start_done(mg);
-}
-
/* If nodeA fails, nodeB is recovering journalA and nodeB fails before
finishing, then nodeC needs to tell gfs to recover both journalA and
journalB. We do this by setting tell_gfs_to_recover back to 1 for
@@ -1792,7 +1826,25 @@
}
}
+/* New mounters may be waiting for a journals message that a failed node (as
+ low nodeid) would have sent. If the low nodeid failed and we're the new low
+ nodeid, then send a journals message to any nodes for whom we've not seen a
+ journals message. */
+
+void resend_journals(struct mountgroup *mg)
+{
+ struct mg_member *memb;
+
+ list_for_each_entry(memb, &mg->members, list) {
+ if (!memb->needs_journals)
+ continue;
+ log_group(mg, "resend_journals to %d", memb->nodeid);
+ send_journals(mg, memb->nodeid);
+ }
+}
+
/*
+ old method:
A is rw mount, B mounts rw
do_start do_start
@@ -1808,11 +1860,27 @@
start_participant_init_2
group_start_done
do_finish do_finish
+
+ new method: decouples stop/start/finish from mount processing
+ A is rw mount, B mounts rw
+
+ do_start do_start
+ start_participant start_participant_init
+ start_done send_options
+ start_done
+ do_finish do_finish
+
+ receive_options
+ assign_journal
+ send_journals
+ receive_journals
+ start_participant_init_2
+ notify_mount_client
*/
void do_start(struct mountgroup *mg, int type, int member_count, int *nodeids)
{
- int pos = 0, neg = 0;
+ int pos = 0, neg = 0, low;
mg->start_event_nr = mg->last_start;
mg->start_type = type;
@@ -1820,10 +1888,18 @@
log_group(mg, "start %d init %d type %d member_count %d",
mg->last_start, mg->init, type, member_count);
+ low = mg->low_finished_nodeid;
+
recover_members(mg, member_count, nodeids, &pos, &neg);
reset_unfinished_recoveries(mg);
+ if (neg && low != mg->low_finished_nodeid && low == our_nodeid) {
+ log_group(mg, "low nodeid failed old %d new %d",
+ low, mg->low_finished_nodeid);
+ resend_journals(mg);
+ }
+
if (mg->init) {
if (member_count == 1)
start_first_mounter(mg);
@@ -1831,6 +1907,7 @@
start_spectator_init(mg);
else
start_participant_init(mg);
+ mg->init = 0;
} else {
if (mg->spectator)
start_spectator(mg, pos, neg);
@@ -1839,8 +1916,7 @@
}
}
-/* FIXME:
-
+/*
What repurcussions are there from umount shutting down gfs in the
kernel before we leave the mountgroup? We can no longer participate
in recovery even though we're in the group -- what are the end cases
More information about the Cluster-devel
mailing list