[Cluster-devel] cluster/group/dlm_controld deadlock.c dlm_daem ...
teigland at sourceware.org
teigland at sourceware.org
Fri Aug 17 21:17:53 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Changes by: teigland at sourceware.org 2007-08-17 21:17:53
Modified files:
group/dlm_controld: deadlock.c dlm_daemon.h main.c
Log message:
handle addition/removal/failure of nodes during a deadlock cycle
serialize deadlock cycles and limit how often cycles are started
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/deadlock.c.diff?cvsroot=cluster&r1=1.5&r2=1.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/dlm_daemon.h.diff?cvsroot=cluster&r1=1.12&r2=1.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/group/dlm_controld/main.c.diff?cvsroot=cluster&r1=1.13&r2=1.14
--- cluster/group/dlm_controld/deadlock.c 2007/08/13 15:32:05 1.5
+++ cluster/group/dlm_controld/deadlock.c 2007/08/17 21:17:53 1.6
@@ -28,7 +28,8 @@
struct node {
struct list_head list;
int nodeid;
- int checkpoint_ready;
+ int checkpoint_ready; /* we've read its ckpt */
+ int in_cycle; /* participating in cycle */
};
enum {
@@ -96,8 +97,9 @@
#define DLM_HEADER_PATCH 0
#define DLM_MSG_CYCLE_START 1
-#define DLM_MSG_CHECKPOINT_READY 2
-#define DLM_MSG_CANCEL_LOCK 3
+#define DLM_MSG_CYCLE_END 2
+#define DLM_MSG_CHECKPOINT_READY 3
+#define DLM_MSG_CANCEL_LOCK 4
struct dlm_header {
uint16_t version[3];
@@ -816,6 +818,7 @@
/* unlink an old checkpoint before we create a new one */
if (ls->lock_ckpt_handle) {
+ log_error("write_checkpoint: old ckpt");
if (_unlink_checkpoint(ls, &name))
return;
}
@@ -984,6 +987,14 @@
send_message(ls, DLM_MSG_CYCLE_START);
}
+void send_cycle_end(struct lockspace *ls)
+{
+ if (!deadlock_enabled)
+ return;
+ log_group(ls, "send_cycle_end");
+ send_message(ls, DLM_MSG_CYCLE_END);
+}
+
static void send_cancel_lock(struct lockspace *ls, struct trans *tr,
struct dlm_lkb *lkb)
{
@@ -1053,22 +1064,18 @@
static void find_deadlock(struct lockspace *ls);
-static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+static void run_deadlock(struct lockspace *ls)
{
struct node *node;
int not_ready = 0;
int low = -1;
- log_group(ls, "receive_checkpoint_ready from %d", nodeid);
-
- read_checkpoint(ls, nodeid);
-
- /* when locks are read from all nodes, then search_deadlock()
- to do detection */
+ if (ls->all_checkpoints_ready)
+ log_group(ls, "WARNING: run_deadlock all_checkpoints_ready");
list_for_each_entry(node, &ls->nodes, list) {
- if (node->nodeid == nodeid)
- node->checkpoint_ready = 1;
+ if (!node->in_cycle)
+ continue;
if (!node->checkpoint_ready)
not_ready++;
@@ -1078,23 +1085,56 @@
if (not_ready)
return;
+ ls->all_checkpoints_ready = 1;
+
list_for_each_entry(node, &ls->nodes, list) {
+ if (!node->in_cycle)
+ continue;
if (node->nodeid < low || low == -1)
low = node->nodeid;
}
ls->low_nodeid = low;
- log_group(ls, "low nodeid in charge of resolution is %d", low);
- find_deadlock(ls);
+ if (low == our_nodeid)
+ find_deadlock(ls);
+ else
+ log_group(ls, "defer resolution to low nodeid %d", low);
+}
+
+static void receive_checkpoint_ready(struct lockspace *ls, int nodeid)
+{
+ struct node *node;
+
+ log_group(ls, "receive_checkpoint_ready from %d", nodeid);
+
+ read_checkpoint(ls, nodeid);
+
+ list_for_each_entry(node, &ls->nodes, list) {
+ if (node->nodeid == nodeid) {
+ node->checkpoint_ready = 1;
+ break;
+ }
+ }
+
+ run_deadlock(ls);
}
static void receive_cycle_start(struct lockspace *ls, int nodeid)
{
+ struct node *node;
int rv;
- log_group(ls, "receive_cycle_start %d", nodeid);
+ log_group(ls, "receive_cycle_start from %d", nodeid);
+
+ if (ls->cycle_running) {
+ log_group(ls, "cycle already running");
+ return;
+ }
+ ls->cycle_running = 1;
+ gettimeofday(&ls->cycle_start_time, NULL);
- gettimeofday(&ls->last_deadlock_check, NULL);
+ list_for_each_entry(node, &ls->nodes, list)
+ node->in_cycle = 1;
rv = read_debugfs_locks(ls);
if (rv < 0) {
@@ -1111,6 +1151,46 @@
send_checkpoint_ready(ls);
}
+static uint64_t dt_usec(struct timeval *start, struct timeval *stop)
+{
+ uint64_t dt;
+
+ dt = stop->tv_sec - start->tv_sec;
+ dt *= 1000000;
+ dt += stop->tv_usec - start->tv_usec;
+ return dt;
+}
+
+/* TODO: nodes added during a cycle - what will they do with messages
+ they recv from other nodes running the cycle? */
+
+static void receive_cycle_end(struct lockspace *ls, int nodeid)
+{
+ struct node *node;
+ uint64_t usec;
+
+ if (!ls->cycle_running) {
+ log_error("receive_cycle_end %s from %d: no cycle running",
+ ls->name, nodeid);
+ return;
+ }
+
+ gettimeofday(&ls->cycle_end_time, NULL);
+ usec = dt_usec(&ls->cycle_start_time, &ls->cycle_end_time);
+ log_group(ls, "receive_cycle_end: from %d cycle time %.2f s",
+ nodeid, usec * 1.e-6);
+
+ ls->cycle_running = 0;
+ ls->all_checkpoints_ready = 0;
+
+ list_for_each_entry(node, &ls->nodes, list)
+ node->checkpoint_ready = 0;
+
+ free_resources(ls);
+ free_transactions(ls);
+ unlink_checkpoint(ls);
+}
+
static void receive_cancel_lock(struct lockspace *ls, int nodeid, uint32_t lkid)
{
dlm_lshandle_t h;
@@ -1167,6 +1247,9 @@
case DLM_MSG_CYCLE_START:
receive_cycle_start(ls, hd->nodeid);
break;
+ case DLM_MSG_CYCLE_END:
+ receive_cycle_end(ls, hd->nodeid);
+ break;
case DLM_MSG_CHECKPOINT_READY:
receive_checkpoint_ready(ls, hd->nodeid);
break;
@@ -1203,14 +1286,14 @@
if (node->nodeid != nodeid)
continue;
- /* TODO: purge locks from this node if we're in a cycle */
-
list_del(&node->list);
free(node);
log_group(ls, "node %d left deadlock cpg", nodeid);
}
}
+static void purge_locks(struct lockspace *ls, int nodeid);
+
static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
struct cpg_address *member_list, int member_list_entries,
struct cpg_address *left_list, int left_list_entries,
@@ -1230,11 +1313,36 @@
return;
}
+ /* nodes added during a cycle won't have node->in_cycle set so they
+ won't be included in any of the cycle processing */
+
for (i = 0; i < joined_list_entries; i++)
node_joined(ls, joined_list[i].nodeid);
for (i = 0; i < left_list_entries; i++)
node_left(ls, left_list[i].nodeid, left_list[i].reason);
+
+ if (!ls->cycle_running)
+ return;
+
+ if (!left_list_entries)
+ return;
+
+ if (!ls->all_checkpoints_ready) {
+ run_deadlock(ls);
+ return;
+ }
+
+ for (i = 0; i < left_list_entries; i++)
+ purge_locks(ls, left_list[i].nodeid);
+
+ for (i = 0; i < left_list_entries; i++) {
+ if (left_list[i].nodeid != ls->low_nodeid)
+ continue;
+ /* this will set a new low node which will call find_deadlock */
+ run_deadlock(ls);
+ break;
+ }
}
static void process_deadlock_cpg(int ci)
@@ -1343,6 +1451,29 @@
client_dead(ls->cpg_ci);
}
+/* would we ever call this after we've created the transaction lists?
+ I don't think so; I think it can only be called between reading
+ checkpoints */
+
+static void purge_locks(struct lockspace *ls, int nodeid)
+{
+ struct dlm_rsb *r;
+ struct dlm_lkb *lkb, *safe;
+
+ list_for_each_entry(r, &ls->resources, list) {
+ list_for_each_entry_safe(lkb, safe, &r->locks, list) {
+ if (lkb->home == nodeid) {
+ list_del(&lkb->list);
+ if (list_empty(&lkb->trans_list))
+ free(lkb);
+ else
+ log_group(ls, "purge %d %x on trans",
+ nodeid, lkb->lock.id);
+ }
+ }
+ }
+}
+
static void add_lkb_trans(struct trans *tr, struct dlm_lkb *lkb)
{
list_add(&lkb->trans_list, &tr->locks);
@@ -1543,7 +1674,7 @@
}
if (remove_tr->others_waiting_on_us)
- log_debug("trans %llx removed others waiting %d",
+ log_group(ls, "trans %llx removed others waiting %d",
(unsigned long long)remove_tr->xid,
remove_tr->others_waiting_on_us);
}
@@ -1675,16 +1806,14 @@
static void find_deadlock(struct lockspace *ls)
{
- struct node *node;
-
if (list_empty(&ls->resources)) {
log_group(ls, "no deadlock: no resources");
- return;
+ goto out;
}
if (!list_empty(&ls->transactions)) {
log_group(ls, "transactions list should be empty");
- return;
+ goto out;
}
dump_resources(ls);
@@ -1701,13 +1830,6 @@
log_group(ls, "found deadlock");
dump_all_trans(ls);
- /* TODO: should probably do this above instead */
- if (ls->low_nodeid != our_nodeid) {
- log_group(ls, "defer resolution to low nodeid %d",
- ls->low_nodeid);
- goto out;
- }
-
cancel_trans(ls);
reduce_waitfor_graph_loop(ls);
@@ -1718,12 +1840,7 @@
log_error("deadlock resolution failed");
dump_all_trans(ls);
-
out:
- free_resources(ls);
- free_transactions(ls);
-
- list_for_each_entry(node, &ls->nodes, list)
- node->checkpoint_ready = 0;
+ send_cycle_end(ls);
}
--- cluster/group/dlm_controld/dlm_daemon.h 2007/08/10 20:23:07 1.12
+++ cluster/group/dlm_controld/dlm_daemon.h 2007/08/17 21:17:53 1.13
@@ -94,9 +94,12 @@
struct list_head transactions;
struct list_head resources;
struct list_head nodes;
- struct timeval last_deadlock_check;
- unsigned int timewarn_count;
+ struct timeval cycle_start_time;
+ struct timeval cycle_end_time;
+ struct timeval last_send_cycle_start;
int got_first_confchg;
+ int cycle_running;
+ int all_checkpoints_ready;
};
/* action.c */
--- cluster/group/dlm_controld/main.c 2007/08/06 21:50:26 1.13
+++ cluster/group/dlm_controld/main.c 2007/08/17 21:17:53 1.14
@@ -446,28 +446,55 @@
{
struct lockspace *ls;
struct timeval now;
+ unsigned int sec;
ls = find_ls_id(data->lockspace_id);
if (!ls)
return;
- log_group(ls, "timewarn: lkid %x pid %d count %d",
- data->id, data->ownpid, ls->timewarn_count);
+ data->resource_name[data->resource_namelen] = '\0';
+
+ log_group(ls, "timewarn: lkid %x pid %d name %s",
+ data->id, data->ownpid, data->resource_name);
+
+ /* Problem: we don't want to get a timewarn, assume it's resolved
+ by the current cycle, but in fact it's from a deadlock that
+ formed after the checkpoints for the current cycle. Then we'd
+ have to hope for another warning (that may not come) to trigger
+ a new cycle to catch the deadlock. If our last cycle ckpt
+ was say N (~5?) sec before we receive the timewarn, then we
+ can be confident that the cycle included the lock in question.
+ Otherwise, we're not sure if the warning is for a new deadlock
+ that's formed since our last cycle ckpt (unless it's a long
+ enough time since the last cycle that we're confident it *is*
+ a new deadlock). When there is a deadlock, I suspect it will
+ be common to receive warnings before, during, and possibly
+ after the cycle that resolves it. Wonder if we should record
+ timewarns and match them with deadlock cycles so we can tell
+ which timewarns are addressed by a given cycle and which aren't. */
+
gettimeofday(&now, NULL);
- if (now.tv_sec - ls->last_deadlock_check.tv_sec > DEADLOCK_CHECK_SECS) {
- ls->timewarn_count = 0;
- send_cycle_start(ls);
- } else {
- /* TODO: set a poll timeout and start another cycle after
- DEADLOCK_CHECK_SECS. Want to save a record of all the
- warned locks to see if they're still blocked later before
- starting a cycle? This would only be helpful if we
- experienced regular false-warnings, indicating that the
- timewarn setting should be larger. */
- ls->timewarn_count++;
+ /* don't send a new start until at least SECS after the last
+ we sent, and at least SECS after the last completed cycle */
+
+ sec = now.tv_sec - ls->last_send_cycle_start.tv_sec;
+
+ if (sec < DEADLOCK_CHECK_SECS) {
+ log_group(ls, "skip send: recent send cycle %d sec", sec);
+ return;
+ }
+
+ sec = now.tv_sec - ls->cycle_end_time.tv_sec;
+
+ if (sec < DEADLOCK_CHECK_SECS) {
+ log_group(ls, "skip send: recent cycle end %d sec", sec);
+ return;
}
+
+ gettimeofday(&ls->last_send_cycle_start, NULL);
+ send_cycle_start(ls);
}
static void process_netlink(int ci)
More information about the Cluster-devel
mailing list