[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
jbrassow at sourceware.org
jbrassow at sourceware.org
Thu Apr 26 16:55:53 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL45
Changes by: jbrassow at sourceware.org 2007-04-26 17:55:51
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c
Log message:
Bug 238031: cluster mirrors not handling all recovery/write conflicts
Problem is that the kernel (main mirror code) does not do any marks/clears when
writing to a region before its recovery. So, it is not possible for the server
to detect a conflict. Basically, we must turn back on the
'is_remote_recovering' function and disallow any writes to regions that are OR
WILL BE recovering.
It's really going to cause some pain during writes while mirrors are re-syncing.
The better fix for the future is to have the writes always mark/clear the
regions - then we can again remove the 'is_remote_recovering' function.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.5&r2=1.1.2.41.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.7&r2=1.1.2.26.2.8
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/24 20:10:20 1.1.2.41.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/26 16:55:51 1.1.2.41.2.6
@@ -861,11 +861,10 @@
int rtn;
struct log_c *lc = (struct log_c *) log->context;
-/* take out optimization
if(atomic_read(&lc->in_sync) == 1){
return 0;
}
-*/
+
rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
return rtn;
}
@@ -876,11 +875,11 @@
struct log_c *lc = (struct log_c *) log->context;
/* check known_regions, return if found */
-/* take out optimization
+
if(atomic_read(&lc->in_sync) == 1){
return 1;
}
-*/
+
if(!block){
return -EWOULDBLOCK;
}
@@ -1414,7 +1413,7 @@
.resume = cluster_resume,
.get_region_size = cluster_get_region_size,
.is_clean = cluster_is_clean,
-/* .is_remote_recovering = cluster_is_remote_recovering,*/
+ .is_remote_recovering = cluster_is_remote_recovering,
.in_sync = cluster_in_sync,
.flush = cluster_flush,
.mark_region = cluster_mark_region,
@@ -1436,7 +1435,7 @@
.resume = cluster_resume,
.get_region_size = cluster_get_region_size,
.is_clean = cluster_is_clean,
-/* .is_remote_recovering = cluster_is_remote_recovering,*/
+ .is_remote_recovering = cluster_is_remote_recovering,
.in_sync = cluster_in_sync,
.flush = cluster_flush,
.mark_region = cluster_mark_region,
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/24 20:10:20 1.1.2.26.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/26 16:55:51 1.1.2.26.2.8
@@ -494,12 +494,32 @@
static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
{
+ region_t region;
struct region_user *ru;
- if ((ru = find_ru_by_region(lc, lr->u.lr_region)) &&
- (ru->ru_rw == RU_RECOVER))
+ /*
+ * This gets a bit complicated. I wish we didn't have to use this
+ * function, but because the mirror code doesn't mark regions which
+ * it writes to that are out-of-sync, we need this function.
+ *
+ * Problem is, we don't know how long the user is going to take to
+ * write to the region after they have called this function. So,
+ * we are forced at this point to deny any writes to regions we
+ * are recovering or might recover in the future.
+ *
+ * We can count on the client side to not send us one of these
+ * requests if the mirror is known to be in-sync.
+ *
+ * And yes, it sucks to take this much time off the I/O.
+ */
+ region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+ lc->region_count, 0);
+
+ if (lr->u.lr_region >= region) {
+ DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
+ lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
lr->u.lr_int_rtn = 1;
- else
+ } else
lr->u.lr_int_rtn = 0;
return 0;
@@ -639,24 +659,65 @@
static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
{
struct region_user *new;
+ int sync_search, conflict = 0;
+ region_t *region = &(lr->u.lr_region_rtn);
- new = mempool_alloc(region_user_pool, GFP_NOFS);
- if(!new){
- lr->u.lr_int_rtn = 0;
- return -ENOMEM;
+ lr->u.lr_int_rtn = 0; /* Default to no work */
+
+ if (lc->recovering_region != (uint64_t)-1) {
+ DMDEBUG("Someone is already recovering region %Lu/%s",
+ lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
+ return 0;
}
-
- if ((lr->u.lr_int_rtn = _core_get_resync_work(lc, &(lr->u.lr_region_rtn)))){
- new->ru_nodeid = who;
- new->ru_region = lr->u.lr_region_rtn;
- new->ru_rw = RU_RECOVER;
- list_add(&new->ru_list, &lc->region_users);
- DMDEBUG("Assigning recovery work to %u/%s: %Lu",
- who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
- } else {
- mempool_free(new, region_user_pool);
+
+ if (lc->sync_search >= lc->region_count) {
+ /*
+ * FIXME: pvmove is not supported yet, but when it is,
+ * an audit of sync_count changes will need to be made
+ */
+ if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
+ lc->sync_search = 0;
+ lc->sync_pass++;
+ } else {
+ return 0;
+ }
+ }
+
+ for (sync_search = lc->sync_search;
+ sync_search < lc->region_count;
+ sync_search = (*region + 1)) {
+ *region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+ lc->region_count,
+ sync_search);
+ if (find_ru_by_region(lc, *region)) {
+ conflict = 1;
+ DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+ *region, lc->uuid + (strlen(lc->uuid) - 8));
+ } else {
+ break;
+ }
}
+ if (*region >= lc->region_count)
+ return 0;
+
+ new = mempool_alloc(region_user_pool, GFP_NOFS);
+ if (!new)
+ return -ENOMEM;
+
+ if (!conflict)
+ lc->sync_search = *region + 1;
+
+ lc->recovering_region = *region;
+
+ lr->u.lr_int_rtn = 1; /* Assigning work */
+ new->ru_nodeid = who;
+ new->ru_region = *region;
+ new->ru_rw = RU_RECOVER;
+ list_add(&new->ru_list, &lc->region_users);
+ DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+ who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+
return 0;
}
More information about the Cluster-devel
mailing list