[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
jbrassow at sourceware.org
jbrassow at sourceware.org
Tue Apr 24 20:10:22 UTC 2007
CVSROOT: /cvs/cluster
Module name: cluster
Branch: RHEL45
Changes by: jbrassow at sourceware.org 2007-04-24 21:10:20
Modified files:
cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c
dm-cmirror-server.h dm-cmirror-xfr.h
Log message:
Bug 199433: NULL pointer dereference in cman:process_messages for cmirro...
- While this isn't a complete fix for 199433, it is most likely the
cause of the error. Cluster mirrors were steadily leaking memory
every time they were deactivated.
Bug 237028: cmirror recovery deadlock due to machine failure + primary l...
- If there is outstanding resync work remaining when the server gets
notice to suspend, delay for a moment to wait for it.
Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.4&r2=1.1.2.41.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.6&r2=1.1.2.26.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.8.1&r2=1.1.2.2.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.2.2&r2=1.1.2.2.2.3
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/10 07:13:15 1.1.2.41.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c 2007/04/24 20:10:20 1.1.2.41.2.5
@@ -379,7 +379,8 @@
if(len <= 0){
/* ATTENTION -- what do we do with this ? */
- DMWARN("Error while listening for server response: %d", len);
+ DMWARN("Error listening for server(%u) response for %s: %d",
+ lc->server_id, lc->uuid + (strlen(lc->uuid) - 8), len);
error = len;
*retry = 1;
seq++;
@@ -767,6 +768,7 @@
static int cluster_postsuspend(struct dirty_log *log)
{
+ int i;
struct region_state *rs, *tmp_rs;
struct log_c *lc = (struct log_c *) log->context;
@@ -788,10 +790,20 @@
spin_unlock(&lc->state_lock);
+ if(lc->server_id == my_id) {
+ for (i = 0; server_busy(lc) && (i < 10); i++) {
+ DMDEBUG("Server for %s still busy, waiting for others",
+ lc->uuid + (strlen(lc->uuid) - 8));
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(HZ*2);
+ }
+ }
+
atomic_set(&lc->suspended, 1);
if(lc->server_id == my_id) {
while (1) {
- DMDEBUG("Telling everyone I'm suspending");
+ DMDEBUG("Telling everyone I'm suspending (%s)",
+ lc->uuid + (strlen(lc->uuid) - 8));
consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
down(&consult_server_lock);
@@ -799,13 +811,15 @@
up(&consult_server_lock);
if ((my_id && (lc->server_id == my_id))) {
- DMDEBUG("Delaying suspend, work to be done.");
+ DMDEBUG("Delaying suspend, work to be done (%s)",
+ lc->uuid + (strlen(lc->uuid) - 8));
atomic_set(&lc->suspended, 0);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ*2);
atomic_set(&lc->suspended, 1);
} else {
- DMDEBUG("Suspending now");
+ DMDEBUG("Suspending now (%s)",
+ lc->uuid + (strlen(lc->uuid) - 8));
break;
}
}
@@ -1196,6 +1210,16 @@
switch(status){
case STATUSTYPE_INFO:
+ DMDEBUG("LOG INFO:");
+ DMDEBUG(" uuid: %s", lc->uuid);
+ DMDEBUG(" uuid_ref : %d", lc->uuid_ref);
+ DMDEBUG(" ?region_count: %Lu", lc->region_count);
+ DMDEBUG(" ?sync_count : %Lu", lc->sync_count);
+ DMDEBUG(" ?sync_search : %d", lc->sync_search);
+ DMDEBUG(" in_sync : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+ DMDEBUG(" server_id : %u", lc->server_id);
+ DMDEBUG(" server_valid: %s",
+ ((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
if(lc->sync != DEFAULTSYNC)
arg_count++;
@@ -1254,11 +1278,6 @@
}
up(&log_list_lock);
- /*
- if (likely(!shutting_down))
- suspend_server();
- */
-
return 0;
}
@@ -1311,9 +1330,7 @@
BUG();
break;
}
- /*
- resume_server();
- */
+
return 0;
}
@@ -1452,6 +1469,7 @@
r = dm_register_dirty_log_type(&_clustered_core_type);
if (r) {
DMWARN("couldn't register clustered_core dirty log type");
+ mempool_destroy(region_state_pool);
return r;
}
@@ -1459,6 +1477,7 @@
if (r) {
DMWARN("couldn't register clustered_disk dirty log type");
dm_unregister_dirty_log_type(&_clustered_core_type);
+ mempool_destroy(region_state_pool);
return r;
}
@@ -1475,6 +1494,7 @@
}
dm_unregister_dirty_log_type(&_clustered_core_type);
dm_unregister_dirty_log_type(&_clustered_disk_type);
+ mempool_destroy(region_state_pool);
DMINFO("dm-cmirror %s (built %s %s) removed",
CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
}
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/10 18:10:42 1.1.2.26.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c 2007/04/24 20:10:20 1.1.2.26.2.7
@@ -42,8 +42,6 @@
static atomic_t server_run;
static struct completion server_completion;
-static wait_queue_head_t _suspend_queue;
-static atomic_t _suspend;
static atomic_t _do_requests;
static int debug_disk_write = 0;
@@ -706,8 +704,8 @@
}
if (!ru) {
- DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
- lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+ DMDEBUG("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+ lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
/*
* This is a valid case, when the following happens:
* 1) a region is recovering and has waiting writes
@@ -798,6 +796,12 @@
uint32_t lowest, next;
uint32_t node_count=global_count, *nodeids=global_nodeids;
+ DMDEBUG("%s(%d): (%s)", RQ_STRING(lr->lr_type), lr->lr_type,
+ (lc) ? lc->uuid + (strlen(lc->uuid) - 8) : "none");
+ DMDEBUG(" starter : %u", lr->u.lr_starter);
+ DMDEBUG(" co-ordinator: %u", lr->u.lr_coordinator);
+ DMDEBUG(" node_count : %d", lr->u.lr_node_count);
+
/* Record the starter's port number so we can get back to him */
if((lr->u.lr_starter == my_id) && (!lr->u.lr_node_count)){
lr->u.lr_starter_port = saddr->sin_port;
@@ -1175,12 +1179,12 @@
complete(&server_completion);
+ DMDEBUG("cluster_log_serverd ready for work");
for(;;){
if(!atomic_read(&server_run)){
break;
}
- suspend_on(&_suspend_queue, atomic_read(&_suspend));
switch(atomic_read(&restart_event_type)){
case SERVICE_NODE_LEAVE:
/* ATTENTION -- may wish to check if regions **
@@ -1206,6 +1210,9 @@
up(&log_list_lock);
break;
+ case SERVICE_NODE_JOIN:
+ DMDEBUG("Node joining");
+ break;
default:
/* Someone has joined, or there is no event */
break;
@@ -1227,6 +1234,7 @@
schedule();
}
+ DMDEBUG("Closing socket on server side");
sock_release(sock);
complete(&server_completion);
return 0;
@@ -1244,8 +1252,6 @@
void print_server_status(struct log_c *lc){
int i;
- atomic_set(&_suspend, 1);
-
DMINFO("SERVER OUTPUT::");
DMINFO(" Live nodes :: %d", global_count);
@@ -1267,11 +1273,18 @@
i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->bitset_uint32_count);
DMINFO(" Total = %d", i);
- atomic_set(&_suspend, 0);
- wake_up_all(&_suspend_queue);
}
*/
+int server_busy(struct log_c *lc)
+{
+ if (!list_empty(&lc->region_users) ||
+ (lc->recovering_region != (uint64_t)-1))
+ return 1;
+ else
+ return 0;
+}
+
int server_free_region_users(struct log_c *lc)
{
int i = 0;
@@ -1287,18 +1300,6 @@
return 0;
}
-
-int suspend_server(void){
- atomic_set(&_suspend, 1);
- return 0;
-}
-
-int resume_server(void){
- atomic_set(&_suspend, 0);
- wake_up_all(&_suspend_queue);
- return 0;
-}
-
int resume_server_requests(void) {
atomic_set(&_do_requests, 1);
return 0;
@@ -1307,6 +1308,7 @@
int start_server(void /* log_devices ? */){
int error;
+ DMDEBUG("start_server called");
region_user_pool = mempool_create(1000, region_user_alloc,
region_user_free, NULL);
if(!region_user_pool){
@@ -1314,20 +1316,20 @@
return -ENOMEM;
}
- init_waitqueue_head(&_suspend_queue);
-
atomic_set(&_do_requests, 0);
atomic_set(&server_run, 1);
init_completion(&server_completion);
error = kernel_thread(cluster_log_serverd, NULL, 0);
if(error < 0){
+ mempool_destroy(region_user_pool);
DMWARN("failed to start kernel thread.");
return error;
}
wait_for_completion(&server_completion);
if(!atomic_read(&server_run)){
+ mempool_destroy(region_user_pool);
DMWARN("Cluster mirror log server thread failed to start");
return -1;
}
@@ -1337,9 +1339,17 @@
void stop_server(void){
+ DMDEBUG("stop_server called");
atomic_set(&server_run, 0);
wait_for_completion(&server_completion);
+ down(&log_list_lock);
+ if (!list_empty(&log_list_head)) {
+ DMERR("Log elements remain at cluster log server shutdown");
+ }
+ up(&log_list_lock);
+ mempool_destroy(region_user_pool);
+
dm_io_put(32);
}
/*
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/10 07:13:15 1.1.2.2.8.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h 2007/04/24 20:10:20 1.1.2.2.8.2
@@ -7,7 +7,7 @@
#ifndef __DM_CMIRROR_SERVER_H__
#define __DM_CMIRROR_SERVER_H__
-int suspend_server(void);
+int server_busy(struct log_c *lc);
int resume_server(void);
int resume_server_requests(void);
int start_server(void);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/04/03 18:23:01 1.1.2.2.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h 2007/04/24 20:10:20 1.1.2.2.2.3
@@ -30,14 +30,15 @@
((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+ ((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
((x) == LRT_FLUSH) ? "LRT_FLUSH": \
((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
- ((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
- ((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
- ((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+ ((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
((x) == LRT_ELECTION) ? "LRT_ELECTION": \
- ((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+ ((x) == LRT_SELECTION) ? "LRT_SELECTION": \
+ ((x) == LRT_MASTER_ASSIGN) ? "LRT_MASTER_ASSIGN": \
+ ((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING" : "UNKNOWN"
struct log_request {
int lr_type;
More information about the Cluster-devel
mailing list