[Fedora-directory-commits] ldapserver/ldap/servers/plugins/replication repl5_inc_protocol.c, 1.14, 1.15

Richard Allen Megginson rmeggins at fedoraproject.org
Fri Mar 6 20:02:16 UTC 2009


Author: rmeggins

Update of /cvs/dirsec/ldapserver/ldap/servers/plugins/replication
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv21859/ldapserver/ldap/servers/plugins/replication

Modified Files:
	repl5_inc_protocol.c 
Log Message:
Resolves: bug 488866
Bug Description: crash in reliab15 test
Reviewed by: nhosoi (Thanks!)
Fix Description: I could not reproduce the crash, but I think the problem is that the server is not handling the disconnection case correctly.  It seems that in the event of disconnection (LDAP_SERVER_DOWN 81 - Can't contact server) the code would continue to read results.
repl5_inc_result_threadmain() will call conn_read_result_ex() in a loop.  If conn_read_result_ex() detects a disconnection or an unrecoverable error, it will call conn_disconnect to close the connection, and return CONN_NOT_CONNECTED.  Once this happens, the code must not use conn->ld any more.  However, the code did not differentiate between the not connected case and other errors, so it would keep trying to read results (in case some errors are recoverable, the thread still has to read all of the pending results).  The code has been fixed to handle disconnect cases specially.  I also added some additional locking to make sure the result and the abort flags were set/read correctly.  Finally, I changed the code that waits for results to come in, so that if the connection has been closed, it will just return immediately.
Platforms tested: RHEL5 x86_64
Flag Day: no
Doc impact: no



Index: repl5_inc_protocol.c
===================================================================
RCS file: /cvs/dirsec/ldapserver/ldap/servers/plugins/replication/repl5_inc_protocol.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -u -r1.14 -r1.15
--- repl5_inc_protocol.c	5 Dec 2008 22:41:51 -0000	1.14
+++ repl5_inc_protocol.c	6 Mar 2009 20:02:13 -0000	1.15
@@ -329,6 +329,7 @@
 		}
 		if (conres != CONN_TIMEOUT)
 		{
+			int return_value;
 			int should_finish = 0;
 			if (message_id) 
 			{
@@ -347,17 +348,26 @@
 
 			conn_get_error_ex(conn, &operation_code, &connection_error, &ldap_error_string);
 			slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: result %d, %d, %d, %d, %s\n", operation_code,connection_error,conres,message_id,ldap_error_string);
-			rd->result = repl5_inc_update_from_op_result(rd->prp, conres, connection_error, csn_str, uniqueid, replica_id, &should_finish, &(rd->num_changes_sent));
-			if (rd->result || should_finish)
+			return_value = repl5_inc_update_from_op_result(rd->prp, conres, connection_error, csn_str, uniqueid, replica_id, &should_finish, &(rd->num_changes_sent));
+			if (return_value || should_finish)
 			{
-				slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: got op result %d should finish %d\n", rd->result, should_finish);
+				slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: got op result %d should finish %d\n", return_value, should_finish);
 				/* If so then we need to take steps to abort the update process */
 				PR_Lock(rd->lock);
+				rd->result = return_value;
 				rd->abort = 1;
 				PR_Unlock(rd->lock);
 				/* We also need to log the error, including details stored from when the operation was sent */
 				/* we cannot finish yet - we still need to waitfor the pending results, then
 				   the main repl code will shut down this thread */
+				/* we can finish if we have disconnected - in that case, there will be nothing
+				   to read */
+				if (return_value == UPDATE_CONNECTION_LOST) {
+					finished = 1;
+				}
+			} else {
+				/* old semantics had result set outside of lock */
+				rd->result = return_value;
 			}
 		}
 		/* Should we stop ? */
@@ -470,13 +480,17 @@
 			/* If so then we're done */
 			done = 1;
 		}
+		if (rd->abort && (rd->result == UPDATE_CONNECTION_LOST))
+		{
+			done = 1; /* no connection == no more results */
+		}
 		PR_Unlock(rd->lock);
 		/* If not then sleep a bit */
 		DS_Sleep(PR_SecondsToInterval(1));
 		loops++;
 		/* If we sleep forever then we can conclude that something bad happened, and bail... */
 		/* Arbitrary 30 second delay : basically we should only expect to wait as long as it takes to process a few operations, which should be on the order of a second at most */
-		if (loops > 300) 
+		if (!done && (loops > 300))
 		{
 			/* Log a warning */
 			slapi_log_error(SLAPI_LOG_FATAL, NULL,
@@ -1551,7 +1565,7 @@
 					{
 						/* We lost the connection - enter backoff state */
 
-						return_value = UPDATE_TRANSIENT_ERROR;
+						return_value = UPDATE_CONNECTION_LOST;
 						*finished = 1;
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
 							"%s: Consumer failed to replay change (uniqueid %s, CSN %s): "
@@ -1794,7 +1808,7 @@
 					{
 						/* We lost the connection - enter backoff state */
 
-						return_value = UPDATE_TRANSIENT_ERROR;
+						return_value = UPDATE_CONNECTION_LOST;
 						finished = 1;
 						slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
 							"%s: Consumer failed to replay change (uniqueid %s, CSN %s): "
@@ -1914,19 +1928,24 @@
 				return_value = UPDATE_YIELD;
 				finished = 1;
 			}
+			PR_Lock(rd->lock);
 			/* See if the result thread has hit a problem */
 			if (!finished && rd->abort)
 			{
 				return_value = rd->result;
 				finished = 1;
 			}
+			PR_Unlock(rd->lock);
 		} while (!finished);
 
 		/* Terminate the results reading thread */
 		if (!prp->repl50consumer) 
 		{
 			/* We need to ensure that we wait until all the responses have been recived from our operations */
-			repl5_inc_waitfor_async_results(rd);
+			if (return_value != UPDATE_CONNECTION_LOST) {
+				/* if connection was lost/closed, there will be nothing to read */
+				repl5_inc_waitfor_async_results(rd);
+			}
 
 			rc = repl5_inc_destroy_async_result_thread(rd);
 			if (rc) {




More information about the Fedora-directory-commits mailing list