[Fedora-directory-commits] ldapserver/ldap/servers/plugins/replication repl5_connection.c, 1.16, 1.17 repl5_tot_protocol.c, 1.10, 1.11

Richard Allen Megginson rmeggins at fedoraproject.org
Thu Mar 12 02:16:45 UTC 2009


Author: rmeggins

Update of /cvs/dirsec/ldapserver/ldap/servers/plugins/replication
In directory cvs1.fedora.phx.redhat.com:/tmp/cvs-serv27916/ldapserver/ldap/servers/plugins/replication

Modified Files:
	repl5_connection.c repl5_tot_protocol.c 
Log Message:
Resolves: bug 488866
Bug Description: crash in reliab15 test
Reviewed by: nkinder (Thanks!)
Fix Description: There was still a small window of time during which the connection could be closed out from under the other thread which was sending/reading result.  The solution is to use explicit locking using the conn->lock to protect access to the conn->ld.  Since this also affected the total update code, I tested it under similar conditions, and found that it exhibited the same behavior.  I added checking to the total update code to check for disconnection and coordinate access in the entry sending/result reading threads.
I also fixed a spurious error message about the sasl path.
Platforms tested: RHEL5
Flag Day: no
Doc impact: no



Index: repl5_connection.c
===================================================================
RCS file: /cvs/dirsec/ldapserver/ldap/servers/plugins/replication/repl5_connection.c,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17
--- repl5_connection.c	11 Mar 2009 13:41:13 -0000	1.16
+++ repl5_connection.c	12 Mar 2009 02:16:42 -0000	1.17
@@ -306,12 +306,20 @@
 
 			while (1) 
 			{
-				if (!conn_connected(conn)) {
+				/* we have to make sure the update sending thread does not
+				   attempt to call conn_disconnect while we are reading
+				   results - so lock the conn while we get the results */
+				PR_Lock(conn->lock);
+				if ((STATE_CONNECTED != conn->state) || !conn->ld) {
 					rc = -1;
 					return_value = CONN_NOT_CONNECTED;
+					PR_Unlock(conn->lock);
 					break;
 				}
+
 				rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res);
+				PR_Unlock(conn->lock);
+
 				if (0 != rc)
 				{
 					/* Something other than a timeout happened */
@@ -342,6 +350,18 @@
 
 			repl5_stop_debug_timeout(eqctx, &setlevel);
 
+			PR_Lock(conn->lock);
+			/* we have to check again since the connection may have
+			   been closed in the meantime
+			   acquire the lock for the rest of the function
+			   to protect against another attempt to close
+			   the conn while we are using it
+			*/
+			if ((STATE_CONNECTED != conn->state) || !conn->ld) {
+				rc = -1;
+				return_value = CONN_NOT_CONNECTED;
+			}
+
 			if (0 == rc)
 			{
 				/* Timeout */
@@ -370,7 +390,7 @@
 				   later */
 				if (IS_DISCONNECT_ERROR(rc))
 				{
-					conn_disconnect(conn);
+					close_connection_internal(conn); /* we already have the lock */
 					return_value = CONN_NOT_CONNECTED;
 				}
 				else
@@ -397,13 +417,13 @@
 				if (IS_DISCONNECT_ERROR(rc))
 				{
 					conn->last_ldap_error = rc;
-					conn_disconnect(conn);
+					close_connection_internal(conn); /* we already have the lock */
 					return_value = CONN_NOT_CONNECTED;
 				}
 				else if (IS_DISCONNECT_ERROR(err))
 				{
 					conn->last_ldap_error = err;
-					conn_disconnect(conn);
+					close_connection_internal(conn); /* we already have the lock */
 					return_value = CONN_NOT_CONNECTED;
 				}
 				/* Got a result */
@@ -450,6 +470,7 @@
 				conn->status = STATUS_CONNECTED;
 			}
 			if (res) ldap_msgfree(res);
+			PR_Unlock(conn->lock); /* release the conn lock */
 			return return_value;
 }
 
@@ -565,7 +586,10 @@
 	server_controls[1] = update_control;
 	server_controls[2] = NULL;
 
-	if (conn_connected(conn))
+	/* lock the conn to prevent the result reader thread
+	   from closing the connection out from under us */
+	PR_Lock(conn->lock);
+	if (STATE_CONNECTED == conn->state)
 	{
 		int setlevel = 0;
 
@@ -574,6 +598,7 @@
 		return_value = see_if_write_available(
 			conn, PR_SecondsToInterval(conn->timeout.tv_sec));
 		if (return_value != CONN_OPERATION_SUCCESS) {
+			PR_Unlock(conn->lock);
 			return return_value;
 		}
 		conn->last_operation = optype;
@@ -627,7 +652,7 @@
 			conn->last_ldap_error = rc;
 			if (IS_DISCONNECT_ERROR(rc))
 			{
-				conn_disconnect(conn);
+				close_connection_internal(conn); /* already have the lock */
 				return_value = CONN_NOT_CONNECTED;
 			}
 			else
@@ -640,11 +665,12 @@
 	else
 	{
 		/* conn->last_ldap_error has been set to a more specific value
-		 * in conn_connected()
+		 * in the thread that did the disconnection
 		 * conn->last_ldap_error = LDAP_SERVER_DOWN;
 		 */
 		return_value = CONN_NOT_CONNECTED;
 	}
+	PR_Unlock(conn->lock); /* release the lock */
 	if (message_id)
 	{
 		*message_id = msgid;
@@ -1073,6 +1099,13 @@
 static void
 close_connection_internal(Repl_Connection *conn)
 {
+	conn->state = STATE_DISCONNECTED;
+	conn->status = STATUS_DISCONNECTED;
+	conn->supports_ds50_repl = -1;
+	conn->supports_ds71_repl = -1;
+	/* do this last, to minimize the chance that another thread
+	   might read conn->state as not disconnected and attempt
+	   to use conn->ld */
 	if (NULL != conn->ld)
 	{
 		/* Since we call slapi_ldap_init, 
@@ -1080,10 +1113,6 @@
 		slapi_ldap_unbind(conn->ld);
 	}
 	conn->ld = NULL;
-	conn->state = STATE_DISCONNECTED;
-	conn->status = STATUS_DISCONNECTED;
-	conn->supports_ds50_repl = -1;
-	conn->supports_ds71_repl = -1;
 	slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name,
 		"%s: Disconnected from the consumer\n", agmt_get_long_name(conn->agmt));
 }


Index: repl5_tot_protocol.c
===================================================================
RCS file: /cvs/dirsec/ldapserver/ldap/servers/plugins/replication/repl5_tot_protocol.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- repl5_tot_protocol.c	5 Dec 2008 22:41:52 -0000	1.10
+++ repl5_tot_protocol.c	12 Mar 2009 02:16:42 -0000	1.11
@@ -95,6 +95,7 @@
 static int send_entry (Slapi_Entry *e, void *callback_data);
 static void repl5_tot_delete(Private_Repl_Protocol **prp);
 
+#define LOST_CONN_ERR(xx) ((xx == -2) || (xx == LDAP_SERVER_DOWN) || (xx == LDAP_CONNECT_ERROR))
 /*
  * Notes on the async version of this code: 
  * First, we need to have the supplier and consumer both be async-capable.
@@ -120,9 +121,9 @@
 repl5_tot_log_operation_failure(int ldap_error, char* ldap_error_string, const char *agreement_name)
 {
                 slapi_log_error(SLAPI_LOG_FATAL, repl_plugin_name,
-						"%s: Received error %d: %s for total update operation\n",
+						"%s: Received error %d (%s): %s for total update operation\n",
 						agreement_name,
-						ldap_error, ldap_error_string ? ldap_error_string : "NULL");
+						ldap_error, ldap_err2string(ldap_error), ldap_error_string ? ldap_error_string : "");
 }
 
 /* Thread that collects results from async operations sent to the consumer */
@@ -206,7 +207,9 @@
 		}
 		/* Should we stop ? */
 		PR_Lock(cb->lock);
-		if (cb->stop_result_thread) 
+		/* if the connection is not connected, then we cannot read any more
+		   results - we are finished */
+		if (cb->stop_result_thread || (conres == CONN_NOT_CONNECTED)) 
 		{
 			finished = 1;
 		}
@@ -290,13 +293,17 @@
 			/* If so then we're done */
 			done = 1;
 		}
+		if (cb_data->abort && LOST_CONN_ERR(cb_data->rc))
+		{
+			done = 1; /* no connection == no more results */
+		}
 		PR_Unlock(cb_data->lock);
 		/* If not then sleep a bit */
 		DS_Sleep(PR_SecondsToInterval(1));
 		loops++;
 		/* If we sleep forever then we can conclude that something bad happened, and bail... */
 		/* Arbitrary 30 second delay : basically we should only expect to wait as long as it takes to process a few operations, which should be on the order of a second at most */
-		if (loops > 300) 
+		if (!done && (loops > 300))
 		{
 			/* Log a warning */
 			slapi_log_error(SLAPI_LOG_FATAL, NULL,
@@ -618,6 +625,18 @@
 		return -1;    
     }
 
+    /* see if the result reader thread encountered
+       a fatal error */
+    PR_Lock(((callback_data*)cb_data)->lock);
+    rc = ((callback_data*)cb_data)->abort;
+    PR_Unlock(((callback_data*)cb_data)->lock);
+    if (rc)
+    {
+        conn_disconnect(prp->conn);
+        prp->stopped = 1;
+        ((callback_data*)cb_data)->rc = -1;
+        return -1;
+    }
     /* skip ruv tombstone - need to  do this because it might be
        more up to date then the data we are sending to the client.
        RUV is sent separately via the protocol */
@@ -702,9 +721,14 @@
     ber_bvfree(bv);
 	(*num_entriesp)++;
 
-	/* For async operation we need to inspect the abort status from the result thread here */
-
-	if (CONN_OPERATION_SUCCESS == rc) {
+	/* if the connection has been closed, we need to stop
+	   sending entries and set a special rc value to let
+	   the result reading thread know the connection has been
+	   closed - do not attempt to read any more results */
+	if (CONN_NOT_CONNECTED == rc) {
+		((callback_data*)cb_data)->rc = -2;
+		retval = -1;
+	} else if (CONN_OPERATION_SUCCESS == rc) {
 		retval = 0;
 	} else {
 		((callback_data*)cb_data)->rc = rc;




More information about the Fedora-directory-commits mailing list