[Cluster-devel] conga/luci/site/luci/Extensions cluster_adapte ...

rmccabe at sourceware.org rmccabe at sourceware.org
Wed Oct 18 23:12:32 UTC 2006


CVSROOT:	/cvs/cluster
Module name:	conga
Changes by:	rmccabe at sourceware.org	2006-10-18 23:12:31

Modified files:
	luci/site/luci/Extensions: cluster_adapters.py 
	                           homebase_adapters.py 

Log message:
	better error handling
	log important (or those useful for debugging) errors to syslog

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/site/luci/Extensions/cluster_adapters.py.diff?cvsroot=cluster&r1=1.121&r2=1.122
http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/site/luci/Extensions/homebase_adapters.py.diff?cvsroot=cluster&r1=1.34&r2=1.35

--- conga/luci/site/luci/Extensions/cluster_adapters.py	2006/10/18 19:16:17	1.121
+++ conga/luci/site/luci/Extensions/cluster_adapters.py	2006/10/18 23:12:31	1.122
@@ -22,7 +22,8 @@
 from clusterOS import resolveOSType
 from GeneralError import GeneralError
 from UnknownClusterError import UnknownClusterError
-from homebase_adapters import nodeUnauth, nodeAuth, manageCluster, createClusterSystems, havePermCreateCluster, setNodeFlag, delNodeFlag, userAuthenticated
+from homebase_adapters import nodeUnauth, nodeAuth, manageCluster, createClusterSystems, havePermCreateCluster, setNodeFlag, delNodeFlag, userAuthenticated, getStorageNode, getClusterNode
+from LuciSyslog import LuciSyslogError, LuciSyslog
 
 #Policy for showing the cluster chooser menu:
 #1) If there are no clusters in the ManagedClusterSystems
@@ -34,6 +35,11 @@
 
 CLUSTER_FOLDER_PATH = '/luci/systems/cluster/'
 
+try:
+	luci_log = LuciSyslog()
+except LuciSyslogError, e:
+	pass
+
 def validateClusterNodes(request, sessionData, clusterName, numStorage):
 	nodeList = list()
 	nodeHash = {}
@@ -205,11 +211,24 @@
 		batch_id_map = {}
 		rc = None
 		for i in nodeList:
+			success = True
 			try:
 				rc = RicciCommunicator(i['ricci_host'])
-				resultNode = rc.process_batch(batchNode, async=True)
-				batch_id_map[i['ricci_host']] = resultNode.getAttribute('batch_id')
+			except RicciError, e:
+				luci_log.debug('Unable to connect to the ricci agent on %s: %s'\
+					% (i['ricci_host'], str(e)))
+				success = False
 			except:
+				success = False
+
+			if success == True:
+				try:
+					resultNode = rc.process_batch(batchNode, async=True)
+					batch_id_map[i['ricci_host']] = resultNode.getAttribute('batch_id')
+				except:
+					success = False
+
+			if not success:
 				nodeUnauth(nodeList)
 				cluster_properties['isComplete'] = False
 				errors.append('An error occurred while attempting to add cluster node \"' + i['ricci_host'] + '\"')
@@ -294,6 +313,7 @@
 		clusterObj = self.restrictedTraverse(PLONE_ROOT + '/systems/cluster/' + clusterName)
 		cluster_os = clusterObj.manage_getProperty('cluster_os')
 		if not cluster_os:
+			luci_log.debug('The cluster OS property is missing for cluster ' + clusterName)
 			raise Exception, 'no cluster OS was found.'
 		try:
 			if len(filter(lambda x: x['os'] != cluster_os, nodeList)) > 0:
@@ -342,17 +362,28 @@
 	batch_id_map = {}
 	for i in nodeList:
 		clunode = nodeList[i]
+		success = True
 		try:
 			rc = RicciCommunicator(clunode['ricci_host'])
-			resultNode = rc.process_batch(batchNode, async=True)
-			batch_id_map[clunode['ricci_host']] = resultNode.getAttribute('batch_id')
-			messages.append('Cluster join initiated for host \"' + clunode['ricci_host'] + '\"')
 		except:
+			luci_log.info('Unable to connect to the ricci daemon on host ' + clunode['ricci_host'])
+			success = False
+
+		if success:
+			try:
+				resultNode = rc.process_batch(batchNode, async=True)
+				batch_id_map[clunode['ricci_host']] = resultNode.getAttribute('batch_id')
+			except:
+				success = False
+
+		if not success:
 			nodeUnauth(nodeList)
 			cluster_properties['isComplete'] = False
 			errors.append('An error occurred while attempting to add cluster node \"' + clunode['ricci_host'] + '\"')
 			return (False, {'errors': errors, 'requestResults': cluster_properties})
 
+			messages.append('Cluster join initiated for host \"' + clunode['ricci_host'] + '\"')
+
 	buildClusterCreateFlags(self, batch_id_map, clusterName)
 	return (True, {'errors': errors, 'messages': messages})
 
@@ -412,6 +443,7 @@
 		try:
 			resObj = resourceAddHandler[res_type](self, dummy_form)
 		except:
+			luci_log
 			resObj = None
 
 		if resObj is None:
@@ -1304,9 +1336,12 @@
 	try:
 		clusterfolder = self.restrictedTraverse(path)
 		if not clusterfolder:
+			luci_log.debug('cluster folder %s for %s is missing.' \
+				% (path, clustername))
 			raise
 		nodes = clusterfolder.objectItems('Folder')
 		if len(nodes) < 1:
+			luci_log.debug('no cluster nodes for %s found.' % clustername)
 			return None
 	except:
 		return None
@@ -1324,15 +1359,15 @@
 
 		try:
 			rc = RicciCommunicator(hostname)
-			if not rc:
-				raise
-		except:
-			#raise Exception, ('unable to communicate with the ricci agent on %s', hostname)
+		except RicciError, e:
+			luci_log.debug('ricci error: %s' % str(e))
 			continue
 
 		try:
 			clu_info = rc.cluster_info()
 			if cluname != lower(clu_info[0]) and cluname != lower(clu_info[1]):
+				luci_log.debug('%s reports it\'s in cluster %s:%s; we expect %s' \
+					 % (hostname, clu_info[0], clu_info[1], cluname))
 				# node reports it's in a different cluster
 				raise
 		except:
@@ -1340,7 +1375,9 @@
 
 		if rc.authed():
 			return rc
-		setNodeFlag(self, node[1], CLUSTER_NODE_NEED_AUTH)
+		setNodeFlag(node[1], CLUSTER_NODE_NEED_AUTH)
+
+	luci_log.debug('no ricci agent could be found for cluster %s' % cluname)
 	return None
 
 def getRicciAgentForCluster(self, req):
@@ -1352,11 +1389,13 @@
 			if not clustername:
 				raise
 		except:
+			luci_log.debug('no cluster name was specified in getRicciAgentForCluster')
 			return None
 	return getRicciAgent(self, clustername)
 
 def getClusterStatus(self, rc):
 	clustatus_batch ='<?xml version="1.0" ?><batch><module name="cluster"><request API_version="1.0"><function_call name="status"/></request></module></batch>'
+
 	try:
 		clustatuscmd_xml = minidom.parseString(clustatus_batch).firstChild
 	except:
@@ -1364,6 +1403,8 @@
 
 	try:
 		ricci_xml = rc.process_batch(clustatuscmd_xml, async=False)
+	except RicciError, e:
+		luci_log.debug('ricci error: %s', str(e))
 	except:
 		return {}
 
@@ -1998,16 +2039,44 @@
 		# to be performed.
 		try:
 			rc = RicciCommunicator(nodename_resolved)
-			# XXX - check the cluster
-			if not rc.authed():
-				# set the flag
-				rc = None
-
-			if not rc:
-				raise
+		except RicciError, e:
+			luci_log.debug('ricci error from %s: %s' \
+				% (nodename_resolved, str(e)))
+			return None
 		except:
 			return None
 
+		cluinfo = rc.cluster_info()
+		if not cluinfo[0] and not cluinfo[1]:
+			luci_log.debug('host %s not in a cluster (expected %s)' \
+				% (nodename_resolved, clustername))
+			return None
+
+		cname = lower(clustername)
+		if cname != lower(cluinfo[0]) and cname != lower(cluinfo[1]):
+			luci_log.debug('host %s in unknown cluster %s:%s (expected %s)' \
+				% (nodename_resolved, cluinfo[0], cluinfo[1], clustername))
+			return None
+
+		if not rc.authed():
+			rc = None
+			try:
+				snode = getStorageNode(self, nodename)
+				setNodeFlag(snode, CLUSTER_NODE_NEED_AUTH)
+			except:
+				# we'll hit it again, and try again then
+				pass
+
+			try:
+				cnode = getClusterNode(self, nodename, clustername)
+				setNodeFlag(cnode, CLUSTER_NODE_NEED_AUTH)
+			except:
+				# we'll hit it again, and try again then
+				pass
+
+		if rc is None:
+			return None
+
 	if task == NODE_LEAVE_CLUSTER:
 		batch_number, result = nodeLeaveCluster(rc)
 
@@ -2056,40 +2125,64 @@
 		#Now we need to annotate the new DB object
 		objpath = path + "/" + objname
 		flag = self.restrictedTraverse(objpath)
-		flag.manage_addProperty(BATCH_ID,batch_id, "string")
-		flag.manage_addProperty(TASKTYPE,NODE_REBOOT, "string")
-		flag.manage_addProperty(FLAG_DESC,"Node \'" + nodename + "\' is being rebooted", "string")
+		flag.manage_addProperty(BATCH_ID, batch_id, "string")
+		flag.manage_addProperty(TASKTYPE, NODE_REBOOT, "string")
+		flag.manage_addProperty(FLAG_DESC, "Node \'" + nodename + "\' is being rebooted", "string")
 
 		response = request.RESPONSE
 		#Once again, is this correct? Should we re-direct to the cluster page?
 		response.redirect(request['URL'] + "?pagetype=" + CLUSTER_CONFIG + "&clustername=" + clustername)
 	elif task == NODE_FENCE:
 		#here, we DON'T want to open connection to node to be fenced.
-		path = CLUSTER_FOLDER_PATH + clustername
+		path = str(CLUSTER_FOLDER_PATH + clustername)
 		try:
 			clusterfolder = self.restrictedTraverse(path)
 			if not clusterfolder:
 				raise
 		except:
+			luci_log.debug('The cluster folder for %s could not be found.' \
+				 % clustername)
+			return None
+
+		try:
+			nodes = clusterfolder.objectItems('Folder')
+		except:
+			luci_log.debug('No cluster nodes for %s were found' % clustername)
 			return None
 
-		nodes = clusterfolder.objectItems('Folder')
 		found_one = False
 		for node in nodes:
-			if node[1].getID().find(nodename) != (-1):
+			if node[1].getId().find(nodename) != (-1):
 				continue
 
 			try:
 				rc = RicciCommunicator(node[1].getId())
-				if not rc.authed():
-					# set the node flag
-					rc = None
 				if not rc:
-					raise
-				found_one = True
-				break
+					continue
+			except RicciError, e:
+				luci_log.debug('ricci error for host %s: %s' \
+					% (node[0], str(e)))
+				continue
 			except:
 				continue
+
+			if not rc.authed():
+				rc = None
+				try:
+					snode = getStorageNode(self, node[1].getId())
+					setNodeFlag(snode, CLUSTER_NODE_NEED_AUTH)
+				except:
+					pass
+
+				try:
+					setNodeFlag(node[1], CLUSTER_NODE_NEED_AUTH)
+				except:
+					pass
+
+				continue
+			found_one = True
+			break
+
 		if not found_one:
 			return None
 
@@ -3430,14 +3523,23 @@
 	raise
 
 def noNodeFlagsPresent(self, nodefolder, flagname, hostname):
-	items = nodefolder.objectItems('ManagedSystem')
+	try:
+		items = nodefolder.objectItems('ManagedSystem')
+	except:
+		luci_log.debug('An error occurred while trying to list flags for cluster ' + nodefolder[0])
+		return False
 
 	for item in items:
 		if item[0] != flagname:
 			continue
 
 		#a flag already exists... try to delete it
-		rc = RicciCommunicator(hostname)
+		try:
+			rc = RicciCommunicator(hostname)
+		except:
+			luci_log.info('Unable to connect to the ricci daemon on host ' + hostname)
+			return False
+
 		finished = checkBatch(rc, item[1].getProperty(BATCH_ID))
 		if finished == True:
 			try:
--- conga/luci/site/luci/Extensions/homebase_adapters.py	2006/10/16 20:46:46	1.34
+++ conga/luci/site/luci/Extensions/homebase_adapters.py	2006/10/18 23:12:31	1.35
@@ -1367,7 +1367,7 @@
 		pass
 	return False
 
-def setNodeFlag(self, node, flag_mask):
+def setNodeFlag(node, flag_mask):
 	try:
 		flags = node.getProperty('flags')
 		node.manage_changeProperties({ 'flags': flags | flag_mask })
@@ -1377,7 +1377,7 @@
 		except:
 			pass
 
-def delNodeFlag(self, node, flag_mask):
+def delNodeFlag(node, flag_mask):
 	try:
 		flags = node.getProperty('flags')
 		if flags & flag_mask != 0:




More information about the Cluster-devel mailing list