extras-buildsys/server Builder.py,1.20.2.1,1.20.2.2

Daniel Williams (dcbw) fedora-extras-commits at redhat.com
Tue Nov 15 18:56:24 UTC 2005


Author: dcbw

Update of /cvs/fedora/extras-buildsys/server
In directory cvs-int.fedora.redhat.com:/tmp/cvs-serv18193/server

Modified Files:
      Tag: STABLE_0_4
	Builder.py 
Log Message:
2005-11-15  Dan Williams  <dcbw at redhat.com>

    * server/Builder.py
        - Suspend builders on hard errors like running out of disk
            space or file descriptors or whatever




Index: Builder.py
===================================================================
RCS file: /cvs/fedora/extras-buildsys/server/Builder.py,v
retrieving revision 1.20.2.1
retrieving revision 1.20.2.2
diff -u -r1.20.2.1 -r1.20.2.2
--- Builder.py	19 Oct 2005 13:39:19 -0000	1.20.2.1
+++ Builder.py	15 Nov 2005 18:56:22 -0000	1.20.2.2
@@ -29,6 +29,9 @@
 import EmailUtils
 import Config
 
+SUSPEND_NONE = 'none'
+SUSPEND_TIMEOUT = 'timeout'
+SUSPEND_HARD_ERROR = 'hard-error'
 
 class Builder(threading.Thread):
     """ Tracks all jobs on a builder instance """
@@ -43,12 +46,14 @@
         self._num_slots = 0
         self._address = address
         self._alive = True
+        self._suspend_reason = SUSPEND_NONE
         self._stop = False
         self._prepping_jobs = False
         self._unavail_count = 0
         self._target_list = []
         self._ping_timeout = 0
         self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+        self._ping_now = False
         self._when_died = 0
         self._server_cfg = cfg
 
@@ -166,6 +171,10 @@
             jobid = 0
         except xmlrpclib.Fault, e:
             print "Builder Error (%s) in start_job(): builder replied '%s'" % (self.address, e)
+            # Check for hard errors, for which we suspend the builder
+            error = str(e)
+            if string.find(error, "OSError") >= 0 and string.find(error, "Errno") >= 0:
+                self._handle_builder_suspend(SUSPEND_HARD_ERROR, error)
             time.sleep(0.5)
             jobid = 0
 
@@ -228,31 +237,35 @@
     def ping_asap(self):
         # Reduce the ping interval to ping the builder right away
         self._cur_ping_interval = 0
+        self._ping_now = True
 
-    def _handle_builder_suspend(self):
+    def _handle_builder_suspend(self, reason, msg):
         for jobid in self._jobs.keys():
             job = self._jobs[jobid]
             job.builder_gone()
             del self._jobs[jobid]
         self._jobs = {}
         self._alive = False
+        self._suspend_reason = reason
         self._unavail_count = 0
         self._prepping_jobs = False
-        self._ping_timeout = time.time()
         self._when_died = time.time()
+
         # Reset current ping interval to default
         self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+        self._ping_timeout = time.time()
 
         # Notify admins
-        print "Suspending builder '%s' because it timed out." % self._address
-        subject = "Builder Timeout: %s" % self._address
-        msg = "The builder '%s' timed out and was suspended." % self._address
+        print "Suspending builder '%s'.  Reason: %s - %s." % (self._address, reason, msg)
+        subject = "Builder Suspended: %s" % self._address
+        msg = "The builder '%s' was suspended.  Reason: %s - %s." % (self._address, reason, msg)
         sender = self._server_cfg.get_str("Email", "email_from")
         for addr in self._server_cfg.get_list("Email", "admin_emails"):
             EmailUtils.email_result(sender, addr, msg, subject)
 
     def _handle_builder_reactivate(self, target_list):
         self._alive = True
+        self._suspend_reason = SUSPEND_NONE
         self._ping_timeout = 0
 
         self._init_builder(target_list)
@@ -278,12 +291,12 @@
 
                 if self._unavail_count > 2:
                     # Kill all jobs on the client if it went away
-                    self._handle_builder_suspend()
+                    self._handle_builder_suspend(SUSPEND_TIMEOUT, "the builder timed out")
                 else:
                     # Update status of all archjobs on this builder
                     for j in self._jobs.values():
                         j.process()
-            else:
+            elif not self._alive and (self._suspend_reason == SUSPEND_TIMEOUT or self._ping_now):
                 # Ping the builder every so often to see if it responds again
                 if time.time() > (self._ping_timeout + self._cur_ping_interval):
                     (alive, target_list) = self._ping_builder()
@@ -295,6 +308,7 @@
 
                     # Reset current ping interval to default
                     self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+                    self._ping_now = False
 
             self._server_lock.release()
 




More information about the fedora-extras-commits mailing list