extras-buildsys/server Builder.py,1.20.2.1,1.20.2.2
Daniel Williams (dcbw)
fedora-extras-commits at redhat.com
Tue Nov 15 18:56:24 UTC 2005
Author: dcbw
Update of /cvs/fedora/extras-buildsys/server
In directory cvs-int.fedora.redhat.com:/tmp/cvs-serv18193/server
Modified Files:
Tag: STABLE_0_4
Builder.py
Log Message:
2005-11-15 Dan Williams <dcbw at redhat.com>
* server/Builder.py
- Suspend builders on hard errors like running out of disk
space or file descriptors or whatever
Index: Builder.py
===================================================================
RCS file: /cvs/fedora/extras-buildsys/server/Builder.py,v
retrieving revision 1.20.2.1
retrieving revision 1.20.2.2
diff -u -r1.20.2.1 -r1.20.2.2
--- Builder.py 19 Oct 2005 13:39:19 -0000 1.20.2.1
+++ Builder.py 15 Nov 2005 18:56:22 -0000 1.20.2.2
@@ -29,6 +29,9 @@
import EmailUtils
import Config
+SUSPEND_NONE = 'none'
+SUSPEND_TIMEOUT = 'timeout'
+SUSPEND_HARD_ERROR = 'hard-error'
class Builder(threading.Thread):
""" Tracks all jobs on a builder instance """
@@ -43,12 +46,14 @@
self._num_slots = 0
self._address = address
self._alive = True
+ self._suspend_reason = SUSPEND_NONE
self._stop = False
self._prepping_jobs = False
self._unavail_count = 0
self._target_list = []
self._ping_timeout = 0
self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+ self._ping_now = False
self._when_died = 0
self._server_cfg = cfg
@@ -166,6 +171,10 @@
jobid = 0
except xmlrpclib.Fault, e:
print "Builder Error (%s) in start_job(): builder replied '%s'" % (self.address, e)
+ # Check for hard errors, for which we suspend the builder
+ error = str(e)
+ if string.find(error, "OSError") >= 0 and string.find(error, "Errno") >= 0:
+ self._handle_builder_suspend(SUSPEND_HARD_ERROR, error)
time.sleep(0.5)
jobid = 0
@@ -228,31 +237,35 @@
def ping_asap(self):
# Reduce the ping interval to ping the builder right away
self._cur_ping_interval = 0
+ self._ping_now = True
- def _handle_builder_suspend(self):
+ def _handle_builder_suspend(self, reason, msg):
for jobid in self._jobs.keys():
job = self._jobs[jobid]
job.builder_gone()
del self._jobs[jobid]
self._jobs = {}
self._alive = False
+ self._suspend_reason = reason
self._unavail_count = 0
self._prepping_jobs = False
- self._ping_timeout = time.time()
self._when_died = time.time()
+
# Reset current ping interval to default
self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+ self._ping_timeout = time.time()
# Notify admins
- print "Suspending builder '%s' because it timed out." % self._address
- subject = "Builder Timeout: %s" % self._address
- msg = "The builder '%s' timed out and was suspended." % self._address
+ print "Suspending builder '%s'. Reason: %s - %s." % (self._address, reason, msg)
+ subject = "Builder Suspended: %s" % self._address
+ msg = "The builder '%s' was suspended. Reason: %s - %s." % (self._address, reason, msg)
sender = self._server_cfg.get_str("Email", "email_from")
for addr in self._server_cfg.get_list("Email", "admin_emails"):
EmailUtils.email_result(sender, addr, msg, subject)
def _handle_builder_reactivate(self, target_list):
self._alive = True
+ self._suspend_reason = SUSPEND_NONE
self._ping_timeout = 0
self._init_builder(target_list)
@@ -278,12 +291,12 @@
if self._unavail_count > 2:
# Kill all jobs on the client if it went away
- self._handle_builder_suspend()
+ self._handle_builder_suspend(SUSPEND_TIMEOUT, "the builder timed out")
else:
# Update status of all archjobs on this builder
for j in self._jobs.values():
j.process()
- else:
+ elif not self._alive and (self._suspend_reason == SUSPEND_TIMEOUT or self._ping_now):
# Ping the builder every so often to see if it responds again
if time.time() > (self._ping_timeout + self._cur_ping_interval):
(alive, target_list) = self._ping_builder()
@@ -295,6 +308,7 @@
# Reset current ping interval to default
self._cur_ping_interval = self._BUILDER_PING_INTERVAL
+ self._ping_now = False
self._server_lock.release()
More information about the fedora-extras-commits
mailing list