Koji bandaid
Mike McGrath
mmcgrath at redhat.com
Sun Feb 17 03:50:20 UTC 2008
The koji builders don't check back in automatically[1] if they've lost a
connection to the host. I put this script together in an attempt to fix
it, thought I'd post it here before sticking it on the builders. Basic
premis is check if its checked in in 5 minutes (should be waaay more then
enough) unless the box is under high load, then check 15 minutes, might
be over kill.
I'd like to run this check via cron every 5 minutes on each builder.
Anyone have any suggested fixes or against me running this?
-Mike
[1] https://fedorahosted.org/koji/ticket/66
-------------- next part --------------
#!/usr/bin/python
import urllib
import koji
import socket
import datetime
import time
import os
import sys
FIVE_MIN = 300
FIFTEEN_MIN = 900
k = koji.ClientSession('https://publictest8.fedora.phx.redhat.com/kojihub', {})
hosts = k.listHosts()
me = socket.gethostname().split('.')[0]
def restart():
import commands
(code, out) = commands.getstatusoutput('/etc/init.d/kojid reload')
print out
def lock():
f = open('/var/lock/subsys/koji_check', 'w')
f.write('%s' % os.getpid())
f.close()
def check_pid(pid):
try:
f = open('/proc/%s/status' % pid, 'r')
f.close()
return 1
except OSError, err:
return 0
except TypeError:
return 0
except IOError:
return 0
def remove_lock():
os.remove('/var/lock/subsys/koji_check')
def check_lock():
try:
f = open('/var/lock/subsys/koji_check', 'r')
old_pid = f.read()
f.close
except IOError:
return
else:
if check_pid(old_pid):
print "Check still running!"
sys.exit(1)
else:
print "Lockfile exists, pid dead. Removing lock"
remove_lock()
return
def check_host():
for host in hosts:
if host['name'].startswith(me) and host['enabled']:
t = k.getLastHostUpdate(host['id'])
dt = time.strptime(t.split('.')[0], "%Y-%m-%d %H:%M:%S")
if (time.time() - time.mktime(dt)) > FIVE_MIN:
# Check to see if the box just happens to be under load
if host['ready'] == False and host['task_load'] >= host['capacity']:
# If under high load be a bit more lenient
if (time.time() - time.mktime(dt)) > FIFTEEN_MIN:
print "Restarting under high load"
restart()
else:
# no load, its been 5 minutes. Restart
print "Restarting"
restart()
def main():
check_lock()
lock()
check_host()
remove_lock()
if __name__ == '__main__':
main()
More information about the Fedora-infrastructure-list
mailing list