check-mirrors check-mirrors.py,1.16,1.17

Tue Aug 1 19:08:20 UTC 2006

Author: mmcgrath

Update of /cvs/fedora/check-mirrors
In directory cvs-int.fedora.redhat.com:/tmp/cvs-serv22484

Modified Files:
	check-mirrors.py 
Log Message:
Initial creation of a db backend.   Still lots of work to do but this works.



Index: check-mirrors.py
===================================================================
RCS file: /cvs/fedora/check-mirrors/check-mirrors.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17

--- check-mirrors.py	17 Jul 2006 18:28:44 -0000	1.16
+++ check-mirrors.py	1 Aug 2006 19:08:17 -0000	1.17
@@ -21,12 +21,12 @@
 
 # TODO:
 # - better error handling
-# - push into a db?
 # - have it accept an option which specifies a section in the config file
 #    to operate on - rather than doing all of them.
+# - remove deleted mirrors from the database
 
 
-debug = False
+debug = True
 
 __revision__ = '$Id$'
 CONFIG = '/etc/check-mirrors.conf'
@@ -43,82 +43,154 @@
 import socket
 import urlparse
 import glob
+import sqlite
 
 from urlgrabber.grabber import URLGrabber
 from urlgrabber.grabber import URLGrabError
 
-class YumBaseError(exceptions.Exception):
-    def __init__(self, args=None):
-        exceptions.Exception.__init__(self)    
+class YumBaseError( exceptions.Exception ):
+    def __init__( self, args=None ):
+        exceptions.Exception.__init__( self )    
         self.args = args
 
-class RepoMDError(YumBaseError):
-    def __init__(self, args=None):
-        YumBaseError.__init__(self)
+class RepoMDError( YumBaseError ):
+    def __init__( self, args=None ):
+        YumBaseError.__init__( self )
         self.args = args
 
 
-def ns_cleanup(qn):
-    if qn.find('}') == -1: return qn 
-    return qn.split('}')[1]
+def ns_cleanup( qn ):
+    if qn.find( '}' ) == -1: return qn 
+    return qn.split( '}' )[1]
 
-def errorprint(stuff):
-    print >> sys.stderr, stuff
+def errorprint( error ):
+    print >> sys.stderr, error
 
-def check_and_make_dir(dir):
+def check_and_make_db( db ):
+    """
+    verify that we can create the sqlite DB file
+    """
+    try:
+        con = sqlite.connect( db )
+        cursor = con.cursor()
+    except sqlite.Error, errmsg:
+        errorprint( 'Failed to connect to database: %s' % db )
+        errorprint( 'Err: ' +str( errmsg ) )
+        return None, None
+
+    try:
+        query = "insert into mirrors (repo, arch, country, url, failures, lastgood) VALUES ('testrepo', 'testarch', 'testcountry', 'http://nowhere/', 0, DATETIME('now'));"
+        if debug:
+            print "Executing %s" % query
+        cursor.execute( query )
+        if debug:
+            print "deleting test %i" % cursor.lastrowid
+        cursor.execute( "delete from mirrors where m_id =" + str( cursor.lastrowid ) + ";" )
+        con.commit()
+    except sqlite.Error, errmsg:
+        if debug:
+            print 'db IO test failed: ' +str( errmsg )
+        
+        try:
+            cursor.execute( 'CREATE TABLE mirrors (m_id INTEGER PRIMARY KEY, repo varchar(30), arch varchar(8), country varchar(2), url text, failures integer, lastgood date);' )
+            con.commit()
+        except sqlite.Error, errmsg:
+            errorprint( 'Err: ' +str( errmsg ) )
+            return None, None
+    return con, cursor
+
+
+def check_and_make_dir( dir ):
     """
      check out the dir and make it, if possible, return 1 if done, else return 0
     """
-    if os.path.exists(dir):
-        if not os.path.isdir(dir):
-            errorprint('%s is not a dir' % dir)
+    if os.path.exists( dir ):
+        if not os.path.isdir( dir ):
+            errorprint( '%s is not a dir' % dir )
             result = False
         else:
-            if not os.access(dir, os.W_OK):
-                errorprint('%s is not writable' % dir)
+            if not os.access( dir, os.W_OK ):
+                errorprint( '%s is not writable' % dir )
                 result = False
             else:
                 result = True
     else:
         try:
-            os.mkdir(dir)
+            os.mkdir( dir )
         except OSError, e:
-            errorprint('Error creating dir %s: %s' % (dir, e))
+            errorprint( 'Error creating dir %s: %s' % ( dir, e ) )
             result = False
         else:
             result = True
     return result
-    
+
+def update_db( repo, arch, country, url, failure, dbconn, dbcursor ):
+    updated = 0
+    if not dbcursor:
+        errorprint( 'sqlite database check failed' )
+
+    if failure:
+        query = "update mirrors set failures=(select failures from mirrors where url='%s')+1 where url='%s' and repo='%s' and arch='%s';" % ( url, url, repo, arch)
+    else:
+        query = "update mirrors set failures='0', lastgood=DATETIME('now') where url='%s' and repo='%s' and arch='%s';" % ( url, repo, arch )
+    try:
+        if debug:
+            print "Executing: %s" % query
+        dbcursor.execute( query )
+        updated = dbcursor.rowcount
+        dbconn.commit()
+    except sqlite.Error, errmsg:
+        errorprint( 'DBerr: ' +str( errmsg ) )
+        errorprint(query)
+    if not updated:
+        try:
+            if failure:
+                lastgoodsql='0'
+            else:
+                lastgoodsql="DATETIME('now')"
+            query = "insert into mirrors (repo, arch, country, url, failures, lastgood) VALUES ('%s', '%s', '%s', '%s', '%s', %s);" % ( repo, arch, country, url, failure, lastgoodsql )
+            if debug:
+                print "Executing: %s" % query
+            dbcursor.execute( query )
+            updated = dbcursor.rowcount
+            dbconn.commit()
+        except sqlite.Error, errmsg:
+            errorprint( 'DBErr: ' +str( errmsg ) )
+            errorprint(query)
+            return None
+    return updated
+
+
 class RepoData:
     """represents anything beneath a <data> tag"""
-    def __init__(self, elem):
-        self.type = elem.attrib.get('type')
-        self.location = (None, None)
-        self.checksum = (None,None) # type,value
-        self.openchecksum = (None,None) # type,value
+    def __init__( self, elem ):
+        self.type = elem.attrib.get( 'type' )
+        self.location = ( None, None )
+        self.checksum = ( None, None ) # type,value
+        self.openchecksum = ( None, None ) # type,value
         self.timestamp = None
     
-        self.parse(elem)
+        self.parse( elem )
 
-    def parse(self, elem):
+    def parse( self, elem ):
         
         for child in elem:
-            child_name = ns_cleanup(child.tag)
+            child_name = ns_cleanup( child.tag )
             if child_name == 'location':
-                relative = child.attrib.get('href')
-                base = child.attrib.get('base')
-                self.location = (base, relative)
-            
+                relative = child.attrib.get( 'href' )
+                base = child.attrib.get( 'base' )
+                self.location = ( base, relative )
+
             elif child_name == 'checksum':
                 csum_value = child.text
-                csum_type = child.attrib.get('type')
-                self.checksum = (csum_type,csum_value)
+                csum_type = child.attrib.get( 'type' )
+                self.checksum = ( csum_type, csum_value )
 
             elif child_name == 'open-checksum':
                 csum_value = child.text
-                csum_type = child.attrib.get('type')
-                self.openchecksum = (csum_type, csum_value)
-            
+                csum_type = child.attrib.get( 'type' )
+                self.openchecksum = ( csum_type, csum_value )
+
             elif child_name == 'timestamp':
                 self.timestamp = child.text
     
@@ -126,44 +198,44 @@
 class RepoMD:
     """represents the repomd xml file"""
     
-    def __init__(self, repoid, srcfile):
+    def __init__( self, repoid, srcfile ):
         """takes a repoid and a filename for the repomd.xml"""
         
         self.repoid = repoid
         self.repoData = {}
         
-        if type(srcfile) == type('str'):
+        if type( srcfile ) == type( 'str' ):
             # srcfile is a filename string
-            infile = open(srcfile, 'rt')
+            infile = open( srcfile, 'rt' )
         else:
             # srcfile is a file object
             infile = srcfile
         
-        parser = iterparse(infile)
+        parser = iterparse( infile )
         
         try:
             for event, elem in parser:
-                elem_name = ns_cleanup(elem.tag)
+                elem_name = ns_cleanup( elem.tag )
                 
                 if elem_name == "data":
-                    thisdata = RepoData(elem=elem)
+                    thisdata = RepoData( elem=elem )
                     self.repoData[thisdata.type] = thisdata
         except SyntaxError, e:
             raise RepoMDError, "Damaged repomd.xml file"
-            
-    def fileTypes(self):
+
+    def fileTypes( self ):
         """return list of metadata file types available"""
         return self.repoData.keys()
-    
-    def getData(self, type):
-        if self.repoData.has_key(type):
+ 
+    def getData( self, type ):
+        if self.repoData.has_key( type ):
             return self.repoData[type]
         else:
             raise RepoMDError, "Error: requested datatype %s not available" % type
-            
-    def dump(self):
+
+    def dump( self ):
         """dump fun output"""
-        
+
         for ft in self.fileTypes():
             thisdata = self.repoData[ft]
             print 'datatype: %s' % thisdata.type
@@ -172,41 +244,41 @@
             print 'checksum: %s -%s' % thisdata.checksum
             print 'open checksum: %s - %s' %  thisdata.openchecksum
 
-class MirrorContainer(object):
+class MirrorContainer( object ):
     """Holder for info about a specific mirror"""
-    
-    def __init__(self, url, grabber, archlist, gi):
+
+    def __init__( self, url, grabber, archlist, gi ):
         self.url = url
         self.grabber = grabber
         self.geoip = gi
         self.timestamps = {}
         self.archlist = archlist
         self.country = None
-        self.get_timestamp(url)
-        self.get_country(url)
-        
-    def get_timestamp(self, url):
+        self.get_timestamp( url )
+        self.get_country( url )
+
+    def get_timestamp( self, url ):
         url = '%s/repodata/repomd.xml' % url
-        (suburl, count) = re.subn('\$ARCH', '$BASEARCH', url)
-        (suburl, count) = re.subn('\$BASEARCH','$basearch', suburl)
-        
+        ( suburl, count ) = re.subn( '\$ARCH', '$BASEARCH', url )
+        ( suburl, count ) = re.subn( '\$BASEARCH', '$basearch', suburl )
+
         for arch in self.archlist:
-            (finurl, count) = re.subn('\$basearch', arch, suburl)
+            ( finurl, count ) = re.subn( '\$basearch', arch, suburl )
             try:
-                fo = self.grabber.urlopen(finurl)
+                fo = self.grabber.urlopen( finurl )
             except URLGrabError, e:
                 if debug:
                     print 'error on %s' % finurl
                 continue
 
             try:
-                p = RepoMD('fooid', fo)
+                p = RepoMD( 'fooid', fo )
             except RepoMDError, e:
                 if debug:
                     print e
                 continue
             except URLGrabError, e:
-                errorprint("Grabber error on %s arch %s was:\n%s" % (url, arch, e))
+                errorprint( "Grabber error on %s arch %s was:\n%s" % ( url, arch, e ) )
                 continue
             else:
                 thisdata = p.repoData['primary']
@@ -214,202 +286,213 @@
                 del p
                 fo.close()
                 del fo
-    
-    def get_country(self, url):
-        url_parts = urlparse.urlparse(url)
-        h = url_parts[1]
-        addr = socket.gethostbyname(h)
-        self.country = self.geoip.country_code_by_addr(addr)
-        
-
 
+    def get_country( self, url ):
+        url_parts = urlparse.urlparse( url )
+        h = url_parts[1]
+        addr = socket.gethostbyname( h )
+        self.country = self.geoip.country_code_by_addr( addr )
 
 
-class MirrorListInfo(object):
+class MirrorListInfo( object ):
     """Holder for config info from the configuration file about the 
        mirrorlist being checked"""
-       
-    def __init__(self):
+
+    def __init__( self ):
         self.archlist = ['i386', 'x86_64', 'ppc']
         self.mirrorid = None
         self.inputfile = None
         self.outputpath = None
-        self.timeout = 10
+        self.timeout = 4
         self.canonical = None
+        self.db = None
         self.mirrorlist = []
-    
-    def populate_mirrorlist(self, grabber_inst):
+
+    def populate_mirrorlist( self, grabber_inst ):
         try:
-            fo = grabber_inst.urlopen(self.inputfile)
+            fo = grabber_inst.urlopen( self.inputfile )
         except IOError, e:
             return
         else:
             content = fo.readlines()
             for line in content:
-                if re.match('^\s*\#.*', line) or re.match('^\s*$', line):
+                if re.match( '^\s*\#.*', line ) or re.match( '^\s*$', line ):
                     continue
-                mirror = re.sub('\n$', '', line) # no more trailing \n's
-                self.mirrorlist.append(mirror)
-        
+                mirror = re.sub( '\n$', '', line ) # no more trailing \n's
+                self.mirrorlist.append( mirror )
+
             fo.close()
 
-def config(cfg):
+def config( cfg ):
 
     sections = []
     conf = ConfigParser.ConfigParser()
-    conf.read(cfg)
-    
+    conf.read( cfg )
+
     for section in conf.sections():
         item = MirrorListInfo()
 
-        if conf.has_option(section, 'file_prefix'):
-            item.mirrorid = conf.get(section, 'file_prefix')
+        if conf.has_option( section, 'file_prefix' ):
+            item.mirrorid = conf.get( section, 'file_prefix' )
         else:
             item.mirrorid = '%s' % section
 
         broken = False
-        
-        if conf.has_option(section, 'inputfile'):
-            item.inputfile = conf.get(section, 'inputfile')
+
+        if conf.has_option( section, 'inputfile' ):
+            item.inputfile = conf.get( section, 'inputfile' )
         else:
-            errorprint('missing inputfile')
+            errorprint( 'missing inputfile' )
             broken = True
-            
-        if conf.has_option(section, 'outputpath'):
-            item.outputpath = conf.get(section, 'outputpath')
+
+        if conf.has_option( section, 'outputpath' ):
+            item.outputpath = conf.get( section, 'outputpath' )
         else:
-            errorprint('missing outputpath')
+            errorprint( 'missing outputpath' )
             broken = True
-            
-        if conf.has_option(section, 'canonical'):
-            item.canonical = conf.get(section, 'canonical')
+
+        if conf.has_option( section, 'canonical' ):
+            item.canonical = conf.get( section, 'canonical' )
         else:
-            errorprint('missing canonical url')
+            errorprint( 'missing canonical url' )
             broken = True
-        
+
+        if conf.has_option( section, 'db' ):
+            item.db = conf.get( section, 'db' )
+        else:
+            errorprint( 'missing db filename' )
+            broken = True
+
         if broken:
-            errorprint("Broooooooooooooken config, in section %s, bailing" % section)
-            sys.exit(1)
-            
-        if conf.has_option(section, 'timeout'):
-            item.timeout = conf.getint(section, 'timeout')
+            errorprint( "Broooooooooooooken config, in section %s, bailing" % section )
+            sys.exit( 1 )
 
+        if conf.has_option( section, 'timeout' ):
+            item.timeout = conf.getint( section, 'timeout' )
 
-        if conf.has_option(section, 'archlist'):
-            a_string = conf.get(section, 'archlist')
-            
-            a_holder = a_string.replace('\n', ' ')
-            a_holder = a_holder.replace(',', ' ')
+
+        if conf.has_option( section, 'archlist' ):
+            a_string = conf.get( section, 'archlist' )
+            a_holder = a_string.replace( '\n', ' ' )
+            a_holder = a_holder.replace( ',', ' ' )
             a_list = a_holder.split()
-            
+
             item.archlist = a_list
 
-        sections.append(item)
-    
+        sections.append( item )
+
     return sections
 
 
-def main(cfg_file):
-    if not os.path.exists(cfg_file):
-        errorprint("config file %s does not exist" % cfg_file)
-        sys.exit(1)
+def main( cfg_file ):
+    if not os.path.exists( cfg_file ):
+        errorprint( "config file %s does not exist" % cfg_file )
+        sys.exit( 1 )
+
+    sections = config( cfg_file )
+    gi = GeoIP.new( GeoIP.GEOIP_STANDARD )
 
-    sections = config(cfg_file)
-    gi = GeoIP.new(GeoIP.GEOIP_STANDARD)
-    
     # grab the canonical mirrors info
     for s in sections:
         mirrors = []
-        
-        ug = URLGrabber(timeout=s.timeout)
-        s.populate_mirrorlist(ug)
-        if len(s.mirrorlist) < 1:
-            errorprint("no mirrors to look at for %s, something is broken, skipping" % s.mirrorid)
+        badmirrors = []
+
+        ug = URLGrabber( timeout=s.timeout )
+        s.populate_mirrorlist( ug )
+        if len( s.mirrorlist ) < 1:
+            errorprint( "no mirrors to look at for %s, something is broken, skipping" % s.mirrorid )
             continue
 
-        if not check_and_make_dir(s.outputpath):
-            errorprint('Error creating output path %s for %s' % (s.outputpath, s.mirrorid))
+        dbconn, dbcursor = check_and_make_db( s.db )
+
+        if not check_and_make_dir( s.outputpath ):
+            errorprint( 'Error creating output path %s for %s' % ( s.outputpath, s.mirrorid ) )
             continue
 
         # get the list of the old files
         new_file_list = []
         old_file_list = []
-        filematch = '%s/%s*' % (s.outputpath, s.mirrorid)
+        filematch = '%s/%s*' % ( s.outputpath, s.mirrorid )
         if debug: print filematch
-        old_file_list.extend(glob.glob(filematch))
+        old_file_list.extend( glob.glob( filematch ) )
         if debug: print old_file_list
-        
-        canon = MirrorContainer(s.canonical, ug, s.archlist, gi)
-        if len(canon.timestamps.keys()) < len(s.archlist):
+
+        canon = MirrorContainer( s.canonical, ug, s.archlist, gi )
+        if len( canon.timestamps.keys() ) < len( s.archlist ):
             # if we can't get info for all arches for the canonical mirror, exit
-            errorprint("Cannot contact canonical host for all archs for mirrorlists of %s skipping" % s.mirrorid)
+            errorprint( "Cannot contact canonical host for all archs for mirrorlists of %s skipping" % s.mirrorid )
             continue
-    
+
         if debug:
             # debug only - just printing out info
             for arch in s.archlist:
-                if canon.timestamps.has_key(arch):
-                    print '%s -  %s: %s' % (s.mirrorid, arch, canon.timestamps[arch])
+                if canon.timestamps.has_key( arch ):
+                    print '%s -  %s: %s' % ( s.mirrorid, arch, canon.timestamps[arch] )
 
         # get the info for all the mirrors
-            
+
         for url in s.mirrorlist:
             try:
-                m = MirrorContainer(url, ug, s.archlist, gi)
+                m = MirrorContainer( url, ug, s.archlist, gi )
             except socket.gaierror, e:
-                errorprint("Cannot get address for mirror %s" % url)
+                errorprint( "Cannot get address for mirror %s" % url )
                 continue
             else:
                 if m:
-                    mirrors.append(m)
-        
+                    mirrors.append( m )
+
         # print them out per-arch and per-country
         for arch in s.archlist:
             glob_urls = []
             country_specific = {}
             
             for m in mirrors:
-                if m.timestamps.has_key(arch):
+                goodmirror = 0
+                if m.timestamps.has_key( arch ):
                     if m.timestamps[arch] == canon.timestamps[arch]:
                         if debug: print 'adding %s' % m.url
-                        glob_urls.append(m.url)
+                        glob_urls.append( m.url )
+                        goodmirror=1
                     if m.country:
-                        if not country_specific.has_key(m.country):
+                        if not country_specific.has_key( m.country ):
                             country_specific[m.country] = []
-                        if debug: print 'adding to %s: %s' % (m.country, m.url)
-                        country_specific[m.country].append(m.url)
-                    
-            global_file = '%s/%s-global-%s.txt' % (s.outputpath, s.mirrorid, arch)
-            glob_fo = open(global_file, 'w')
+                            goodmirror=1
+                        if debug: print 'adding to %s: %s' % ( m.country, m.url )
+                        country_specific[m.country].append( m.url )
+                if not goodmirror:
+                    print "Bad: %s, %s, %s, %s" % (s.mirrorid, arch, m.country, m.url)
+                    if not update_db(s.mirrorid, arch, m.country, m.url, '1', dbconn, dbcursor):
+                        errorprint( "Error updating: %s" % url)
+            global_file = '%s/%s-global-%s.txt' % ( s.outputpath, s.mirrorid, arch )
+            glob_fo = open( global_file, 'w' )
             for url in glob_urls:
-                glob_fo.write('%s\n' % url)
+                glob_fo.write( '%s\n' % url )
             glob_fo.close()
-            new_file_list.append(os.path.normpath(global_file))
-            
+            new_file_list.append( os.path.normpath( global_file ) )
+
             for code in country_specific.keys():
-                country_file = '%s/%s-%s-%s.txt' % (s.outputpath, s.mirrorid, code, arch)
-                country_fo = open(country_file, 'w')
+                country_file = '%s/%s-%s-%s.txt' % ( s.outputpath, s.mirrorid, code, arch )
+                country_fo = open( country_file, 'w' )
                 for url in country_specific[code]:
-                    country_fo.write('%s\n' % url)
-                country_fo.close()
-                new_file_list.append(os.path.normpath(country_file))
-        
+                    country_fo.write( '%s\n' % url )
+                    if not update_db( s.mirrorid, arch, code, url, 0, dbconn, dbcursor ):
+                          errorprint( "Error updating: %s" % url )
+                new_file_list.append( os.path.normpath( country_file ) )
+
         # clean up
         for fn in old_file_list:
-            fn = os.path.normpath(fn)
+            fn = os.path.normpath( fn )
             if fn not in new_file_list:
                 if debug: print "removing old file %s" % fn 
-                os.unlink(fn)
-        
-                
+                os.unlink( fn )
+#    dbconn.close()
+#    dbcursor.close()
 
 
 if __name__ == '__main__':
-    if len(sys.argv) < 2:
+    if len( sys.argv ) < 2:
         conf_fn = CONFIG
     else:
         conf_fn = sys.argv[1]
 
-    main(conf_fn)
-
-
+    main( conf_fn )