#!/usr/bin/python # Fuzzy license searcher # Author: Colin Walters # This file is hereby placed into the public domain. import os,sys,re import difflib TABLE_URL = 'http://fedoraproject.org/wiki/Licensing?action=raw' licenses = { 'GPLv2+': {'fragments': ['GNU General Public License'], 'seqs': ['under the terms of the GNU General Public License', 'version 2 of the License', 'or (at your option) any later version']}, 'ASL 2.0': {'seqs': ['under the Apache License, Version 2.0']}, } for license,matchdata in licenses.iteritems(): matchdata['seqs'] = map(lambda seq: difflib.SequenceMatcher(None, None, seq), matchdata['seqs']) def file_licenses(fname, threshold=0.7): for license,matchdata in licenses.iteritems(): seqs = matchdata['seqs'] seq_hits = map(lambda x: 0, seqs) for i,line in enumerate(open(fname)): for seqi,seq in enumerate(seqs): seq.set_seq1(line) ratio = seq.quick_ratio() if ratio > seq_hits[seqi]: ratio = seq.ratio() if ratio > seq_hits[seqi]: seq_hits[seqi] = ratio avg_ratio = reduce(lambda x,y: x+y, seq_hits)/len(seq_hits) if avg_ratio >= threshold: yield (license, avg_ratio) def dirtree_licenses(dirname): for dirname,dirs,files in os.walk(dirname): for fname in files: fpath = os.path.join(dirname,fname) for license,ratio in file_licenses(fpath): yield (fpath, license, ratio) if __name__ == '__main__': matches = {} for fpath,license,ratio in dirtree_licenses(sys.argv[1]): if license not in matches or (matches[license][0] < ratio): matches[license] = (ratio, fpath) sys.stdout.write('%s %s: %s\n' % (license, ratio, fpath))