#!/usr/bin/env python # SPDX-License-Identifier: GPL-2.0-only # Copyright Thomas Gleixner from argparse import ArgumentParser, REMAINDER from textwrap import TextWrapper import unicodedata import datetime import difflib import pickle import locale import codecs import time import json import git import sys import os import re git_source_url = 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree' def encode_txt(txt): for d in ['ascii', 'UTF-8', 'latin-1', 'iso-8859-1' ]: try: return txt.decode(d) except: pass try: return txt.decode('UTF-8', errors='ignore') except: res = '' for t in txt: try: res += str(t).decode('UTF-8', errors='ignore') except: pass return res def normalize(txt): txt = encode_txt(txt) lines = txt.split() txt = '' for l in lines: l = l.strip().lower() if l.startswith(';;'): l = l[2:].strip() if l.startswith(';'): l = l[1:].strip() if l.startswith('*'): l = l[1:].strip() if l.startswith('/*'): l = l[2:].strip() if l.startswith('#'): l = l[1:].strip() if l.startswith('//'): l = l[2:].strip() l = l.replace('*/', ' ').strip() l = l.replace('*', ' ').strip() l = l.replace('\t', ' ').strip() l = l.replace('.', ' ').strip() l = l.replace(',', ' ').strip() l = l.replace(';', ' ').strip() l = l.replace(':', ' ').strip() l = l.replace('/', ' ').strip() l = l.replace('(', ' ').strip() l = l.replace(')', ' ').strip() l = l.replace('-', ' ').strip() l = l.replace('"', ' ').strip() l = l.replace('|', ' ').strip() l = l.replace('\'', ' ').strip() l = l.replace('`', ' ').strip() while l.find(' ') >= 0: l = l.replace(' ', ' ') l = l.strip() if len(l) == 0: continue txt = txt + ' ' + l.strip() txt = txt.strip() return txt class fileinfo(object): def __init__(self, fpath, orig_fpath=None, sha=None, ts=-1, author=None, info=None): self.fpath = fpath self.orig_fpath = orig_fpath self.sha = sha self.ts = ts self.author = author self.info = info self.subjects = {} self.prefix = fpath class shainfo(object): def __init__(self, sha, tag, date, author): self.sha = sha self.tag = tag self.date = int(date) self.author = author class license(object): def __init__(self, fpath, lic, args): try: self.score = int(lic['score']) except: print(fpath) self.match = encode_txt(lic['matched_text']) self.pattern = normalize(self.match) if self.match.strip().startswith('MODULE_LICENSE("GPL")') or \ self.match.strip().startswith('MODULE_LICENSE ("GPL")') or \ self.match.strip().startswith('MODULE_LICENSE("GPL v2")'): self.spdx = 'GPL-2.0-only' else: self.spdx = lic['spdx_license_key'] # Fixup references to the COPYING file if args.fixupcopying and self.spdx == 'GPL-1.0-or-later': if self.match.find('COPYING') > 0: self.spdx = 'GPL-2.0-only' self.shortname = lic['short_name'] if len(self.spdx) == 0: self.spdx = self.shortname self.start_line = lic['start_line'] self.end_line = lic['end_line'] self.is_ambiguous = lic.get('is_ambiguous', False) self.is_spdx = False self.is_text = False self.is_notice = False self.is_ref = False self.is_tag = False self.rule = None if not lic.has_key('matched_rule'): return mr = lic['matched_rule'] self.is_notice = mr['is_license_notice'] self.is_tag = mr['is_license_tag'] self.is_text = mr['is_license_text'] self.is_ref = mr['is_license_reference'] self.is_spdx = self.is_tag and mr['matcher'] == '4-spdx-id' # FIXES idf = mr['identifier'] self.rule = idf class author(object): def __init__(self, data): self.txt = data['value'] self.start_line = data['start_line'] self.end_line = data['end_line'] class copyright(object): def __init__(self, data): self.txt = data['value'] self.start_line = data['start_line'] self.end_line = data['end_line'] class scanentry(object): crap_matches = [ re.compile('GPL\(.*driver'), ] def __init__(self, info, args): self.path = info['path'].split('/', 1)[1] self.entries = [] self.licenses = [] self.copyrights = [] self.authors = [] self.holders = [] self.is_resolved = None self.scan_licenses(info, args) self.scan_authors(info) self.scan_holders(info) self.scan_copyrights(info) self.start_line = 1 self.end_line = 1 self.warned = 1 if args.minlines: # FIXME. Scan this once. OTOH, it's fast on a fast machine :) fp = os.path.join(args.source, self.path) self.end_line=len(open(fp).readlines()) def scan_holders(self, info): for h in info.get('holders', []): self.holders.append(author(h)) def scan_authors(self, info): for h in info.get('authors', []): self.authors.append(author(h)) def scan_copyrights(self, info): for h in info.get('copyrights', []): c = copyright(h) #if c.start_line != c.end_line: # print('%s: %d - %d: %s' %(self.path, c.start_line, c.end_line, c.txt)) self.copyrights.append(c) def is_export(self, l): if l.match.strip().startswith('EXPORT_'): return True if l.match.strip().startswith('SYMBOL_GPL'): return True return False def drop_crap(self, l, args): for c in self.crap_matches: if c.search(l.match): return True if args.dropmodule: if l.match.strip().startswith('MODULE_LICENSE'): return True if args.dropexport: if self.is_export(l): return True return False def scan_licenses(self, info, args): for lic in info['licenses']: l = license(self.path, lic, args) if l.score >= args.dropscore and not self.drop_crap(l, args): self.licenses.append(l) def has_license(self): return len(self.licenses) > 0 def has_matching_license(self, spdx): for l in self.licenses: if l.spdx == spdx: return True return False def has_matching_rule(self, rule): for l in self.licenses: if l.rule == rule: return True return False def has_score(self, args): for l in self.licenses: if l.score < args.minscore or l.score > args.maxscore: return False return True def has_ambiguous(self): for l in self.licenses: if l.is_ambiguous: return True return False def has_spdx(self): for l in self.licenses: if l.is_spdx: return True return False def has_text(self): for l in self.licenses: if l.is_text: return True return False def has_tag(self): for l in self.licenses: if l.is_tag: return True return False def has_notice(self): for l in self.licenses: if l.is_notice: return True return False def has_reference(self): for l in self.licenses: if l.is_ref: return True return False def get_rules(self): res = [] for l in self.licenses: if l.rule: res.append(l.rule) return res def has_gpl_conflicts(self, args): spdx = None for l in self.licenses: if not l.spdx.startswith('GPL'): continue if l.match.startswith('EXPORT_SYMBOL_GPL'): continue if l.match.startswith('EXPORT_[TRACEPOINT]_SYMBOL_GPL'): continue if l.match.startswith('MODULE_LICENSE'): if args.nomodule: continue if not spdx: spdx = l.spdx elif spdx != l.spdx: return True return False def export_only(self): if len(self.licenses) == 0: return False for l in self.licenses: if not self.is_export(l): return False return True def module_only(self): if len(self.licenses) == 0: return False for l in self.licenses: if not l.match.startswith('MODULE_LICENSE'): return False return True def has_module(self): for l in self.licenses: if l.match.startswith('MODULE_LICENSE'): return True return False def has_conflict(self, args): # GPL conflicts only for now return self.has_gpl_conflicts(args) def unique_license(self): ml = len(self.licenses) if ml == 0: return False spdx = self.licenses[0].spdx i = 1 while i < ml: if self.licenses[i].spdx != spdx: return False i += 1 return True def get_spdx_tags(self, split_tags=False): if len(self.licenses) == 0: return 'NOLICENSE' spdx = [] realspdx = [] for lic in self.licenses: tag = lic.spdx if split_tags and lic.is_spdx: if tag not in realspdx: realspdx.append(tag) else: if tag not in spdx: spdx.append(tag) resreal = '' if len(realspdx) > 0: spds = sorted(realspdx) resreal += spds.pop(0) for s in spds: resreal += ',%s' %s res = '' if len(spdx) > 0: spds = sorted(spdx) res += spds.pop(0) for s in spds: res += ',%s' %s if split_tags: return resreal, res else: return res def spdx_pure(self): r, l = self.get_spdx_tags(split_tags=True) return len(r) > 0 and len(l) == 0 def get_history(self): res = '\n' fi = self.fileinfo while fi != None: if fi.info: res += ' File: %s\n' %fi.fpath if fi.info.sha and len(fi.info.sha) > 0: res += ' Commit: %s\n' %fi.info.sha if fi.info.tag and len(fi.info.tag) > 0: res += ' Tag: %s\n' %fi.info.tag if fi.info.author and len(fi.info.author) > 0: res += ' Author: %s\n' %encode_txt(fi.info.author) if fi.info.date >= 0: res += ' Date: %s\n' %time.asctime(time.gmtime(fi.info.date)) if fi.orig_fpath: res += '\n Renamed from:\n' fi = fi.orig_fpath return res def get_extra_info(self): res = '' if len(self.copyrights): res += ' Copyrights:\n' for c in self.copyrights: res += ' %s\n' %encode_txt(c.txt) res += '\n' if len(self.authors): res += ' Authors:\n' for a in self.authors: res += ' %s\n' %encode_txt(a.txt) res += '\n' res += ' Further information (might be inaccurate):\n' res += self.get_history() return res def get_match_info(self): res = ' Scanner info:\n' for l in self.licenses: res += ' Rule: %s\n' %l.rule res += ' Score: %3d\n' %l.score res += ' SPDX: %s\n' %l.spdx res += '\n' return res def print_all_matches(self): # Non SPDX deduced license information for l in self.licenses: print(' Scanned: %s' %l.spdx) print(' Score: %d' %l.score) print(' Name: %s' %l.shortname) print(' Rule: %s' %l.rule) print(' Line: %d - %d' %(l.start_line, l.end_line)) b = bytearray(' ') if l.is_text: b[0] = 'X' if l.is_ref: b[1] = 'R' if l.is_tag: b[2] = 'T' if l.is_notice: b[3] = 'N' print(' What: %s' %b) print(' Link: %s/%s#n%d' %(git_source_url, self.path, l.start_line)) print(' Match:') for line in l.match.split('\n'): print(' %s' %line) print('') def print_info(self, args): if args.format == 'stats': return elif args.format == 'fname': print(self.path) elif args.format == 'csv': print('%s,%s' %(self.path, self.get_spdx_tags())) elif args.format == 'full': print('%s\n' %self.path) # No license if not self.has_license(): print(' No license information found') # SPDX tag(s) found elif self.has_spdx(): if not self.has_conflict(args): print(' SPDX: %s' %self.licenses[0].match) if args.morethanone: self.print_all_matches() elif self.is_resolved: spdx, txt = self.is_resolved.split(',', 1) print(' SPDX: %s' %spdx) print(' Conflict resolved: %s' %txt) else: print(' Conflicts detected') self.print_all_matches() else: if self.has_conflict(args): print(' Conflicts detected') self.print_all_matches() try: print(self.get_extra_info()) except: for l in self.get_extra_info().split('\n'): try: print(l) except: pass def do_get_prefix(self): if self.fileinfo.prefix: return self.fileinfo.prefix maxcnt = 0 match = None for s in self.fileinfo.subjects: cnt = self.fileinfo.subjects[s] if cnt > maxcnt: maxcnt = cnt match = s if match: return match return self.path def get_prefix(self): prefix = self.do_get_prefix() if self.path == prefix: return prefix if len(prefix) < 40: return prefix return self.path def exclude_file(self): return self.path in [ 'drivers/dma/qcom/hidma.c', ] def stop_boilerplate(self, l, i, args): st = [ 'Note: the choice of the license', 'The code is based on publicly available information:', 'This file is part of Donald Becker\'s 8390 drivers', 'For further information regarding this notice, see:', 'For the record: _GPL here is only because somebody decided to slap it', 'Note: This code is heavily based on the GNU MP Library.', ] for s in st: if l.find(s) >= 0: if not self.line_is_boilerplate(i, args): print('%s: REMOVE ?: %s' %(self.path, s)) if s != '*/': print('%s: STOPBP: %s' %(self.path, l.strip())) return True return False def exclude_from_boilerplate(self, l, n): st = [ 'Based from clk-highbank.c', 'Based on twl6030_usb.c', 'Derived from GPLv2+ licensed source:', 'based on GPL\'ed 2.6 kernel sources', 'See ip_conntrack_helper_h323_asn1.h for details.', 'For a historical changelog see', 'See LICENSE.ql', 'See linux/lib/crc32.c for license and changes', 'Derived from code originally in linux/arch/arm/kernel/fiq.c', 'Adapted from OProfile GPLv2 support jidump.h:', 'crc32hash.c - derived from linux/lib/crc32.c, GNU GPL v2', 'licensing of what follows is governed by reiserfs/README', ] for s in st: if l.lower().find(s.lower()) >= 0: print('%s: EXCSBP: %s' %(self.path, l.strip())) return True fl = { # Note: range(0,2) produces [ 0, 1 ] - Oh well I always # trip over that 'drivers/media/dvb-frontends/dib3000.h' : range( 6, 11), 'drivers/media/dvb-frontends/dib3000mb.c' : range( 6, 11), 'drivers/media/usb/dvb-usb/dibusb-mb.c' : range( 6, 8), 'drivers/media/usb/dvb-usb/dibusb-mc.c' : range( 6, 8), 'drivers/media/v4l2-core/v4l2-common.c' : range( 18, 26), 'drivers/media/usb/dvb-usb-v2/az6007.c' : range( 5, 13), 'arch/arm/mach-ixp4xx/ixp4xx_npe.c' : range(10, 13), } ls = fl.get(self.path, []) if n in ls: print('%s: EXCLBP: %s' %(self.path, l.strip())) return n in ls def line_is_missed_boilerplate(self, l, n): if l.strip().startswith('* to the Free Software Foundation'): print('%s: MISSBP: %s' %(self.path, l.strip())) return True ml = { 'drivers/scsi/smartpqi/Kconfig' : range(10, 37), } ls = ml.get(self.path, []) if n in ls: print('%s: MISSBP: %s' %(self.path, l.strip())) return n in ls def line_is_boilerplate(self, n, args): for l in self.licenses: if l.is_spdx and args.no_spdx: continue if l.is_ref and args.no_reference: continue if l.is_tag and args.no_tag: continue if l.is_text and args.no_text: continue if l.is_notice and args.no_notice: continue if l.match.find('DO NOT ALTER') >= 0: if not self.warned: self.warned = 1 print('%s: DO NOT ALTER' %self.path) return False if n >= l.start_line and n <= l.end_line: return True return False def line_has_copyright(self, n, l): for c in self.copyrights: if n >= c.start_line and n < c.end_line: if l.find(c.txt) >= 0: return True for m in [ 'Copyright', '(C)', '(c)' ]: if l.find(m) >= 0: return True return False def line_has_author(self, n, l): for a in self.authors: if n >= a.start_line and n <= a.end_line: if l.find(a.txt) >= 0: return True return False def sanitize_copyright(self, n, txt): if txt.find('OProfile') >= 0: return txt if txt.endswith('>, distribute under GPLv2\n'): return txt.replace('>, distribute under GPLv2', '') if txt.endswith('. Subject to GPLv2.\n'): return txt.replace('Subject to GPLv2.', '') if txt.find('. This file is licensed') >= 0: return txt.split(' This file is licensed')[0] + '\n' if self.path.endswith('.cocci'): if txt.find(' GPLv2.') > 0: txt = txt.replace('GPLv2.', '') if txt.find(' GPLv2') > 0: txt = txt.replace('GPLv2', '') if txt.find(' GPL v2.') > 0: txt = txt.replace('GPL v2.', '') if txt.find(' GPL v2') > 0: txt = txt.replace('GPL v2', '') return txt.strip() + '\n' if txt.find('/* GPLv2 C') == 0: return txt.replace('GPLv2 ', '') if txt.find('/* GPLv2, C') == 0: return txt.replace('GPLv2, ', '') return txt def sanitize_comment(self, end): i = 0 empty = [] last = 0 cnt = 0 if len(self.patch) > end + 1: end += 1 i = 0 cs = 0 ce = 0 for l in self.patch: i += 1 if l.find('/*') >= 0: cs = i elif l.find('*/') >= 0: ce = i break if cs and cs < end and end < ce: end = ce i = 0 while i < end: l = self.patch[i] i += 1 t = l.strip() if t in self.patch_comments: if not last: last = i - 1 cnt = 1 else: cnt += 1 continue cs = t.split(' ', 1)[0].strip() if cs == '*/': cnt += 1 if last and cnt > 1: empty.append((last, cnt - 1)) last = 0 cnt = 0 if last and cnt > 1: empty.append((last, cnt - 1)) dropped = 0 for (l, c) in empty: l -= dropped dropped += c while c > 0: c -= 1 self.patch.pop(l) def add_spdx_id(self, comment, lic): self.lic_comment = comment if comment != '//' and comment != '/*': self.patch_comments = [ comment ] else: self.patch_comments = [ '//', '*' ] txt = '%s SPDX-License-Identifier: %s' %(comment, lic) if comment == '/*': txt += ' */' txt += '\n' self.patch.append(txt) def is_endof_comment(self, l): if l.find('*/') < 0: return None if l.find('/*') >= 0: return None return ' */\n' def can_drop(self, l, i): if not self.exclude_from_boilerplate(l, i): # Protect copyright notices if not self.line_has_copyright(i, l): if not self.line_has_author(i, l): return True return False def make_patch(self, args, replace, striponly): if self.exclude_file(): return 0 if self.has_ambiguous(): print('Not patching %s: ambiguous' %self.path) return 0 if not args.dual_license: try: lic = self.licenses[0].spdx except: lic = 'GPL-2.0-only' else: lic = self.get_spdx_tags().replace(',', ' or ') fp = os.path.join(args.source, self.path) orig = codecs.open(fp, encoding='utf-8').readlines() if len(orig) == 0: print('Not patching %s: empty' %self.path) return 0 self.patch = [] i = 0 j = 0 strip = replace or striponly if not striponly: j = 1 if self.path.endswith('.c'): self.add_spdx_id('//', lic) elif self.path.endswith('.dts'): self.add_spdx_id('//', lic) elif self.path.endswith('.dtsi'): self.add_spdx_id('//', lic) elif self.path.endswith('.cocci'): self.add_spdx_id('//', lic) elif self.path.endswith('.h'): self.add_spdx_id('/*', lic) elif self.path.endswith('.S'): if orig[0].startswith(';;'): self.add_spdx_id(';;', lic) elif orig[0].startswith(';'): self.add_spdx_id(';', lic) else: self.add_spdx_id('/*', lic) elif self.path.find('Makefile') >= 0: self.add_spdx_id('#', lic) elif self.path.find('Kconfig') >= 0: self.add_spdx_id('#', lic) elif orig[0].startswith('#!'): self.patch.append(orig[0]) self.add_spdx_id('#', lic) i = 1 j = 2 else: print('Not patching %s: not supported' %self.path) return 0 stop_bp = False start_bp = False start_bp_comment = -1 endbp = 0 while i < len(orig): l = orig[i] i += 1 if strip: # HACK if self.stop_boilerplate(l, i, args): stop_bp = True if not stop_bp and self.line_is_boilerplate(i, args): if not start_bp: start_bp = True ls = l.strip() if ls.startswith('/*') and self.can_drop(l, i): if ls.endswith('*/'): continue j += 1 self.patch.append('/*\n') start_bp_comment = j continue # HACK if not self.exclude_from_boilerplate(l, i): # Protect copyright notices if not self.line_has_copyright(i, l): if not self.line_has_author(i, l): endbp = j + 1 l = self.is_endof_comment(l) if not l: continue else: l = self.sanitize_copyright(i, l) else: pass # HACK if l and self.line_is_missed_boilerplate(l, i): endbp = j + 1 l = self.is_endof_comment(l) if not l: continue if not l: print('OOPS: %s: %d %d' %(self.path, i, j)) continue if start_bp_comment >= 0 and start_bp_comment == j: if l.strip() == '*/': self.patch.pop(-1) j -= 1 continue j += 1 self.patch.append(l) if strip: self.sanitize_comment(endbp) afile = os.path.join('a', self.path) bfile = os.path.join('b', self.path) diff = difflib.unified_diff(orig, self.patch, afile, bfile) pd = args.patchdir if not args.flat: parts = self.path.split('/') pn = parts.pop(-1) pn += '.patch' for p in parts: pd = os.path.join(pd, p) elif not args.patchname: pn = self.path.replace('/','-') pn += '.patch' i = 1 while os.path.isfile(os.path.join(pd, pn)): pn = self.path.replace('/','-') pn += '-%d.patch' %i i += 1 else: pn = args.patchname if not os.path.isdir(pd): os.makedirs(pd) pp = os.path.join(pd, pn) if not os.path.isfile(pp): pf = codecs.open(pp, encoding='utf-8', mode='w') try: for l in args.template: txt = l if txt.startswith('Subject:'): if not args.patchname: txt = 'Subject: %s: %s\n' %(self.get_prefix(), l.split('Subject:')[1].strip()) elif txt.startswith('From:'): txt = 'From: %s\n' %args.author elif txt.startswith('Date:'): dt = datetime.datetime.now() toff = time.timezone th = -toff / 3600 tm = (abs(toff) % 3600) / 60 tz = '%+03d%02d' %(th, tm) txt = 'Date: %s %s\n' %(dt.strftime('%a, %d %b %Y %T'), tz) elif txt.startswith('$SCANMATCH'): lm = self.licenses[0].match.split('\n') txt = ' %s %s\n' %(self.lic_comment, lm[0]) for m in lm[1:]: txt += ' %s\n' %m elif txt.startswith('$SPDXID'): txt = ' %s\n' %lic elif txt.startswith('Signed-off-by:'): txt = 'Signed-off-by: %s\n' %args.author pf.write(txt) pf.write('\n') if not args.patchname: pf.write(self.get_extra_info()) pf.write('\n\n') pf.write(self.get_match_info()) pf.write('\n---\n\n') pf.writelines(diff) pf.write('\n\n') pf.close() fd = open(os.path.join(pd, 'series'), 'a') fd.write('%s\n' %pn) fd.close() return 1 except Exception, ex: print('Failed to write diff for %s' %pn) print(ex) else: pf = codecs.open(pp, encoding='utf-8', mode='a') try: pf.writelines(diff) pf.write('\n\n') pf.close() return 1 except Exception, ex: print('Failed to write diff for %s' %pn) print(ex) return 0 class matchrule(object): def __init__(self, txt): self.rule = txt self.files = [] self.matches = [] self.spdx = [] class scaninfo(object): def __init__(self, args): self.entries = [] self.resolved = {} self.excludes = [] self.licenses = [] self.directories = [] self.fileinfos = {} self.patchrules = {} self.numpatches = 0 self.matches = [] self.spdx = [] self.rules = {} self.rulefilters = [] self.patched = [] # Scan resolved conflicts if available if args.resolved: lines = open(args.resolved).readlines() for l in lines: self.add_resolved(l.strip()) if args.excludes: self.excludes = args.excludes.split(',') if args.license_filter: self.licenses = args.license_filter.split(',') if args.rules_filter: self.rulefilters = args.rules_filter.split(',') def is_excluded(self, entry, args): # Filter based on path for e in self.excludes: if entry.path.startswith(e): return True if args.paths and len(args.paths): res = False for p in args.paths: if entry.path.startswith(p): res = True break if not res: return True if args.filters: drop = True for f in args.filters: if entry.path.find(f) >= 0: drop = False break if args.negate_filters: drop = not drop if drop: return True # Filter based on licenses if len(self.licenses): res = False for l in self.licenses: if entry.has_matching_license(l): res = True if not res: return True # Filter dual license matches if args.dual_license: lics = entry.get_spdx_tags().split(',') if len(self.licenses) != len(lics): return True for l in self.licenses: if not l in lics: return True lics.remove(l) if len(self.rulefilters): res = False for r in self.rulefilters: if entry.has_matching_rule(r): res = True if not res: return True # Filter based on score if not entry.has_score(args): return True # Filter SPDX if args.has_spdx: if not entry.has_spdx(): return True # Filter SPDX + other if args.spdx_plus and entry.spdx_pure(): return True # Filter SPDX pure if args.spdx_pure and not entry.spdx_pure(): return True if args.no_spdx and entry.has_spdx(): return True # Filter text if args.has_text and not entry.has_text(): return True if args.no_text and entry.has_text(): return True # Filter text if args.has_tag and not entry.has_tag(): return True if args.no_tag and entry.has_tag(): return True # Filter notice if args.has_notice and not entry.has_notice(): return True if args.no_notice and entry.has_notice(): return True # Filter reference if args.has_reference and not entry.has_reference(): return True if args.no_reference and entry.has_reference(): return True if args.has_module and not entry.has_module(): return True # Filter ambiguous if args.has_ambiguous and not entry.has_ambiguous(): return True if args.no_ambiguous and entry.has_ambiguous(): return True # Filter on conflicts if args.conflicts and not entry.has_conflict(args): return True if args.noconflicts and entry.has_conflict(args): return True # Filter on multiple/unique licenses if args.multiple and entry.unique_license(): return True if args.unique and not entry.unique_license(): return True if args.morethanone and len(entry.licenses) < 2: return True # Export only if args.export_only and not entry.export_only(): return True # Module only if args.module_only and not entry.module_only(): return True # Line count if args.minlines > entry.end_line: return True return False def add_entry(self, entry, args): entry.is_resolved = self.resolved.get(entry.path, None) excl = self.is_excluded(entry, args) if excl: #print('Exclude: %s' %entry.path) return #print('Include: %s' %entry.path) self.entries.append(entry) for l in entry.licenses: if l.rule == '': continue rule = self.rules.get(l.rule, matchrule(l.rule)) if not entry.path in rule.files: rule.files.append(entry.path) addmatch = True for mt in rule.matches: if mt == l.pattern: addmatch = False break if addmatch: rule.matches.append(l.pattern) if not l.spdx in rule.spdx: rule.spdx.append(l.spdx) self.rules[l.rule] = rule def stats(self, args): tot_files = 0 has_spdx = 0 has_license = 0 unique = 0 conflicts = 0 resolved = 0 spdx_unique = 0 spdx_conflicts = 0 spdx_resolved = 0 spdx_plus_text = 0 spdx_plus_ref = 0 spdx_plus_notice = 0 spdx_pure = 0 ambiguous = 0 matches = {} raw_matches = {} licenses_sp = {} licenses_st = {} licenses_u = {} licenses_m = {} variants = [] for entry in self.entries: tot_files += 1 if entry.has_spdx(): has_spdx += 1 if entry.unique_license(): spdx_unique += 1 if entry.has_conflict(args): spdx_conflicts += 1 if entry.is_resolved: spdx_resolved += 1 if entry.has_text(): spdx_plus_text += 1 if entry.has_reference(): spdx_plus_ref += 1 if entry.has_notice(): spdx_plus_notice += 1 r, l = entry.get_spdx_tags(split_tags=True) if len(l) == 0: cnt = licenses_sp.get(r, 0) licenses_sp[r] = cnt + 1 spdx_pure += 1 else: lt = '%s + %s' %(r, l) cnt = licenses_st.get(lt, 0) licenses_st[lt] = cnt + 1 elif entry.has_license(): has_license += 1 if entry.unique_license(): unique += 1 l = entry.get_spdx_tags() cnt = licenses_u.get(l, 0) licenses_u[l] = cnt + 1 else: if entry.has_conflict(args): conflicts += 1 if entry.is_resolved: resolved += 1 l = entry.get_spdx_tags() cnt = licenses_m.get(l, 0) licenses_m[l] = cnt + 1 if entry.has_ambiguous(): ambiguous += 1 for l in entry.licenses: if l.spdx not in variants: variants.append(l.spdx) if l.is_spdx: continue cnt = raw_matches.get(l.match, 0) cnt += 1 raw_matches[l.match] = cnt m = l.pattern cnt,om = matches.get(m, (0, l.match)) cnt += 1 matches[m] = (cnt, om) print('Files: %8d' %tot_files) print(' no License: %8d' %(tot_files - (has_spdx + has_license))) print(' ambiguous: %8d' %(ambiguous)) print(' with SPDX: %8d' %has_spdx) print(' unique: %8d' %spdx_unique) print(' GPL conflicts: %8d' %spdx_conflicts) print(' resolved conflicts: %8d' %spdx_resolved) print(' With text: %8d' %spdx_plus_text) print(' With reference: %8d' %spdx_plus_ref) print(' With notice: %8d' %spdx_plus_notice) if args.verbose: print(' Pure SPDX: %8d' %spdx_pure) for l in sorted(licenses_sp.keys()): print(' %-85s: %8d' %(l, licenses_sp[l])) print(' SPDX + text: %8d' %(has_spdx - spdx_pure)) for l in sorted(licenses_st.keys()): print(' %-85s: %8d' %(l, licenses_st[l])) print(' with License: %8d' %has_license) print(' unique: %8d' %unique) for l, c in licenses_u.iteritems(): print(' %-70s: %8d' %(l, c)) print(' multiple: %8d' %(has_license - unique)) for l, c in licenses_m.iteritems(): print(' %-70s: %8d' %(l, c)) print(' GPL conflicts: %8d' %conflicts) print(' resolved conflicts: %8d' %resolved) print('') print('Raw license expressions: %8d' %len(raw_matches.keys())) print('License expressions: %8d' %len(matches.keys())) totcnt = 0 for m in matches: cnt, om = matches[m] totcnt += cnt print('Total expressions: %8d' %totcnt) if args.verbose: print('License variants: %8d' %len(variants)) for l in sorted(variants): print(' %s' %l) def make_patch(self, e, args, replace, striponly): r = e.make_patch(args, replace, striponly) if not r: return #print(e.path) self.patched.append(e.path) self.numpatches += 1 for rule in e.get_rules(): cnt = self.patchrules.get(rule, 0) + 1 self.patchrules[rule] = cnt def patch_boiler(self, args, rule=None): for e in self.entries: if e.has_spdx(): continue if e.has_conflict(args): continue if args.unique and not e.unique_license(): continue if rule and not e.has_matching_rule(rule): continue if e.path in self.patched: continue self.make_patch(e, args, replace=True, striponly=False) def patch_export(self, args): for e in self.entries: if not e.export_only(): continue self.make_patch(e, args, replace=False, striponly=False) def patch_module(self, args): for e in self.entries: if not e.module_only(): continue self.make_patch(e, args, replace=False, striponly=False) def patch_make(self, args): for e in self.entries: self.make_patch(e, args, replace=True, striponly=False) def patch_none(self, args): for e in self.entries: self.make_patch(e, args, replace=False, striponly=False) def patch_strip(self, args, rule=None): for e in self.entries: if not e.has_spdx(): continue if rule and not e.has_matching_rule(rule): continue self.make_patch(e, args, replace=False, striponly=True) def print_rules(self, args): nrules = 0 nfiles = 0 for r in sorted(self.rules): nrules += 1 rule = self.rules[r] print('Rule: %s' %rule.rule) print('SPDX:') for s in rule.spdx: print(' %s' %s) print('Files: %d' %len(rule.files)) print('Patterns: %d' %len(rule.matches)) for mt in rule.matches: wrapper = TextWrapper(initial_indent=" ", subsequent_indent=' ') for l in wrapper.wrap(mt): print(l) print('\n') print('Filenames:') for f in rule.files: print(' %s' %f) nfiles += 1 print('\n') print('Total Rules: %d' %nrules) print('Total Files: %d' %nfiles) def parse(self, args, rule=None): if args.format == 'stats': self.stats(args) elif args.format == 'rules': self.print_rules(args) elif args.format == 'patch_boiler': self.patch_boiler(args, rule) elif args.format == 'patch_export': self.patch_export(args) elif args.format == 'patch_module': self.patch_module(args) elif args.format == 'patch_make': self.patch_make(args) elif args.format == 'patch_none': self.patch_none(args) elif args.format == 'patch_strip': self.patch_strip(args, rule) elif args.format in ['csv', 'full', 'fname']: for e in self.entries: e.print_info(args) else: pass if args.format.startswith('patch_'): print('%d patches generated' %self.numpatches) for rule in self.patchrules: print('%-40s: %8d matches' %(rule, self.patchrules[rule])) def scan_entries(info, data): # Scan all entries for item in data['files']: if item['type'] == 'directory': continue # File info entry = scanentry(item, args) entry.fileinfo = info.fileinfos.get(entry.path, fileinfo(entry.path)) info.add_entry(entry, args) def load_info(args): data = json.load(open(args.datafile)) info = scaninfo(args) # Read the pickled data try: info.fileinfos = pickle.load(open(args.infodb)) except: pass return info, data if __name__ == '__main__': formats = [ 'none', 'stats', 'full', 'csv', 'fname', 'rules', 'patch_boiler', 'patch_export', 'patch_module', 'patch_make', 'patch_none', 'patch_strip', ] parser = ArgumentParser(description='License information') parser.add_argument('datafile', metavar='datafile', help='JSON data file with scan information') parser.add_argument('paths', nargs=REMAINDER, help='Optional File/directory patsh') parser.add_argument('--infodb', dest='infodb', help='Pickled file info db') # Outpout format parser.add_argument('--format', '-f', dest='format', default='stats', choices=formats, help='Output format') parser.add_argument('--verbose', '-v', dest='verbose', default=False, action='store_true', help='Verbose output') # Input filters parser.add_argument('--exclude', '-e', dest='excludes', help='Exclude directories/files, separate with commata') # FIXME make that regex parser.add_argument('--filter', '-F', dest='filter', help='match parts of the path, separate with commata') parser.add_argument('--negate_filter', '-n', dest='negate_filters', default=False, action='store_true', help='Negate --filter') # License filters parser.add_argument('--license', '-l', dest='license_filter', help='License filter, separate with commata') # Rules filter parser.add_argument('--rules', '-R', dest='rules_filter', help='Rules filter, separate with commata') # Filter based on score parser.add_argument('--minscore', '-s', dest='minscore', type=int, default=0, help='Minimal scan score (0-100)') parser.add_argument('--maxscore', '-S', dest='maxscore', type=int, default=100, help='Maximal scan score (0-100)') # Filter based on lines in the file parser.add_argument('--minlines', dest='minlines', type=int, default=0, help='Minimal line count in file (0-...)') # Filters based on scan results parser.add_argument('--has_spdx', dest='has_spdx', default=False, action='store_true', help='Files with SPDX identifier') parser.add_argument('--no_spdx', dest='no_spdx', default=False, action='store_true', help='Files without SPDX identifier') parser.add_argument('--spdx_plus', dest='spdx_plus', default=False, action='store_true', help='Files with SPDX identifier plus other text/ref/notice') parser.add_argument('--spdx_pure', dest='spdx_pure', default=False, action='store_true', help='Files with pure SPDX identifier') parser.add_argument('--has_text', dest='has_text', default=False, action='store_true', help='Files with license text') parser.add_argument('--no_text', dest='no_text', default=False, action='store_true', help='Files without license text') parser.add_argument('--has_notice', dest='has_notice', default=False, action='store_true', help='Files with license notice') parser.add_argument('--no_notice', dest='no_notice', default=False, action='store_true', help='Files without license notice') parser.add_argument('--has_reference', dest='has_reference', default=False, action='store_true', help='Files with license reference') parser.add_argument('--no_reference', dest='no_reference', default=False, action='store_true', help='Files without license reference') parser.add_argument('--has_tag', dest='has_tag', default=False, action='store_true', help='Files with license tag') parser.add_argument('--no_tag', dest='no_tag', default=False, action='store_true', help='Files without license tag') parser.add_argument('--has_module', dest='has_module', default=False, action='store_true', help='Files with MODULE_LICENSE') parser.add_argument('--has_ambiguous', dest='has_ambiguous', default=False, action='store_true', help='Files marked as ambiguous') parser.add_argument('--no_ambiguous', dest='no_ambiguous', default=False, action='store_true', help='Files marked as ambiguous') parser.add_argument('--no_license', dest='no_license', default=False, action='store_true', help='Files no license entry') parser.add_argument('--export_only', dest='export_only', default=False, action='store_true', help='Files with only EXPORT based license entries') parser.add_argument('--module_only', dest='module_only', default=False, action='store_true', help='Files with only MODULE_LICENSE based license entries') parser.add_argument('--dual_license', dest='dual_license', default=False, action='store_true', help='Files with dual or more licenses') # Magic for patch generation parser.add_argument('--for-each-filter', dest='eachfilter', default=False, action='store_true', help='Cycle through the text,notice,reference,tag filters') # Remove license scan entries based on score parser.add_argument('--dropscore', '-d', dest='dropscore', type=int, default=0, help='Drop license scan entries below minimal score (0-100)') parser.add_argument('--dropmodule', dest='dropmodule', default=False, action='store_true', help='Drop module based scan entries') parser.add_argument('--dropexport', dest='dropexport', default=False, action='store_true', help='Drop export based scan entries') # Only show files with conflicts parser.add_argument('--conflicts', dest='conflicts', default=False, action='store_true', help='Files with conflicts') parser.add_argument('--noconflicts', dest='noconflicts', default=False, action='store_true', help='Files without conflicts') parser.add_argument('--module', dest='module', default=False, action='store_true', help='Only module conflicts') parser.add_argument('--nomodule', dest='nomodule', default=False, action='store_true', help='Ignore module conflicts') # Only show files with multiple licenses parser.add_argument('--multiple', '-m', dest='multiple', default=False, action='store_true', help='Files with multiple licenses') parser.add_argument('--morethanone', '-M', dest='morethanone', default=False, action='store_true', help='Files with more than one scan match') # Resolved conflicts (false positives) parser.add_argument('--resolved', '-r', dest='resolved', help='File with list of filenames with resolved (false) conflicts') # Fixup GPL-1.0-or-later with COPYING reference parser.add_argument('--fixupcopying', dest='fixupcopying', default=False, action='store_true', help='Fixup GPL-1.0-or-later when a reference to COPYING is found') # Only files which have unique licenses parser.add_argument('--unique', '-u', dest='unique', default=False, action='store_true', help='Only files with unique license matches') # Source path for patch creation parser.add_argument('--source', dest='source', default='.', help='Source directory for patch creation') parser.add_argument('--patchdir', dest='patchdir', default='patches', help='Patches directory for patch creation') parser.add_argument('--patchname', dest='patchname', default=None, help='Patchname for combo patches') parser.add_argument('--ruleslist', dest='ruleslist', default=None, help='Ruleslist for patch series based on single rules') parser.add_argument('--author', dest='author', help='Author information') parser.add_argument('--flat', dest='flat', default=False, action='store_true', help='Flat patch series') parser.add_argument('--template', '-t', dest='template', default='header.txt', help='Template for patch header') args = parser.parse_args() if args.filter: args.filters = args.filter.split(',') else: args.filters = None # Get author information for patches if args.format.startswith('patch_'): if not args.author: an = os.environ.get('GIT_AUTHOR_NAME', None) am = os.environ.get('GIT_AUTHOR_EMAIL', None) if not am: print('No author information found\n') sys.exit(1) if an: args.author = '%s <%s>' %(an, am) else: args.author = am try: args.template = open(args.template).readlines() except: args.template = [] info, data = load_info(args) if args.ruleslist: scan_entries(info, data) rules = open(args.ruleslist).readlines() for rulefile in rules: rulefile = rulefile.strip() args.template = open(rulefile).readlines() rule = rulefile.split('/')[1].split('-', 1)[1] args.patchname = '%s.patch' %rulefile.split('/')[1] print('Patching rule: %s' %rule) info.parse(args, rule) if not args.eachfilter: if args.no_license: args.no_text = True args.no_notice = True args.no_reference = True args.no_tag = True args.no_spdx = True scan_entries(info, data) info.parse(args) else: # Hack to spare reloading the data over and over print('text') args.no_text = False args.no_notice = True args.no_reference = True args.no_tag = True scan_entries(info, data) info.parse(args) print('notice') args.no_text = True args.no_notice = False args.no_reference = True args.no_tag = True info2 = scaninfo(args) info2.resolved = info.resolved info2.excludes = info.excludes info2.licenses = info.licenses info2.fileinfos = info.fileinfos scan_entries(info2, data) info2.parse(args) print('reference') args.no_text = True args.no_notice = True args.no_reference = False args.no_tag = True info3 = scaninfo(args) info3.resolved = info.resolved info3.excludes = info.excludes info3.licenses = info.licenses info3.fileinfos = info.fileinfos scan_entries(info3, data) info3.parse(args) print('tag') args.no_text = True args.no_notice = True args.no_reference = True args.no_tag = False info4 = scaninfo(args) info4.resolved = info.resolved info4.excludes = info.excludes info4.licenses = info.licenses info4.fileinfos = info.fileinfos scan_entries(info4, data) info4.parse(args)