#!/usr/bin/env python
# SPDX-License-Identifier: GPL-2.0-only
# Copyright Thomas Gleixner <tglx@linutronix.de>

from argparse import ArgumentParser, REMAINDER
from textwrap import TextWrapper
import unicodedata
import datetime
import difflib
import pickle
import locale
import codecs
import time
import json
import git
import sys
import os
import re

git_source_url = 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree'

def encode_txt(txt):
    for d in ['ascii', 'UTF-8', 'latin-1', 'iso-8859-1' ]:
        try:
            return txt.decode(d)
        except:
            pass
    try:
        return txt.decode('UTF-8', errors='ignore')
    except:
        res = ''
        for t in txt:
            try:
                res += str(t).decode('UTF-8', errors='ignore')
            except:
                pass
        return res

def normalize(txt):
    txt = encode_txt(txt)
    lines = txt.split()
    txt = ''
    for l in lines:
        l = l.strip().lower()
        if l.startswith(';;'):
            l = l[2:].strip()
        if l.startswith(';'):
            l = l[1:].strip()
        if l.startswith('*'):
            l = l[1:].strip()
        if l.startswith('/*'):
            l = l[2:].strip()
        if l.startswith('#'):
            l = l[1:].strip()
        if l.startswith('//'):
            l = l[2:].strip()
        l = l.replace('*/', ' ').strip()
        l = l.replace('*', ' ').strip()
        l = l.replace('\t', ' ').strip()
        l = l.replace('.', ' ').strip()
        l = l.replace(',', ' ').strip()
        l = l.replace(';', ' ').strip()
        l = l.replace(':', ' ').strip()
        l = l.replace('/', ' ').strip()
        l = l.replace('(', ' ').strip()
        l = l.replace(')', ' ').strip()
        l = l.replace('-', ' ').strip()
        l = l.replace('"', ' ').strip()
        l = l.replace('|', ' ').strip()
        l = l.replace('\'', ' ').strip()
        l = l.replace('`', ' ').strip()

        while l.find('  ') >= 0:
            l = l.replace('  ', ' ')
        l = l.strip()
        if len(l) == 0:
            continue
        txt = txt + ' ' + l.strip()
        txt = txt.strip()
    return txt

class fileinfo(object):
    def __init__(self, fpath, orig_fpath=None, sha=None, ts=-1, author=None,
                 info=None):
        self.fpath = fpath
        self.orig_fpath = orig_fpath
        self.sha = sha
        self.ts = ts
        self.author = author
        self.info = info
        self.subjects = {}
        self.prefix = fpath

class shainfo(object):
    def __init__(self, sha, tag, date, author):
        self.sha = sha
        self.tag = tag
        self.date = int(date)
        self.author = author

class license(object):
    def __init__(self, fpath, lic, args):
        try:
            self.score = int(lic['score'])
        except:
            print(fpath)
        self.match = encode_txt(lic['matched_text'])
        self.pattern = normalize(self.match)

        if self.match.strip().startswith('MODULE_LICENSE("GPL")') or \
           self.match.strip().startswith('MODULE_LICENSE ("GPL")') or \
           self.match.strip().startswith('MODULE_LICENSE("GPL v2")'):
            self.spdx = 'GPL-2.0-only'
        else:
            self.spdx = lic['spdx_license_key']

        # Fixup references to the COPYING file
        if args.fixupcopying and self.spdx == 'GPL-1.0-or-later':
            if self.match.find('COPYING') > 0:
                self.spdx = 'GPL-2.0-only'

        self.shortname = lic['short_name']

        if len(self.spdx) == 0:
            self.spdx = self.shortname

        self.start_line = lic['start_line']
        self.end_line = lic['end_line']
        self.is_ambiguous = lic.get('is_ambiguous', False)
        self.is_spdx = False
        self.is_text = False
        self.is_notice = False
        self.is_ref = False
        self.is_tag = False
        self.rule = None
        if not lic.has_key('matched_rule'):
            return
        mr = lic['matched_rule']
        self.is_notice = mr['is_license_notice']
        self.is_tag = mr['is_license_tag']
        self.is_text = mr['is_license_text']
        self.is_ref = mr['is_license_reference']
        self.is_spdx = self.is_tag and mr['matcher'] == '4-spdx-id'
        # FIXES
        idf = mr['identifier']
        self.rule = idf

class author(object):
    def __init__(self, data):
        self.txt = data['value']
        self.start_line = data['start_line']
        self.end_line = data['end_line']

class copyright(object):
    def __init__(self, data):
        self.txt = data['value']
        self.start_line = data['start_line']
        self.end_line = data['end_line']

class scanentry(object):
    crap_matches = [
        re.compile('GPL\(.*driver'),
    ]

    def __init__(self, info, args):
        self.path = info['path'].split('/', 1)[1]
        self.entries = []
        self.licenses = []
        self.copyrights = []
        self.authors = []
        self.holders = []
        self.is_resolved = None
        self.scan_licenses(info, args)
        self.scan_authors(info)
        self.scan_holders(info)
        self.scan_copyrights(info)
        self.start_line = 1
        self.end_line = 1
        self.warned = 1

        if args.minlines:
            # FIXME. Scan this once. OTOH, it's fast on a fast machine :)
            fp = os.path.join(args.source, self.path)
            self.end_line=len(open(fp).readlines())

    def scan_holders(self, info):
        for h in info.get('holders', []):
            self.holders.append(author(h))

    def scan_authors(self, info):
        for h in info.get('authors', []):
            self.authors.append(author(h))

    def scan_copyrights(self, info):
        for h in info.get('copyrights', []):
            c = copyright(h)
            #if c.start_line != c.end_line:
            #    print('%s: %d - %d: %s' %(self.path, c.start_line, c.end_line, c.txt))
            self.copyrights.append(c)

    def is_export(self, l):
        if l.match.strip().startswith('EXPORT_'):
            return True
        if l.match.strip().startswith('SYMBOL_GPL'):
            return True
        return False

    def drop_crap(self, l, args):
        for c in self.crap_matches:
            if c.search(l.match):
                return True
        if args.dropmodule:
            if l.match.strip().startswith('MODULE_LICENSE'):
                return True
        if args.dropexport:
            if self.is_export(l):
                return True
        return False

    def scan_licenses(self, info, args):
        for lic in info['licenses']:
            l = license(self.path, lic, args)
            if l.score >= args.dropscore and not self.drop_crap(l, args):
                self.licenses.append(l)

    def has_license(self):
        return len(self.licenses) > 0

    def has_matching_license(self, spdx):
        for l in self.licenses:
            if l.spdx == spdx:
                return True
        return False

    def has_matching_rule(self, rule):
        for l in self.licenses:
            if l.rule == rule:
                return True
        return False

    def has_score(self, args):
        for l in self.licenses:
            if l.score < args.minscore or l.score > args.maxscore:
                return False
        return True

    def has_ambiguous(self):
        for l in self.licenses:
            if l.is_ambiguous:
                return True
        return False

    def has_spdx(self):
        for l in self.licenses:
            if l.is_spdx:
                return True
        return False

    def has_text(self):
        for l in self.licenses:
            if l.is_text:
                return True
        return False

    def has_tag(self):
        for l in self.licenses:
            if l.is_tag:
                return True
        return False

    def has_notice(self):
        for l in self.licenses:
            if l.is_notice:
                return True
        return False

    def has_reference(self):
        for l in self.licenses:
            if l.is_ref:
                return True
        return False

    def get_rules(self):
        res = []
        for l in self.licenses:
            if l.rule:
                res.append(l.rule)
        return res

    def has_gpl_conflicts(self, args):
        spdx = None
        for l in self.licenses:
            if not l.spdx.startswith('GPL'):
                continue
            if l.match.startswith('EXPORT_SYMBOL_GPL'):
                continue
            if l.match.startswith('EXPORT_[TRACEPOINT]_SYMBOL_GPL'):
                continue
            if l.match.startswith('MODULE_LICENSE'):
                if args.nomodule:
                    continue

            if not spdx:
                spdx = l.spdx
            elif spdx != l.spdx:
                return True
        return False

    def export_only(self):
        if len(self.licenses) == 0:
            return False
        for l in self.licenses:
            if not self.is_export(l):
                return False
        return True

    def module_only(self):
        if len(self.licenses) == 0:
            return False
        for l in self.licenses:
            if not l.match.startswith('MODULE_LICENSE'):
                return False
        return True

    def has_module(self):
        for l in self.licenses:
            if l.match.startswith('MODULE_LICENSE'):
                return True
        return False

    def has_conflict(self, args):
        # GPL conflicts only for now
        return self.has_gpl_conflicts(args)

    def unique_license(self):
        ml = len(self.licenses)
        if ml == 0:
            return False
        spdx = self.licenses[0].spdx
        i = 1
        while i < ml:
            if self.licenses[i].spdx != spdx:
                return False
            i += 1
        return True

    def get_spdx_tags(self, split_tags=False):
        if len(self.licenses) == 0:
            return 'NOLICENSE'

        spdx = []
        realspdx = []
        for lic in self.licenses:
            tag = lic.spdx

            if split_tags and lic.is_spdx:
                if tag not in realspdx:
                    realspdx.append(tag)
            else:
                if tag not in spdx:
                    spdx.append(tag)

        resreal = ''
        if len(realspdx) > 0:
            spds = sorted(realspdx)
            resreal += spds.pop(0)
            for s in spds:
                resreal += ',%s' %s


        res = ''
        if len(spdx) > 0:
            spds = sorted(spdx)
            res += spds.pop(0)
            for s in spds:
                res += ',%s' %s

        if split_tags:
            return resreal, res
        else:
            return res

    def spdx_pure(self):
        r, l = self.get_spdx_tags(split_tags=True)
        return len(r) > 0 and len(l) == 0

    def get_history(self):
        res = '\n'
        fi = self.fileinfo
        while fi != None:
            if fi.info:
                res += '    File:      %s\n' %fi.fpath
                if fi.info.sha and len(fi.info.sha) > 0:
                    res += '    Commit:    %s\n' %fi.info.sha
                if fi.info.tag and len(fi.info.tag) > 0:
                    res += '    Tag:       %s\n' %fi.info.tag
                if fi.info.author and len(fi.info.author) > 0:
                    res += '    Author:    %s\n' %encode_txt(fi.info.author)
                if fi.info.date >= 0:
                    res += '    Date:      %s\n' %time.asctime(time.gmtime(fi.info.date))
            if fi.orig_fpath:
                res += '\n    Renamed from:\n'
            fi = fi.orig_fpath
        return res

    def get_extra_info(self):
        res = ''
        if len(self.copyrights):
            res += ' Copyrights:\n'
            for c in self.copyrights:
                res += '     %s\n' %encode_txt(c.txt)
            res += '\n'
        if len(self.authors):
            res += ' Authors:\n'
            for a in self.authors:
                res += '     %s\n' %encode_txt(a.txt)
            res += '\n'
        res += ' Further information (might be inaccurate):\n'
        res += self.get_history()
        return res

    def get_match_info(self):
        res = '  Scanner info:\n'
        for l in self.licenses:
            res += '      Rule:      %s\n' %l.rule
            res += '      Score:     %3d\n' %l.score
            res += '      SPDX:      %s\n' %l.spdx
            res += '\n'
        return res

    def print_all_matches(self):
        # Non SPDX deduced license information
        for l in self.licenses:
            print(' Scanned:    %s' %l.spdx)
            print('     Score:  %d' %l.score)
            print('     Name:   %s' %l.shortname)
            print('     Rule:   %s' %l.rule)
            print('     Line:   %d - %d' %(l.start_line, l.end_line))

            b = bytearray('    ')
            if l.is_text:
                b[0] = 'X'
            if l.is_ref:
                 b[1] = 'R'
            if l.is_tag:
                b[2] = 'T'
            if l.is_notice:
                b[3] = 'N'

            print('     What:   %s' %b)
            print('     Link:   %s/%s#n%d' %(git_source_url, self.path,
                                             l.start_line))
            print('     Match:')
            for line in l.match.split('\n'):
                print('             %s' %line)
            print('')

    def print_info(self, args):
        if args.format == 'stats':
            return
        elif args.format == 'fname':
            print(self.path)
        elif args.format == 'csv':
            print('%s,%s' %(self.path, self.get_spdx_tags()))
        elif args.format == 'full':
            print('%s\n' %self.path)

            # No license
            if not self.has_license():
                print(' No license information found')

            # SPDX tag(s) found
            elif self.has_spdx():
                if not self.has_conflict(args):
                    print(' SPDX:        %s' %self.licenses[0].match)
                    if args.morethanone:
                        self.print_all_matches()

                elif self.is_resolved:
                    spdx, txt = self.is_resolved.split(',', 1)
                    print(' SPDX:              %s' %spdx)
                    print(' Conflict resolved: %s' %txt)

                else:
                    print(' Conflicts detected')
                    self.print_all_matches()

            else:
                if self.has_conflict(args):
                    print(' Conflicts detected')

                self.print_all_matches()

            try:
                print(self.get_extra_info())
            except:
                for l in self.get_extra_info().split('\n'):
                    try:
                        print(l)
                    except:
                        pass

    def do_get_prefix(self):
        if self.fileinfo.prefix:
            return self.fileinfo.prefix

        maxcnt = 0
        match = None
        for s in self.fileinfo.subjects:
            cnt = self.fileinfo.subjects[s]
            if cnt > maxcnt:
                maxcnt = cnt
                match = s
        if match:
            return match
        return self.path

    def get_prefix(self):
        prefix = self.do_get_prefix()
        if self.path == prefix:
            return prefix
        if len(prefix) < 40:
            return prefix
        return self.path

    def exclude_file(self):
        return self.path  in [ 'drivers/dma/qcom/hidma.c', ]

    def stop_boilerplate(self, l, i, args):
        st = [ 'Note: the choice of the license',
               'The code is based on publicly available information:',
               'This file is part of Donald Becker\'s 8390 drivers',
               'For further information regarding this notice, see:',
               'For the record: _GPL here is only because somebody decided to slap it',
               'Note: This code is heavily based on the GNU MP Library.',
        ]
        for s in st:
            if l.find(s) >= 0:
                if not self.line_is_boilerplate(i, args):
                    print('%s: REMOVE ?: %s' %(self.path, s))

                if s != '*/':
                    print('%s: STOPBP: %s' %(self.path, l.strip()))
                return True
        return False

    def exclude_from_boilerplate(self, l, n):

        st = [ 'Based from clk-highbank.c',
               'Based on twl6030_usb.c',
               'Derived from GPLv2+ licensed source:',
               'based on GPL\'ed 2.6 kernel sources',
               'See ip_conntrack_helper_h323_asn1.h for details.',
               'For a historical changelog see',
               'See LICENSE.ql',
               'See linux/lib/crc32.c for license and changes',
               'Derived from code originally in linux/arch/arm/kernel/fiq.c',
               'Adapted from OProfile GPLv2 support jidump.h:',
               'crc32hash.c - derived from linux/lib/crc32.c, GNU GPL v2',
               'licensing of what follows is governed by reiserfs/README',
        ]

        for s in st:
            if l.lower().find(s.lower()) >= 0:
                print('%s: EXCSBP: %s' %(self.path, l.strip()))
                return True

        fl = {
            # Note: range(0,2) produces [ 0, 1 ] - Oh well I always
            # trip over that
            'drivers/media/dvb-frontends/dib3000.h'     : range( 6, 11),
            'drivers/media/dvb-frontends/dib3000mb.c'   : range( 6, 11),
            'drivers/media/usb/dvb-usb/dibusb-mb.c'     : range( 6, 8),
            'drivers/media/usb/dvb-usb/dibusb-mc.c'     : range( 6, 8),
            'drivers/media/v4l2-core/v4l2-common.c'     : range( 18, 26),
            'drivers/media/usb/dvb-usb-v2/az6007.c'     : range( 5, 13),
            'arch/arm/mach-ixp4xx/ixp4xx_npe.c'         : range(10, 13),
        }

        ls = fl.get(self.path, [])
        if n in ls:
            print('%s: EXCLBP: %s' %(self.path, l.strip()))

        return n in ls

    def line_is_missed_boilerplate(self, l, n):
        if l.strip().startswith('* to the Free Software Foundation'):
            print('%s: MISSBP: %s' %(self.path, l.strip()))
            return True

        ml = {
            'drivers/scsi/smartpqi/Kconfig'   : range(10, 37),
        }

        ls = ml.get(self.path, [])
        if n in ls:
            print('%s: MISSBP: %s' %(self.path, l.strip()))
        return n in ls

    def line_is_boilerplate(self, n, args):
        for l in self.licenses:
            if l.is_spdx and args.no_spdx:
                continue
            if l.is_ref and args.no_reference:
                continue
            if l.is_tag and args.no_tag:
                continue
            if l.is_text and args.no_text:
                continue
            if l.is_notice and args.no_notice:
                continue
            if l.match.find('DO NOT ALTER') >= 0:
                if not self.warned:
                    self.warned = 1
                    print('%s: DO NOT ALTER' %self.path)
                return False
            if n >= l.start_line and n <= l.end_line:
                return True
        return False

    def line_has_copyright(self, n, l):
        for c in self.copyrights:
            if n >= c.start_line and n < c.end_line:
                if l.find(c.txt) >= 0:
                    return True
        for m in [ 'Copyright', '(C)', '(c)' ]:
            if l.find(m) >= 0:
                return True
        return False

    def line_has_author(self, n, l):
        for a in self.authors:
            if n >= a.start_line and n <= a.end_line:
                if l.find(a.txt) >= 0:
                    return True
        return False

    def sanitize_copyright(self, n, txt):
        if txt.find('OProfile') >= 0:
            return txt
        if txt.endswith('>, distribute under GPLv2\n'):
            return txt.replace('>, distribute under GPLv2', '')
        if txt.endswith('. Subject to GPLv2.\n'):
            return txt.replace('Subject to GPLv2.', '')
        if txt.find('. This file is licensed') >= 0:
            return txt.split(' This file is licensed')[0] + '\n'

        if self.path.endswith('.cocci'):
            if txt.find(' GPLv2.') > 0:
                txt = txt.replace('GPLv2.', '')
            if txt.find(' GPLv2') > 0:
                txt = txt.replace('GPLv2', '')
            if txt.find(' GPL v2.') > 0:
                txt = txt.replace('GPL v2.', '')
            if txt.find(' GPL v2') > 0:
                txt = txt.replace('GPL v2', '')
            return txt.strip() + '\n'

        if txt.find('/* GPLv2 C') == 0:
             return txt.replace('GPLv2 ', '')
        if txt.find('/* GPLv2, C') == 0:
            return txt.replace('GPLv2, ', '')
        return txt

    def sanitize_comment(self, end):
        i = 0
        empty = []
        last = 0
        cnt = 0

        if len(self.patch) > end + 1:
            end += 1

        i = 0
        cs = 0
        ce = 0
        for l in self.patch:
            i += 1
            if l.find('/*') >= 0:
                cs = i
            elif l.find('*/') >= 0:
                ce = i
                break

        if cs and cs < end and end < ce:
            end = ce

        i = 0
        while i < end:
            l = self.patch[i]
            i += 1
            t = l.strip()

            if t in self.patch_comments:
                if not last:
                    last = i - 1
                    cnt = 1
                else:
                    cnt += 1
                continue

            cs = t.split(' ', 1)[0].strip()
            if cs == '*/':
                cnt += 1

            if last and cnt > 1:
                empty.append((last, cnt - 1))
            last = 0
            cnt = 0

        if last and cnt > 1:
            empty.append((last, cnt - 1))

        dropped = 0
        for (l, c) in empty:
            l -= dropped
            dropped += c
            while c > 0:
                c -= 1
                self.patch.pop(l)

    def add_spdx_id(self, comment, lic):
        self.lic_comment = comment
        if comment != '//' and comment != '/*':
            self.patch_comments = [ comment ]
        else:
            self.patch_comments = [ '//', '*' ]
        txt = '%s SPDX-License-Identifier: %s' %(comment, lic)
        if comment == '/*':
            txt += ' */'
        txt += '\n'
        self.patch.append(txt)

    def is_endof_comment(self, l):
        if l.find('*/') < 0:
            return None
        if l.find('/*') >= 0:
            return None
        return ' */\n'

    def can_drop(self, l, i):
        if not self.exclude_from_boilerplate(l, i):
            # Protect copyright notices
            if not self.line_has_copyright(i, l):
                if not self.line_has_author(i, l):
                    return True
        return False

    def make_patch(self, args, replace, striponly):
        if self.exclude_file():
            return 0

        if self.has_ambiguous():
            print('Not patching %s: ambiguous' %self.path)
            return 0

        if not args.dual_license:
            try:
                lic = self.licenses[0].spdx
            except:
                lic = 'GPL-2.0-only'
        else:
            lic = self.get_spdx_tags().replace(',', ' or ')

        fp = os.path.join(args.source, self.path)

        orig = codecs.open(fp, encoding='utf-8').readlines()
        if len(orig) == 0:
            print('Not patching %s: empty' %self.path)
            return 0

        self.patch = []
        i = 0
        j = 0
        strip = replace or striponly
        if not striponly:
            j = 1
            if self.path.endswith('.c'):
                self.add_spdx_id('//', lic)
            elif self.path.endswith('.dts'):
                self.add_spdx_id('//', lic)
            elif self.path.endswith('.dtsi'):
                self.add_spdx_id('//', lic)
            elif self.path.endswith('.cocci'):
                self.add_spdx_id('//', lic)
            elif self.path.endswith('.h'):
                self.add_spdx_id('/*', lic)
            elif self.path.endswith('.S'):
                if orig[0].startswith(';;'):
                    self.add_spdx_id(';;', lic)
                elif orig[0].startswith(';'):
                    self.add_spdx_id(';', lic)
                else:
                    self.add_spdx_id('/*', lic)
            elif self.path.find('Makefile') >= 0:
                self.add_spdx_id('#', lic)
            elif self.path.find('Kconfig') >= 0:
                self.add_spdx_id('#', lic)
            elif orig[0].startswith('#!'):
                self.patch.append(orig[0])
                self.add_spdx_id('#', lic)
                i = 1
                j = 2
            else:
                print('Not patching %s: not supported' %self.path)
                return 0

        stop_bp = False
        start_bp = False
        start_bp_comment = -1
        endbp = 0

        while i < len(orig):
            l = orig[i]
            i += 1

            if strip:
                # HACK
                if self.stop_boilerplate(l, i, args):
                    stop_bp = True

                if not stop_bp and self.line_is_boilerplate(i, args):
                    if not start_bp:
                        start_bp = True
                        ls = l.strip()
                        if ls.startswith('/*') and self.can_drop(l, i):
                            if ls.endswith('*/'):
                                continue
                            j += 1
                            self.patch.append('/*\n')
                            start_bp_comment = j
                            continue
                    # HACK
                    if not self.exclude_from_boilerplate(l, i):
                        # Protect copyright notices
                        if not self.line_has_copyright(i, l):
                            if not self.line_has_author(i, l):
                                endbp = j + 1
                                l = self.is_endof_comment(l)
                                if not l:
                                    continue
                        else:
                            l = self.sanitize_copyright(i, l)
                else:
                    pass

                # HACK
                if l and self.line_is_missed_boilerplate(l, i):
                    endbp = j + 1
                    l = self.is_endof_comment(l)
                    if not l:
                        continue
            if not l:
                print('OOPS: %s: %d %d' %(self.path, i, j))
                continue

            if start_bp_comment >= 0 and start_bp_comment == j:
                if l.strip() == '*/':
                    self.patch.pop(-1)
                    j -= 1
                    continue

            j += 1
            self.patch.append(l)

        if strip:
            self.sanitize_comment(endbp)

        afile = os.path.join('a', self.path)
        bfile = os.path.join('b', self.path)
        diff = difflib.unified_diff(orig, self.patch, afile, bfile)

        pd = args.patchdir
        if not args.flat:
            parts = self.path.split('/')
            pn = parts.pop(-1)
            pn += '.patch'
            for p in parts:
                pd = os.path.join(pd, p)

        elif not args.patchname:
            pn = self.path.replace('/','-')
            pn += '.patch'
            i = 1
            while os.path.isfile(os.path.join(pd, pn)):
                pn = self.path.replace('/','-')
                pn += '-%d.patch' %i
                i += 1

        else:
            pn = args.patchname

        if not os.path.isdir(pd):
            os.makedirs(pd)

        pp = os.path.join(pd, pn)
        if not os.path.isfile(pp):
            pf = codecs.open(pp, encoding='utf-8', mode='w')
            try:
                for l in args.template:
                    txt = l
                    if txt.startswith('Subject:'):
                        if not args.patchname:
                            txt = 'Subject: %s: %s\n' %(self.get_prefix(), l.split('Subject:')[1].strip())
                    elif txt.startswith('From:'):
                        txt = 'From: %s\n' %args.author
                    elif txt.startswith('Date:'):
                        dt = datetime.datetime.now()
                        toff = time.timezone
                        th = -toff / 3600
                        tm = (abs(toff) % 3600) / 60
                        tz = '%+03d%02d' %(th, tm)
                        txt = 'Date: %s %s\n' %(dt.strftime('%a, %d %b %Y %T'), tz)
                    elif txt.startswith('$SCANMATCH'):
                        lm = self.licenses[0].match.split('\n')
                        txt = '  %s %s\n' %(self.lic_comment, lm[0])
                        for m in lm[1:]:
                            txt += '  %s\n' %m
                    elif txt.startswith('$SPDXID'):
                        txt = '  %s\n' %lic
                    elif txt.startswith('Signed-off-by:'):
                        txt = 'Signed-off-by: %s\n' %args.author
                    pf.write(txt)

                pf.write('\n')
                if not args.patchname:
                    pf.write(self.get_extra_info())
                    pf.write('\n\n')
                    pf.write(self.get_match_info())

                pf.write('\n---\n\n')
                pf.writelines(diff)
                pf.write('\n\n')
                pf.close()

                fd = open(os.path.join(pd, 'series'), 'a')
                fd.write('%s\n' %pn)
                fd.close()
                return 1

            except Exception, ex:
                print('Failed to write diff for %s' %pn)
                print(ex)
        else:
            pf = codecs.open(pp, encoding='utf-8', mode='a')
            try:
                pf.writelines(diff)
                pf.write('\n\n')
                pf.close()
                return 1
            except Exception, ex:
                print('Failed to write diff for %s' %pn)
                print(ex)

        return 0

class matchrule(object):
    def __init__(self, txt):
        self.rule = txt
        self.files = []
        self.matches = []
        self.spdx = []

class scaninfo(object):
    def __init__(self, args):
        self.entries = []
        self.resolved = {}
        self.excludes = []
        self.licenses = []
        self.directories = []
        self.fileinfos = {}
        self.patchrules = {}
        self.numpatches = 0
        self.matches = []
        self.spdx = []
        self.rules = {}
        self.rulefilters = []
        self.patched = []

        # Scan resolved conflicts if available
        if args.resolved:
            lines = open(args.resolved).readlines()
            for l in lines:
                self.add_resolved(l.strip())

        if args.excludes:
            self.excludes = args.excludes.split(',')

        if args.license_filter:
            self.licenses = args.license_filter.split(',')

        if args.rules_filter:
            self.rulefilters = args.rules_filter.split(',')

    def is_excluded(self, entry, args):
        # Filter based on path
        for e in self.excludes:
            if entry.path.startswith(e):
                return True

        if args.paths and len(args.paths):
            res = False
            for p in args.paths:
                if entry.path.startswith(p):
                    res = True
                    break
            if not res:
                return True

        if args.filters:
            drop = True
            for f in args.filters:
                if entry.path.find(f) >= 0:
                    drop = False
                    break
            if args.negate_filters:
                drop = not drop
            if drop:
                return True

        # Filter based on licenses
        if len(self.licenses):
            res = False
            for l in self.licenses:
                if entry.has_matching_license(l):
                    res = True
            if not res:
                return True

        # Filter dual license matches
        if args.dual_license:
            lics = entry.get_spdx_tags().split(',')
            if len(self.licenses) != len(lics):
                return True
            for l in self.licenses:
                if not l in lics:
                    return True
                lics.remove(l)

        if len(self.rulefilters):
            res = False
            for r in self.rulefilters:
                if entry.has_matching_rule(r):
                    res = True
            if not res:
                return True

        # Filter based on score
        if not entry.has_score(args):
            return True

        # Filter SPDX
        if args.has_spdx:
            if not entry.has_spdx():
                return True
            # Filter SPDX + other
            if args.spdx_plus and entry.spdx_pure():
                return True
            # Filter SPDX pure
            if args.spdx_pure and not entry.spdx_pure():
                return True

        if args.no_spdx and entry.has_spdx():
            return True

        # Filter text
        if args.has_text and not entry.has_text():
            return True
        if args.no_text and entry.has_text():
            return True

        # Filter text
        if args.has_tag and not entry.has_tag():
            return True
        if args.no_tag and entry.has_tag():
            return True

        # Filter notice
        if args.has_notice and not entry.has_notice():
            return True
        if args.no_notice and entry.has_notice():
            return True

        # Filter reference
        if args.has_reference and not entry.has_reference():
            return True
        if args.no_reference and entry.has_reference():
            return True

        if args.has_module and not entry.has_module():
            return True

        # Filter ambiguous
        if args.has_ambiguous and not entry.has_ambiguous():
            return True
        if args.no_ambiguous and entry.has_ambiguous():
            return True

        # Filter on conflicts
        if args.conflicts and not entry.has_conflict(args):
            return True
        if args.noconflicts and entry.has_conflict(args):
            return True

        # Filter on multiple/unique licenses
        if args.multiple and entry.unique_license():
            return True
        if args.unique and not entry.unique_license():
            return True
        if args.morethanone and len(entry.licenses) < 2:
            return True

        # Export only
        if args.export_only and not entry.export_only():
            return True

        # Module only
        if args.module_only and not entry.module_only():
            return True

        # Line count
        if args.minlines > entry.end_line:
            return True

        return False

    def add_entry(self, entry, args):
        entry.is_resolved = self.resolved.get(entry.path, None)

        excl = self.is_excluded(entry, args)
        if excl:
            #print('Exclude: %s' %entry.path)
            return

        #print('Include: %s' %entry.path)
        self.entries.append(entry)

        for l in entry.licenses:
            if l.rule == '':
                continue
            rule = self.rules.get(l.rule, matchrule(l.rule))
            if not entry.path in rule.files:
                rule.files.append(entry.path)
            addmatch = True
            for mt in rule.matches:
                if mt == l.pattern:
                    addmatch = False
                    break
            if addmatch:
                rule.matches.append(l.pattern)
            if not l.spdx in rule.spdx:
                rule.spdx.append(l.spdx)
            self.rules[l.rule] = rule

    def stats(self, args):
        tot_files = 0

        has_spdx = 0
        has_license = 0
        unique = 0
        conflicts = 0
        resolved = 0

        spdx_unique = 0
        spdx_conflicts = 0
        spdx_resolved = 0
        spdx_plus_text = 0
        spdx_plus_ref = 0
        spdx_plus_notice = 0
        spdx_pure = 0

        ambiguous = 0

        matches = {}
        raw_matches = {}

        licenses_sp = {}
        licenses_st = {}
        licenses_u = {}
        licenses_m = {}

        variants = []

        for entry in self.entries:
            tot_files += 1
            if entry.has_spdx():
                has_spdx += 1
                if entry.unique_license():
                    spdx_unique += 1
                if entry.has_conflict(args):
                    spdx_conflicts += 1
                    if entry.is_resolved:
                        spdx_resolved += 1
                if entry.has_text():
                    spdx_plus_text += 1
                if entry.has_reference():
                    spdx_plus_ref += 1
                if entry.has_notice():
                    spdx_plus_notice += 1

                r, l = entry.get_spdx_tags(split_tags=True)
                if len(l) == 0:
                    cnt = licenses_sp.get(r, 0)
                    licenses_sp[r] = cnt + 1
                    spdx_pure += 1
                else:
                    lt = '%s + %s' %(r, l)
                    cnt = licenses_st.get(lt, 0)
                    licenses_st[lt] = cnt + 1

            elif entry.has_license():
                has_license += 1
                if entry.unique_license():
                    unique += 1
                    l = entry.get_spdx_tags()
                    cnt = licenses_u.get(l, 0)
                    licenses_u[l] = cnt + 1
                else:
                    if entry.has_conflict(args):
                        conflicts += 1
                        if entry.is_resolved:
                            resolved += 1
                    l = entry.get_spdx_tags()
                    cnt = licenses_m.get(l, 0)
                    licenses_m[l] = cnt + 1

            if entry.has_ambiguous():
                ambiguous += 1

            for l in entry.licenses:
                if l.spdx not in variants:
                    variants.append(l.spdx)
                if l.is_spdx:
                    continue
                cnt = raw_matches.get(l.match, 0)
                cnt += 1
                raw_matches[l.match] = cnt
                m = l.pattern
                cnt,om = matches.get(m, (0, l.match))
                cnt += 1
                matches[m] = (cnt, om)

        print('Files:          %8d' %tot_files)
        print(' no License:    %8d' %(tot_files - (has_spdx + has_license)))
        print(' ambiguous:     %8d' %(ambiguous))
        print(' with SPDX:     %8d' %has_spdx)
        print('      unique:              %8d' %spdx_unique)
        print('      GPL conflicts:       %8d' %spdx_conflicts)
        print('      resolved conflicts:  %8d' %spdx_resolved)
        print('      With text:           %8d' %spdx_plus_text)
        print('      With reference:      %8d' %spdx_plus_ref)
        print('      With notice:         %8d' %spdx_plus_notice)

        if args.verbose:
            print('      Pure SPDX:      %8d' %spdx_pure)
            for l in sorted(licenses_sp.keys()):
                print('           %-85s: %8d' %(l, licenses_sp[l]))

            print('      SPDX + text:    %8d' %(has_spdx - spdx_pure))
            for l in sorted(licenses_st.keys()):
                print('           %-85s: %8d' %(l, licenses_st[l]))

        print(' with License:  %8d' %has_license)
        print('      unique:              %8d' %unique)
        for l, c in licenses_u.iteritems():
            print('        %-70s: %8d' %(l, c))
        print('      multiple:            %8d' %(has_license - unique))
        for l, c in licenses_m.iteritems():
            print('        %-70s: %8d' %(l, c))

        print('      GPL conflicts:       %8d' %conflicts)
        print('      resolved conflicts:  %8d' %resolved)

        print('')
        print('Raw license expressions:   %8d' %len(raw_matches.keys()))
        print('License expressions:       %8d' %len(matches.keys()))

        totcnt = 0
        for m in matches:
            cnt, om = matches[m]
            totcnt += cnt
        print('Total expressions:         %8d' %totcnt)

        if args.verbose:
            print('License variants:          %8d' %len(variants))
            for l in sorted(variants):
                print('    %s' %l)

    def make_patch(self, e, args, replace, striponly):
        r = e.make_patch(args, replace, striponly)
        if not r:
            return
        #print(e.path)
        self.patched.append(e.path)
        self.numpatches += 1

        for rule in e.get_rules():
            cnt = self.patchrules.get(rule, 0) + 1
            self.patchrules[rule] = cnt

    def patch_boiler(self, args, rule=None):
        for e in self.entries:
            if e.has_spdx():
                continue
            if e.has_conflict(args):
                continue
            if args.unique and not e.unique_license():
                continue
            if rule and not e.has_matching_rule(rule):
                continue
            if e.path in self.patched:
                continue
            self.make_patch(e, args, replace=True, striponly=False)

    def patch_export(self, args):
        for e in self.entries:
            if not e.export_only():
                continue
            self.make_patch(e, args, replace=False, striponly=False)

    def patch_module(self, args):
        for e in self.entries:
            if not e.module_only():
                continue
            self.make_patch(e, args, replace=False, striponly=False)

    def patch_make(self, args):
        for e in self.entries:
            self.make_patch(e, args, replace=True, striponly=False)

    def patch_none(self, args):
        for e in self.entries:
            self.make_patch(e, args, replace=False, striponly=False)

    def patch_strip(self, args, rule=None):
        for e in self.entries:
            if not e.has_spdx():
                continue
            if rule and not e.has_matching_rule(rule):
                continue
            self.make_patch(e, args, replace=False, striponly=True)

    def print_rules(self, args):
        nrules = 0
        nfiles = 0
        for r in sorted(self.rules):
            nrules += 1
            rule = self.rules[r]
            print('Rule: %s' %rule.rule)
            print('SPDX:')
            for s in rule.spdx:
                print('    %s' %s)
            print('Files: %d' %len(rule.files))
            print('Patterns: %d' %len(rule.matches))
            for mt in rule.matches:
                wrapper = TextWrapper(initial_indent="  ", subsequent_indent='  ')
                for l in wrapper.wrap(mt):
                    print(l)
                print('\n')
            print('Filenames:')
            for f in rule.files:
                print('    %s' %f)
                nfiles += 1
            print('\n')
        print('Total Rules: %d' %nrules)
        print('Total Files: %d' %nfiles)

    def parse(self, args, rule=None):
        if args.format == 'stats':
            self.stats(args)
        elif args.format == 'rules':
            self.print_rules(args)
        elif args.format == 'patch_boiler':
            self.patch_boiler(args, rule)
        elif args.format == 'patch_export':
            self.patch_export(args)
        elif args.format == 'patch_module':
            self.patch_module(args)
        elif args.format == 'patch_make':
            self.patch_make(args)
        elif args.format == 'patch_none':
            self.patch_none(args)
        elif args.format == 'patch_strip':
            self.patch_strip(args, rule)
        elif args.format in ['csv', 'full', 'fname']:
            for e in self.entries:
                e.print_info(args)
        else:
            pass

        if args.format.startswith('patch_'):
            print('%d patches generated' %self.numpatches)
            for rule in self.patchrules:
                print('%-40s: %8d matches' %(rule, self.patchrules[rule]))


def scan_entries(info, data):
    # Scan all entries
    for item in data['files']:
        if item['type'] == 'directory':
            continue
        # File info
        entry = scanentry(item, args)
        entry.fileinfo = info.fileinfos.get(entry.path, fileinfo(entry.path))
        info.add_entry(entry, args)

def load_info(args):
    data = json.load(open(args.datafile))
    info = scaninfo(args)

    # Read the pickled data
    try:
        info.fileinfos = pickle.load(open(args.infodb))
    except:
        pass

    return info, data

if __name__ == '__main__':

    formats = [
        'none',
        'stats',
        'full',
        'csv',
        'fname',
        'rules',
        'patch_boiler',
        'patch_export',
        'patch_module',
        'patch_make',
        'patch_none',
        'patch_strip',
    ]

    parser = ArgumentParser(description='License information')
    parser.add_argument('datafile', metavar='datafile',
                        help='JSON data file with scan information')

    parser.add_argument('paths', nargs=REMAINDER,
                        help='Optional File/directory patsh')

    parser.add_argument('--infodb', dest='infodb',
                        help='Pickled file info db')

    # Outpout format
    parser.add_argument('--format', '-f', dest='format', default='stats',
                        choices=formats, help='Output format')
    parser.add_argument('--verbose', '-v', dest='verbose', default=False,
                        action='store_true', help='Verbose output')

    # Input filters
    parser.add_argument('--exclude', '-e', dest='excludes',
                        help='Exclude directories/files, separate with commata')

    # FIXME make that regex
    parser.add_argument('--filter', '-F', dest='filter',
                        help='match parts of the path, separate with commata')
    parser.add_argument('--negate_filter', '-n',  dest='negate_filters', default=False,
                        action='store_true', help='Negate --filter')

    # License filters
    parser.add_argument('--license', '-l', dest='license_filter',
                        help='License filter, separate with commata')

    # Rules filter
    parser.add_argument('--rules', '-R', dest='rules_filter',
                        help='Rules filter, separate with commata')

    # Filter based on score
    parser.add_argument('--minscore', '-s', dest='minscore', type=int, default=0,
                        help='Minimal scan score (0-100)')
    parser.add_argument('--maxscore', '-S', dest='maxscore', type=int, default=100,
                        help='Maximal scan score (0-100)')

    # Filter based on lines in the file
    parser.add_argument('--minlines', dest='minlines', type=int, default=0,
                        help='Minimal line count in file (0-...)')

    # Filters based on scan results
    parser.add_argument('--has_spdx', dest='has_spdx', default=False,
                        action='store_true', help='Files with SPDX identifier')
    parser.add_argument('--no_spdx', dest='no_spdx', default=False,
                        action='store_true', help='Files without SPDX identifier')

    parser.add_argument('--spdx_plus', dest='spdx_plus', default=False,
                        action='store_true', help='Files with SPDX identifier plus other text/ref/notice')
    parser.add_argument('--spdx_pure', dest='spdx_pure', default=False,
                        action='store_true', help='Files with pure SPDX identifier')

    parser.add_argument('--has_text', dest='has_text', default=False,
                        action='store_true', help='Files with license text')
    parser.add_argument('--no_text', dest='no_text', default=False,
                        action='store_true', help='Files without license text')

    parser.add_argument('--has_notice', dest='has_notice', default=False,
                        action='store_true', help='Files with license notice')
    parser.add_argument('--no_notice', dest='no_notice', default=False,
                        action='store_true', help='Files without license notice')

    parser.add_argument('--has_reference', dest='has_reference', default=False,
                        action='store_true', help='Files with license reference')
    parser.add_argument('--no_reference', dest='no_reference', default=False,
                        action='store_true', help='Files without license reference')

    parser.add_argument('--has_tag', dest='has_tag', default=False,
                        action='store_true', help='Files with license tag')
    parser.add_argument('--no_tag', dest='no_tag', default=False,
                        action='store_true', help='Files without license tag')

    parser.add_argument('--has_module', dest='has_module', default=False,
                        action='store_true', help='Files with MODULE_LICENSE')

    parser.add_argument('--has_ambiguous', dest='has_ambiguous', default=False,
                        action='store_true', help='Files marked as ambiguous')
    parser.add_argument('--no_ambiguous', dest='no_ambiguous', default=False,
                        action='store_true', help='Files marked as ambiguous')

    parser.add_argument('--no_license', dest='no_license', default=False,
                        action='store_true', help='Files no license entry')

    parser.add_argument('--export_only', dest='export_only', default=False,
                        action='store_true', help='Files with only EXPORT based license entries')

    parser.add_argument('--module_only', dest='module_only', default=False,
                        action='store_true', help='Files with only MODULE_LICENSE based license entries')

    parser.add_argument('--dual_license', dest='dual_license', default=False,
                        action='store_true', help='Files with dual or more licenses')

    # Magic for patch generation
    parser.add_argument('--for-each-filter', dest='eachfilter', default=False,
                        action='store_true', help='Cycle through the text,notice,reference,tag filters')

    # Remove license scan entries based on score
    parser.add_argument('--dropscore', '-d', dest='dropscore', type=int, default=0,
                        help='Drop license scan entries below minimal score (0-100)')

    parser.add_argument('--dropmodule', dest='dropmodule', default=False,
                        action='store_true', help='Drop module based scan entries')
    parser.add_argument('--dropexport', dest='dropexport', default=False,
                        action='store_true', help='Drop export based scan entries')

    # Only show files with conflicts
    parser.add_argument('--conflicts', dest='conflicts', default=False,
                        action='store_true', help='Files with conflicts')

    parser.add_argument('--noconflicts', dest='noconflicts', default=False,
                        action='store_true', help='Files without conflicts')

    parser.add_argument('--module', dest='module', default=False,
                        action='store_true', help='Only module conflicts')
    parser.add_argument('--nomodule', dest='nomodule', default=False,
                        action='store_true', help='Ignore module conflicts')

    # Only show files with multiple licenses
    parser.add_argument('--multiple', '-m', dest='multiple', default=False,
                        action='store_true', help='Files with multiple licenses')

    parser.add_argument('--morethanone', '-M', dest='morethanone', default=False,
                        action='store_true', help='Files with more than one scan match')

    # Resolved conflicts (false positives)
    parser.add_argument('--resolved', '-r', dest='resolved',
                        help='File with list of filenames with resolved (false) conflicts')

    # Fixup GPL-1.0-or-later with COPYING reference
    parser.add_argument('--fixupcopying', dest='fixupcopying', default=False,
                        action='store_true',
                        help='Fixup GPL-1.0-or-later when a reference to COPYING is found')

    # Only files which have unique licenses
    parser.add_argument('--unique', '-u', dest='unique', default=False,
                        action='store_true', help='Only files with unique license matches')

    # Source path for patch creation
    parser.add_argument('--source', dest='source', default='.',
                        help='Source directory for patch creation')
    parser.add_argument('--patchdir', dest='patchdir', default='patches',
                        help='Patches directory for patch creation')
    parser.add_argument('--patchname', dest='patchname', default=None,
                        help='Patchname for combo patches')
    parser.add_argument('--ruleslist', dest='ruleslist', default=None,
                        help='Ruleslist for patch series based on single rules')
    parser.add_argument('--author', dest='author', help='Author information')
    parser.add_argument('--flat', dest='flat', default=False,
                        action='store_true', help='Flat patch series')
    parser.add_argument('--template', '-t', dest='template', default='header.txt',
                        help='Template for patch header')

    args = parser.parse_args()

    if args.filter:
        args.filters = args.filter.split(',')
    else:
        args.filters = None
 
    # Get author information for patches
    if args.format.startswith('patch_'):
        if not args.author:
            an = os.environ.get('GIT_AUTHOR_NAME', None)
            am = os.environ.get('GIT_AUTHOR_EMAIL', None)
            if not am:
                print('No author information found\n')
                sys.exit(1)
            if an:
                args.author = '%s <%s>' %(an, am)
            else:
                args.author = am

        try:
            args.template = open(args.template).readlines()
        except:
            args.template = []

    info, data = load_info(args)

    if args.ruleslist:
        scan_entries(info, data)

        rules = open(args.ruleslist).readlines()
        for rulefile in rules:
            rulefile = rulefile.strip()
            args.template = open(rulefile).readlines()
            rule = rulefile.split('/')[1].split('-', 1)[1]
            args.patchname = '%s.patch' %rulefile.split('/')[1]
            print('Patching rule: %s' %rule)
            info.parse(args, rule)

    if not args.eachfilter:
        if args.no_license:
            args.no_text = True
            args.no_notice = True
            args.no_reference = True
            args.no_tag = True
            args.no_spdx = True

        scan_entries(info, data)
        info.parse(args)
    else:
        # Hack to spare reloading the data over and over
        print('text')
        args.no_text = False
        args.no_notice = True
        args.no_reference = True
        args.no_tag = True
        scan_entries(info, data)
        info.parse(args)

        print('notice')
        args.no_text = True
        args.no_notice = False
        args.no_reference = True
        args.no_tag = True
        info2 = scaninfo(args)
        info2.resolved = info.resolved
        info2.excludes = info.excludes
        info2.licenses = info.licenses
        info2.fileinfos = info.fileinfos

        scan_entries(info2, data)
        info2.parse(args)

        print('reference')
        args.no_text = True
        args.no_notice = True
        args.no_reference = False
        args.no_tag = True
        info3 = scaninfo(args)
        info3.resolved = info.resolved
        info3.excludes = info.excludes
        info3.licenses = info.licenses
        info3.fileinfos = info.fileinfos

        scan_entries(info3, data)
        info3.parse(args)

        print('tag')
        args.no_text = True
        args.no_notice = True
        args.no_reference = True
        args.no_tag = False
        info4 = scaninfo(args)
        info4.resolved = info.resolved
        info4.excludes = info.excludes
        info4.licenses = info.licenses
        info4.fileinfos = info.fileinfos

        scan_entries(info4, data)
        info4.parse(args)