#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# List archive collator and sanitizer
#
# The purpose of this script is to make a complete mailing list archive by
# collecting individual archives from individual list subscribers. It uses a
# list of known IDs to locate messages we don't already have in the
# archive, and sanitizes the headers to remove as much private
# information as possible. It also makes sure to consider messages
# that have the proper mailing list header, so you can aim it at any
# inbox to find relevant messages.
#
# Example usage:
#   list-archive-maker.py -s mail/lists/* -k known-msgids.list \
#                         -l linux-kernel.vger.kernel.org -e collected
#
# The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
# You can review these files to make sure the script did the right thing.
#
# Author:  Konstantin Ryabitsev <konstantin@linuxfoundation.org>
#

import os
import sys
import mailbox
import email.utils
import email.policy
import fnmatch
import argparse

from typing import Tuple, List, Set

from email import charset
charset.add_charset('utf-8', None)

# Set our own policy
EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)

# Only retain the headers that are important to us
# must be lowercase for matching purposes.
# We treat "Received" headers with extra care for privacy, but if you
# want to exclude them entirely, you can remove them from this list.
# We also consider shell-globbing style wildcards.
WANTHDRS = {'return-path',
            'received',
            'sender',
            'from',
            'to',
            'cc',
            'subject',
            'date',
            'message-id',
            'resent-message-id',
            'reply-to',
            'in-reply-to',
            'references',
            'mime-*',
            'list-*',
            'content-*',
            'errors-to',
            'x-mailing-list',
            'resent-to',
            'dkim-*',
            'x-developer-*',
            }

__VERSION__ = '2.0'


def formataddr(pair: Tuple[str, str]) -> str:
    try:
        return email.utils.formataddr(pair)
    except UnicodeEncodeError:
        # This might happen if the realname is encoded in a broken way; just
        # drop the real name then.
        return email.utils.formataddr((None, pair[1]))


def process_archives(sources: List[str], outdir: str, knownset: Set[str], listids: List[str],
                     rejectsfile: str, asmaildir: bool, extrahdrs: List[Tuple[str, str]]) -> Set[str]:
    outboxes = dict()
    writecount = dict()
    seenids = set()

    if asmaildir:
        outbox = mailbox.Maildir(outdir)
        outboxes[outdir] = outbox
        writecount[outdir] = 1

    # convert listids into email addresses by replacing the first '.' to '@'.
    # if you're working with a mailing list that has a non-standard list-id, you
    # can specify the list email address as part of the listids to satisfy this check.
    eaddrs = list()
    for listid in listids:
        if listid.find('@') < 0:
            eaddrs.append(listid.replace('.', '@', 1))
        else:
            eaddrs.append(listid)

    rejectsbox = None
    if rejectsfile:
        rejectsbox = mailbox.mbox(rejectsfile)

    for sourcefile in sources:
        sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
        sys.stdout.flush()
        # If the filename ends with /, we treat as maildir
        if sourcefile[-1] == '/':
            inbox = mailbox.Maildir(sourcefile)
        else:
            inbox = mailbox.mbox(sourcefile)

        total = len(inbox)

        sys.stdout.write('%s messages\n' % total)
        sys.stdout.flush()

        counter = 0
        skipped = 0
        dupmsgid = 0
        nomsgid = 0
        notourlist = 0

        for msg in inbox:
            counter += 1
            sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
                             (counter, total, skipped, dupmsgid, nomsgid, notourlist))
            sys.stdout.flush()

            msgid = msg['message-id']
            if msgid is None and msg.get('resent-message-id', ''):
                msgid = msg['resent-message-id']

            if msgid is None:
                # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
                # system message.
                if rejectsfile:
                    msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa
                    rejectsbox.add(msg)
                skipped += 1
                nomsgid += 1
                continue

            msgid = msgid.strip()
            if msgid in knownset:
                # Duplicate Message-ID, either because we already have it in the known-ids,
                # or because the inbox has messages with same IDs. There is no fix for the
                # latter condition, so we just assume they got delivered multiple times and
                # use the first one found.
                if rejectsfile:
                    msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa
                    rejectsbox.add(msg)
                skipped += 1
                dupmsgid += 1
                continue

            if extrahdrs:
                msg._headers += extrahdrs  # noqa

            # Remove headers not in WANTHDRS list and any Received:
            # lines that do not mention the list email address
            newhdrs = list()
            to = list()
            cc = list()
            recvtime = None
            is_our_list = False
            for hdrname, hdrval in list(msg._headers):  # noqa
                lhdrname = hdrname.lower()
                lhdrval = hdrval.lower()
                wanthdr = False
                for hdrmatch in WANTHDRS:
                    if fnmatch.fnmatch(lhdrname, hdrmatch):
                        wanthdr = True
                        break

                if not wanthdr:
                    continue

                if lhdrname == 'received':
                    # noinspection PyBroadException
                    try:
                        if recvtime is None:
                            # Use the first Received header we find for the message date
                            # (for the purposes of knowing which mbox file to put it in)
                            recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
                        # does hdrval contain one of our email addresses?
                        for eaddr in eaddrs:
                            if lhdrval.find(eaddr) >= 0:
                                newhdrs.append((hdrname, hdrval))
                                break
                    except:
                        # Something went horribly wrong, eh?
                        pass

                elif lhdrname == 'list-id':
                    for listid in listids:
                        if lhdrval.find(listid) >= 0 or fnmatch.fnmatch(lhdrval, listid):
                            newhdrs.append((hdrname, hdrval))
                            is_our_list = True
                            break

                elif lhdrname == 'x-mailing-list':
                    for listid in listids:
                        if lhdrval.find(listid) >= 0:
                            # Stick the list-id that's first in our collection,
                            # since we assume that it's the canonical one
                            newhdrs.append(('List-Id', listids[0]))
                            is_our_list = True
                            break

                # Malformed emails can have multiple to: and cc: fields.  Merge
                # so there's one field for each header type.
                #
                # Save the place in newhdrs where the first to or cc list would
                # have appeared, so we can insert the merged list there rather
                # than strangely at the end.

                elif lhdrname == 'to':
                    for pair in email.utils.getaddresses([hdrval]):
                        if pair[1] in cc:
                            # already in Cc, so no need to add it to "To"
                            continue
                        to.append(formataddr(pair))

                elif lhdrname == 'cc':
                    for pair in email.utils.getaddresses([hdrval]):
                        if pair[1] in to:
                            # already in To, so no need to add it to CCs
                            continue
                        cc.append(formataddr(pair))

                else:
                    newhdrs.append((hdrname, hdrval))

            if len(to):
                newhdrs.append(('To', ', '.join(to)))

            if len(cc):
                newhdrs.append(('Cc', ', '.join(cc)))

            if not is_our_list:
                # Sometimes a message is cc'd to multiple mailing lists and the
                # archives only contain a copy of the message that was delivered to a
                # different list. E.g. something can be To: linux-mm@vger.kernel.org
                # and also Cc: linux-kernel@vger.kernel.org and we're looking for the
                # LKML list-id, the archive may only contain the copy that arrived to
                # linux-mm. We try to hedge for this by looking in the "To" and "Cc"
                # fields for any indication that this was intended for our mailing list.
                for eaddr in eaddrs:
                    if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
                            str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
                            str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
                        # insert the list-id header
                        # (assuming the first one in the list to be the canonical one)
                        newhdrs.append(('List-ID', '<%s>' % listids[0]))
                        is_our_list = True
                        break

                if not is_our_list:
                    # Well, we tried everything
                    if rejectsfile:
                        msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa
                        rejectsbox.add(msg)
                    skipped += 1
                    notourlist += 1
                    continue

            msg._headers = newhdrs

            msgdate = recvtime
            if msgdate is None:
                # fine, use the date in the message, even if it's bogus
                msgdate = email.utils.parsedate_tz(str(msg['Date']))

            if asmaildir:
                mboxname = outdir
            else:
                mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])

            # do we have this mbox open already?
            if mboxname in outboxes:
                outbox = outboxes[mboxname]
            else:
                outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
                outboxes[mboxname] = outbox
                writecount[mboxname] = 1

            try:
                outbox.add(msg.as_string(policy=EMLPOLICY).encode())
                seenids.add(msgid)
                knownset.add(msgid)
                writecount[mboxname] += 1
            except:  # noqa
                # Oh well, toss it
                pass

        inbox.close()
        sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
                         (counter, total, skipped, dupmsgid, nomsgid, notourlist))

    allboxes = sorted(outboxes)

    if len(allboxes):
        print()
        print('Summary')
        for mboxname in allboxes:
            print('  %s: %s new (%s total)' %
                  (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
            outboxes[mboxname].close()
    else:
        print('No new messages found.')

    return seenids


def main(args: argparse.Namespace):
    if not args.as_maildir and not os.path.isdir(args.exportdir):
        os.mkdir(args.exportdir)

    if args.known_ids and os.path.exists(args.known_ids):
        if args.known_ids.endswith('.sqlite3'):
            import sqlite3
            dbconn = sqlite3.connect(args.known_ids, sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
            cur = dbconn.cursor()
            rows = cur.execute('SELECT mid FROM msgmap').fetchall()
            knownids = {f'<{x[0]}>' for x in rows}
        else:
            with open(args.known_ids, 'r') as fh:
                knownids = set(fh.read().splitlines())
                fh.close()
        print('Loaded %s message-ids from "%s"' % (len(knownids), args.known_ids))
    else:
        # should we load message-ids from existing mailboxes found in the export dir?
        # right now we're just appending to them, which is probably not expected behaviour.
        knownids = set()

    if not args.source:
        print('You have to specify at least one source')
        sys.exit(1)

    # Make list ID matching case-insensitive to match more mail
    if args.list_ids:
        listids = [listid.lower() for listid in args.list_ids]
    else:
        listids = ['*']

    extrahdrs = list()
    if args.extrahdrs:
        for hdr in args.extrahdrs:
            name, val = hdr.split(':', maxsplit=1)
            if val.strip():
                extrahdrs.append((name.strip(), val.strip()))

    newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.as_maildir,
                              extrahdrs)

    if newids is None or not args.known_ids or args.known_ids.endswith('.sqlite3'):
        sys.exit(0)

    knownids.update(newids)
    print('Wrote %s msgids into %s (%s new)' % (len(knownids), args.known_ids, len(newids)))
    with open(args.known_ids, 'w') as fh:
        fh.write('\n'.join(knownids))
        fh.close()


if __name__ == '__main__':
    # noinspection PyTypeChecker
    parser = argparse.ArgumentParser(
        description="Make a mbox of LKML messages we haven't yet archived",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument('-s', '--source', nargs='+',
                        help=('Mbox file with archives, can be multiple. '
                              'Paths with trailing "/" will be treated as maildirs.'))
    parser.add_argument('-e', '--exportdir', required=True, default='list-archives',
                        help='Export dir where to put sanitized archives')
    parser.add_argument('-m', '--as-maildir', action='store_true', default=False,
                        help='Export as maildir instead of mailboxes')
    parser.add_argument('-k', '--known-ids',
                        help='File with known Message-IDs (one per line, or msgmap.sqlite3)')
    parser.add_argument('-l', '--list-ids', nargs='+',
                        help='Limit to just these list-ids (can be multiple)')
    parser.add_argument('-r', '--rejected',
                        help='Mailbox file where to save messages that were rejected '
                             '(adds X-Import-Rejected-Reason header)')
    parser.add_argument('-x', '--extrahdrs', nargs='+', metavar='FULLHDR',
                        help='Extra headers to inject into each message')

    main(parser.parse_args())