#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # List archive collator and sanitizer # # The purpose of this script is to make a complete mailing list archive by # collecting individual archives from individual list subscribers. It uses a # list of known IDs to locate messages we don't already have in the # archive, and sanitizes the headers to remove as much private # information as possible. It also makes sure to consider messages # that have the proper mailing list header, so you can aim it at any # inbox to find relevant messages. # # Example usage: # list-archive-maker.py -s mail/lists/* -k known-msgids.list \ # -l linux-kernel.vger.kernel.org -e collected # # The results will be written out into a "collected" dir in the YYYY-MM.mbx format. # You can review these files to make sure the script did the right thing. # # Author: Konstantin Ryabitsev # import os import sys import mailbox import email.utils import email.policy import fnmatch import argparse from typing import Tuple, List, Set from email import charset charset.add_charset('utf-8', None) # Set our own policy EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None) # Only retain the headers that are important to us # must be lowercase for matching purposes. # We treat "Received" headers with extra care for privacy, but if you # want to exclude them entirely, you can remove them from this list. # We also consider shell-globbing style wildcards. WANTHDRS = {'return-path', 'received', 'sender', 'from', 'to', 'cc', 'subject', 'date', 'message-id', 'resent-message-id', 'reply-to', 'in-reply-to', 'references', 'mime-*', 'list-*', 'content-*', 'errors-to', 'x-mailing-list', 'resent-to', 'dkim-*', 'x-developer-*', } __VERSION__ = '2.0' def formataddr(pair: Tuple[str, str]) -> str: try: return email.utils.formataddr(pair) except UnicodeEncodeError: # This might happen if the realname is encoded in a broken way; just # drop the real name then. return email.utils.formataddr((None, pair[1])) def process_archives(sources: List[str], outdir: str, knownset: Set[str], listids: List[str], rejectsfile: str, asmaildir: bool, extrahdrs: List[Tuple[str, str]]) -> Set[str]: outboxes = dict() writecount = dict() seenids = set() if asmaildir: outbox = mailbox.Maildir(outdir) outboxes[outdir] = outbox writecount[outdir] = 1 # convert listids into email addresses by replacing the first '.' to '@'. # if you're working with a mailing list that has a non-standard list-id, you # can specify the list email address as part of the listids to satisfy this check. eaddrs = list() for listid in listids: if listid.find('@') < 0: eaddrs.append(listid.replace('.', '@', 1)) else: eaddrs.append(listid) rejectsbox = None if rejectsfile: rejectsbox = mailbox.mbox(rejectsfile) for sourcefile in sources: sys.stdout.write('Opening %s...' % os.path.basename(sourcefile)) sys.stdout.flush() # If the filename ends with /, we treat as maildir if sourcefile[-1] == '/': inbox = mailbox.Maildir(sourcefile) else: inbox = mailbox.mbox(sourcefile) total = len(inbox) sys.stdout.write('%s messages\n' % total) sys.stdout.flush() counter = 0 skipped = 0 dupmsgid = 0 nomsgid = 0 notourlist = 0 for msg in inbox: counter += 1 sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' % (counter, total, skipped, dupmsgid, nomsgid, notourlist)) sys.stdout.flush() msgid = msg['message-id'] if msgid is None and msg.get('resent-message-id', ''): msgid = msg['resent-message-id'] if msgid is None: # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other # system message. if rejectsfile: msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa rejectsbox.add(msg) skipped += 1 nomsgid += 1 continue msgid = msgid.strip() if msgid in knownset: # Duplicate Message-ID, either because we already have it in the known-ids, # or because the inbox has messages with same IDs. There is no fix for the # latter condition, so we just assume they got delivered multiple times and # use the first one found. if rejectsfile: msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa rejectsbox.add(msg) skipped += 1 dupmsgid += 1 continue if extrahdrs: msg._headers += extrahdrs # noqa # Remove headers not in WANTHDRS list and any Received: # lines that do not mention the list email address newhdrs = list() to = list() cc = list() recvtime = None is_our_list = False for hdrname, hdrval in list(msg._headers): # noqa lhdrname = hdrname.lower() lhdrval = hdrval.lower() wanthdr = False for hdrmatch in WANTHDRS: if fnmatch.fnmatch(lhdrname, hdrmatch): wanthdr = True break if not wanthdr: continue if lhdrname == 'received': # noinspection PyBroadException try: if recvtime is None: # Use the first Received header we find for the message date # (for the purposes of knowing which mbox file to put it in) recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip()) # does hdrval contain one of our email addresses? for eaddr in eaddrs: if lhdrval.find(eaddr) >= 0: newhdrs.append((hdrname, hdrval)) break except: # Something went horribly wrong, eh? pass elif lhdrname == 'list-id': for listid in listids: if lhdrval.find(listid) >= 0 or fnmatch.fnmatch(lhdrval, listid): newhdrs.append((hdrname, hdrval)) is_our_list = True break elif lhdrname == 'x-mailing-list': for listid in listids: if lhdrval.find(listid) >= 0: # Stick the list-id that's first in our collection, # since we assume that it's the canonical one newhdrs.append(('List-Id', listids[0])) is_our_list = True break # Malformed emails can have multiple to: and cc: fields. Merge # so there's one field for each header type. # # Save the place in newhdrs where the first to or cc list would # have appeared, so we can insert the merged list there rather # than strangely at the end. elif lhdrname == 'to': for pair in email.utils.getaddresses([hdrval]): if pair[1] in cc: # already in Cc, so no need to add it to "To" continue to.append(formataddr(pair)) elif lhdrname == 'cc': for pair in email.utils.getaddresses([hdrval]): if pair[1] in to: # already in To, so no need to add it to CCs continue cc.append(formataddr(pair)) else: newhdrs.append((hdrname, hdrval)) if len(to): newhdrs.append(('To', ', '.join(to))) if len(cc): newhdrs.append(('Cc', ', '.join(cc))) if not is_our_list: # Sometimes a message is cc'd to multiple mailing lists and the # archives only contain a copy of the message that was delivered to a # different list. E.g. something can be To: linux-mm@vger.kernel.org # and also Cc: linux-kernel@vger.kernel.org and we're looking for the # LKML list-id, the archive may only contain the copy that arrived to # linux-mm. We try to hedge for this by looking in the "To" and "Cc" # fields for any indication that this was intended for our mailing list. for eaddr in eaddrs: if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or str(msg.get('cc', '')).lower().find(eaddr) >= 0 or str(msg.get('resent-to', '')).lower().find(eaddr) >= 0): # insert the list-id header # (assuming the first one in the list to be the canonical one) newhdrs.append(('List-ID', '<%s>' % listids[0])) is_our_list = True break if not is_our_list: # Well, we tried everything if rejectsfile: msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa rejectsbox.add(msg) skipped += 1 notourlist += 1 continue msg._headers = newhdrs msgdate = recvtime if msgdate is None: # fine, use the date in the message, even if it's bogus msgdate = email.utils.parsedate_tz(str(msg['Date'])) if asmaildir: mboxname = outdir else: mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1]) # do we have this mbox open already? if mboxname in outboxes: outbox = outboxes[mboxname] else: outbox = mailbox.mbox('%s/%s' % (outdir, mboxname)) outboxes[mboxname] = outbox writecount[mboxname] = 1 try: outbox.add(msg.as_string(policy=EMLPOLICY).encode()) seenids.add(msgid) knownset.add(msgid) writecount[mboxname] += 1 except: # noqa # Oh well, toss it pass inbox.close() sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' % (counter, total, skipped, dupmsgid, nomsgid, notourlist)) allboxes = sorted(outboxes) if len(allboxes): print() print('Summary') for mboxname in allboxes: print(' %s: %s new (%s total)' % (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname]))) outboxes[mboxname].close() else: print('No new messages found.') return seenids def main(args: argparse.Namespace): if not args.as_maildir and not os.path.isdir(args.exportdir): os.mkdir(args.exportdir) if args.known_ids and os.path.exists(args.known_ids): if args.known_ids.endswith('.sqlite3'): import sqlite3 dbconn = sqlite3.connect(args.known_ids, sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) cur = dbconn.cursor() rows = cur.execute('SELECT mid FROM msgmap').fetchall() knownids = {f'<{x[0]}>' for x in rows} else: with open(args.known_ids, 'r') as fh: knownids = set(fh.read().splitlines()) fh.close() print('Loaded %s message-ids from "%s"' % (len(knownids), args.known_ids)) else: # should we load message-ids from existing mailboxes found in the export dir? # right now we're just appending to them, which is probably not expected behaviour. knownids = set() if not args.source: print('You have to specify at least one source') sys.exit(1) # Make list ID matching case-insensitive to match more mail if args.list_ids: listids = [listid.lower() for listid in args.list_ids] else: listids = ['*'] extrahdrs = list() if args.extrahdrs: for hdr in args.extrahdrs: name, val = hdr.split(':', maxsplit=1) if val.strip(): extrahdrs.append((name.strip(), val.strip())) newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.as_maildir, extrahdrs) if newids is None or not args.known_ids or args.known_ids.endswith('.sqlite3'): sys.exit(0) knownids.update(newids) print('Wrote %s msgids into %s (%s new)' % (len(knownids), args.known_ids, len(newids))) with open(args.known_ids, 'w') as fh: fh.write('\n'.join(knownids)) fh.close() if __name__ == '__main__': # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Make a mbox of LKML messages we haven't yet archived", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument('-s', '--source', nargs='+', help=('Mbox file with archives, can be multiple. ' 'Paths with trailing "/" will be treated as maildirs.')) parser.add_argument('-e', '--exportdir', required=True, default='list-archives', help='Export dir where to put sanitized archives') parser.add_argument('-m', '--as-maildir', action='store_true', default=False, help='Export as maildir instead of mailboxes') parser.add_argument('-k', '--known-ids', help='File with known Message-IDs (one per line, or msgmap.sqlite3)') parser.add_argument('-l', '--list-ids', nargs='+', help='Limit to just these list-ids (can be multiple)') parser.add_argument('-r', '--rejected', help='Mailbox file where to save messages that were rejected ' '(adds X-Import-Rejected-Reason header)') parser.add_argument('-x', '--extrahdrs', nargs='+', metavar='FULLHDR', help='Extra headers to inject into each message') main(parser.parse_args())