diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-10-20 12:59:44 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2020-10-20 12:59:44 -0400 |
commit | 35fa657cf9ee91aea540317b823fb3485fad0bd3 (patch) | |
tree | 291c39e9fab7540d89972ce2875e37c624cc7085 | |
parent | 1d54a8c8a09eacdc2ed1213afaa8639b7393b6ef (diff) | |
download | korg-helpers-35fa657cf9ee91aea540317b823fb3485fad0bd3.tar.gz |
Split list-archive-maker into two scripts
Move all operations for collecting remote archives into
list-archive-collector.py, adding a way to import marc.info archives. We
also add several other useful options to list-archive-collector:
- With -k, we can check if we already have a copy of that message on
lore.kernel.org and backfill many of the important headers that are
missing from pipermail and (especially) marc.info sources.
- With -s and prsence of /usr/bin/spamc, we'll check the message for
spam before accepting it
- The generated .mbx file can then be used with list-archive-maker.py to
create the final archive sources for importing into public-inbox.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r-- | .gitignore | 2 | ||||
-rwxr-xr-x | list-archive-collector.py | 588 | ||||
-rwxr-xr-x | list-archive-maker.py | 281 |
3 files changed, 661 insertions, 210 deletions
@@ -1 +1,3 @@ .idea +*.swp +*~ diff --git a/list-archive-collector.py b/list-archive-collector.py new file mode 100755 index 0000000..4e3a81f --- /dev/null +++ b/list-archive-collector.py @@ -0,0 +1,588 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# List archive collector +# +# This is a tool to collect archive from networked non-mbox sources, such as: +# - mailman +# - marc.info +# - nntp +# +# After the archives are collected, you can feed them to list-archive-maker.py +# +# Author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> +# + +import os +import sys +import mailbox +import email.utils +import email.policy +import time +import re +import quopri +import base64 +import gzip +import nntplib +import requests +import logging +import subprocess + +try: + import cchardet as chardet # noqa +except ImportError: + import chardet + +from tempfile import mkstemp +from bs4 import BeautifulSoup # noqa +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +from email import charset +charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa + +# Used for our requests session +REQSESSION = None + +__VERSION__ = '1.0' +# Where des marc.info live? +MARCURL = 'https://marc.info' +# Wait this many seconds between requests to marc.info, to avoid triggering +# anti-abuse blocks (and to just be nice) +MARCNICE = 1 + +# Set our own policy +EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None) + +logger = logging.getLogger(__name__) + + +def get_requests_session(): + global REQSESSION + if REQSESSION is None: + REQSESSION = requests.session() + retry = Retry(connect=3, backoff_factor=1) + adapter = HTTPAdapter(max_retries=retry) + REQSESSION.mount('http://', adapter) + REQSESSION.mount('https://', adapter) + headers = { + 'User-Agent': f'lore-archive-maker/{__VERSION__}', + } + REQSESSION.headers.update(headers) + + return REQSESSION + + +def lore_get_message(msgid): + # See where we're redirected + rurl = f'https://lore.kernel.org/r/{msgid}' + rses = get_requests_session() + resp = rses.head(rurl) + if resp.status_code < 300 or resp.status_code > 400: + # Not known on lore + return None + # Pop msgid from the end of the redirect + msgurl = resp.headers['Location'] + 'raw' + resp.close() + resp = rses.get(msgurl) + msg = email.message_from_bytes(resp.content) + logger.info(' found on lore') + return msg + + +# Turned off for now +def patchwork_get_headers(msgid): + url = f'https://patchwork.kernel.org/api/1.2/patches/' + params = [ + ('msgid', msgid), + ] + rses = get_requests_session() + resp = rses.get(url, params=params, stream=False) + if resp.status_code > 200: + return None + + jj = resp.json() + if not len(jj): + return None + + # we only care about one + p_id = jj[0].get('id') + resp = rses.get(f'{url}{p_id}', stream=False) + if resp.status_code > 200: + return None + + logger.info(' found on patchwork') + jj = resp.json() + return jj.get('headers') + + +def lookaside_fillin(msg): + wanthdrs = [ + 'To', + 'Cc', + 'References', + 'In-Reply-To', + 'User-Agent', + 'X-Mailer', + ] + msgid = msg.get('Message-Id').strip('<>') + lmsg = lore_get_message(msgid) + if not lmsg: + return False + # lmsg = patchwork_get_headers(msgid) + # if not lmsg: + # return False + + for wanthdr in wanthdrs: + if not msg.get(wanthdr) and lmsg.get(wanthdr): + msg[wanthdr] = lmsg.get(wanthdr) + + return True + + +def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside): + rses = get_requests_session() + url = f'{MARCURL}/?l={marc_list_id}&m={msgnum}&q=mbox' + logger.info(' grabbing message %s', msgnum) + resp = rses.get(url) + rawmsg = resp.content + multipart = False + if rawmsg.find(b'\nContent-Type: multipart/mixed;') > 0: + multipart = True + # marc.info breaks MIME by incorrectly writing boundary headers + rawmsg = rawmsg.replace(b'\nContent-Type: multipart/mixed; boundary="--', + b'\nContent-Type: multipart/mixed; boundary="', 1) + # We don't need to fix charset for multipart/mixed messages + + msg = email.message_from_bytes(rawmsg) + if not msg.get('Message-Id'): + logger.info(' No message-id, ignored') + # Can't use it anyway + return None + + hdrs = list() + + for hdrname, hdrval in list(msg._headers): # noqa + if hdrname == 'To': + # Useless, we throw it out + continue + elif hdrval.find(' () ') and (hdrval.find(' ! ') or hdrval.find('<')): + # marc.info mangles @ and . in email addresses with + # the above values. Unmangle them back. + hdrval = hdrval.replace(' () ', '@').replace(' ! ', '.') + hdrs.append((hdrname, hdrval)) + msg._headers = hdrs # noqa + + # Marc.info removes content-transfer-encoding headers, so try to figure out + # what format the raw message is in before trying to add it to the mailbox + if not multipart: + payload = msg.get_payload(decode=True) + # Try to base64 decode it first + dec = None + try: + dec = base64.b64decode(payload, validate=True) + if dec != payload: + msg.set_payload(dec) + except: # noqa + pass + + if not dec: + try: + dec = quopri.decodestring(payload) + if dec != payload: + msg.set_payload(dec) + except ValueError: + pass + + if listid: + msg['List-Id'] = f'<{listid}>' + + if lookaside: + lookaside_fillin(msg) + + if not msg.get('To'): + msg['To'] = toaddr + + return msg + + +def check_if_spam(bmsg): + if not os.path.exists('/usr/bin/spamc'): + return False + + logger.info(' checking for spam') + args = ['/usr/bin/spamc', '-c'] + logger.debug('Running %s' % ' '.join(args)) + + pp = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + pp.communicate(input=bmsg) + if pp.returncode == 0: + return False + + return True + + +def add_msg_to_mbx(msg, mbx, checkspam): + if msg.get_default_type() == 'text/plain': + try: + payload = msg.get_payload(decode=True) + if payload: + msg.set_charset(chardet.detect(payload)['encoding']) + except: # noqa + # This may fail for various reasons having to do with the wonderful world + # of 8bit content and legacy encodings. + # Ignore and hope we can still as_string below. + pass + + try: + bmsg = msg.as_string(policy=EMLPOLICY).encode() + if checkspam and check_if_spam(bmsg): + logger.info(' spam: %s', msg['Subject']) + return + + mbx.add(bmsg) + except: # noqa + # Throw it out, because life is too short to figure out all possible ways + # that decades-old email messages make python break. + logger.info(' corrupted: %s', msg['Subject']) + return + + +def marc_get_full_thread(marc_list_id, thread_id): + cp = 1 + rses = get_requests_session() + msgnums = list() + logger.info('Grabbing thread %s', thread_id) + while True: + lastpage = True + np = cp + 1 + nl = f'r={np}&' + # Be nice + time.sleep(MARCNICE) + url = f'{MARCURL}/?t={thread_id}&r={cp}&w=1' + rsp = rses.get(url) + soup = BeautifulSoup(rsp.content, features='lxml') + for tag in soup.find_all('a'): + href = tag.attrs.get('href') + if not href: + continue + # See if it's a link to the next page + if href.find(nl) >= 0: + lastpage = False + continue + # Is it from the wrong list? + if href.find(marc_list_id) < 0: + continue + + match = re.search(r'm=(\d+)\D', href) + if match: + msgnums.append(match.groups()[0]) + continue + + if lastpage: + break + cp += 1 + logger.info('\t... page %s', cp) + + return msgnums + + +def parse_pipermail_index(pipermail_url): + logger.info('Grabbing pipermail index from %s', pipermail_url) + rses = get_requests_session() + resp = rses.get(pipermail_url) + index = resp.content + + soup = BeautifulSoup(index, features='lxml') + + mboxes = [] + for tag in soup.find_all('a'): + # we are looking for a href that ends with .txt.gz + if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz': + mboxes.append(os.path.join(pipermail_url, tag.attrs['href'])) + + return mboxes + + +def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam): + tmpfile = mkstemp('pipermail')[1] + chunks = pipermail_url.split('/') + + logger.info(' grabbing %s', chunks[-1]) + rses = get_requests_session() + resp = rses.get(pipermail_url, stream=True) + + with gzip.GzipFile(fileobj=resp.raw) as uncompressed: + # XXX: this can be horribly large + mboxdata = uncompressed.read().decode('utf-8', errors='replace') + + resp.close() + + # Pipermail does a nasty thing where it doesn't properly handle + # lines in the body that start with "From ". First, we add ">" to + # all lines starting with "From " and then fix some of them in the + # next step. + logger.info(' demangling %s', chunks[-1]) + regex = r'^From ' + subst = '>From ' + mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + # Fix pipermail mangling where it changes some email addresses + # to be ' at ' instead of '@'. This is easiest to do with a + # handful of regexes than via actual message body manipulation + # as parf of the python's email.message object + regex = r'(<[^>]+) at ([^>]+>)' + subst = '\\1@\\2' + mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + regex = r'^>?(From:? \S+) at (\S+\..*)' + mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) + + with open(tmpfile, 'wb') as out_fh: + out_fh.write(mboxdata.encode()) + + # Open it now as a mailbox + tmpmbx = mailbox.mbox(tmpfile) + for msg in tmpmbx: + logger.info(' processing: %s', msg.get('Message-Id')) + # Fix bogus From: foo@bar.baz (Foo Barski) -> Foo Barski <foo@bar.baz> + fromline = msg.get('From') + matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', fromline) + if matches: + gr = matches.groups() + msg.replace_header('From', f'{gr[1]} <{gr[0]}>') + + if listid: + msg['List-Id'] = f'<{listid}>' + + if lookaside: + lookaside_fillin(msg) + + if not msg.get('To'): + msg['To'] = toaddr + + add_msg_to_mbx(msg, mbx, checkspam) + + tmpmbx.close() + os.unlink(tmpfile) + + +def get_marcinfo(args): + global MARCNICE + + if args.nice < 0.5: + logger.critical('Hitting marc.info every %s s will get you auto-banned. Try above 0.5.', args.nice) + sys.exit(1) + MARCNICE = args.nice + + if not args.to: + args.to = args.listid.replace('.', '@', 1) + + marc_list_id = args.listname + + rses = get_requests_session() + url = f'{MARCURL}/?l={marc_list_id}&w=1' + logger.info('Grabbing main index for %s', marc_list_id) + + rsp = rses.get(url, stream=False) + soup = BeautifulSoup(rsp.content, features='lxml') + months = list() + for tag in soup.find_all('a'): + # we are looking for a href that contains + href = tag.attrs.get('href') + if not href: + continue + match = re.search(r'b=(\d+)\D', href) + if match: + months.append(match.groups()[0]) + + thdnums = set() + msgnums = set() + for month in months: + logger.info('Grabbing month %s', month) + # We may be paginated + cp = 1 + while True: + lastpage = True + # Be nice + np = cp + 1 + time.sleep(MARCNICE) + url = f'{MARCURL}/?l={marc_list_id}&b={month}&r={cp}&w=1' + if cp > 1: + logger.info(' ... page %s', cp) + rsp = rses.get(url) + soup = BeautifulSoup(rsp.content, features='lxml') + for tag in soup.find_all('a'): + href = tag.attrs.get('href') + if not href: + continue + # See if it's a link to the next page + telltale = f'r={np}&' + if href.find(telltale) >= 0: + lastpage = False + continue + + # Is it a message link? + match = re.search(r'm=(\d+)\D', href) + if match: + msgnums.add(match.groups()[0]) + continue + + # Is it a thread link? + match = re.search(r't=(\d+)\D', href) + if match: + thdnums.add(match.groups()[0]) + continue + + if lastpage: + break + cp += 1 + + mbx = mailbox.mbox(args.out) + for thdnum in thdnums: + tnums = marc_get_full_thread(marc_list_id, thdnum) + # last message starts the thread + tnums.reverse() + irt = None + for tnum in tnums: + if tnum in msgnums: + msgnums.remove(tnum) + time.sleep(MARCNICE) + msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside) + if not msg: + continue + + if not irt: + irt = msg.get('Message-Id') + elif not msg.get('References'): + msg['References'] = irt + msg['In-Reply-To'] = irt + + add_msg_to_mbx(msg, mbx, args.checkspam) + + logger.info('Grabbing remaining unthreaded messages') + for msgnum in msgnums: + time.sleep(MARCNICE) + msg = marc_get_message(marc_list_id, msgnum, args.listid, args.to, args.lookaside) + if not msg: + continue + + add_msg_to_mbx(msg, mbx, args.checkspam) + + mbx.close() + + +def get_mailman(args): + if not args.to: + args.to = args.listid.replace('.', '@', 1) + + months = parse_pipermail_index(args.url) + if not months: + print('Could not find any .txt.gz files listed at %s' % args.url) + sys.exit(1) + mbx = mailbox.mbox(args.out) + for month in months: + grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam) + + +def get_nntp(args): + # Expect in format nntp://news.gmane.org/gmane.linux.network + logger.info('Connecting to %s', args.url) + chunks = args.url.split('/') + server, group = chunks[-2:] + nntplib._MAXLINE = 1 << 20 + server = nntplib.NNTP(server) + resp, count, first, last, name = server.group(group) + total = int(last) + + mbx = mailbox.mbox(args.out) + aid = 1 + while aid <= total: + try: + nresp, nainfo = server.article(aid) + msg = email.message_from_bytes(b'\n'.join(nainfo[2])) + logger.info(' processing: %s, %s/%s', msg.get('Message-Id'), aid, total) + newhdrs = list() + for hdrname, hdrval in list(msg._headers): # noqa + if hdrname.find('Original-') == 0: + hdrname = hdrname.replace('Original-', '') + newhdrs.append((hdrname, hdrval)) + msg._headers = newhdrs # noqa + if args.listid: + try: + msg.replace_header('List-Id', f'<{args.listid}>') + except KeyError: + msg.add_header('List-Id', f'<{args.listid}') + + add_msg_to_mbx(msg, mbx, args.checkspam) + + except nntplib.NNTPTemporaryError: + # Ignore one-off article failures -- probably deletes + pass + finally: + aid += 1 + + mbx.close() + + +if __name__ == '__main__': + import argparse + + # noinspection PyTypeChecker + parser = argparse.ArgumentParser( + description="Collect external mail archives into a local mbox", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument('-d', '--debug', action='store_true', default=False, + help='Add more debugging info to the output') + parser.add_argument('-i', '--listid', + help='List-Id header to inject into the messages') + parser.add_argument('-t', '--to', + help='Value to put into the To: header, if missing ' + '(defaults to list-id with first . replaced with @') + parser.add_argument('-k', '--lookaside', action='store_true', default=False, + help='Attempt to look up matching lore messages for missing to/cc headers') + parser.add_argument('-s', '--checkspam', action='store_true', default=False, + help='Run spamc to check messages for spam before adding') + parser.add_argument('-o', '--out', required=True, + help='Filename of the mailbox file to write out') + + subparsers = parser.add_subparsers(help='sub-command help', dest='subcmd') + + sp_mm = subparsers.add_parser('mailman', help='Collect mailman archives', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + sp_mm.add_argument('-u', '--url', required=True, + help='Mailman archive index URL') + sp_mm.set_defaults(func=get_mailman) + + sp_marc = subparsers.add_parser('marcinfo', help='Collect marc.info archives', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + sp_marc.add_argument('-l', '--listname', required=True, + help='Marc.info list name (?l= parameter)') + sp_marc.add_argument('-n', '--nice', default=MARCNICE, type=float, + help='Seconds to sleep between requests') + sp_marc.set_defaults(func=get_marcinfo) + + sp_nntp = subparsers.add_parser('nntp', help='Collect NNTP archives', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + sp_nntp.add_argument('-u', '--url', required=True, + help='NNTP url (e.g. nntp://news.gmane.com/gmane.linux.kernel') + sp_nntp.set_defaults(func=get_nntp) + + cmdargs = parser.parse_args() + logger.setLevel(logging.DEBUG) + + ch = logging.StreamHandler() + formatter = logging.Formatter('%(message)s') + ch.setFormatter(formatter) + + if cmdargs.debug: + ch.setLevel(logging.DEBUG) + else: + ch.setLevel(logging.INFO) + + logger.addHandler(ch) + + if 'func' not in cmdargs: + parser.print_help() + sys.exit(1) + + cmdargs.func(cmdargs) diff --git a/list-archive-maker.py b/list-archive-maker.py index 65bf2ca..191a504 100755 --- a/list-archive-maker.py +++ b/list-archive-maker.py @@ -25,10 +25,15 @@ import os import sys import mailbox import email.utils -import time -import re +import email.policy import fnmatch +from email import charset +charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa + +# Set our own policy +EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None) + # Only retain the headers that are important to us # must be lowercase for matching purposes. # We treat "Received" headers with extra care for privacy, but if you @@ -55,8 +60,10 @@ WANTHDRS = {'return-path', 'resent-to', } +__VERSION__ = '2.0' + -def main(sources, outdir, msgids, listids, rejectsfile): +def process_archives(sources, outdir, msgids, listids, rejectsfile): outboxes = {} writecount = {} seenids = [] @@ -77,58 +84,15 @@ def main(sources, outdir, msgids, listids, rejectsfile): rejectsbox = mailbox.mbox(rejectsfile) for sourcefile in sources: - is_pipermail = False - is_nntp = False - - # do you have a '://' in you? - if sourcefile.find('://') > 0: - if sourcefile.find('nntp://') == 0: - is_nntp = True - else: - is_pipermail = True - - if is_nntp: - # Expect in format nntp://news.gmane.org/gmane.linux.network - sys.stdout.write('Connecting to %s...' % sourcefile) - chunks = sourcefile.split('/') - server, group = chunks[-2:] - import nntplib - nntplib._MAXLINE = 1 << 20 - server = nntplib.NNTP(server) - resp, count, first, last, name = server.group(group) - total = int(last) - - def nntp_msg_gen(last): - aid = 1 - while aid <= last: - try: - resp, ainfo = server.article(aid) - message = email.message_from_bytes(b'\n'.join(ainfo[2])) - yield message - except nntplib.NNTPTemporaryError: - # Ignore one-off article failures -- probably deletes - pass - finally: - aid += 1 - - inbox = nntp_msg_gen(total) - + sys.stdout.write('Opening %s...' % os.path.basename(sourcefile)) + sys.stdout.flush() + # If the filename ends with /, we treat as maildir + if sourcefile[-1] == '/': + inbox = mailbox.Maildir(sourcefile) else: - if is_pipermail: - sourcefile = grab_pipermail_archive(sourcefile, outdir) - sys.stdout.write('parsing...') - sys.stdout.flush() - inbox = mailbox.mbox(sourcefile) - else: - sys.stdout.write('Opening %s...' % os.path.basename(sourcefile)) - sys.stdout.flush() - # If the filename ends with /, we treat as maildir - if sourcefile[-1] == '/': - inbox = mailbox.Maildir(sourcefile) - else: - inbox = mailbox.mbox(sourcefile) + inbox = mailbox.mbox(sourcefile) - total = len(inbox) + total = len(inbox) sys.stdout.write('%s messages\n' % total) sys.stdout.flush() @@ -153,7 +117,7 @@ def main(sources, outdir, msgids, listids, rejectsfile): # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other # system message. if rejectsfile: - msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) + msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa rejectsbox.add(msg) skipped += 1 nomsgid += 1 @@ -166,7 +130,7 @@ def main(sources, outdir, msgids, listids, rejectsfile): # latter condition, so we just assume they got delivered multiple times and # use the first one found. if rejectsfile: - msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) + msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa rejectsbox.add(msg) skipped += 1 dupmsgid += 1 @@ -179,12 +143,8 @@ def main(sources, outdir, msgids, listids, rejectsfile): cc = '' recvtime = None is_our_list = False - for hdrname, hdrval in list(msg._headers): + for hdrname, hdrval in list(msg._headers): # noqa lhdrname = hdrname.lower() - if is_nntp and lhdrname.find('original-') == 0: - lhdrname = lhdrname.replace('original-', '') - hdrname = hdrname.replace('Original-', '') - lhdrval = hdrval.lower() wanthdr = False for hdrmatch in WANTHDRS: @@ -212,20 +172,20 @@ def main(sources, outdir, msgids, listids, rejectsfile): pass elif lhdrname == 'list-id': - for listid in listids: - if lhdrval.find(listid) >= 0: - newhdrs.append((hdrname, hdrval)) - is_our_list = True - break + for listid in listids: + if lhdrval.find(listid) >= 0: + newhdrs.append((hdrname, hdrval)) + is_our_list = True + break elif lhdrname == 'x-mailing-list': - for listid in listids: - if lhdrval.find(listid) >= 0: - # Stick the list-id that's first in our collection, - # since we assume that it's the canonical one - newhdrs.append(('List-Id', listids[0])) - is_our_list = True - break + for listid in listids: + if lhdrval.find(listid) >= 0: + # Stick the list-id that's first in our collection, + # since we assume that it's the canonical one + newhdrs.append(('List-Id', listids[0])) + is_our_list = True + break # Malformed emails can have multiple to: and cc: fields. Merge # so there's one field for each header type. @@ -271,31 +231,20 @@ def main(sources, outdir, msgids, listids, rejectsfile): # LKML list-id, the archive may only contain the copy that arrived to # linux-mm. We try to hedge for this by looking in the "To" and "Cc" # fields for any indication that this was intended for our mailing list. - if is_pipermail: - # Pipermail doesn't preserve the List-Id nor "To" headers, - # so put them back in place - newhdrs.append(('To', eaddrs[0])) - newhdrs.append(('List-Id', listids[0])) - is_our_list = True - elif is_nntp: - # We assume everything in the newsgroup matches our first list-id - newhdrs.append(('List-Id', listids[0])) - is_our_list = True - else: - for eaddr in eaddrs: - if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or - str(msg.get('cc', '')).lower().find(eaddr) >= 0 or - str(msg.get('resent-to', '')).lower().find(eaddr) >= 0): - # insert the list-id header - # (assuming the first one in the list to be the canonical one) - newhdrs.append(('List-ID', '<%s>' % listids[0])) - is_our_list = True - break + for eaddr in eaddrs: + if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or + str(msg.get('cc', '')).lower().find(eaddr) >= 0 or + str(msg.get('resent-to', '')).lower().find(eaddr) >= 0): + # insert the list-id header + # (assuming the first one in the list to be the canonical one) + newhdrs.append(('List-ID', '<%s>' % listids[0])) + is_our_list = True + break if not is_our_list: # Well, we tried everything if rejectsfile: - msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) + msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa rejectsbox.add(msg) skipped += 1 notourlist += 1 @@ -309,8 +258,6 @@ def main(sources, outdir, msgids, listids, rejectsfile): msgdate = email.utils.parsedate_tz(str(msg['Date'])) mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1]) - if is_nntp: - msg.set_unixfrom('From nntp@import %s' % time.strftime('%c', msgdate[:9])) # do we have this mbox open already? if mboxname in outboxes: @@ -321,14 +268,11 @@ def main(sources, outdir, msgids, listids, rejectsfile): outboxes[mboxname] = outbox writecount[mboxname] = 1 - outbox.add(msg) + outbox.add(msg.as_string(policy=EMLPOLICY).encode()) seenids.append(msgid) knownset.add(msgid) inbox.close() - if is_pipermail: - os.unlink(sourcefile) - sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' % (counter, total, skipped, dupmsgid, nomsgid, notourlist)) @@ -347,99 +291,7 @@ def main(sources, outdir, msgids, listids, rejectsfile): return None -def parse_pipermail_index(pipermail_url): - try: - from bs4 import BeautifulSoup - except ImportError as ex: - print('You need to install python-beautifulsoup4 to parse pipermail URLs') - print(ex) - sys.exit(1) - - print('Grabbing the pipermail index from %s' % pipermail_url) - with urllib_request.urlopen(pipermail_url) as response: - index = response.read() - response.close() - - soup = BeautifulSoup(index, features='lxml') - - mboxes = [] - for tag in soup.find_all('a'): - # we are looking for a href that ends with .txt.gz - if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz': - mboxes.append(os.path.join(pipermail_url, tag.attrs['href'])) - - return mboxes - - -def grab_pipermail_archive(pipermail_url, outdir): - import gzip - - chunks = pipermail_url.split('/') - - sys.stdout.write('Grabbing %s...' % chunks[-1]) - sys.stdout.flush() - # stick it into outdir/_tmp_pipermail_%last-chunk - local_file = os.path.join(outdir, '_tmp_pipermail_%s' % chunks[-1]) - - with urllib_request.urlopen(pipermail_url) as response: - with gzip.GzipFile(fileobj=response) as uncompressed: - # XXX: this can be horribly large - mboxdata = uncompressed.read().decode('utf-8', errors='replace') - uncompressed.close() - response.close() - - # Pipermail does a nasty thing where it doesn't properly handle - # lines in the body that start with "From ". First, we add ">" to - # all lines starting with "From " and then fix some of them in the - # next step. - sys.stdout.write('demangling...') - sys.stdout.flush() - regex = r'^From ' - subst = '>From ' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) - # Fix pipermail mangling where it changes some email addresses - # to be ' at ' instead of '@'. This is easiest to do with a - # handful of regexes than via actual message body manipulation - # as parf of the python's email.message object - regex = r'(<[^>]+) at ([^>]+>)' - subst = '\\1@\\2' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) - regex = r'^>?(From:? \S+) at (\S+\..*)' - mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE) - - with open(local_file, 'wb') as out_fh: - out_fh.write(mboxdata.encode('utf-8')) - - out_fh.close() - return local_file - - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser( - description="Make a mbox of LKML messages we haven't yet archived", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument('-source', nargs='+', - help=('Mbox file with archives, can be multiple. ' - 'Paths with trailing "/" will be treated as maildirs.')) - parser.add_argument('-pipermail', - help='Download mailman pipermail archives from this URL') - parser.add_argument('-nntp', - help=('Download full archives from a NNTP server, ' - 'e.g. -n nntp://news.gmane.com/gmane.linux.kernel')) - parser.add_argument('-exportdir', required=True, default='list-archives', - help='Export dir where to put sanitized archives') - parser.add_argument('-knownids', - help='File with known Message-IDs (one per line)') - parser.add_argument('-listids', required=True, nargs='+', - help='List ID to match, can be multiple') - parser.add_argument('-rejected', - help='Mailbox file where to save messages that were rejected ' - '(adds X-Import-Rejected-Reason header)') - - args = parser.parse_args() - +def main(args): if not os.path.isdir(args.exportdir): os.mkdir(args.exportdir) @@ -453,29 +305,14 @@ if __name__ == '__main__': # right now we're just appending to them, which is probably not expected behaviour. knownids = [] - # are you asking for a pipermail grab? - mboxes = [] - if args.pipermail is not None: - import urllib.request as urllib_request - mboxes = parse_pipermail_index(args.pipermail) - if not mboxes: - print('Could not find any .txt.gz files listed at %s' % args.pipermail) - sys.exit(1) - - if args.nntp: - mboxes.append(args.nntp) - - if args.source: - mboxes += args.source - - if not mboxes: - print('You have to specify at least one source (-s, -p, or -n)') + if not args.source: + print('You have to specify at least one source') sys.exit(1) # Make list ID matching case insensitive to match more mail listids = [listid.lower() for listid in args.listids] - newids = main(mboxes, args.exportdir, knownids, listids, args.rejected) + newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected) if newids is None or not args.knownids: sys.exit(0) @@ -485,3 +322,27 @@ if __name__ == '__main__': with open(args.knownids, 'w') as fh: fh.write('\n'.join(new_idlist)) fh.close() + + +if __name__ == '__main__': + import argparse + + # noinspection PyTypeChecker + parser = argparse.ArgumentParser( + description="Make a mbox of LKML messages we haven't yet archived", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument('-source', nargs='+', + help=('Mbox file with archives, can be multiple. ' + 'Paths with trailing "/" will be treated as maildirs.')) + parser.add_argument('-exportdir', required=True, default='list-archives', + help='Export dir where to put sanitized archives') + parser.add_argument('-knownids', + help='File with known Message-IDs (one per line)') + parser.add_argument('-listids', required=True, nargs='+', + help='List ID to match, can be multiple') + parser.add_argument('-rejected', + help='Mailbox file where to save messages that were rejected ' + '(adds X-Import-Rejected-Reason header)') + + main(parser.parse_args()) |