Split list-archive-maker into two scripts

Move all operations for collecting remote archives into list-archive-collector.py, adding a way to import marc.info archives. We also add several other useful options to list-archive-collector: - With -k, we can check if we already have a copy of that message on lore.kernel.org and backfill many of the important headers that are missing from pipermail and (especially) marc.info sources. - With -s and prsence of /usr/bin/spamc, we'll check the message for spam before accepting it - The generated .mbx file can then be used with list-archive-maker.py to create the final archive sources for importing into public-inbox. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-10-20 12:59:44 -0400
committer: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-10-20 12:59:44 -0400
commit: 35fa657cf9ee91aea540317b823fb3485fad0bd3 (patch)
tree: 291c39e9fab7540d89972ce2875e37c624cc7085
parent: 1d54a8c8a09eacdc2ed1213afaa8639b7393b6ef (diff)
download: korg-helpers-35fa657cf9ee91aea540317b823fb3485fad0bd3.tar.gz
3 files changed, 661 insertions, 210 deletions
diff --git a/.gitignore b/.gitignore
index 485dee6..bddfa03 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 .idea
+*.swp
+*~
diff --git a/list-archive-collector.py b/list-archive-collector.py
new file mode 100755
index 0000000..4e3a81f
--- /dev/null
+++ b/list-archive-collector.py
@@ -0,0 +1,588 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# List archive collector
+#
+# This is a tool to collect archive from networked non-mbox sources, such as:
+# - mailman
+# - marc.info
+# - nntp
+#
+# After the archives are collected, you can feed them to list-archive-maker.py
+#
+# Author:  Konstantin Ryabitsev <konstantin@linuxfoundation.org>
+#
+
+import os
+import sys
+import mailbox
+import email.utils
+import email.policy
+import time
+import re
+import quopri
+import base64
+import gzip
+import nntplib
+import requests
+import logging
+import subprocess
+
+try:
+    import cchardet as chardet # noqa
+except ImportError:
+    import chardet
+
+from tempfile import mkstemp
+from bs4 import BeautifulSoup # noqa
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+from email import charset
+charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+
+# Used for our requests session
+REQSESSION = None
+
+__VERSION__ = '1.0'
+# Where des marc.info live?
+MARCURL = 'https://marc.info'
+# Wait this many seconds between requests to marc.info, to avoid triggering
+# anti-abuse blocks (and to just be nice)
+MARCNICE = 1
+
+# Set our own policy
+EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
+
+logger = logging.getLogger(__name__)
+
+
+def get_requests_session():
+    global REQSESSION
+    if REQSESSION is None:
+        REQSESSION = requests.session()
+        retry = Retry(connect=3, backoff_factor=1)
+        adapter = HTTPAdapter(max_retries=retry)
+        REQSESSION.mount('http://', adapter)
+        REQSESSION.mount('https://', adapter)
+        headers = {
+            'User-Agent': f'lore-archive-maker/{__VERSION__}',
+        }
+        REQSESSION.headers.update(headers)
+
+    return REQSESSION
+
+
+def lore_get_message(msgid):
+    # See where we're redirected
+    rurl = f'https://lore.kernel.org/r/{msgid}'
+    rses = get_requests_session()
+    resp = rses.head(rurl)
+    if resp.status_code < 300 or resp.status_code > 400:
+        # Not known on lore
+        return None
+    # Pop msgid from the end of the redirect
+    msgurl = resp.headers['Location'] + 'raw'
+    resp.close()
+    resp = rses.get(msgurl)
+    msg = email.message_from_bytes(resp.content)
+    logger.info('    found on lore')
+    return msg
+
+
+# Turned off for now
+def patchwork_get_headers(msgid):
+    url = f'https://patchwork.kernel.org/api/1.2/patches/'
+    params = [
+        ('msgid', msgid),
+    ]
+    rses = get_requests_session()
+    resp = rses.get(url, params=params, stream=False)
+    if resp.status_code > 200:
+        return None
+
+    jj = resp.json()
+    if not len(jj):
+        return None
+
+    # we only care about one
+    p_id = jj[0].get('id')
+    resp = rses.get(f'{url}{p_id}', stream=False)
+    if resp.status_code > 200:
+        return None
+
+    logger.info('    found on patchwork')
+    jj = resp.json()
+    return jj.get('headers')
+
+
+def lookaside_fillin(msg):
+    wanthdrs = [
+        'To',
+        'Cc',
+        'References',
+        'In-Reply-To',
+        'User-Agent',
+        'X-Mailer',
+    ]
+    msgid = msg.get('Message-Id').strip('<>')
+    lmsg = lore_get_message(msgid)
+    if not lmsg:
+        return False
+        # lmsg = patchwork_get_headers(msgid)
+        # if not lmsg:
+        #    return False
+
+    for wanthdr in wanthdrs:
+        if not msg.get(wanthdr) and lmsg.get(wanthdr):
+            msg[wanthdr] = lmsg.get(wanthdr)
+
+    return True
+
+
+def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
+    rses = get_requests_session()
+    url = f'{MARCURL}/?l={marc_list_id}&m={msgnum}&q=mbox'
+    logger.info('  grabbing message %s', msgnum)
+    resp = rses.get(url)
+    rawmsg = resp.content
+    multipart = False
+    if rawmsg.find(b'\nContent-Type: multipart/mixed;') > 0:
+        multipart = True
+        # marc.info breaks MIME by incorrectly writing boundary headers
+        rawmsg = rawmsg.replace(b'\nContent-Type: multipart/mixed; boundary="--',
+                                b'\nContent-Type: multipart/mixed; boundary="', 1)
+        # We don't need to fix charset for multipart/mixed messages
+
+    msg = email.message_from_bytes(rawmsg)
+    if not msg.get('Message-Id'):
+        logger.info('    No message-id, ignored')
+        # Can't use it anyway
+        return None
+
+    hdrs = list()
+
+    for hdrname, hdrval in list(msg._headers):  # noqa
+        if hdrname == 'To':
+            # Useless, we throw it out
+            continue
+        elif hdrval.find(' () ') and (hdrval.find(' ! ') or hdrval.find('<')):
+            # marc.info mangles @ and . in email addresses with
+            # the above values. Unmangle them back.
+            hdrval = hdrval.replace(' () ', '@').replace(' ! ', '.')
+        hdrs.append((hdrname, hdrval))
+    msg._headers = hdrs # noqa
+
+    # Marc.info removes content-transfer-encoding headers, so try to figure out
+    # what format the raw message is in before trying to add it to the mailbox
+    if not multipart:
+        payload = msg.get_payload(decode=True)
+        # Try to base64 decode it first
+        dec = None
+        try:
+            dec = base64.b64decode(payload, validate=True)
+            if dec != payload:
+                msg.set_payload(dec)
+        except: # noqa
+            pass
+
+        if not dec:
+            try:
+                dec = quopri.decodestring(payload)
+                if dec != payload:
+                    msg.set_payload(dec)
+            except ValueError:
+                pass
+
+    if listid:
+        msg['List-Id'] = f'<{listid}>'
+
+    if lookaside:
+        lookaside_fillin(msg)
+
+    if not msg.get('To'):
+        msg['To'] = toaddr
+
+    return msg
+
+
+def check_if_spam(bmsg):
+    if not os.path.exists('/usr/bin/spamc'):
+        return False
+
+    logger.info('    checking for spam')
+    args = ['/usr/bin/spamc', '-c']
+    logger.debug('Running %s' % ' '.join(args))
+
+    pp = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    pp.communicate(input=bmsg)
+    if pp.returncode == 0:
+        return False
+
+    return True
+
+
+def add_msg_to_mbx(msg, mbx, checkspam):
+    if msg.get_default_type() == 'text/plain':
+        try:
+            payload = msg.get_payload(decode=True)
+            if payload:
+                msg.set_charset(chardet.detect(payload)['encoding'])
+        except: # noqa
+            # This may fail for various reasons having to do with the wonderful world
+            # of 8bit content and legacy encodings.
+            # Ignore and hope we can still as_string below.
+            pass
+
+    try:
+        bmsg = msg.as_string(policy=EMLPOLICY).encode()
+        if checkspam and check_if_spam(bmsg):
+            logger.info('    spam: %s', msg['Subject'])
+            return
+
+        mbx.add(bmsg)
+    except: # noqa
+        # Throw it out, because life is too short to figure out all possible ways
+        # that decades-old email messages make python break.
+        logger.info('    corrupted: %s', msg['Subject'])
+        return
+
+
+def marc_get_full_thread(marc_list_id, thread_id):
+    cp = 1
+    rses = get_requests_session()
+    msgnums = list()
+    logger.info('Grabbing thread %s', thread_id)
+    while True:
+        lastpage = True
+        np = cp + 1
+        nl = f'r={np}&'
+        # Be nice
+        time.sleep(MARCNICE)
+        url = f'{MARCURL}/?t={thread_id}&r={cp}&w=1'
+        rsp = rses.get(url)
+        soup = BeautifulSoup(rsp.content, features='lxml')
+        for tag in soup.find_all('a'):
+            href = tag.attrs.get('href')
+            if not href:
+                continue
+            # See if it's a link to the next page
+            if href.find(nl) >= 0:
+                lastpage = False
+                continue
+            # Is it from the wrong list?
+            if href.find(marc_list_id) < 0:
+                continue
+
+            match = re.search(r'm=(\d+)\D', href)
+            if match:
+                msgnums.append(match.groups()[0])
+                continue
+
+        if lastpage:
+            break
+        cp += 1
+        logger.info('\t... page %s', cp)
+
+    return msgnums
+
+
+def parse_pipermail_index(pipermail_url):
+    logger.info('Grabbing pipermail index from %s', pipermail_url)
+    rses = get_requests_session()
+    resp = rses.get(pipermail_url)
+    index = resp.content
+
+    soup = BeautifulSoup(index, features='lxml')
+
+    mboxes = []
+    for tag in soup.find_all('a'):
+        # we are looking for a href that ends with .txt.gz
+        if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
+            mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))
+
+    return mboxes
+
+
+def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam):
+    tmpfile = mkstemp('pipermail')[1]
+    chunks = pipermail_url.split('/')
+
+    logger.info('  grabbing %s', chunks[-1])
+    rses = get_requests_session()
+    resp = rses.get(pipermail_url, stream=True)
+
+    with gzip.GzipFile(fileobj=resp.raw) as uncompressed:
+        # XXX: this can be horribly large
+        mboxdata = uncompressed.read().decode('utf-8', errors='replace')
+
+    resp.close()
+
+    # Pipermail does a nasty thing where it doesn't properly handle
+    # lines in the body that start with "From ". First, we add ">" to
+    # all lines starting with "From " and then fix some of them in the
+    # next step.
+    logger.info('  demangling %s', chunks[-1])
+    regex = r'^From '
+    subst = '>From '
+    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+    # Fix pipermail mangling where it changes some email addresses
+    # to be ' at ' instead of '@'. This is easiest to do with a
+    # handful of regexes than via actual message body manipulation
+    # as parf of the python's email.message object
+    regex = r'(<[^>]+) at ([^>]+>)'
+    subst = '\\1@\\2'
+    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+    regex = r'^>?(From:? \S+) at (\S+\..*)'
+    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
+
+    with open(tmpfile, 'wb') as out_fh:
+        out_fh.write(mboxdata.encode())
+
+    # Open it now as a mailbox
+    tmpmbx = mailbox.mbox(tmpfile)
+    for msg in tmpmbx:
+        logger.info('  processing: %s', msg.get('Message-Id'))
+        # Fix bogus From: foo@bar.baz (Foo Barski) -> Foo Barski <foo@bar.baz>
+        fromline = msg.get('From')
+        matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', fromline)
+        if matches:
+            gr = matches.groups()
+            msg.replace_header('From', f'{gr[1]} <{gr[0]}>')
+
+        if listid:
+            msg['List-Id'] = f'<{listid}>'
+
+        if lookaside:
+            lookaside_fillin(msg)
+
+        if not msg.get('To'):
+            msg['To'] = toaddr
+
+        add_msg_to_mbx(msg, mbx, checkspam)
+
+    tmpmbx.close()
+    os.unlink(tmpfile)
+
+
+def get_marcinfo(args):
+    global MARCNICE
+
+    if args.nice < 0.5:
+        logger.critical('Hitting marc.info every %s s will get you auto-banned. Try above 0.5.', args.nice)
+        sys.exit(1)
+    MARCNICE = args.nice
+
+    if not args.to:
+        args.to = args.listid.replace('.', '@', 1)
+
+    marc_list_id = args.listname
+
+    rses = get_requests_session()
+    url = f'{MARCURL}/?l={marc_list_id}&w=1'
+    logger.info('Grabbing main index for %s', marc_list_id)
+
+    rsp = rses.get(url, stream=False)
+    soup = BeautifulSoup(rsp.content, features='lxml')
+    months = list()
+    for tag in soup.find_all('a'):
+        # we are looking for a href that contains
+        href = tag.attrs.get('href')
+        if not href:
+            continue
+        match = re.search(r'b=(\d+)\D', href)
+        if match:
+            months.append(match.groups()[0])
+
+    thdnums = set()
+    msgnums = set()
+    for month in months:
+        logger.info('Grabbing month %s', month)
+        # We may be paginated
+        cp = 1
+        while True:
+            lastpage = True
+            # Be nice
+            np = cp + 1
+            time.sleep(MARCNICE)
+            url = f'{MARCURL}/?l={marc_list_id}&b={month}&r={cp}&w=1'
+            if cp > 1:
+                logger.info('  ... page %s', cp)
+            rsp = rses.get(url)
+            soup = BeautifulSoup(rsp.content, features='lxml')
+            for tag in soup.find_all('a'):
+                href = tag.attrs.get('href')
+                if not href:
+                    continue
+                # See if it's a link to the next page
+                telltale = f'r={np}&'
+                if href.find(telltale) >= 0:
+                    lastpage = False
+                    continue
+
+                # Is it a message link?
+                match = re.search(r'm=(\d+)\D', href)
+                if match:
+                    msgnums.add(match.groups()[0])
+                    continue
+
+                # Is it a thread link?
+                match = re.search(r't=(\d+)\D', href)
+                if match:
+                    thdnums.add(match.groups()[0])
+                    continue
+
+            if lastpage:
+                break
+            cp += 1
+
+    mbx = mailbox.mbox(args.out)
+    for thdnum in thdnums:
+        tnums = marc_get_full_thread(marc_list_id, thdnum)
+        # last message starts the thread
+        tnums.reverse()
+        irt = None
+        for tnum in tnums:
+            if tnum in msgnums:
+                msgnums.remove(tnum)
+            time.sleep(MARCNICE)
+            msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside)
+            if not msg:
+                continue
+
+            if not irt:
+                irt = msg.get('Message-Id')
+            elif not msg.get('References'):
+                msg['References'] = irt
+                msg['In-Reply-To'] = irt
+
+            add_msg_to_mbx(msg, mbx, args.checkspam)
+
+    logger.info('Grabbing remaining unthreaded messages')
+    for msgnum in msgnums:
+        time.sleep(MARCNICE)
+        msg = marc_get_message(marc_list_id, msgnum, args.listid, args.to, args.lookaside)
+        if not msg:
+            continue
+
+        add_msg_to_mbx(msg, mbx, args.checkspam)
+
+    mbx.close()
+
+
+def get_mailman(args):
+    if not args.to:
+        args.to = args.listid.replace('.', '@', 1)
+
+    months = parse_pipermail_index(args.url)
+    if not months:
+        print('Could not find any .txt.gz files listed at %s' % args.url)
+        sys.exit(1)
+    mbx = mailbox.mbox(args.out)
+    for month in months:
+        grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam)
+
+
+def get_nntp(args):
+    # Expect in format nntp://news.gmane.org/gmane.linux.network
+    logger.info('Connecting to %s', args.url)
+    chunks = args.url.split('/')
+    server, group = chunks[-2:]
+    nntplib._MAXLINE = 1 << 20
+    server = nntplib.NNTP(server)
+    resp, count, first, last, name = server.group(group)
+    total = int(last)
+
+    mbx = mailbox.mbox(args.out)
+    aid = 1
+    while aid <= total:
+        try:
+            nresp, nainfo = server.article(aid)
+            msg = email.message_from_bytes(b'\n'.join(nainfo[2]))
+            logger.info('  processing: %s, %s/%s', msg.get('Message-Id'), aid, total)
+            newhdrs = list()
+            for hdrname, hdrval in list(msg._headers): # noqa
+                if hdrname.find('Original-') == 0:
+                    hdrname = hdrname.replace('Original-', '')
+                newhdrs.append((hdrname, hdrval))
+            msg._headers = newhdrs # noqa
+            if args.listid:
+                try:
+                    msg.replace_header('List-Id', f'<{args.listid}>')
+                except KeyError:
+                    msg.add_header('List-Id', f'<{args.listid}')
+
+            add_msg_to_mbx(msg, mbx, args.checkspam)
+
+        except nntplib.NNTPTemporaryError:
+            # Ignore one-off article failures -- probably deletes
+            pass
+        finally:
+            aid += 1
+
+    mbx.close()
+
+
+if __name__ == '__main__':
+    import argparse
+
+    # noinspection PyTypeChecker
+    parser = argparse.ArgumentParser(
+        description="Collect external mail archives into a local mbox",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument('-d', '--debug', action='store_true', default=False,
+                        help='Add more debugging info to the output')
+    parser.add_argument('-i', '--listid',
+                        help='List-Id header to inject into the messages')
+    parser.add_argument('-t', '--to',
+                        help='Value to put into the To: header, if missing '
+                             '(defaults to list-id with first . replaced with @')
+    parser.add_argument('-k', '--lookaside', action='store_true', default=False,
+                        help='Attempt to look up matching lore messages for missing to/cc headers')
+    parser.add_argument('-s', '--checkspam', action='store_true', default=False,
+                        help='Run spamc to check messages for spam before adding')
+    parser.add_argument('-o', '--out', required=True,
+                        help='Filename of the mailbox file to write out')
+
+    subparsers = parser.add_subparsers(help='sub-command help', dest='subcmd')
+
+    sp_mm = subparsers.add_parser('mailman', help='Collect mailman archives',
+                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    sp_mm.add_argument('-u', '--url', required=True,
+                       help='Mailman archive index URL')
+    sp_mm.set_defaults(func=get_mailman)
+
+    sp_marc = subparsers.add_parser('marcinfo', help='Collect marc.info archives',
+                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    sp_marc.add_argument('-l', '--listname', required=True,
+                         help='Marc.info list name (?l= parameter)')
+    sp_marc.add_argument('-n', '--nice', default=MARCNICE, type=float,
+                         help='Seconds to sleep between requests')
+    sp_marc.set_defaults(func=get_marcinfo)
+
+    sp_nntp = subparsers.add_parser('nntp', help='Collect NNTP archives',
+                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    sp_nntp.add_argument('-u', '--url', required=True,
+                         help='NNTP url (e.g. nntp://news.gmane.com/gmane.linux.kernel')
+    sp_nntp.set_defaults(func=get_nntp)
+
+    cmdargs = parser.parse_args()
+    logger.setLevel(logging.DEBUG)
+
+    ch = logging.StreamHandler()
+    formatter = logging.Formatter('%(message)s')
+    ch.setFormatter(formatter)
+
+    if cmdargs.debug:
+        ch.setLevel(logging.DEBUG)
+    else:
+        ch.setLevel(logging.INFO)
+
+    logger.addHandler(ch)
+
+    if 'func' not in cmdargs:
+        parser.print_help()
+        sys.exit(1)
+
+    cmdargs.func(cmdargs)
diff --git a/list-archive-maker.py b/list-archive-maker.py
index 65bf2ca..191a504 100755
--- a/list-archive-maker.py
+++ b/list-archive-maker.py
@@ -25,10 +25,15 @@ import os
 import sys
 import mailbox
 import email.utils
-import time
-import re
+import email.policy
 import fnmatch
 
+from email import charset
+charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+
+# Set our own policy
+EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
+
 # Only retain the headers that are important to us
 # must be lowercase for matching purposes.
 # We treat "Received" headers with extra care for privacy, but if you
@@ -55,8 +60,10 @@ WANTHDRS = {'return-path',
             'resent-to',
             }
 
+__VERSION__ = '2.0'
+
 
-def main(sources, outdir, msgids, listids, rejectsfile):
+def process_archives(sources, outdir, msgids, listids, rejectsfile):
     outboxes = {}
     writecount = {}
     seenids = []
@@ -77,58 +84,15 @@ def main(sources, outdir, msgids, listids, rejectsfile):
         rejectsbox = mailbox.mbox(rejectsfile)
 
     for sourcefile in sources:
-        is_pipermail = False
-        is_nntp = False
-
-        # do you have a '://' in you?
-        if sourcefile.find('://') > 0:
-            if sourcefile.find('nntp://') == 0:
-                is_nntp = True
-            else:
-                is_pipermail = True
-
-        if is_nntp:
-            # Expect in format nntp://news.gmane.org/gmane.linux.network
-            sys.stdout.write('Connecting to %s...' % sourcefile)
-            chunks = sourcefile.split('/')
-            server, group = chunks[-2:]
-            import nntplib
-            nntplib._MAXLINE = 1 << 20
-            server = nntplib.NNTP(server)
-            resp, count, first, last, name = server.group(group)
-            total = int(last)
-
-            def nntp_msg_gen(last):
-                aid = 1
-                while aid <= last:
-                    try:
-                        resp, ainfo = server.article(aid)
-                        message = email.message_from_bytes(b'\n'.join(ainfo[2]))
-                        yield message
-                    except nntplib.NNTPTemporaryError:
-                        # Ignore one-off article failures -- probably deletes
-                        pass
-                    finally:
-                        aid += 1
-
-            inbox = nntp_msg_gen(total)
-
+        sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
+        sys.stdout.flush()
+        # If the filename ends with /, we treat as maildir
+        if sourcefile[-1] == '/':
+            inbox = mailbox.Maildir(sourcefile)
         else:
-            if is_pipermail:
-                sourcefile = grab_pipermail_archive(sourcefile, outdir)
-                sys.stdout.write('parsing...')
-                sys.stdout.flush()
-                inbox = mailbox.mbox(sourcefile)
-            else:
-                sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
-                sys.stdout.flush()
-                # If the filename ends with /, we treat as maildir
-                if sourcefile[-1] == '/':
-                    inbox = mailbox.Maildir(sourcefile)
-                else:
-                    inbox = mailbox.mbox(sourcefile)
+            inbox = mailbox.mbox(sourcefile)
 
-            total = len(inbox)
+        total = len(inbox)
 
         sys.stdout.write('%s messages\n' % total)
         sys.stdout.flush()
@@ -153,7 +117,7 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                 # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
                 # system message.
                 if rejectsfile:
-                    msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID'))
+                    msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa
                     rejectsbox.add(msg)
                 skipped += 1
                 nomsgid += 1
@@ -166,7 +130,7 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                 # latter condition, so we just assume they got delivered multiple times and
                 # use the first one found.
                 if rejectsfile:
-                    msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID'))
+                    msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa
                     rejectsbox.add(msg)
                 skipped += 1
                 dupmsgid += 1
@@ -179,12 +143,8 @@ def main(sources, outdir, msgids, listids, rejectsfile):
             cc = ''
             recvtime = None
             is_our_list = False
-            for hdrname, hdrval in list(msg._headers):
+            for hdrname, hdrval in list(msg._headers): # noqa
                 lhdrname = hdrname.lower()
-                if is_nntp and lhdrname.find('original-') == 0:
-                    lhdrname = lhdrname.replace('original-', '')
-                    hdrname = hdrname.replace('Original-', '')
-
                 lhdrval = hdrval.lower()
                 wanthdr = False
                 for hdrmatch in WANTHDRS:
@@ -212,20 +172,20 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                         pass
 
                 elif lhdrname == 'list-id':
-                        for listid in listids:
-                            if lhdrval.find(listid) >= 0:
-                                newhdrs.append((hdrname, hdrval))
-                                is_our_list = True
-                                break
+                    for listid in listids:
+                        if lhdrval.find(listid) >= 0:
+                            newhdrs.append((hdrname, hdrval))
+                            is_our_list = True
+                            break
 
                 elif lhdrname == 'x-mailing-list':
-                        for listid in listids:
-                            if lhdrval.find(listid) >= 0:
-                                # Stick the list-id that's first in our collection,
-                                # since we assume that it's the canonical one
-                                newhdrs.append(('List-Id', listids[0]))
-                                is_our_list = True
-                                break
+                    for listid in listids:
+                        if lhdrval.find(listid) >= 0:
+                            # Stick the list-id that's first in our collection,
+                            # since we assume that it's the canonical one
+                            newhdrs.append(('List-Id', listids[0]))
+                            is_our_list = True
+                            break
 
                 # Malformed emails can have multiple to: and cc: fields.  Merge
                 # so there's one field for each header type.
@@ -271,31 +231,20 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                 # LKML list-id, the archive may only contain the copy that arrived to
                 # linux-mm. We try to hedge for this by looking in the "To" and "Cc"
                 # fields for any indication that this was intended for our mailing list.
-                if is_pipermail:
-                    # Pipermail doesn't preserve the List-Id nor "To" headers,
-                    # so put them back in place
-                    newhdrs.append(('To', eaddrs[0]))
-                    newhdrs.append(('List-Id', listids[0]))
-                    is_our_list = True
-                elif is_nntp:
-                    # We assume everything in the newsgroup matches our first list-id
-                    newhdrs.append(('List-Id', listids[0]))
-                    is_our_list = True
-                else:
-                    for eaddr in eaddrs:
-                        if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
-                                str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
-                                str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
-                            # insert the list-id header
-                            # (assuming the first one in the list to be the canonical one)
-                            newhdrs.append(('List-ID', '<%s>' % listids[0]))
-                            is_our_list = True
-                            break
+                for eaddr in eaddrs:
+                    if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
+                            str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
+                            str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
+                        # insert the list-id header
+                        # (assuming the first one in the list to be the canonical one)
+                        newhdrs.append(('List-ID', '<%s>' % listids[0]))
+                        is_our_list = True
+                        break
 
                 if not is_our_list:
                     # Well, we tried everything
                     if rejectsfile:
-                        msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID'))
+                        msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa
                         rejectsbox.add(msg)
                     skipped += 1
                     notourlist += 1
@@ -309,8 +258,6 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                 msgdate = email.utils.parsedate_tz(str(msg['Date']))
 
             mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])
-            if is_nntp:
-                msg.set_unixfrom('From nntp@import %s' % time.strftime('%c', msgdate[:9]))
 
             # do we have this mbox open already?
             if mboxname in outboxes:
@@ -321,14 +268,11 @@ def main(sources, outdir, msgids, listids, rejectsfile):
                 outboxes[mboxname] = outbox
                 writecount[mboxname] = 1
 
-            outbox.add(msg)
+            outbox.add(msg.as_string(policy=EMLPOLICY).encode())
             seenids.append(msgid)
             knownset.add(msgid)
 
         inbox.close()
-        if is_pipermail:
-            os.unlink(sourcefile)
-
         sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
                          (counter, total, skipped, dupmsgid, nomsgid, notourlist))
 
@@ -347,99 +291,7 @@ def main(sources, outdir, msgids, listids, rejectsfile):
         return None
 
 
-def parse_pipermail_index(pipermail_url):
-    try:
-        from bs4 import BeautifulSoup
-    except ImportError as ex:
-        print('You need to install python-beautifulsoup4 to parse pipermail URLs')
-        print(ex)
-        sys.exit(1)
-
-    print('Grabbing the pipermail index from %s' % pipermail_url)
-    with urllib_request.urlopen(pipermail_url) as response:
-        index = response.read()
-        response.close()
-
-    soup = BeautifulSoup(index, features='lxml')
-
-    mboxes = []
-    for tag in soup.find_all('a'):
-        # we are looking for a href that ends with .txt.gz
-        if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
-            mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))
-
-    return mboxes
-
-
-def grab_pipermail_archive(pipermail_url, outdir):
-    import gzip
-
-    chunks = pipermail_url.split('/')
-
-    sys.stdout.write('Grabbing %s...' % chunks[-1])
-    sys.stdout.flush()
-    # stick it into outdir/_tmp_pipermail_%last-chunk
-    local_file = os.path.join(outdir, '_tmp_pipermail_%s' % chunks[-1])
-
-    with urllib_request.urlopen(pipermail_url) as response:
-        with gzip.GzipFile(fileobj=response) as uncompressed:
-            # XXX: this can be horribly large
-            mboxdata = uncompressed.read().decode('utf-8', errors='replace')
-            uncompressed.close()
-        response.close()
-
-    # Pipermail does a nasty thing where it doesn't properly handle
-    # lines in the body that start with "From ". First, we add ">" to
-    # all lines starting with "From " and then fix some of them in the
-    # next step.
-    sys.stdout.write('demangling...')
-    sys.stdout.flush()
-    regex = r'^From '
-    subst = '>From '
-    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
-    # Fix pipermail mangling where it changes some email addresses
-    # to be ' at ' instead of '@'. This is easiest to do with a
-    # handful of regexes than via actual message body manipulation
-    # as parf of the python's email.message object
-    regex = r'(<[^>]+) at ([^>]+>)'
-    subst = '\\1@\\2'
-    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
-    regex = r'^>?(From:? \S+) at (\S+\..*)'
-    mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
-
-    with open(local_file, 'wb') as out_fh:
-        out_fh.write(mboxdata.encode('utf-8'))
-
-    out_fh.close()
-    return local_file
-
-
-if __name__ == '__main__':
-    import argparse
-    parser = argparse.ArgumentParser(
-        description="Make a mbox of LKML messages we haven't yet archived",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument('-source', nargs='+',
-                        help=('Mbox file with archives, can be multiple. '
-                              'Paths with trailing "/" will be treated as maildirs.'))
-    parser.add_argument('-pipermail',
-                        help='Download mailman pipermail archives from this URL')
-    parser.add_argument('-nntp',
-                        help=('Download full archives from a NNTP server, '
-                              'e.g. -n nntp://news.gmane.com/gmane.linux.kernel'))
-    parser.add_argument('-exportdir', required=True, default='list-archives',
-                        help='Export dir where to put sanitized archives')
-    parser.add_argument('-knownids',
-                        help='File with known Message-IDs (one per line)')
-    parser.add_argument('-listids', required=True, nargs='+',
-                        help='List ID to match, can be multiple')
-    parser.add_argument('-rejected',
-                        help='Mailbox file where to save messages that were rejected '
-                             '(adds X-Import-Rejected-Reason header)')
-
-    args = parser.parse_args()
-
+def main(args):
     if not os.path.isdir(args.exportdir):
         os.mkdir(args.exportdir)
 
@@ -453,29 +305,14 @@ if __name__ == '__main__':
         # right now we're just appending to them, which is probably not expected behaviour.
         knownids = []
 
-    # are you asking for a pipermail grab?
-    mboxes = []
-    if args.pipermail is not None:
-        import urllib.request as urllib_request
-        mboxes = parse_pipermail_index(args.pipermail)
-        if not mboxes:
-            print('Could not find any .txt.gz files listed at %s' % args.pipermail)
-            sys.exit(1)
-
-    if args.nntp:
-        mboxes.append(args.nntp)
-
-    if args.source:
-        mboxes += args.source
-
-    if not mboxes:
-        print('You have to specify at least one source (-s, -p, or -n)')
+    if not args.source:
+        print('You have to specify at least one source')
         sys.exit(1)
 
     # Make list ID matching case insensitive to match more mail
     listids = [listid.lower() for listid in args.listids]
 
-    newids = main(mboxes, args.exportdir, knownids, listids, args.rejected)
+    newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected)
 
     if newids is None or not args.knownids:
         sys.exit(0)
@@ -485,3 +322,27 @@ if __name__ == '__main__':
     with open(args.knownids, 'w') as fh:
         fh.write('\n'.join(new_idlist))
         fh.close()
+
+
+if __name__ == '__main__':
+    import argparse
+
+    # noinspection PyTypeChecker
+    parser = argparse.ArgumentParser(
+        description="Make a mbox of LKML messages we haven't yet archived",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument('-source', nargs='+',
+                        help=('Mbox file with archives, can be multiple. '
+                              'Paths with trailing "/" will be treated as maildirs.'))
+    parser.add_argument('-exportdir', required=True, default='list-archives',
+                        help='Export dir where to put sanitized archives')
+    parser.add_argument('-knownids',
+                        help='File with known Message-IDs (one per line)')
+    parser.add_argument('-listids', required=True, nargs='+',
+                        help='List ID to match, can be multiple')
+    parser.add_argument('-rejected',
+                        help='Mailbox file where to save messages that were rejected '
+                             '(adds X-Import-Rejected-Reason header)')
+
+    main(parser.parse_args())
author	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-10-20 12:59:44 -0400
committer	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-10-20 12:59:44 -0400
commit	35fa657cf9ee91aea540317b823fb3485fad0bd3 (patch)
tree	291c39e9fab7540d89972ce2875e37c624cc7085
parent	1d54a8c8a09eacdc2ed1213afaa8639b7393b6ef (diff)
download	korg-helpers-35fa657cf9ee91aea540317b823fb3485fad0bd3.tar.gz