aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2022-10-20 14:11:29 -0400
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2022-10-20 14:11:29 -0400
commit354fc16e397312c8972c6b48e7645d2c98d71db3 (patch)
tree63cf3ce0686bd08aa79e42c2aa68afbe226cb308
parenta72a55b06660b249c82cdba32a4dc54e8a7ea2f3 (diff)
downloadkorg-helpers-354fc16e397312c8972c6b48e7645d2c98d71db3.tar.gz
list-collectors: small improvements
Add a few small improvements to the list-archive-collector and list-archive-maker lists, allowing to more easier deal with partial archives. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-xlist-archive-collector.py137
-rwxr-xr-xlist-archive-maker.py112
2 files changed, 136 insertions, 113 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py
index b3186ad..99e22be 100755
--- a/list-archive-collector.py
+++ b/list-archive-collector.py
@@ -25,23 +25,25 @@ import quopri
import base64
import gzip
import io
-import nntplib
import requests
import logging
import subprocess
+import argparse
try:
- import cchardet as chardet # noqa
+ import cchardet as chardet # noqa
except ImportError:
- import chardet
+ import chardet # noqa
from tempfile import mkstemp
from bs4 import BeautifulSoup # noqa
from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
+from urllib3 import Retry
+
+from typing import Optional, Union, List, Set
from email import charset
-charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+charset.add_charset('utf-8', None)
# Used for our requests session
REQSESSION = None
@@ -59,7 +61,7 @@ EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length
logger = logging.getLogger(__name__)
-def clean_header(hdrval):
+def clean_header(hdrval: str) -> Optional[str]:
if hdrval is None:
return ''
@@ -78,7 +80,7 @@ def clean_header(hdrval):
return new_hdrval.strip()
-def get_requests_session():
+def get_requests_session() -> requests.Session:
global REQSESSION
if REQSESSION is None:
REQSESSION = requests.session()
@@ -94,14 +96,14 @@ def get_requests_session():
return REQSESSION
-def lore_get_message(msgid):
+def lore_get_message(msgid: str) -> email.message.Message:
# See where we're redirected
rurl = f'https://lore.kernel.org/r/{msgid}'
rses = get_requests_session()
resp = rses.head(rurl)
if resp.status_code < 300 or resp.status_code > 400:
# Not known on lore
- return None
+ raise LookupError
# Pop msgid from the end of the redirect
msgurl = resp.headers['Location'] + 'raw'
resp.close()
@@ -111,33 +113,7 @@ def lore_get_message(msgid):
return msg
-# Turned off for now
-def patchwork_get_headers(msgid):
- url = f'https://patchwork.kernel.org/api/1.2/patches/'
- params = [
- ('msgid', msgid),
- ]
- rses = get_requests_session()
- resp = rses.get(url, params=params, stream=False)
- if resp.status_code > 200:
- return None
-
- jj = resp.json()
- if not len(jj):
- return None
-
- # we only care about one
- p_id = jj[0].get('id')
- resp = rses.get(f'{url}{p_id}', stream=False)
- if resp.status_code > 200:
- return None
-
- logger.info(' found on patchwork')
- jj = resp.json()
- return jj.get('headers')
-
-
-def lookaside_fillin(msg):
+def lookaside_fillin(msg: email.message.Message) -> bool:
wanthdrs = [
'To',
'Cc',
@@ -147,12 +123,10 @@ def lookaside_fillin(msg):
'X-Mailer',
]
msgid = str(msg.get('Message-Id')).strip('<>')
- lmsg = lore_get_message(msgid)
- if not lmsg:
+ try:
+ lmsg = lore_get_message(msgid)
+ except LookupError:
return False
- # lmsg = patchwork_get_headers(msgid)
- # if not lmsg:
- # return False
for wanthdr in wanthdrs:
if not msg.get(wanthdr) and lmsg.get(wanthdr):
@@ -161,7 +135,8 @@ def lookaside_fillin(msg):
return True
-def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
+def marc_get_message(marc_list_id: str, msgnum: str, listid: str, toaddr: str,
+ lookaside: bool) -> email.message.Message:
rses = get_requests_session()
url = f'{MARCURL}/?l={marc_list_id}&m={msgnum}&q=mbox'
logger.info(' grabbing message %s', msgnum)
@@ -179,7 +154,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
if not msg.get('Message-Id'):
logger.info(' No message-id, ignored')
# Can't use it anyway
- return None
+ raise LookupError
hdrs = list()
@@ -227,7 +202,7 @@ def marc_get_message(marc_list_id, msgnum, listid, toaddr, lookaside):
return msg
-def check_if_spam(bmsg):
+def check_if_spam(bmsg: bytes) -> bool:
if not os.path.exists('/usr/bin/spamc'):
return False
@@ -243,7 +218,16 @@ def check_if_spam(bmsg):
return True
-def add_msg_to_mbx(msg, mbx, checkspam):
+def add_msg_to_mbx(msg: email.message.Message, mbx: Union[mailbox.Mailbox, mailbox.Maildir],
+ checkspam: bool, cleansubj: Optional[str]) -> None:
+ oldsubj = clean_header(msg.get('Subject', ''))
+ if cleansubj and cleansubj in oldsubj:
+ # We only remove it if it's ^thatsring, or ^Re: thatstring
+ if oldsubj.startswith(cleansubj):
+ msg.replace_header('Subject', oldsubj.replace(cleansubj, '', 1).strip())
+ if oldsubj.startswith(f'Re: {cleansubj}'):
+ msg.replace_header('Subject', oldsubj.replace(f'Re: {cleansubj}', 'Re:', 1).strip())
+
if msg.get_default_type() == 'text/plain':
try:
payload = msg.get_payload(decode=True)
@@ -261,6 +245,7 @@ def add_msg_to_mbx(msg, mbx, checkspam):
logger.info(' spam: %s', msg['Subject'])
return
+ logger.info(' added: %s', msg['Subject'])
mbx.add(bmsg)
except: # noqa
# Throw it out, because life is too short to figure out all possible ways
@@ -269,7 +254,7 @@ def add_msg_to_mbx(msg, mbx, checkspam):
return
-def marc_get_full_thread(marc_list_id, thread_id):
+def marc_get_full_thread(marc_list_id: str, thread_id: str) -> List[str]:
cp = 1
rses = get_requests_session()
msgnums = list()
@@ -308,7 +293,7 @@ def marc_get_full_thread(marc_list_id, thread_id):
return msgnums
-def parse_pipermail_index(pipermail_url):
+def parse_pipermail_index(pipermail_url: str) -> Set[str]:
logger.info('Grabbing pipermail index from %s', pipermail_url)
rses = get_requests_session()
resp = rses.get(pipermail_url)
@@ -325,7 +310,7 @@ def parse_pipermail_index(pipermail_url):
return mboxes
-def parse_hyperkitty_index(hyperkitty_url):
+def parse_hyperkitty_index(hyperkitty_url: str) -> Set[str]:
logger.info('Grabbing hyperkitty index from %s', hyperkitty_url)
rses = get_requests_session()
resp = rses.get(hyperkitty_url)
@@ -354,7 +339,8 @@ def parse_hyperkitty_index(hyperkitty_url):
return mboxes
-def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checkspam):
+def grab_pipermail_archive(pipermail_url: str, mbx: Union[mailbox.Mailbox, mailbox.Maildir],
+ args: argparse.Namespace) -> None:
tmpfile = mkstemp('pipermail')[1]
chunks = pipermail_url.split('/')
if pipermail_url[0] == '/':
@@ -432,25 +418,25 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks
if newfrom != oldfrom:
msg.replace_header('From', newfrom)
- if listid:
- msg['List-Id'] = f'<{listid}>'
+ if args.listid:
+ msg['List-Id'] = f'<{args.listid}>'
- if lookaside:
+ if args.lookaside:
lookaside_fillin(msg)
if not msg.get('To'):
- msg['To'] = toaddr
+ msg['To'] = args.to
# Fix in-reply-to
irt = msg.get('in-reply-to')
if irt and irt[0] != '<':
msg.replace_header('In-Reply-To', f'<{irt}>')
- add_msg_to_mbx(msg, mbx, checkspam)
+ add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
tmpmbx.close()
os.unlink(tmpfile)
-def get_marcinfo(args):
+def get_marcinfo(args: argparse.Namespace) -> None:
global MARCNICE
if args.nice < 0.5:
@@ -521,7 +507,7 @@ def get_marcinfo(args):
break
cp += 1
- mbx = mailbox.mbox(args.out)
+ mbx = get_outbox(args)
for thdnum in thdnums:
tnums = marc_get_full_thread(marc_list_id, thdnum)
# last message starts the thread
@@ -531,8 +517,9 @@ def get_marcinfo(args):
if tnum in msgnums:
msgnums.remove(tnum)
time.sleep(MARCNICE)
- msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside)
- if not msg:
+ try:
+ msg = marc_get_message(marc_list_id, tnum, args.listid, args.to, args.lookaside)
+ except LookupError:
continue
if not irt:
@@ -541,7 +528,7 @@ def get_marcinfo(args):
msg['References'] = irt
msg['In-Reply-To'] = irt
- add_msg_to_mbx(msg, mbx, args.checkspam)
+ add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
logger.info('Grabbing remaining unthreaded messages')
for msgnum in msgnums:
@@ -550,18 +537,18 @@ def get_marcinfo(args):
if not msg:
continue
- add_msg_to_mbx(msg, mbx, args.checkspam)
+ add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
mbx.close()
-def get_mailman(args):
+def get_mailman(args: argparse.Namespace) -> None:
if not args.to:
args.to = args.listid.replace('.', '@', 1)
- mbx = mailbox.mbox(args.out)
+ mbx = get_outbox(args)
if args.url[0] == '/':
- grab_pipermail_archive(args.url, mbx, args.listid, args.to, args.lookaside, args.checkspam)
+ grab_pipermail_archive(args.url, mbx, args)
else:
if args.mailman3:
months = parse_hyperkitty_index(args.url)
@@ -571,10 +558,11 @@ def get_mailman(args):
print('Could not find any .txt.gz files listed at %s' % args.url)
sys.exit(1)
for month in months:
- grab_pipermail_archive(month, mbx, args.listid, args.to, args.lookaside, args.checkspam)
+ grab_pipermail_archive(month, mbx, args)
-def get_nntp(args):
+def get_nntp(args: argparse.Namespace) -> None:
+ import nntplib
# Expect in format nntp://news.gmane.org/gmane.linux.network
logger.info('Connecting to %s', args.url)
chunks = args.url.split('/')
@@ -584,7 +572,7 @@ def get_nntp(args):
resp, count, first, last, name = server.group(group)
total = int(last)
- mbx = mailbox.mbox(args.out)
+ mbx = get_outbox(args)
aid = 1
while aid <= total:
try:
@@ -601,9 +589,9 @@ def get_nntp(args):
try:
msg.replace_header('List-Id', f'<{args.listid}>')
except KeyError:
- msg.add_header('List-Id', f'<{args.listid}')
+ msg.add_header('List-Id', f'<{args.listid}>')
- add_msg_to_mbx(msg, mbx, args.checkspam)
+ add_msg_to_mbx(msg, mbx, args.checkspam, args.clean_subject)
except nntplib.NNTPTemporaryError:
# Ignore one-off article failures -- probably deletes
@@ -614,9 +602,14 @@ def get_nntp(args):
mbx.close()
-if __name__ == '__main__':
- import argparse
+def get_outbox(args: argparse.Namespace) -> Union[mailbox.Mailbox, mailbox.Maildir]:
+ if args.as_maildir:
+ logger.info('Will output into maildir %s', args.out)
+ return mailbox.Maildir(args.out)
+ return mailbox.mbox(args.out)
+
+if __name__ == '__main__':
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description="Collect external mail archives into a local mbox",
@@ -635,6 +628,10 @@ if __name__ == '__main__':
help='Run spamc to check messages for spam before adding')
parser.add_argument('-o', '--out', required=True,
help='Filename of the mailbox file to write out')
+ parser.add_argument('-m', '--as-maildir', action='store_true', default=False,
+ help='Output as maildir instead of mbox')
+ parser.add_argument('-c', '--clean-subject',
+ help='Remove this string from subjects (e.g. [listname])')
subparsers = parser.add_subparsers(help='sub-command help', dest='subcmd')
diff --git a/list-archive-maker.py b/list-archive-maker.py
index 393e294..801d840 100755
--- a/list-archive-maker.py
+++ b/list-archive-maker.py
@@ -27,9 +27,12 @@ import mailbox
import email.utils
import email.policy
import fnmatch
+import argparse
+
+from typing import Tuple, List, Set
from email import charset
-charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
+charset.add_charset('utf-8', None)
# Set our own policy
EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
@@ -62,7 +65,8 @@ WANTHDRS = {'return-path',
__VERSION__ = '2.0'
-def formataddr(pair):
+
+def formataddr(pair: Tuple[str, str]) -> str:
try:
return email.utils.formataddr(pair)
except UnicodeEncodeError:
@@ -70,11 +74,12 @@ def formataddr(pair):
# drop the real name then.
return email.utils.formataddr((None, pair[1]))
-def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
- outboxes = {}
- writecount = {}
- seenids = []
- knownset = set(msgids)
+
+def process_archives(sources: List[str], outdir: str, knownset: Set[str], listids: List[str],
+ rejectsfile: str, asmaildir: bool, extrahdrs: List[Tuple[str, str]]) -> Set[str]:
+ outboxes = dict()
+ writecount = dict()
+ seenids = set()
if asmaildir:
outbox = mailbox.Maildir(outdir)
@@ -84,7 +89,7 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
# convert listids into email addresses by replacing the first '.' to '@'.
# if you're working with a mailing list that has a non-standard list-id, you
# can specify the list email address as part of the listids to satisfy this check.
- eaddrs = []
+ eaddrs = list()
for listid in listids:
if listid.find('@') < 0:
eaddrs.append(listid.replace('.', '@', 1))
@@ -150,9 +155,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
# Remove headers not in WANTHDRS list and any Received:
# lines that do not mention the list email address
- newhdrs = []
- to = []
- cc = []
+ newhdrs = list()
+ to = list()
+ cc = list()
recvtime = None
is_our_list = False
for hdrname, hdrval in list(msg._headers): # noqa
@@ -203,13 +208,13 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
# so there's one field for each header type.
#
# Save the place in newhdrs where the first to or cc list would
- # have appeared so we can insert the merged list there rather
+ # have appeared, so we can insert the merged list there rather
# than strangely at the end.
elif lhdrname == 'to':
for pair in email.utils.getaddresses([hdrval]):
if pair[1] in cc:
- # already in Cc, so no need to add it to To
+ # already in Cc, so no need to add it to "To"
continue
to.append(formataddr(pair))
@@ -256,6 +261,9 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
notourlist += 1
continue
+ if extrahdrs:
+ newhdrs += extrahdrs
+
msg._headers = newhdrs
msgdate = recvtime
@@ -278,10 +286,10 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
try:
outbox.add(msg.as_string(policy=EMLPOLICY).encode())
- seenids.append(msgid)
+ seenids.add(msgid)
knownset.add(msgid)
writecount[mboxname] += 1
- except:
+ except: # noqa
# Oh well, toss it
pass
@@ -298,66 +306,84 @@ def process_archives(sources, outdir, msgids, listids, rejectsfile, asmaildir):
print(' %s: %s new (%s total)' %
(os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
outboxes[mboxname].close()
- return seenids
else:
print('No new messages found.')
- return None
+
+ return seenids
-def main(args):
- if not args.asmaildir and not os.path.isdir(args.exportdir):
+def main(args: argparse.Namespace):
+ if not args.as_maildir and not os.path.isdir(args.exportdir):
os.mkdir(args.exportdir)
- if args.knownids and os.path.exists(args.knownids):
- with open(args.knownids, 'r') as fh:
- knownids = fh.read().splitlines()
- fh.close()
- print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
+ if args.known_ids and os.path.exists(args.known_ids):
+ if args.known_ids.endswith('.sqlite3'):
+ import sqlite3
+ dbconn = sqlite3.connect(args.known_ids, sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
+ cur = dbconn.cursor()
+ rows = cur.execute('SELECT mid FROM msgmap').fetchall()
+ knownids = {x[0] for x in rows}
+ else:
+ with open(args.known_ids, 'r') as fh:
+ knownids = set(fh.read().splitlines())
+ fh.close()
+ print('Loaded %s message-ids from "%s"' % (len(knownids), args.known_ids))
else:
# should we load message-ids from existing mailboxes found in the export dir?
# right now we're just appending to them, which is probably not expected behaviour.
- knownids = []
+ knownids = set()
if not args.source:
print('You have to specify at least one source')
sys.exit(1)
- # Make list ID matching case insensitive to match more mail
- listids = [listid.lower() for listid in args.listids]
+ # Make list ID matching case-insensitive to match more mail
+ if args.list_ids:
+ listids = [listid.lower() for listid in args.list_ids]
+ else:
+ listids = ['*']
+
+ extrahdrs = list()
+ if args.extrahdrs:
+ for hdr in args.extrahdrs:
+ name, val = hdr.split(':', maxsplit=1)
+ if val.strip():
+ extrahdrs.append((name.strip(), val.strip()))
- newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.asmaildir)
+ newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected, args.as_maildir,
+ extrahdrs)
- if newids is None or not args.knownids:
+ if newids is None or not args.known_ids or args.known_ids.endswith('.sqlite3'):
sys.exit(0)
- new_idlist = knownids + newids
- print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
- with open(args.knownids, 'w') as fh:
- fh.write('\n'.join(new_idlist))
+ knownids.update(newids)
+ print('Wrote %s msgids into %s (%s new)' % (len(knownids), args.known_ids, len(newids)))
+ with open(args.known_ids, 'w') as fh:
+ fh.write('\n'.join(knownids))
fh.close()
if __name__ == '__main__':
- import argparse
-
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description="Make a mbox of LKML messages we haven't yet archived",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
- parser.add_argument('-source', nargs='+',
+ parser.add_argument('-s', '--source', nargs='+',
help=('Mbox file with archives, can be multiple. '
'Paths with trailing "/" will be treated as maildirs.'))
- parser.add_argument('-exportdir', required=True, default='list-archives',
+ parser.add_argument('-e', '--exportdir', required=True, default='list-archives',
help='Export dir where to put sanitized archives')
- parser.add_argument('-asmaildir', action='store_true', default=False,
+ parser.add_argument('-m', '--as-maildir', action='store_true', default=False,
help='Export as maildir instead of mailboxes')
- parser.add_argument('-knownids',
- help='File with known Message-IDs (one per line)')
- parser.add_argument('-listids', required=True, nargs='+',
- help='List ID to match, can be multiple')
- parser.add_argument('-rejected',
+ parser.add_argument('-k', '--known-ids',
+ help='File with known Message-IDs (one per line, or msgmap.sqlite3)')
+ parser.add_argument('-l', '--list-ids', nargs='+',
+ help='Limit to just these list-ids (can be multiple)')
+ parser.add_argument('-r', '--rejected',
help='Mailbox file where to save messages that were rejected '
'(adds X-Import-Rejected-Reason header)')
+ parser.add_argument('-x', '--extrahdrs', nargs='+', metavar='FULLHDR',
+ help='Extra headers to inject into each message')
main(parser.parse_args())