aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-16 15:36:18 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-11-16 15:36:18 -0500
commitaabb998988810989806dbc2e0dace84d0fa909ed (patch)
treeb01b5dcdf8947a0faa16055266a55833ec2137d8
parentfb6f2278be47365c14c17241c81d22ca9276a7c4 (diff)
downloadkorg-helpers-aabb998988810989806dbc2e0dace84d0fa909ed.tar.gz
Fix mailman archive downloads
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-xlist-archive-collector.py15
1 files changed, 9 insertions, 6 deletions
diff --git a/list-archive-collector.py b/list-archive-collector.py
index 4e3a81f..d97d078 100755
--- a/list-archive-collector.py
+++ b/list-archive-collector.py
@@ -23,6 +23,7 @@ import re
import quopri
import base64
import gzip
+import io
import nntplib
import requests
import logging
@@ -298,7 +299,7 @@ def parse_pipermail_index(pipermail_url):
mboxes = []
for tag in soup.find_all('a'):
# we are looking for a href that ends with .txt.gz
- if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
+ if 'href' in tag.attrs and tag.attrs['href'].find('.txt') > 1:
mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))
return mboxes
@@ -310,11 +311,13 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks
logger.info(' grabbing %s', chunks[-1])
rses = get_requests_session()
+ # XXX: this can be horribly large
resp = rses.get(pipermail_url, stream=True)
-
- with gzip.GzipFile(fileobj=resp.raw) as uncompressed:
- # XXX: this can be horribly large
- mboxdata = uncompressed.read().decode('utf-8', errors='replace')
+ if resp.content.startswith(b'\x1f\x8b'):
+ with gzip.GzipFile(fileobj=io.BytesIO(resp.content)) as uncompressed:
+ mboxdata = uncompressed.read().decode('utf-8', errors='replace')
+ else:
+ mboxdata = resp.content.decode('utf-8', errors='replace')
resp.close()
@@ -342,7 +345,7 @@ def grab_pipermail_archive(pipermail_url, mbx, listid, toaddr, lookaside, checks
# Open it now as a mailbox
tmpmbx = mailbox.mbox(tmpfile)
for msg in tmpmbx:
- logger.info(' processing: %s', msg.get('Message-Id'))
+ logger.info(' processing: %s', msg.get('Message-Id'))
# Fix bogus From: foo@bar.baz (Foo Barski) -> Foo Barski <foo@bar.baz>
fromline = msg.get('From')
matches = re.search(r'(\S+@\S+\.\S+) \((.*)\)$', fromline)