aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Jordan via RT <kernel-helpdesk@rt.linuxfoundation.org>2019-01-24 22:45:26 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2019-01-25 15:08:05 -0500
commit45b6f51bb85688ab5db4386a61c766d7481bc85a (patch)
tree81f76caab6652b080873ae0b5e85e39437ac8500
parentdc2306ba201b6f4a7c0d9675dcdf71f2549dae28 (diff)
downloadkorg-helpers-45b6f51bb85688ab5db4386a61c766d7481bc85a.tar.gz
handle multiple To: and Cc: fields in malformed emails
Emails should have at most one To: and one Cc: header, but sometimes malformed ones have more than that, causing 'notourlist' false positives and leaving legitimate messages out of the archive. Collapse multiple identical headers into one. Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
-rwxr-xr-xlist-archive-maker.py35
1 files changed, 34 insertions, 1 deletions
diff --git a/list-archive-maker.py b/list-archive-maker.py
index bfd6478..b618a18 100755
--- a/list-archive-maker.py
+++ b/list-archive-maker.py
@@ -175,9 +175,13 @@ def main(sources, outdir, msgids, listids, rejectsfile):
# Remove headers not in WANTHDRS list and any Received:
# lines that do not mention the list email address
newhdrs = []
+ to_list = []
+ to_header_idx = None
+ cc_list = []
+ cc_header_idx = None
recvtime = None
is_our_list = False
- for hdrname, hdrval in msg._headers:
+ for hdrname, hdrval in list(msg._headers):
lhdrname = hdrname.lower()
if is_nntp and lhdrname.find('original-') == 0:
lhdrname = lhdrname.replace('original-', '')
@@ -224,9 +228,38 @@ def main(sources, outdir, msgids, listids, rejectsfile):
is_our_list = True
break
+ # Malformed emails can have multiple to: and cc: fields. Merge
+ # so there's one field for each header type.
+ #
+ # Save the place in newhdrs where the first to or cc list would
+ # have appeared so we can insert the merged list there rather
+ # than strangely at the end.
+
+ elif lhdrname == 'to':
+ to_list.extend(hdrval.split(','))
+ msg._headers.remove((hdrname, hdrval))
+ if to_header_idx is None:
+ to_header_idx = len(newhdrs)
+
+ elif lhdrname == 'cc':
+ cc_list.extend(hdrval.split(','))
+ msg._headers.remove((hdrname, hdrval))
+ if cc_header_idx is None:
+ cc_header_idx = len(newhdrs)
+
else:
newhdrs.append((hdrname, hdrval))
+ if len(to_list) > 0:
+ to_header = ('To', ', '.join(to_list))
+ msg._headers.append(to_header)
+ newhdrs.insert(to_header_idx, to_header)
+
+ if len(cc_list) > 0:
+ cc_header = ('Cc', ', '.join(cc_list))
+ msg._headers.append(cc_header)
+ newhdrs.insert(cc_header_idx, cc_header)
+
if not is_our_list:
# Sometimes a message is cc'd to multiple mailing lists and the
# archives only contain a copy of the message that was delivered to a