aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2020-10-01 11:37:05 +0200
committerThomas Gleixner <tglx@linutronix.de>2020-10-01 15:27:29 +0200
commitd86ba1a2b7356d80e885cbcbdec81e38c56aae5f (patch)
tree90015943ccd856433fc9e4ab2243212ec7e570d3
parent165baa721985ce9a1057f95886a6185edcda4efb (diff)
downloadquilttools-master.tar.gz
mb2q: Workaround broken mbox formatsHEADmaster
mailbox format requires that lines starting with 'From ' in the mail body are escaped. That's usually done as '>From '. Some mailbox generators including patchwork fail to do that which confuses the python mailbox parser as it considers a 'From ' line in the mail body as start of the next message. Work around it by parsing the binary file content of the mailbox manually and by applying heuristics to distinguish between a 'From ' line in the body and a valid unixfrom line which starts a new message. Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rwxr-xr-xmb2q78
1 files changed, 76 insertions, 2 deletions
diff --git a/mb2q b/mb2q
index c3ea998..ef16638 100755
--- a/mb2q
+++ b/mb2q
@@ -3,7 +3,7 @@
# Copyright Thomas Gleixner <tglx@linutronix.de>
from email.utils import make_msgid, formatdate
-from email import message_from_binary_file
+from email import message_from_binary_file, message_from_bytes
from email.policy import EmailPolicy
from argparse import ArgumentParser
from importlib import import_module
@@ -693,6 +693,80 @@ class nm_mbox(object):
def items(self):
return copy.copy(self.msgs)
+def valid_unixfrom(bline):
+ try:
+ line = bline.decode()
+ if not line.startswith('From '):
+ return False
+
+ # unixfrom lines have the format
+ # From $Address $Datetime
+ # $Datetime is in ctime() format
+ frm, addr, datestr = line.split(' ', 2)
+ ctime = time.strptime(datestr.strip())
+ return True
+ except:
+ return False
+
+def empty_line(bline):
+ try:
+ line = bline.decode().strip()
+ return len(line) == 0
+ except:
+ return false
+
+#
+# mailbox.mbox trips over lines in the mail body which start with 'From '
+# Work around that by reading the mailbox file in binary format and
+# searching for unixfrom headers.
+#
+# This is sloppy and incomplete but should cover the most obvious cases
+# for mails on LKML etc. It's unlikely that the mail body contains a valid
+# unixfrom preceeded by an empty newline.
+#
+class solid_mbox(object):
+ def __init__(self, fpath):
+ self.msgs = []
+
+ bmsg = bytes(0)
+ prev_empty = True
+ for bline in open(fpath, 'rb').readlines():
+ # Unixfrom lines must be either at the start of
+ # the file or preceeded by an empty new line
+ if empty_line(bline):
+ prev_empty = True
+ bmsg += bline
+ continue
+
+ if not valid_unixfrom(bline):
+ prev_empty = False
+ bmsg += bline
+ continue
+
+ # If the previous line was not empty, ignore it
+ if not prev_empty:
+ bmsg += bline
+ continue
+
+ self._add_msg(bmsg)
+ bmsg = bytes(0)
+
+ # Handle the last msg
+ self._add_msg(bmsg)
+
+ def _add_msg(self, bmsg):
+ # First message trips over this obviously
+ if len(bmsg) == 0:
+ return
+
+ policy = EmailPolicy(utf8=True)
+ msg = message_from_bytes(bmsg, policy=policy)
+ msgid = msg.get('Message-ID', None)
+ self.msgs.append((msgid, msg))
+
+ def items(self):
+ return copy.copy(self.msgs)
+
if __name__ == '__main__':
parser = ArgumentParser(description='Mailbox 2 quilt converter')
parser.add_argument('inbox', metavar='inbox',
@@ -774,7 +848,7 @@ if __name__ == '__main__':
mbox = nm_mbox(args.inbox)
patchsuffix = 'notmuch_%s' %args.inbox.replace(':', '_')
elif os.path.isfile(args.inbox):
- mbox = mailbox.mbox(args.inbox, create=False)
+ mbox = solid_mbox(args.inbox)
elif os.path.isdir(args.inbox):
mbox = mailbox.Maildir(args.inbox, create=False)
else: