diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2020-10-01 11:37:05 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2020-10-01 15:27:29 +0200 |
commit | d86ba1a2b7356d80e885cbcbdec81e38c56aae5f (patch) | |
tree | 90015943ccd856433fc9e4ab2243212ec7e570d3 | |
parent | 165baa721985ce9a1057f95886a6185edcda4efb (diff) | |
download | quilttools-master.tar.gz |
mailbox format requires that lines starting with 'From ' in the mail body
are escaped. That's usually done as '>From '.
Some mailbox generators including patchwork fail to do that which confuses
the python mailbox parser as it considers a 'From ' line in the mail body
as start of the next message.
Work around it by parsing the binary file content of the mailbox manually
and by applying heuristics to distinguish between a 'From ' line in the
body and a valid unixfrom line which starts a new message.
Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rwxr-xr-x | mb2q | 78 |
1 files changed, 76 insertions, 2 deletions
@@ -3,7 +3,7 @@ # Copyright Thomas Gleixner <tglx@linutronix.de> from email.utils import make_msgid, formatdate -from email import message_from_binary_file +from email import message_from_binary_file, message_from_bytes from email.policy import EmailPolicy from argparse import ArgumentParser from importlib import import_module @@ -693,6 +693,80 @@ class nm_mbox(object): def items(self): return copy.copy(self.msgs) +def valid_unixfrom(bline): + try: + line = bline.decode() + if not line.startswith('From '): + return False + + # unixfrom lines have the format + # From $Address $Datetime + # $Datetime is in ctime() format + frm, addr, datestr = line.split(' ', 2) + ctime = time.strptime(datestr.strip()) + return True + except: + return False + +def empty_line(bline): + try: + line = bline.decode().strip() + return len(line) == 0 + except: + return false + +# +# mailbox.mbox trips over lines in the mail body which start with 'From ' +# Work around that by reading the mailbox file in binary format and +# searching for unixfrom headers. +# +# This is sloppy and incomplete but should cover the most obvious cases +# for mails on LKML etc. It's unlikely that the mail body contains a valid +# unixfrom preceeded by an empty newline. +# +class solid_mbox(object): + def __init__(self, fpath): + self.msgs = [] + + bmsg = bytes(0) + prev_empty = True + for bline in open(fpath, 'rb').readlines(): + # Unixfrom lines must be either at the start of + # the file or preceeded by an empty new line + if empty_line(bline): + prev_empty = True + bmsg += bline + continue + + if not valid_unixfrom(bline): + prev_empty = False + bmsg += bline + continue + + # If the previous line was not empty, ignore it + if not prev_empty: + bmsg += bline + continue + + self._add_msg(bmsg) + bmsg = bytes(0) + + # Handle the last msg + self._add_msg(bmsg) + + def _add_msg(self, bmsg): + # First message trips over this obviously + if len(bmsg) == 0: + return + + policy = EmailPolicy(utf8=True) + msg = message_from_bytes(bmsg, policy=policy) + msgid = msg.get('Message-ID', None) + self.msgs.append((msgid, msg)) + + def items(self): + return copy.copy(self.msgs) + if __name__ == '__main__': parser = ArgumentParser(description='Mailbox 2 quilt converter') parser.add_argument('inbox', metavar='inbox', @@ -774,7 +848,7 @@ if __name__ == '__main__': mbox = nm_mbox(args.inbox) patchsuffix = 'notmuch_%s' %args.inbox.replace(':', '_') elif os.path.isfile(args.inbox): - mbox = mailbox.mbox(args.inbox, create=False) + mbox = solid_mbox(args.inbox) elif os.path.isdir(args.inbox): mbox = mailbox.Maildir(args.inbox, create=False) else: |