Refactor get-lore-mbox

As the feature set grew, it became obvious that the structure needed to be less hacky (initial code was just barely beyond a proof of concept). This moves most of the am-mangling code into classes where it makes much more sense, plus makes debugging easier. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-02-12 11:32:50 -0500
committer: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-02-12 11:32:50 -0500
commit: 560dcf24ba0a716565a9f8552c271b687238b59a (patch)
tree: 78a899ca4c320032b71b760b07ee3a8473fd0555
parent: edec683e42126d2aa918010f38ba84e6d83edbea (diff)
download: korg-helpers-560dcf24ba0a716565a9f8552c271b687238b59a.tar.gz
1 files changed, 496 insertions, 305 deletions
diff --git a/get-lore-mbox.py b/get-lore-mbox.py
index 75f813f..adc542b 100755
--- a/get-lore-mbox.py
+++ b/get-lore-mbox.py
@@ -11,6 +11,7 @@ import mailbox
 import email
 import email.message
 import email.utils
+import email.header
 import subprocess
 import logging
 import re
@@ -19,7 +20,7 @@ import time
 
 import requests
 import urllib.parse
-import xml.etree.ElementTree as ET
+import xml.etree.ElementTree
 import gzip
 
 from tempfile import mkstemp
@@ -50,6 +51,436 @@ WANTHDRS = {'sender',
             }
 
 
+class LoreMailbox:
+    def __init__(self):
+        self.msgid_map = dict()
+        self.series = dict()
+        self.followups = list()
+        self.unknowns = list()
+
+    def __repr__(self):
+        out = list()
+        for key, lser in self.series.items():
+            out.append(str(lser))
+        out.append('--- Followups ---')
+        for lmsg in self.followups:
+            out.append('  %s' % lmsg.full_subject)
+        out.append('--- Unknowns ---')
+        for lmsg in self.unknowns:
+            out.append('  %s' % lmsg.full_subject)
+
+        return '\n'.join(out)
+
+    def get_by_msgid(self, msgid):
+        if msgid in self.msgid_map:
+            return self.msgid_map[msgid]
+        return None
+
+    def get_series(self, revision=None):
+        if revision is None:
+            if not len(self.series):
+                return None
+            # Use the highest revision
+            revision = max(self.series.keys())
+        elif revision not in self.series.keys():
+            return None
+
+        lser = self.series[revision]
+
+        # Do we have a cover letter for it?
+        if not lser.has_cover:
+            # Let's find the first patch with an in-reply-to and see if that
+            # is our cover letter
+            for member in lser.patches:
+                if member is not None and member.in_reply_to is not None:
+                    potential = self.get_by_msgid(member.in_reply_to)
+                    if potential.has_diffstat and not potential.has_diff:
+                        # This is *probably* the cover letter
+                        lser.patches[0] = potential
+                        lser.has_cover = True
+                        break
+
+        # Do we have any follow-ups?
+        for fmsg in self.followups:
+            logger.debug('Analyzing follow-up: %s', fmsg.full_subject)
+            # If there are no trailers in this one, ignore it
+            if not len(fmsg.trailers):
+                continue
+            # if it's for the wrong revision, ignore it
+            if lser.revision != fmsg.revision:
+                continue
+            # Go up through the follow-ups and tally up trailers until
+            # we either run out of in-reply-tos, or we find a patch in
+            # our series
+            pmsg = self.msgid_map[fmsg.in_reply_to]
+            trailers = fmsg.trailers
+            lvl = 1
+            while True:
+                logger.debug('%sParent: %s', ' ' * lvl, pmsg.full_subject)
+                logger.debug('%sTrailers: %s', ' ' * lvl, trailers)
+                found = False
+                for lmsg in lser.patches:
+                    if lmsg is not None and lmsg.msgid == pmsg.msgid:
+                        # Confirmed, this is our parent patch
+                        lmsg.followup_trailers += trailers
+                        found = True
+                        break
+                if found:
+                    break
+                elif pmsg.in_reply_to:
+                    lvl += 1
+                    trailers += pmsg.trailers
+                    pmsg = self.msgid_map[pmsg.in_reply_to]
+                else:
+                    break
+
+        return lser
+
+    def add_message(self, msg):
+        lmsg = LoreMessage(msg)
+        logger.debug('Looking at: %s', lmsg.full_subject)
+        self.msgid_map[lmsg.msgid] = lmsg
+
+        if lmsg.lsubject.patch:
+            if lmsg.revision not in self.series:
+                self.series[lmsg.revision] = LoreSeries(lmsg.revision, lmsg.expected)
+                if len(self.series) > 1:
+                    logger.info('Found new series v%s', lmsg.revision)
+            if lmsg.has_diff and not lmsg.reply:
+                self.series[lmsg.revision].add_patch(lmsg)
+            elif lmsg.counter == 0 and not lmsg.reply:
+                # Bona-fide cover letter
+                self.series[lmsg.revision].add_cover(lmsg)
+            elif lmsg.reply:
+                # We'll figure out where this belongs later
+                self.followups.append(lmsg)
+        elif lmsg.reply:
+            self.followups.append(lmsg)
+        else:
+            self.unknowns.append(lmsg)
+
+
+class LoreSeries:
+    def __init__(self, revision, expected):
+        self.revision = revision
+        self.expected = expected
+        self.patches = [None] * (expected+1)
+        self.followups = list()
+        self.complete = False
+        self.has_cover = False
+
+    def __repr__(self):
+        out = list()
+        if self.has_cover:
+            out.append('- Series: [v%s] %s' % (self.revision, self.patches[0].subject))
+        elif self.patches[1] is not None:
+            out.append('- Series: [v%s] %s' % (self.revision, self.patches[1].subject))
+        else:
+            out.append('- Series: [v%s] (untitled)' % self.revision)
+
+        out.append('  revision: %s' % self.revision)
+        out.append('  expected: %s' % self.expected)
+        out.append('  complete: %s' % self.complete)
+        out.append('  has_cover: %s' % self.has_cover)
+        out.append('  patches:')
+        at = 0
+        for member in self.patches:
+            if member is not None:
+                out.append('    [%s/%s] %s' % (at, self.expected, member.subject))
+                if member.followup_trailers:
+                    out.append('       Add: %s' % ', '.join(member.followup_trailers))
+            else:
+                out.append('    [%s/%s] MISSING' % (at, self.expected))
+            at += 1
+
+        return '\n'.join(out)
+
+    def add_patch(self, lmsg):
+        while len(self.patches) < lmsg.expected + 1:
+            self.patches.append(None)
+        self.expected = lmsg.expected
+        self.patches[lmsg.counter] = lmsg
+        self.complete = not (None in self.patches[1:])
+
+    def add_cover(self, lmsg):
+        self.add_patch(lmsg)
+        self.has_cover = True
+
+    def get_slug(self):
+        # Find the first non-None entry
+        lmsg = None
+        for lmsg in self.patches:
+            if lmsg is not None:
+                break
+
+        if lmsg is None:
+            return 'undefined'
+
+        msgdate = email.utils.parsedate_tz(str(lmsg.msg['Date']))
+        prefix = time.strftime('%Y%m%d', msgdate[:9])
+        authorline = email.utils.getaddresses(lmsg.msg.get_all('from', []))[0]
+        author = re.sub(r'\W+', '_', authorline[1]).strip('_').lower()
+        slug = '%s_%s' % (prefix, author)
+        if self.revision != 1:
+            slug = 'v%s_%s' % (self.revision, slug)
+
+        return slug
+
+    def save_am_mbox(self, outfile, covertrailers):
+        if os.path.exists(outfile):
+            os.unlink(outfile)
+        mbx = mailbox.mbox(outfile)
+        logger.info('---')
+        logger.critical('Writing %s', outfile)
+        at = 1
+        for lmsg in self.patches[1:]:
+            if lmsg is not None:
+                if self.has_cover and covertrailers and self.patches[0].followup_trailers:
+                    lmsg.followup_trailers += self.patches[0].followup_trailers
+                logger.info('  %s', lmsg.full_subject)
+                msg = lmsg.get_am_message()
+                mbx.add(msg)
+            else:
+                logger.error('  ERROR: missing [%s/%s]!', at, self.expected)
+            at += 1
+        return mbx
+
+    def save_cover(self, outfile):
+        cover_msg = self.patches[0].get_am_message(add_trailers=False)
+        with open(outfile, 'w') as fh:
+            fh.write(cover_msg.as_string())
+        logger.critical('Cover: %s', outfile)
+
+
+class LoreMessage:
+    def __init__(self, msg):
+        self.msg = msg
+        self.msgid = None
+
+        # Subject-based info
+        self.lsubject = None
+        self.full_subject = None
+        self.subject = None
+        self.reply = False
+        self.revision = 1
+        self.counter = 1
+        self.expected = 1
+        self.revision_inferred = True
+        self.counters_inferred = True
+
+        # Header-based info
+        self.in_reply_to = None
+
+        # Body and body-based info
+        self.body = None
+        self.has_diff = False
+        self.has_diffstat = False
+        self.trailers = list()
+        self.followup_trailers = list()
+
+        self.msgid = LoreMessage.get_clean_msgid(self.msg)
+        self.lsubject = LoreSubject(msg['Subject'])
+        # Copy them into this object for convenience
+        self.full_subject = self.lsubject.full_subject
+        self.subject = self.lsubject.subject
+        self.reply = self.lsubject.reply
+        self.revision = self.lsubject.revision
+        self.counter = self.lsubject.counter
+        self.expected = self.lsubject.expected
+        self.revision_inferred = self.lsubject.revision_inferred
+        self.counters_inferred = self.lsubject.counters_inferred
+
+        self.in_reply_to = LoreMessage.get_clean_msgid(self.msg, header='In-Reply-To')
+
+        # walk until we find the first text/plain part
+        mcharset = self.msg.get_content_charset()
+        if not mcharset:
+            mcharset = 'utf-8'
+        body = None
+        for part in msg.walk():
+            if part.get_content_type().find('text/plain') < 0:
+                continue
+            body = part.get_payload(decode=True)
+            if body is None:
+                continue
+            pcharset = part.get_content_charset()
+            if not pcharset:
+                pcharset = mcharset
+            body = body.decode(pcharset, errors='replace')
+            break
+        self.body = body
+
+        if re.search(r'^\s*\d+\sfile.*\d+ insertion.*\d+ deletion', self.body, re.MULTILINE | re.IGNORECASE):
+            self.has_diffstat = True
+        if re.search(r'^---.*\n\+\+\+', self.body, re.MULTILINE):
+            self.has_diff = True
+
+        # Do we have something that looks like a trailer?
+        matches = re.findall(r'^\s*([\w-]+: .*<\S+>)\s*$', self.body, re.MULTILINE)
+        if matches:
+            self.trailers = matches
+
+    def __repr__(self):
+        out = list()
+        out.append('msgid: %s' % self.msgid)
+        out.append(str(self.lsubject))
+
+        out.append('  in_reply_to: %s' % self.in_reply_to)
+
+        # Header-based info
+        out.append('  --- begin body ---')
+        for line in self.body.split('\n'):
+            out.append('  |%s' % line)
+        out.append('  --- end body ---')
+
+        # Body and body-based info
+        out.append('  has_diff: %s' % self.has_diff)
+        out.append('  has_diffstat: %s' % self.has_diffstat)
+        out.append('  --- begin my trailers ---')
+        for trailer in self.trailers:
+            out.append('  |%s' % trailer)
+        out.append('  --- begin followup trailers ---')
+        for trailer in self.followup_trailers:
+            out.append('  |%s' % trailer)
+        out.append('  --- end trailers ---')
+
+        return '\n'.join(out)
+
+    @staticmethod
+    def clean_header(hdrval):
+        new_hdrval = ''
+        dhdrs = email.header.decode_header(hdrval)
+        for dhdr in dhdrs:
+            if dhdr[1] is not None:
+                try:
+                    uval = dhdr[0].decode(dhdr[1], errors='replace')
+                except LookupError:
+                    # Not known charset/encoding. Try utf-8 and hope for the best.
+                    uval = dhdr[0].decode('utf-8', errors='replace')
+            elif isinstance(dhdr[0], (bytes, bytearray)):
+                uval = dhdr[0].decode('utf-8', errors='replace')
+            else:
+                uval = dhdr[0]
+            uval = uval.replace('\n', ' ')
+            new_hdrval += re.sub(r'\s+', ' ', uval).strip()
+        return new_hdrval
+
+    @staticmethod
+    def get_clean_msgid(msg, header='Message-Id'):
+        msgid = None
+        raw = msg.get(header)
+        if raw:
+            matches = re.search(r'<([^>]+)>', LoreMessage.clean_header(raw))
+            if matches:
+                msgid = matches.groups()[0]
+        return msgid
+
+    def get_am_message(self, add_trailers=True):
+        am_body = self.body
+        if add_trailers and self.followup_trailers:
+            cmdargs = None
+            for trailer in set(self.followup_trailers):
+                # Check if this trailer is already in the body
+                if trailer not in self.trailers:
+                    logger.info('    Adding trailer: %s', trailer)
+                    if cmdargs is None:
+                        cmdargs = ['interpret-trailers']
+                    cmdargs += ['--trailer', trailer]
+            if cmdargs:
+                am_body = git_run_command(None, args=cmdargs, stdin=am_body.encode('utf-8'))
+        am_msg = email.message.EmailMessage()
+        am_msg.set_payload(am_body.encode('utf-8'))
+        # Clean up headers
+        for hdrname, hdrval in self.msg.items():
+            lhdrname = hdrname.lower()
+            wanthdr = False
+            for hdrmatch in WANTHDRS:
+                if fnmatch.fnmatch(lhdrname, hdrmatch):
+                    wanthdr = True
+                    break
+            if wanthdr:
+                new_hdrval = LoreMessage.clean_header(hdrval)
+                am_msg.add_header(hdrname, new_hdrval)
+        return am_msg
+
+
+class LoreSubject:
+    def __init__(self, subject):
+        # Subject-based info
+        self.full_subject = None
+        self.subject = None
+        self.reply = False
+        self.resend = False
+        self.patch = False
+        self.rfc = False
+        self.revision = 1
+        self.counter = 1
+        self.expected = 1
+        self.revision_inferred = True
+        self.counters_inferred = True
+        self.prefixes = list()
+
+        subject = re.sub(r'\s+', ' ', LoreMessage.clean_header(subject)).strip()
+        # Remove any leading [] that don't have the word "patch" in them
+        while True:
+            oldsubj = subject
+            subject = re.sub(r'^\s*\[[^\]]*\]\s*(\[patch.*)', '\\1', subject, flags=re.IGNORECASE)
+            if oldsubj == subject:
+                break
+
+        # Remove any brackets inside brackets
+        while True:
+            oldsubj = subject
+            subject = re.sub(r'^\s*\[([^\]]*)\[([^\]]*)\]', '[\\1\\2]', subject)
+            subject = re.sub(r'^\s*\[([^\]]*)\]([^\]]*)\]', '[\\1\\2]', subject)
+            if oldsubj == subject:
+                break
+
+        self.full_subject = subject
+        # Is it a reply?
+        if re.search(r'^\w+:\s*\[', subject):
+            self.reply = True
+            subject = re.sub(r'^\w+:\s*\[', '[', subject)
+
+        # Find all [foo] in the title
+        while subject.find('[') == 0:
+            matches = re.search(r'^\[([^\]]*)\]', subject)
+            for chunk in matches.groups()[0].split():
+                if re.search(r'^\d+/\d+$', chunk):
+                    counters = chunk.split('/')
+                    self.counter = int(counters[0])
+                    self.expected = int(counters[1])
+                    self.counters_inferred = False
+                elif re.search(r'^v\d+$', chunk, re.IGNORECASE):
+                    self.revision = int(chunk[1:])
+                    self.revision_inferred = False
+                elif chunk.lower() == 'rfc':
+                    self.rfc = True
+                elif chunk.lower() == 'resend':
+                    self.resend = True
+                elif chunk.lower() == 'patch':
+                    self.patch = True
+                self.prefixes.append(chunk.lower())
+            subject = re.sub(r'^\s*\[[^\]]*\]\s*', '', subject)
+        self.subject = subject
+
+    def __repr__(self):
+        out = list()
+        out.append('  full_subject: %s' % self.full_subject)
+        out.append('  subject: %s' % self.subject)
+        out.append('  reply: %s' % self.reply)
+        out.append('  resend: %s' % self.resend)
+        out.append('  rfc: %s' % self.rfc)
+        out.append('  revision: %s' % self.revision)
+        out.append('  revision_inferred: %s' % self.revision_inferred)
+        out.append('  counter: %s' % self.counter)
+        out.append('  expected: %s' % self.expected)
+        out.append('  counters_inferred: %s' % self.counters_inferred)
+        out.append('  prefixes: %s' % ', '.join(self.prefixes))
+
+        return '\n'.join(out)
+
+
 def git_get_command_lines(gitdir, args):
     out = git_run_command(gitdir, args)
     lines = list()
@@ -86,26 +517,6 @@ def git_run_command(gitdir, args, stdin=None, logstderr=False):
     return output
 
 
-def amify_msg(msg, trailers, ensurediff=False):
-    body = get_plain_part(msg, ensurediff=ensurediff)
-    if trailers:
-        body = git_add_trailers(body, trailers)
-    msg.set_payload(body.encode('utf-8'))
-    # Clean up headers
-    newhdrs = []
-    for hdrname, hdrval in list(msg._headers):
-        lhdrname = hdrname.lower()
-        wanthdr = False
-        for hdrmatch in WANTHDRS:
-            if fnmatch.fnmatch(lhdrname, hdrmatch):
-                wanthdr = True
-                break
-        if wanthdr:
-            newhdrs.append((hdrname, hdrval))
-    msg._headers = newhdrs
-    return msg
-
-
 def get_config_from_git():
     gitconfig = _DEFAULT_CONFIG
     args = ['config', '-z', '--get-regexp', r'get-lore-mbox\..*']
@@ -137,8 +548,14 @@ def get_msgid_from_stdin():
 
 def get_pi_thread_by_url(t_mbx_url, savefile):
     resp = requests.get(t_mbx_url)
+    if resp.status_code != 200:
+        logger.critical('Server returned an error: %s', resp.status_code)
+        return None
     t_mbox = gzip.decompress(resp.content)
     resp.close()
+    if not len(t_mbox):
+        logger.critical('No messages found for that query')
+        return None
     with open(savefile, 'wb') as fh:
         logger.debug('Saving %s', savefile)
         fh.write(t_mbox)
@@ -167,57 +584,14 @@ def get_pi_thread_by_msgid(msgid, config, cmdargs):
 
     loc = urllib.parse.urlparse(t_mbx_url)
     if cmdargs.useproject:
+        logger.info('Modifying query to use %s', cmdargs.useproject)
         t_mbx_url = '%s://%s/%s/%s/t.mbox.gz' % (
             loc.scheme, loc.netloc, cmdargs.useproject, msgid)
+        logger.debug('Will query: %s', t_mbx_url)
     logger.critical('Grabbing thread from %s', loc.netloc)
     return get_pi_thread_by_url(t_mbx_url, savefile)
 
 
-def get_plain_part(msg, ensurediff=False):
-    # walk until we find the first text/plain part
-    mcharset = msg.get_content_charset()
-    if not mcharset:
-        mcharset = 'utf-8'
-    body = None
-    for part in msg.walk():
-        if part.get_content_type().find('text/plain') < 0:
-            continue
-        body = part.get_payload(decode=True)
-        if body is None:
-            continue
-        pcharset = part.get_content_charset()
-        if not pcharset:
-            pcharset = mcharset
-        body = body.decode(pcharset, errors='replace')
-        if ensurediff and not body_contains_diff(body):
-            continue
-        break
-    return body
-
-
-def git_add_trailers(payload, trailers):
-    cmdargs = ['interpret-trailers']
-    output = payload
-    if trailers:
-        for trailer in set(trailers):
-            # Check if this trailer is already in the body
-            if payload.find(trailer) < 0:
-                logger.info('    Adding trailer: %s', trailer)
-                cmdargs += ['--trailer', trailer]
-        output = git_run_command(None, args=cmdargs, stdin=payload.encode('utf-8'))
-    return output
-
-
-def get_clean_msgid(msg, header='Message-ID'):
-    msgid = None
-    raw = msg.get(header)
-    if raw:
-        matches = re.search(r'<([^>]+)>', raw)
-        if matches:
-            msgid = matches.groups()[0]
-    return msgid
-
-
 def mbox_to_am(mboxfile, config, cmdargs):
     outdir = cmdargs.outdir
     wantver = cmdargs.wantver
@@ -226,189 +600,59 @@ def mbox_to_am(mboxfile, config, cmdargs):
     mbx = mailbox.mbox(mboxfile)
     count = len(mbx)
     logger.info('Analyzing %s messages in the thread', count)
-    am_kept = list()
-    slug = None
-    cover_keys = dict()
-    sorted_keys = [None, None]
-    trailer_map = dict()
-    cur_vn = None
-    vn = None
-    multiple_revisions = False
-    msgid_map = dict()
-    irt_map = dict()
-    # Go through the mbox once to build a message map:
+    lmbx = LoreMailbox()
+    # Go through the mbox once to populate base series
     for key, msg in mbx.items():
-        msgid = get_clean_msgid(msg)
-        irtid = get_clean_msgid(msg, header='In-Reply-To')
-        msgid_map[msgid] = key
-        if irtid is not None:
-            if irtid not in irt_map.keys():
-                irt_map[irtid] = list()
-            irt_map[irtid].append(key)
-    # Go through it slowly now
-    for key, msg in mbx.items():
-        subj_info = get_subject_info(msg['Subject'])
-        logger.debug('Looking at msg %s: %s', key, subj_info['full_subject'])
-        body = get_plain_part(msg)
-        msgid = get_clean_msgid(msg)
-        irtid = get_clean_msgid(msg, header='In-Reply-To')
-        has_diffstat = body_contains_diffstat(body)
-        has_diff = body_contains_diff(body)
-
-        # if it has no in-reply-to, but other messages I-R-T to it, then
-        # it's probably a cover letter that doesn't follow the standard 00/NN notation
-        if irtid is None and not has_diff and msgid in irt_map.keys():
-            logger.debug('  Probaby a cover letter')
-            cover_keys[subj_info['revision']] = key
-            continue
-
-        if subj_info['revision_inferred'] and irtid in msgid_map:
-            # Grab revision info from the cover letter
-            cover_subj_info = get_subject_info(mbx[msgid_map[irtid]]['Subject'])
-            subj_info['revision'] = cover_subj_info['revision']
-            # Make sure sorted_keys has enough members
-            if len(sorted_keys) < subj_info['expected'] + 1:
-                sorted_keys = [None] * (subj_info['expected'] + 1)
-
-        new_vn = subj_info['revision']
-        if cur_vn is None or new_vn > cur_vn:
-            if wantver and wantver != new_vn:
-                logger.info('Found series revision: v%s (ignored)', new_vn)
-            else:
-                logger.info('Found series revision: v%s', new_vn)
-            if cur_vn is not None and new_vn > cur_vn:
-                multiple_revisions = True
-            if wantver is None or wantver == new_vn:
-                # Blow away anything we currently have in sorted_keys
-                sorted_keys = [None] * (subj_info['expected'] + 1)
-                slug = None
-            cur_vn = new_vn
-        elif vn is None:
-            cur_vn = new_vn
-
-        if wantver is not None and wantver != cur_vn:
-            logger.debug('  Ignoring v%s: %s', cur_vn, subj_info['full_subject'])
-            continue
-
-        vn = cur_vn
-
-        # We use a "slug" for mbox name, based on the date and author
-        if not slug:
-            msgdate = email.utils.parsedate_tz(str(msg['Date']))
-            prefix = time.strftime('%Y%m%d', msgdate[:9])
-            authorline = email.utils.getaddresses(msg.get_all('from', []))[0]
-            if authorline[0]:
-                author = re.sub(r'\W+', '_', authorline[0]).strip('_').lower()
-            else:
-                author = re.sub(r'\W+', '_', authorline[1]).strip('_').lower()
-            slug = '%s_%s' % (prefix, author)
-            if cur_vn != 1:
-                slug = 'v%s_%s' % (cur_vn, slug)
-
-        # If the counter is 0, it's definitely the cover letter
-        if subj_info['counter'] == 0 and cur_vn not in cover_keys.keys():
-            # Found the cover letter
-            logger.debug('  Found a cover letter for v%s', cur_vn)
-            am_kept.append(key)
-            sorted_keys[subj_info['counter']] = key
-            cover_keys[cur_vn] = key
-            continue
+        lmbx.add_message(msg)
 
-        if has_diff:
-            # Do we already have a match for this, though?
-            if sorted_keys[subj_info['counter']] is None:
-                am_kept.append(key)
-                sorted_keys[subj_info['counter']] = key
-                continue
-        # Do we have something that looks like a new trailer?
-        matches = re.search(r'^\s*([\w-]+: .*<\S+>)\s*$', body, re.MULTILINE)
-        if not matches:
-            continue
-        # Where do we need to stick them?
-        irt_key = 0
-        irt_id = get_clean_msgid(msg, header='In-Reply-To')
-        if irt_id and irt_id in msgid_map:
-            irt_key = msgid_map[irt_id]
-        if irt_key not in trailer_map:
-            trailer_map[irt_key] = list()
-        trailer_map[irt_key] += matches.groups()
-
-    if not len(am_kept):
-        logger.info('Did not find any patches to save')
-        return None
+    lser = lmbx.get_series(revision=wantver)
+    if lser is None and wantver is None:
+        logger.critical('No patches found.')
+        return
+    if lser is None:
+        logger.critical('Unable to find revision %s', wantver)
+        return
+    if len(lmbx.series) > 1 and not wantver:
+        logger.info('Will use the latest revision: v%s', lser.revision)
+        logger.info('You can pick other revisions using the -vN flag')
 
-    if not wantname:
-        am_filename = os.path.join(outdir, '%s.mbx' % slug)
-        am_cover = os.path.join(outdir, '%s.cover' % slug)
-    else:
-        am_filename = os.path.join(outdir, wantname)
-        am_cover = os.path.join(outdir, '%s.cover' % wantname)
-        if wantname.find('.') < 0:
-            slug = wantname
-        else:
+    if wantname:
+        slug = wantname
+        if wantname.find('.') > -1:
             slug = '.'.join(wantname.split('.')[:-1])
+    else:
+        slug = lser.get_slug()
 
-    if multiple_revisions and not wantver:
-        logger.info('Will use the latest revision: v%s', vn)
-        logger.info('You can pick other revisions using the -vN flag')
-    if os.path.exists(am_filename):
-        os.unlink(am_filename)
-    am_mbx = mailbox.mbox(am_filename)
-    logger.info('---')
+    am_filename = os.path.join(outdir, '%s.mbx' % lser.get_slug())
+    am_cover = os.path.join(outdir, '%s.cover' % lser.get_slug())
 
-    # Check if any trailers were sent to the cover letter
-    global_trailers = []
-    if vn in cover_keys and cover_keys[vn] in trailer_map:
-        global_trailers = trailer_map[cover_keys[vn]]
-
-    logger.critical('Writing %s', am_filename)
-    have_missing = False
-    at = 1
-    for key in sorted_keys[1:]:
-        if key is None:
-            logger.error('  ERROR: missing [%s/%s]!', at, len(sorted_keys)-1)
-            have_missing = True
-        else:
-            msg = mbx[key]
-            subject = re.sub(r'\s+', ' ', msg['Subject'])
-            logger.info('  %s', subject)
-            trailers = []
-            if key in trailer_map:
-                trailers += trailer_map[key]
-            if global_trailers and covertrailers:
-                trailers += global_trailers
-            msg = amify_msg(msg, trailers, ensurediff=True)
-            am_mbx.add(msg)
-        at += 1
-
-    if not len(am_mbx):
-        logger.info('Did not find any patches to save')
-        return None
+    am_mbx = lser.save_am_mbox(am_filename, covertrailers)
+    logger.info('---')
 
     logger.critical('Total patches: %s', len(am_mbx))
-    if global_trailers and not covertrailers:
+    if lser.has_cover and lser.patches[0].followup_trailers and not covertrailers:
         # Warn that some trailers were sent to the cover letter
         logger.critical('---')
         logger.critical('NOTE: Some trailers were sent to the cover letter:')
-        for trailer in global_trailers:
+        for trailer in lser.patches[0].followup_trailers:
             logger.critical('      %s', trailer)
         logger.critical('NOTE: Rerun with -t to apply them to all patches')
 
     logger.critical('---')
-    if have_missing:
+    if not lser.complete:
         logger.critical('WARNING: Thread incomplete!')
 
-    if vn in cover_keys:
-        # Save the cover letter
-        cover_msg = amify_msg(mbx[cover_keys[vn]], None, ensurediff=False)
-        with open(am_cover, 'w') as fh:
-            fh.write(cover_msg.as_string())
-        logger.critical('Cover: %s', am_cover)
-        first_body = get_plain_part(cover_msg)
-    else:
-        first_body = get_plain_part(am_mbx[0])
+    if lser.has_cover:
+        lser.save_cover(am_cover)
+
+    top_msgid = None
+    first_body = None
+    for lmsg in lser.patches:
+        if lmsg is not None:
+            first_body = lmsg.body
+            top_msgid = lmsg.msgid
+            break
 
-    top_msgid = get_clean_msgid(am_mbx[0])
     linkurl = config['linkmask'] % top_msgid
     logger.critical('Link: %s', linkurl)
 
@@ -435,60 +679,7 @@ def mbox_to_am(mboxfile, config, cmdargs):
     return am_filename
 
 
-def get_subject_info(subject):
-    subject = re.sub(r'\s+', ' ', subject).strip()
-    subject_info = {
-        'full_subject': subject,
-        'reply': False,
-        'resend': False,
-        'rfc': False,
-        'revision': 1,
-        'revision_inferred': True,
-        'counter': 1,
-        'expected': 1,
-        'prefixes': list(),
-        'subject': None,
-    }
-    # Is it a reply?
-    if re.search(r'^\w+:\s*\[', subject):
-        subject_info['reply'] = True
-        subject = re.sub(r'^\w+:\s*\[', '[', subject)
-
-    # Find all [foo] in the title
-    while subject.find('[') == 0:
-        matches = re.search(r'^\[([^\]]*)\]', subject)
-        for chunk in matches.groups()[0].split():
-            if re.search(r'^\d+/\d+$', chunk):
-                counters = chunk.split('/')
-                subject_info['counter'] = int(counters[0])
-                subject_info['expected'] = int(counters[1])
-            elif re.search(r'^v\d+$', chunk, re.IGNORECASE):
-                subject_info['revision'] = int(chunk[1:])
-                subject_info['revision_inferred'] = False
-            elif chunk.lower() == 'rfc':
-                subject_info['rfc'] = True
-            elif chunk.lower() == 'resend':
-                subject_info['resend'] = True
-            subject_info['prefixes'].append(chunk.lower())
-        subject = re.sub(r'^\s*\[[^\]]*\]\s*', '', subject)
-    subject_info['subject'] = subject
-
-    return subject_info
-
-
-def body_contains_diffstat(body):
-    if re.search(r'^\s*\d+\sfile.*\d+ insertion.*\d+ deletion', body, re.MULTILINE | re.IGNORECASE):
-        return True
-    return False
-
-
-def body_contains_diff(body):
-    if re.search(r'^---.*\n\+\+\+', body, re.MULTILINE):
-        return True
-    return False
-
-
-def get_newest_series(mboxfile, cmdargs):
+def get_newest_series(mboxfile):
     # Open the mbox and find the latest series mentioned in it
     mbx = mailbox.mbox(mboxfile)
     base_msg = None
@@ -496,66 +687,66 @@ def get_newest_series(mboxfile, cmdargs):
     seen_msgids = list()
     seen_covers = list()
     for key, msg in mbx.items():
-        msgid = get_clean_msgid(msg)
+        msgid = LoreMessage.get_clean_msgid(msg)
         seen_msgids.append(msgid)
-        subj_info = get_subject_info(msg['Subject'])
+        lsub = LoreSubject(msg['Subject'])
         # Ignore replies or counters above 1
-        if subj_info['reply'] or subj_info['counter'] > 1:
+        if lsub.reply or lsub.counter > 1:
             continue
-        if latest_revision is None or subj_info['revision'] > latest_revision:
+        if latest_revision is None or lsub.revision > latest_revision:
             # New revision
-            latest_revision = subj_info['revision']
-            if subj_info['counter'] == 0:
+            latest_revision = lsub.revision
+            if lsub.counter == 0:
                 # And a cover letter, nice. This is the easy case
                 base_msg = msg
                 seen_covers.append(latest_revision)
                 continue
-            if subj_info['counter'] == 1:
+            if lsub.counter == 1:
                 if latest_revision not in seen_covers:
                     # A patch/series without a cover letter
                     base_msg = msg
 
     # Get subject info from base_msg again
-    subj_info = get_subject_info(base_msg['Subject'])
-    if not len(subj_info['prefixes']):
+    lsub = LoreSubject(base_msg['Subject'])
+    if not len(lsub.prefixes):
         logger.debug('Not checking for new revisions: no prefixes on the cover letter.')
         mbx.close()
         return
-    base_msgid = get_clean_msgid(base_msg)
+    base_msgid = LoreMessage.get_clean_msgid(base_msg)
     fromeml = email.utils.getaddresses(base_msg.get_all('from', []))[0][1]
     msgdate = email.utils.parsedate_tz(str(base_msg['Date']))
     startdate = time.strftime('%Y%m%d', msgdate[:9])
     listarc = base_msg.get_all('List-Archive')[-1].strip('<>')
-    q = 's:"%s" AND f:"%s" AND d:%s..' % (subj_info['subject'], fromeml, startdate)
+    q = 's:"%s" AND f:"%s" AND d:%s..' % (lsub.subject, fromeml, startdate)
     queryurl = '%s?%s' % (listarc, urllib.parse.urlencode({'q': q, 'x': 'A', 'o': '-1'}))
     logger.critical('Checking for newer revisions on %s', listarc)
     logger.debug('Query URL: %s', queryurl)
     resp = requests.get(queryurl)
     # try to parse it
-    tree = ET.fromstring(resp.content)
+    tree = xml.etree.ElementTree.fromstring(resp.content)
     resp.close()
     ns = {'atom': 'http://www.w3.org/2005/Atom'}
     entries = tree.findall('atom:entry', ns)
 
     for entry in entries:
         title = entry.find('atom:title', ns).text
-        subj_info = get_subject_info(title)
-        if subj_info['reply'] or subj_info['counter'] > 1:
+        lsub = LoreSubject(title)
+        if lsub.reply or lsub.counter > 1:
             logger.debug('Ignoring result (not interesting): %s', title)
             continue
         link = entry.find('atom:link', ns).get('href')
-        if subj_info['revision'] < latest_revision:
+        if lsub.revision < latest_revision:
             logger.debug('Ignoring result (not new revision): %s', title)
             continue
         if link.find('/%s/' % base_msgid) > 0:
             logger.debug('Ignoring result (same thread as ours):%s', title)
             continue
-        if subj_info['revision'] == 1 and subj_info['revision'] == latest_revision:
+        if lsub.revision == 1 and lsub.revision == latest_revision:
             # Someone sent a separate message with an identical title but no new vX in the subject line
             # It's *probably* a new revision.
             logger.debug('Likely a new revision: %s', title)
-        elif subj_info['revision'] > latest_revision:
-            logger.debug('Definitely a new revision [v%s]: %s', subj_info['revision'], title)
+        elif lsub.revision > latest_revision:
+            logger.debug('Definitely a new revision [v%s]: %s', lsub.revision, title)
         else:
             logger.debug('No idea what this is: %s', title)
             continue
@@ -566,7 +757,7 @@ def get_newest_series(mboxfile, cmdargs):
         # Append all of these to the existing mailbox
         new_adds = 0
         for nt_msg in nt_mbx:
-            nt_msgid = get_clean_msgid(nt_msg)
+            nt_msgid = LoreMessage.get_clean_msgid(nt_msg)
             if nt_msgid in seen_msgids:
                 logger.debug('Duplicate message, skipping')
                 continue
@@ -615,7 +806,7 @@ def main(cmdargs):
     mboxfile = get_pi_thread_by_msgid(msgid, config, cmdargs)
 
     if mboxfile and cmdargs.checknewer:
-        get_newest_series(mboxfile, cmdargs)
+        get_newest_series(mboxfile)
 
     if mboxfile is None:
         return
author	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-02-12 11:32:50 -0500
committer	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-02-12 11:32:50 -0500
commit	560dcf24ba0a716565a9f8552c271b687238b59a (patch)
tree	78a899ca4c320032b71b760b07ee3a8473fd0555
parent	edec683e42126d2aa918010f38ba84e6d83edbea (diff)
download	korg-helpers-560dcf24ba0a716565a9f8552c271b687238b59a.tar.gz