diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2024-02-27 17:40:47 -0500 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2024-02-27 17:40:47 -0500 |
commit | be3b0af6dfd18d42f91b39bd0070ae51206b2660 (patch) | |
tree | d7d498d9a013094d2cc3af531c8b546f11d076db | |
parent | 7f3284906e67f138eae82271a6c3bde1ebb30791 (diff) | |
download | b4-be3b0af6dfd18d42f91b39bd0070ae51206b2660.tar.gz |
mbox: initial work to add thread minimization
While playing with shell_gpt, I needed a way to slim down lengthy
threads so we don't ask ChatGPT to analyze things like DKIM headers ore
excessive quotes. The routines aren't very good yet, but commit them
for now until we can do a better job and improve them further.
Link: https://lore.kernel.org/20240227-flawless-capybara-of-drama-e09653@lemur/
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r-- | misc/retrieve_lore_thread.py | 36 | ||||
-rw-r--r-- | src/b4/__init__.py | 5 | ||||
-rw-r--r-- | src/b4/command.py | 7 | ||||
-rw-r--r-- | src/b4/mbox.py | 69 |
4 files changed, 113 insertions, 4 deletions
diff --git a/misc/retrieve_lore_thread.py b/misc/retrieve_lore_thread.py new file mode 100644 index 0000000..4de39fb --- /dev/null +++ b/misc/retrieve_lore_thread.py @@ -0,0 +1,36 @@ +import sys + +from instructor import OpenAISchema +from pydantic import Field + +# This is needed for now while the minimization bits aren't released +sys.path.insert(0, '/home/user/work/git/korg/b4/src') +import b4 +import b4.mbox + + +class Function(OpenAISchema): + """ + Accepts a message-id, retrieves a mailing list discussion thread from lore.kernel.org, and returns a mailbox with all messages in the tread. + """ + + message_id: str = Field( + ..., + example='20240228-foo-bar-baz@localhost', + descriptions='Message-ID of the thread to retrieve from lore.kernel.org', + ) + + class Config: + title = "retrieve_lore_thread" + + @classmethod + def execute(cls, message_id: str) -> str: + b4._setup_main_config() + msgs = b4.get_pi_thread_by_msgid(message_id, with_thread=True) + if not msgs: + return f'No messages matching this message-id: {message_id}' + minmsgs = b4.mbox.minimize_thread(msgs) + out = '' + for minmsg in minmsgs: + out += minmsg.as_string(policy=b4.emlpolicy) + '\n' + return out diff --git a/src/b4/__init__.py b/src/b4/__init__.py index d26662c..34f9e4a 100644 --- a/src/b4/__init__.py +++ b/src/b4/__init__.py @@ -2847,7 +2847,7 @@ def get_config_from_git(regexp: str, defaults: Optional[dict] = None, return gitconfig -def _setup_main_config(cmdargs: argparse.Namespace) -> None: +def _setup_main_config(cmdargs: Optional[argparse.Namespace] = None) -> None: global MAIN_CONFIG defcfg = copy.deepcopy(DEFAULT_CONFIG) @@ -2878,7 +2878,8 @@ def _setup_main_config(cmdargs: argparse.Namespace) -> None: gpgcfg = get_config_from_git(r'gpg\..*', {'program': 'gpg'}) config['gpgbin'] = gpgcfg['program'] - _cmdline_config_override(cmdargs, config, 'b4') + if cmdargs: + _cmdline_config_override(cmdargs, config, 'b4') MAIN_CONFIG = config diff --git a/src/b4/command.py b/src/b4/command.py index be0f033..69f0593 100644 --- a/src/b4/command.py +++ b/src/b4/command.py @@ -166,8 +166,11 @@ def setup_parser() -> argparse.ArgumentParser: cmd_mbox_common_opts(sp_mbox) sp_mbox.add_argument('-f', '--filter-dupes', dest='filterdupes', action='store_true', default=False, help='When adding messages to existing maildir, filter out duplicates') - sp_mbox.add_argument('-r', '--refetch', dest='refetch', metavar='MBOX', default=False, - help='Refetch all messages in specified mbox with their original headers') + sm_g = sp_mbox.add_mutually_exclusive_group() + sm_g.add_argument('-r', '--refetch', dest='refetch', metavar='MBOX', default=False, + help='Refetch all messages in specified mbox with their original headers') + sm_g.add_argument('--minimize', dest='minimize', action='store_true', default=False, + help='Attempt to generate a minimal thread to simplify review.') sp_mbox.set_defaults(func=cmd_mbox) # b4 am diff --git a/src/b4/mbox.py b/src/b4/mbox.py index 9bcee97..a7d4376 100644 --- a/src/b4/mbox.py +++ b/src/b4/mbox.py @@ -697,6 +697,72 @@ def refetch(dest: str) -> None: mbox.close() +def minimize_thread(msgs: List[email.message.EmailMessage]) -> List[email.message.EmailMessage]: + # We go through each message and minimize headers and body content + wanthdrs = { + 'From', + 'Subject', + 'Date', + 'Message-ID', + 'Reply-To', + 'In-Reply-To', + } + mmsgs = list() + for msg in msgs: + mmsg = email.message.EmailMessage() + for wanthdr in wanthdrs: + cleanhdr = b4.LoreMessage.clean_header(msg[wanthdr]) + if cleanhdr: + mmsg[wanthdr] = cleanhdr + + body, charset = b4.LoreMessage.get_payload(msg) + if not (b4.DIFF_RE.search(body) or b4.DIFFSTAT_RE.search(body)): + htrs, cmsg, mtrs, basement, sig = b4.LoreMessage.get_body_parts(body) + # split the message into quoted and unquoted chunks + chunks = list() + chunk = list() + current = None + for line in (cmsg.rstrip().splitlines()): + quoted = line.startswith('>') and True or False + if current is None: + current = quoted + if current == quoted: + if quoted and re.search(r'^>\s*>', line): + # trim multiple levels of quoting + continue + if quoted and not chunk and line.strip() == '>': + # Trim empty lines with just > in them + continue + chunk.append(line) + continue + + if current: + while len(chunk) and chunk[-1].strip() == '>': + chunk.pop(-1) + if chunk: + chunks.append((quoted, chunk)) + chunk = list() + chunk.append(line) + current = quoted + + # Don't append bottom quotes + if chunk and not current: + chunks.append((current, chunk)) + + body = '' + for quoted, chunk in chunks: + # Should we offer a way to trim the quote in some fashion? + body += '\n'.join(chunk).strip() + '\n\n' + if not body.strip(): + continue + + mmsg.set_payload(body, charset='utf-8') + # mmsg.set_charset('utf-8') + mmsgs.append(mmsg) + + return mmsgs + + def main(cmdargs: argparse.Namespace) -> None: # We force some settings if cmdargs.subcmd == 'shazam': @@ -738,6 +804,9 @@ def main(cmdargs: argparse.Namespace) -> None: return logger.info('%s messages in the thread', len(msgs)) + if cmdargs.subcmd == 'mbox' and cmdargs.minimize: + msgs = minimize_thread(msgs) + if cmdargs.outdir == '-': logger.info('---') b4.save_mboxrd_mbox(msgs, sys.stdout.buffer, mangle_from=False) |