aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2024-02-27 17:40:47 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2024-02-27 17:40:47 -0500
commitbe3b0af6dfd18d42f91b39bd0070ae51206b2660 (patch)
treed7d498d9a013094d2cc3af531c8b546f11d076db
parent7f3284906e67f138eae82271a6c3bde1ebb30791 (diff)
downloadb4-be3b0af6dfd18d42f91b39bd0070ae51206b2660.tar.gz
mbox: initial work to add thread minimization
While playing with shell_gpt, I needed a way to slim down lengthy threads so we don't ask ChatGPT to analyze things like DKIM headers ore excessive quotes. The routines aren't very good yet, but commit them for now until we can do a better job and improve them further. Link: https://lore.kernel.org/20240227-flawless-capybara-of-drama-e09653@lemur/ Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rw-r--r--misc/retrieve_lore_thread.py36
-rw-r--r--src/b4/__init__.py5
-rw-r--r--src/b4/command.py7
-rw-r--r--src/b4/mbox.py69
4 files changed, 113 insertions, 4 deletions
diff --git a/misc/retrieve_lore_thread.py b/misc/retrieve_lore_thread.py
new file mode 100644
index 0000000..4de39fb
--- /dev/null
+++ b/misc/retrieve_lore_thread.py
@@ -0,0 +1,36 @@
+import sys
+
+from instructor import OpenAISchema
+from pydantic import Field
+
+# This is needed for now while the minimization bits aren't released
+sys.path.insert(0, '/home/user/work/git/korg/b4/src')
+import b4
+import b4.mbox
+
+
+class Function(OpenAISchema):
+ """
+ Accepts a message-id, retrieves a mailing list discussion thread from lore.kernel.org, and returns a mailbox with all messages in the tread.
+ """
+
+ message_id: str = Field(
+ ...,
+ example='20240228-foo-bar-baz@localhost',
+ descriptions='Message-ID of the thread to retrieve from lore.kernel.org',
+ )
+
+ class Config:
+ title = "retrieve_lore_thread"
+
+ @classmethod
+ def execute(cls, message_id: str) -> str:
+ b4._setup_main_config()
+ msgs = b4.get_pi_thread_by_msgid(message_id, with_thread=True)
+ if not msgs:
+ return f'No messages matching this message-id: {message_id}'
+ minmsgs = b4.mbox.minimize_thread(msgs)
+ out = ''
+ for minmsg in minmsgs:
+ out += minmsg.as_string(policy=b4.emlpolicy) + '\n'
+ return out
diff --git a/src/b4/__init__.py b/src/b4/__init__.py
index d26662c..34f9e4a 100644
--- a/src/b4/__init__.py
+++ b/src/b4/__init__.py
@@ -2847,7 +2847,7 @@ def get_config_from_git(regexp: str, defaults: Optional[dict] = None,
return gitconfig
-def _setup_main_config(cmdargs: argparse.Namespace) -> None:
+def _setup_main_config(cmdargs: Optional[argparse.Namespace] = None) -> None:
global MAIN_CONFIG
defcfg = copy.deepcopy(DEFAULT_CONFIG)
@@ -2878,7 +2878,8 @@ def _setup_main_config(cmdargs: argparse.Namespace) -> None:
gpgcfg = get_config_from_git(r'gpg\..*', {'program': 'gpg'})
config['gpgbin'] = gpgcfg['program']
- _cmdline_config_override(cmdargs, config, 'b4')
+ if cmdargs:
+ _cmdline_config_override(cmdargs, config, 'b4')
MAIN_CONFIG = config
diff --git a/src/b4/command.py b/src/b4/command.py
index be0f033..69f0593 100644
--- a/src/b4/command.py
+++ b/src/b4/command.py
@@ -166,8 +166,11 @@ def setup_parser() -> argparse.ArgumentParser:
cmd_mbox_common_opts(sp_mbox)
sp_mbox.add_argument('-f', '--filter-dupes', dest='filterdupes', action='store_true', default=False,
help='When adding messages to existing maildir, filter out duplicates')
- sp_mbox.add_argument('-r', '--refetch', dest='refetch', metavar='MBOX', default=False,
- help='Refetch all messages in specified mbox with their original headers')
+ sm_g = sp_mbox.add_mutually_exclusive_group()
+ sm_g.add_argument('-r', '--refetch', dest='refetch', metavar='MBOX', default=False,
+ help='Refetch all messages in specified mbox with their original headers')
+ sm_g.add_argument('--minimize', dest='minimize', action='store_true', default=False,
+ help='Attempt to generate a minimal thread to simplify review.')
sp_mbox.set_defaults(func=cmd_mbox)
# b4 am
diff --git a/src/b4/mbox.py b/src/b4/mbox.py
index 9bcee97..a7d4376 100644
--- a/src/b4/mbox.py
+++ b/src/b4/mbox.py
@@ -697,6 +697,72 @@ def refetch(dest: str) -> None:
mbox.close()
+def minimize_thread(msgs: List[email.message.EmailMessage]) -> List[email.message.EmailMessage]:
+ # We go through each message and minimize headers and body content
+ wanthdrs = {
+ 'From',
+ 'Subject',
+ 'Date',
+ 'Message-ID',
+ 'Reply-To',
+ 'In-Reply-To',
+ }
+ mmsgs = list()
+ for msg in msgs:
+ mmsg = email.message.EmailMessage()
+ for wanthdr in wanthdrs:
+ cleanhdr = b4.LoreMessage.clean_header(msg[wanthdr])
+ if cleanhdr:
+ mmsg[wanthdr] = cleanhdr
+
+ body, charset = b4.LoreMessage.get_payload(msg)
+ if not (b4.DIFF_RE.search(body) or b4.DIFFSTAT_RE.search(body)):
+ htrs, cmsg, mtrs, basement, sig = b4.LoreMessage.get_body_parts(body)
+ # split the message into quoted and unquoted chunks
+ chunks = list()
+ chunk = list()
+ current = None
+ for line in (cmsg.rstrip().splitlines()):
+ quoted = line.startswith('>') and True or False
+ if current is None:
+ current = quoted
+ if current == quoted:
+ if quoted and re.search(r'^>\s*>', line):
+ # trim multiple levels of quoting
+ continue
+ if quoted and not chunk and line.strip() == '>':
+ # Trim empty lines with just > in them
+ continue
+ chunk.append(line)
+ continue
+
+ if current:
+ while len(chunk) and chunk[-1].strip() == '>':
+ chunk.pop(-1)
+ if chunk:
+ chunks.append((quoted, chunk))
+ chunk = list()
+ chunk.append(line)
+ current = quoted
+
+ # Don't append bottom quotes
+ if chunk and not current:
+ chunks.append((current, chunk))
+
+ body = ''
+ for quoted, chunk in chunks:
+ # Should we offer a way to trim the quote in some fashion?
+ body += '\n'.join(chunk).strip() + '\n\n'
+ if not body.strip():
+ continue
+
+ mmsg.set_payload(body, charset='utf-8')
+ # mmsg.set_charset('utf-8')
+ mmsgs.append(mmsg)
+
+ return mmsgs
+
+
def main(cmdargs: argparse.Namespace) -> None:
# We force some settings
if cmdargs.subcmd == 'shazam':
@@ -738,6 +804,9 @@ def main(cmdargs: argparse.Namespace) -> None:
return
logger.info('%s messages in the thread', len(msgs))
+ if cmdargs.subcmd == 'mbox' and cmdargs.minimize:
+ msgs = minimize_thread(msgs)
+
if cmdargs.outdir == '-':
logger.info('---')
b4.save_mboxrd_mbox(msgs, sys.stdout.buffer, mangle_from=False)