aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-02-11 12:38:33 -0500
committerKonstantin Ryabitsev <konstantin@linuxfoundation.org>2020-02-11 12:38:33 -0500
commitbb81a9edd5cb32cd900cacc51a7e76acfdd1b5b5 (patch)
treed6868a7eb4a8954017e24c0fa0b5f2345188e4bd
parentea55cc54be17c478e7307f8d4eb640988d7822f9 (diff)
downloadkorg-helpers-bb81a9edd5cb32cd900cacc51a7e76acfdd1b5b5.tar.gz
Properly decode non-utf8 message content
Actually pay attention to what the charset says in the message headers for the few holdouts who are still sending things as iso-8859-1. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
-rwxr-xr-xget-lore-mbox.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/get-lore-mbox.py b/get-lore-mbox.py
index 63f8838..a548341 100755
--- a/get-lore-mbox.py
+++ b/get-lore-mbox.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
# -*- coding: utf-8 -*-
#
__author__ = 'Konstantin Ryabitsev <konstantin@linuxfoundation.org>'
@@ -174,6 +175,9 @@ def get_pi_thread_by_msgid(msgid, config, cmdargs):
def get_plain_part(msg, ensurediff=False):
# walk until we find the first text/plain part
+ mcharset = msg.get_content_charset()
+ if not mcharset:
+ mcharset = 'utf-8'
body = None
for part in msg.walk():
if part.get_content_type().find('text/plain') < 0:
@@ -181,7 +185,10 @@ def get_plain_part(msg, ensurediff=False):
body = part.get_payload(decode=True)
if body is None:
continue
- body = body.decode('utf-8', errors='replace')
+ pcharset = part.get_content_charset()
+ if not pcharset:
+ pcharset = mcharset
+ body = body.decode(pcharset, errors='replace')
if ensurediff and not body_contains_diff(body):
continue
break