Merge branch 'jk/parse-commit-with-malformed-ident'

The commit object parser has been taught to be a bit more lenient to parse timestamps on the author/committer line with a malformed author/committer ident. * jk/parse-commit-with-malformed-ident: parse_commit(): describe more date-parsing failure modes parse_commit(): handle broken whitespace-only timestamp parse_commit(): parse timestamp from end of line t4212: avoid putting git on left-hand side of pipe
author: Junio C Hamano <gitster@pobox.com> 2023-05-09 16:45:44 -0700
committer: Junio C Hamano <gitster@pobox.com> 2023-05-09 16:45:45 -0700
commit: 620e92b8454d2569b9ad9a2070fd2edea99895cc (patch)
tree: 921324185df2fd40f2678b32172b54fd29040c45 /commit.c
parent: 69c786637d7a7fe3b2b8f7d989af095f5f49c3a8 (diff)
parent: 90ef0f14eb1410747885806d8e55725053572654 (diff)
download: git-620e92b8454d2569b9ad9a2070fd2edea99895cc.tar.gz
1 files changed, 49 insertions, 8 deletions
diff --git a/commit.c b/commit.c
index 878b4473e4..e2e4fd2db9 100644
--- a/commit.c
+++ b/commit.c
@@ -96,6 +96,7 @@ struct commit *lookup_commit_reference_by_name(const char *name)
 static timestamp_t parse_commit_date(const char *buf, const char *tail)
 {
 	const char *dateptr;
+	const char *eol;
 
 	if (buf + 6 >= tail)
 		return 0;
@@ -107,16 +108,56 @@ static timestamp_t parse_commit_date(const char *buf, const char *tail)
 		return 0;
 	if (memcmp(buf, "committer", 9))
 		return 0;
-	while (buf < tail && *buf++ != '>')
-		/* nada */;
-	if (buf >= tail)
+
+	/*
+	 * Jump to end-of-line so that we can walk backwards to find the
+	 * end-of-email ">". This is more forgiving of malformed cases
+	 * because unexpected characters tend to be in the name and email
+	 * fields.
+	 */
+	eol = memchr(buf, '\n', tail - buf);
+	if (!eol)
 		return 0;
-	dateptr = buf;
-	while (buf < tail && *buf++ != '\n')
-		/* nada */;
-	if (buf >= tail)
+	dateptr = eol;
+	while (dateptr > buf && dateptr[-1] != '>')
+		dateptr--;
+	if (dateptr == buf)
 		return 0;
-	/* dateptr < buf && buf[-1] == '\n', so parsing will stop at buf-1 */
+
+	/*
+	 * Trim leading whitespace, but make sure we have at least one
+	 * non-whitespace character, as parse_timestamp() will otherwise walk
+	 * right past the newline we found in "eol" when skipping whitespace
+	 * itself.
+	 *
+	 * In theory it would be sufficient to allow any character not matched
+	 * by isspace(), but there's a catch: our isspace() does not
+	 * necessarily match the behavior of parse_timestamp(), as the latter
+	 * is implemented by system routines which match more exotic control
+	 * codes, or even locale-dependent sequences.
+	 *
+	 * Since we expect the timestamp to be a number, we can check for that.
+	 * Anything else (e.g., a non-numeric token like "foo") would just
+	 * cause parse_timestamp() to return 0 anyway.
+	 */
+	while (dateptr < eol && isspace(*dateptr))
+		dateptr++;
+	if (!isdigit(*dateptr) && *dateptr != '-')
+		return 0;
+
+	/*
+	 * We know there is at least one digit (or dash), so we'll begin
+	 * parsing there and stop at worst case at eol.
+	 *
+	 * Note that we may feed parse_timestamp() extra characters here if the
+	 * commit is malformed, and it will parse as far as it can. For
+	 * example, "123foo456" would return "123". That might be questionable
+	 * (versus returning "0"), but it would help in a hypothetical case
+	 * like "123456+0100", where the whitespace from the timezone is
+	 * missing. Since such syntactic errors may be baked into history and
+	 * hard to correct now, let's err on trying to make our best guess
+	 * here, rather than insist on perfect syntax.
+	 */
 	return parse_timestamp(dateptr, NULL, 10);
 }
author	Junio C Hamano <gitster@pobox.com>	2023-05-09 16:45:44 -0700
committer	Junio C Hamano <gitster@pobox.com>	2023-05-09 16:45:45 -0700
commit	620e92b8454d2569b9ad9a2070fd2edea99895cc (patch)
tree	921324185df2fd40f2678b32172b54fd29040c45 /commit.c
parent	69c786637d7a7fe3b2b8f7d989af095f5f49c3a8 (diff)
parent	90ef0f14eb1410747885806d8e55725053572654 (diff)
download	git-620e92b8454d2569b9ad9a2070fd2edea99895cc.tar.gz