From 60452a30f5971c54ecdcd1bd389c1cabbbc844f5 Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:27 +0200 Subject: grep: break down an "if" stmt in preparation for next changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'grep.c') diff --git a/grep.c b/grep.c index 7b2b96a437..f430d7e756 100644 --- a/grep.c +++ b/grep.c @@ -403,7 +403,9 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; - if (opt->fixed || is_fixed(p->pattern, p->patternlen)) + if (opt->fixed) + p->fixed = 1; + else if (is_fixed(p->pattern, p->patternlen)) p->fixed = 1; else p->fixed = 0; -- cgit 1.2.3-korg From 5c1ebcca4d1df3b2a84d01b3f32ec13bf8f760f8 Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:30 +0200 Subject: grep/icase: avoid kwsset on literal non-ascii strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we detect the pattern is just a literal string, we avoid heavy regex engine and use fast substring search implemented in kwsset.c. But kws uses git-ctype which is locale-independent so it does not know how to fold case properly outside ascii range. Let regcomp or pcre take care of this case instead. Slower, but accurate. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 7 ++++++- t/t7812-grep-icase-non-ascii.sh | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100755 t/t7812-grep-icase-non-ascii.sh (limited to 'grep.c') diff --git a/grep.c b/grep.c index f430d7e756..451275d298 100644 --- a/grep.c +++ b/grep.c @@ -4,6 +4,7 @@ #include "xdiff-interface.h" #include "diff.h" #include "diffcore.h" +#include "commit.h" static int grep_source_load(struct grep_source *gs); static int grep_source_is_binary(struct grep_source *gs); @@ -398,14 +399,18 @@ static int is_fixed(const char *s, size_t len) static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { + int icase, ascii_only; int err; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; + icase = opt->regflags & REG_ICASE || p->ignore_case; + ascii_only = !has_non_ascii(p->pattern); if (opt->fixed) p->fixed = 1; - else if (is_fixed(p->pattern, p->patternlen)) + else if ((!icase || ascii_only) && + is_fixed(p->pattern, p->patternlen)) p->fixed = 1; else p->fixed = 0; diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh new file mode 100755 index 0000000000..b78a774dab --- /dev/null +++ b/t/t7812-grep-icase-non-ascii.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_LOCALE 'setup' ' + test_write_lines "TILRAUN: Halló Heimur!" >file && + git add file && + LC_ALL="$is_IS_locale" && + export LC_ALL +' + +test_have_prereq GETTEXT_LOCALE && +test-regex "HALLÓ" "Halló" ICASE && +test_set_prereq REGEX_LOCALE + +test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' + git grep -i "TILRAUN: Halló Heimur!" && + git grep -i "TILRAUN: HALLÓ HEIMUR!" +' + +test_done -- cgit 1.2.3-korg From 793dc676e08394ed15bffe0ed66606ff9ced1c6a Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:31 +0200 Subject: grep/icase: avoid kwsset when -F is specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to the previous commit, we can't use kws on icase search outside ascii range. But we can't simply pass the pattern to regcomp/pcre like the previous commit because it may contain regex special characters, so we need to quote the regex first. To avoid misquote traps that could lead to undefined behavior, we always stick to basic regex engine in this case. We don't need fancy features for grepping a literal string anyway. basic_regex_quote_buf() assumes that if the pattern is in a multibyte encoding, ascii chars must be unambiguously encoded as single bytes. This is true at least for UTF-8. For others, let's wait until people yell up. Chances are nobody uses multibyte, non utf-8 charsets anymore. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 45 ++++++++++++++++++++++++++++++++++++++++- quote.c | 37 +++++++++++++++++++++++++++++++++ quote.h | 1 + t/t7812-grep-icase-non-ascii.sh | 26 ++++++++++++++++++++++++ 4 files changed, 108 insertions(+), 1 deletion(-) (limited to 'grep.c') diff --git a/grep.c b/grep.c index 451275d298..627ae3e3e8 100644 --- a/grep.c +++ b/grep.c @@ -5,6 +5,7 @@ #include "diff.h" #include "diffcore.h" #include "commit.h" +#include "quote.h" static int grep_source_load(struct grep_source *gs); static int grep_source_is_binary(struct grep_source *gs); @@ -397,6 +398,28 @@ static int is_fixed(const char *s, size_t len) return 1; } +static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) +{ + struct strbuf sb = STRBUF_INIT; + int err; + int regflags; + + basic_regex_quote_buf(&sb, p->pattern); + regflags = opt->regflags & ~REG_EXTENDED; + if (opt->ignore_case) + regflags |= REG_ICASE; + err = regcomp(&p->regexp, sb.buf, regflags); + if (opt->debug) + fprintf(stderr, "fixed %s\n", sb.buf); + strbuf_release(&sb); + if (err) { + char errbuf[1024]; + regerror(err, &p->regexp, errbuf, sizeof(errbuf)); + regfree(&p->regexp); + compile_regexp_failed(p, errbuf); + } +} + static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { int icase, ascii_only; @@ -407,8 +430,20 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) icase = opt->regflags & REG_ICASE || p->ignore_case; ascii_only = !has_non_ascii(p->pattern); + /* + * Even when -F (fixed) asks us to do a non-regexp search, we + * may not be able to correctly case-fold when -i + * (ignore-case) is asked (in which case, we'll synthesize a + * regexp to match the pattern that matches regexp special + * characters literally, while ignoring case differences). On + * the other hand, even without -F, if the pattern does not + * have any regexp special characters and there is no need for + * case-folding search, we can internally turn it into a + * simple string match using kws. p->fixed tells us if we + * want to use kws. + */ if (opt->fixed) - p->fixed = 1; + p->fixed = !icase || ascii_only; else if ((!icase || ascii_only) && is_fixed(p->pattern, p->patternlen)) p->fixed = 1; @@ -423,6 +458,14 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; + } else if (opt->fixed) { + /* + * We come here when the pattern has the non-ascii + * characters we cannot case-fold, and asked to + * ignore-case. + */ + compile_fixed_regexp(p, opt); + return; } if (opt->pcre) { diff --git a/quote.c b/quote.c index fe884d2452..c67adb718c 100644 --- a/quote.c +++ b/quote.c @@ -440,3 +440,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src) } strbuf_addch(sb, '"'); } + +void basic_regex_quote_buf(struct strbuf *sb, const char *src) +{ + char c; + + if (*src == '^') { + /* only beginning '^' is special and needs quoting */ + strbuf_addch(sb, '\\'); + strbuf_addch(sb, *src++); + } + if (*src == '*') + /* beginning '*' is not special, no quoting */ + strbuf_addch(sb, *src++); + + while ((c = *src++)) { + switch (c) { + case '[': + case '.': + case '\\': + case '*': + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + case '$': + /* only the end '$' is special and needs quoting */ + if (*src == '\0') + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + default: + strbuf_addch(sb, c); + break; + } + } +} diff --git a/quote.h b/quote.h index 99e04d34bf..362d315bec 100644 --- a/quote.h +++ b/quote.h @@ -67,5 +67,6 @@ extern char *quote_path_relative(const char *in, const char *prefix, extern void perl_quote_buf(struct strbuf *sb, const char *src); extern void python_quote_buf(struct strbuf *sb, const char *src); extern void tcl_quote_buf(struct strbuf *sb, const char *src); +extern void basic_regex_quote_buf(struct strbuf *sb, const char *src); #endif diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index b78a774dab..1929809d4a 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,4 +20,30 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' + git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 && + test_cmp expect2 debug2 +' + +test_expect_success REGEX_LOCALE 'grep string with regex, with -F' ' + test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file && + + git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 && + test_cmp expect2 debug2 +' + test_done -- cgit 1.2.3-korg From e944d9d932b3653debcfdd16eec2721eed0670e5 Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:32 +0200 Subject: grep: rewrite an if/else condition to avoid duplicate expression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "!icase || ascii_only" is repeated twice in this if/else chain as this series evolves. Rewrite it (and basically revert the first if condition back to before the "grep: break down an "if" stmt..." commit). Helped-by: Junio C Hamano Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'grep.c') diff --git a/grep.c b/grep.c index 627ae3e3e8..6325cafe73 100644 --- a/grep.c +++ b/grep.c @@ -442,11 +442,8 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) * simple string match using kws. p->fixed tells us if we * want to use kws. */ - if (opt->fixed) + if (opt->fixed || is_fixed(p->pattern, p->patternlen)) p->fixed = !icase || ascii_only; - else if ((!icase || ascii_only) && - is_fixed(p->pattern, p->patternlen)) - p->fixed = 1; else p->fixed = 0; -- cgit 1.2.3-korg From 9d9babb84d45234132f3cb1f4527ce1106e3d2ec Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:33 +0200 Subject: grep/pcre: prepare locale-dependent tables for icase matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default tables are usually built with C locale and only suitable for LANG=C or similar. This should make case insensitive search work correctly for all single-byte charsets. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 8 ++++++-- grep.h | 1 + t/t7813-grep-icase-iso.sh | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100755 t/t7813-grep-icase-iso.sh (limited to 'grep.c') diff --git a/grep.c b/grep.c index 6325cafe73..af920c4542 100644 --- a/grep.c +++ b/grep.c @@ -324,11 +324,14 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) int erroffset; int options = PCRE_MULTILINE; - if (opt->ignore_case) + if (opt->ignore_case) { + if (has_non_ascii(p->pattern)) + p->pcre_tables = pcre_maketables(); options |= PCRE_CASELESS; + } p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset, - NULL); + p->pcre_tables); if (!p->pcre_regexp) compile_regexp_failed(p, error); @@ -362,6 +365,7 @@ static void free_pcre_regexp(struct grep_pat *p) { pcre_free(p->pcre_regexp); pcre_free(p->pcre_extra_info); + pcre_free((void *)p->pcre_tables); } #else /* !USE_LIBPCRE */ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) diff --git a/grep.h b/grep.h index 95f197a8d9..cee4357b17 100644 --- a/grep.h +++ b/grep.h @@ -48,6 +48,7 @@ struct grep_pat { regex_t regexp; pcre *pcre_regexp; pcre_extra *pcre_extra_info; + const unsigned char *pcre_tables; kwset_t kws; unsigned fixed:1; unsigned ignore_case:1; diff --git a/t/t7813-grep-icase-iso.sh b/t/t7813-grep-icase-iso.sh new file mode 100755 index 0000000000..efef7fb81f --- /dev/null +++ b/t/t7813-grep-icase-iso.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_ISO_LOCALE 'setup' ' + printf "TILRAUN: Hall Heimur!" >file && + git add file && + LC_ALL="$is_IS_iso_locale" && + export LC_ALL +' + +test_expect_success GETTEXT_ISO_LOCALE,LIBPCRE 'grep pcre string' ' + git grep --perl-regexp -i "TILRAUN: H.ll Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LL HEIMUR!" +' + +test_done -- cgit 1.2.3-korg From 18547aacf5ced88bf83d85b03b2b7b3fc6aa3f6c Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:35 +0200 Subject: grep/pcre: support utf-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the previous change in this function, we add locale support for single-byte encodings only. It looks like pcre only supports utf-* as multibyte encodings, the others are left in the cold (which is fine). We need to enable PCRE_UTF8 so pcre can find character boundary correctly. It's needed for case folding (when --ignore-case is used) or '*', '+' or similar syntax is used. The "has_non_ascii()" check is to be on the conservative side. If there's non-ascii in the pattern, the searched content could still be in utf-8, but we can treat it just like a byte stream and everything should work. If we force utf-8 based on locale only and pcre validates utf-8 and the file content is in non-utf8 encoding, things break. Noticed-by: Plamen Totev Helped-by: Plamen Totev Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 2 ++ t/t7812-grep-icase-non-ascii.sh | 15 +++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'grep.c') diff --git a/grep.c b/grep.c index af920c4542..0e6511fe23 100644 --- a/grep.c +++ b/grep.c @@ -329,6 +329,8 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) p->pcre_tables = pcre_maketables(); options |= PCRE_CASELESS; } + if (is_utf8_locale() && has_non_ascii(p->pattern)) + options |= PCRE_UTF8; p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset, p->pcre_tables); diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 1929809d4a..08ae4c945d 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,6 +20,21 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 icase' ' + git grep --perl-regexp "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!" +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 string with "+"' ' + test_write_lines "TILRAUN: Hallóó Heimur!" >file2 && + git add file2 && + git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual && + echo file >expected && + echo file2 >>expected && + test_cmp expected actual +' + test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | grep fixed >debug1 && -- cgit 1.2.3-korg From 695f95ba5dc4ab44f327574f85c3ebe7ebf449b1 Mon Sep 17 00:00:00 2001 From: Nguyễn Thái Ngọc Duy Date: Sat, 25 Jun 2016 07:22:38 +0200 Subject: grep.c: reuse "icase" variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- grep.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'grep.c') diff --git a/grep.c b/grep.c index 0e6511fe23..906406a182 100644 --- a/grep.c +++ b/grep.c @@ -454,10 +454,7 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->fixed = 0; if (p->fixed) { - if (opt->regflags & REG_ICASE || p->ignore_case) - p->kws = kwsalloc(tolower_trans_tbl); - else - p->kws = kwsalloc(NULL); + p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL); kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; -- cgit 1.2.3-korg