From aa681494a7eac0c8738da9f45af4c69f96a5aea1 Mon Sep 17 00:00:00 2001 From: Ludi Rehak Date: Mon, 28 Feb 2022 10:56:44 -0800 Subject: [PATCH 1/2] Fix bug in sift where no matches were found for patterns starting with \b\W or ending with \W\b MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Old code: $ sift "\b\W\W\W\b" $m/mist-ap/cloud | wc -l        0 New code: $ sift "\b\W\W\W\b" $m/mist-ap/cloud | wc -l     3509 Ripgrep agrees with bug fix: $ rg "\b\W\W\W\b" $m/mist-ap/cloud | wc -l     3509 --- matching.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/matching.go b/matching.go index 11cf26e4..c486507b 100644 --- a/matching.go +++ b/matching.go @@ -256,13 +256,16 @@ func getMatches(regex *regexp.Regexp, data []byte, testBuffer []byte, offset int // analyze match and reject false matches if !options.Multiline { // remove newlines at the beginning of the match + skip := false for ; start < length && end > start && data[start] == 0x0a; start++ { + skip = true } // remove newlines at the end of the match for ; end > 0 && end > start && data[end-1] == 0x0a; end-- { + skip = true } // check if the corrected match is still valid - if !regex.Match(testBuffer[start:end]) { + if skip && !regex.Match(testBuffer[start:end]) { continue } // check if the match contains newlines From 79b5d5daf65635cc7f611cd1b8ad1889099bab65 Mon Sep 17 00:00:00 2001 From: Ludi Rehak Date: Mon, 28 Feb 2022 11:18:47 -0800 Subject: [PATCH 2/2] Faster searches for "sift -w WORD" word boundary searches when all matches for WORD start with a string literal. 1. Make a first pass over the haystack by starting the search with the string literal part of the needle: WORD\b It's a more selective filter (in most cases) and delays entering the slower regex engine. [1] 2. Do a second pass with the now much smaller haystack on the complete needle: \bWORD\b [1] https://github.com/golang/go/blob/a064a4f29a97a4fc7398d1ac9d7c53c5ba0bc646/src/regexp/backtrack.go#L341 Observed 16x speedup for the following pattern and directory: sift -q "\bWaitForConnect\b" $m/mist-ap 2.60s user 0.09s system 549% cpu 0.488 total sift -w -q "WaitForConnect" $m/mist-ap 0.08s user 0.10s system 570% cpu 0.031 total --- matching.go | 8 ++++++++ options.go | 27 ++++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/matching.go b/matching.go index c486507b..8f221d17 100644 --- a/matching.go +++ b/matching.go @@ -171,6 +171,14 @@ func processReader(reader io.Reader, matchRegexes []*regexp.Regexp, data []byte, validMatch = true } } + // When -w is set and all matches for WORD start with a string literal, omit the leading word boundary \b in the first pass. + // First Pass: WORD\b + // String literals are often more quickly searched for before entering the slower regex engine. + // Enforce the leading word boundary requirement in a second pass here. + // Second Pass: \bWORD\b + if len(options.CompletePattern) > 0 && validMatch { + validMatch, _ = regexp.MatchString(options.CompletePattern, newMatches[i].line) + } if validMatch { prevMatch = &newMatches[i] i++ diff --git a/options.go b/options.go index 60f3bdbd..8c066719 100644 --- a/options.go +++ b/options.go @@ -25,6 +25,7 @@ import ( "os/user" "path/filepath" "regexp" + "regexp/syntax" "runtime" "sort" "strconv" @@ -106,6 +107,7 @@ type Options struct { ListTypes bool `long:"list-types" description:"list available file types" json:"-" default-mask:"-"` Version func() `short:"V" long:"version" description:"show version and license information" json:"-"` WordRegexp bool `short:"w" long:"word-regexp" description:"only match on ASCII word boundaries"` + CompletePattern string WriteConfig bool `long:"write-config" description:"save config for loaded configs + given command line arguments" json:"-"` Zip bool `short:"z" long:"zip" description:"search content of compressed .gz files (default: off)"` NoZip func() `short:"Z" long:"no-zip" description:"do not search content of compressed .gz files" json:"-"` @@ -498,6 +500,21 @@ func (o *Options) checkFormats() error { return nil } +// isPrefixStringLiteral determines whether all matches for the regexp must start with a string literal. +func isPrefixStringLiteral(exp string) bool { + re, err := syntax.Parse(exp, syntax.Perl) + if err != nil { + return false + } + re = re.Simplify() + prog, err := syntax.Compile(re) + if err != nil { + return false + } + prefix, _ := prog.Prefix() + return len(prefix) > 0 +} + // preparePattern adjusts a pattern to respect the ignore-case, literal and multiline options func (o *Options) preparePattern(pattern string) string { if o.Literal { @@ -507,11 +524,19 @@ func (o *Options) preparePattern(pattern string) string { pattern = strings.ToLower(pattern) } if o.WordRegexp { - pattern = `\b` + pattern + `\b` + // detect string literal to see if pattern without leading \b can use the fast path + if isPrefixStringLiteral(pattern) { + o.CompletePattern = `\b` + pattern + `\b` + pattern = pattern + `\b` + } else { + pattern = `\b` + pattern + `\b` + } } pattern = "(?m)" + pattern + o.CompletePattern = "(?m)" + o.CompletePattern if o.Multiline { pattern = "(?s)" + pattern + o.CompletePattern = "(?s)" + o.CompletePattern } return pattern }