From 766427de0c04c64085c5ed907e3fdcc6124fa2dd Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Mon, 10 Aug 2015 18:34:20 +0900 Subject: [PATCH] Fix --with-nth performance; avoid regex if possible Close #317 --- CHANGELOG.md | 9 ++++++++ src/options.go | 24 +++++++++++++++------ src/options_test.go | 50 ++++++++++++++++++++++++++++++++++++++++--- src/pattern.go | 4 ++-- src/pattern_test.go | 20 ++++++++--------- src/tokenizer.go | 41 +++++++++++++++++++++++++---------- src/tokenizer_test.go | 4 ++-- 7 files changed, 117 insertions(+), 35 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bce59b..f7ec383 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,15 @@ CHANGELOG ========= +0.10.3 +------ + +- Fixed slow performance of `--with-nth` when used with `--delimiter` + - Regular expression engine of Golang as of now is very slow, so the fixed + version will treat the given delimiter pattern as a plain string instead + of a regular expression unless it contains special characters and is + a valid regular expression. + 0.10.2 ------ diff --git a/src/options.go b/src/options.go index 6a0d197..35fa8ec 100644 --- a/src/options.go +++ b/src/options.go @@ -104,7 +104,7 @@ type Options struct { Case Case Nth []Range WithNth []Range - Delimiter *regexp.Regexp + Delimiter Delimiter Sort int Tac bool Tiebreak tiebreak @@ -149,7 +149,7 @@ func defaultOptions() *Options { Case: CaseSmart, Nth: make([]Range, 0), WithNth: make([]Range, 0), - Delimiter: nil, + Delimiter: Delimiter{}, Sort: 1000, Tac: false, Tiebreak: byLength, @@ -268,17 +268,27 @@ func splitNth(str string) []Range { return ranges } -func delimiterRegexp(str string) *regexp.Regexp { - rx, e := regexp.Compile(str) - if e != nil { - str = regexp.QuoteMeta(str) +func delimiterRegexp(str string) Delimiter { + // Special handling of \t + str = strings.Replace(str, "\\t", "\t", -1) + + // 1. Pattern does not contain any special character + if regexp.QuoteMeta(str) == str { + return Delimiter{str: &str} } + rx, e := regexp.Compile(str) + // 2. Pattern is not a valid regular expression + if e != nil { + return Delimiter{str: &str} + } + + // 3. Pattern as regular expression. Slow. rx, e = regexp.Compile(fmt.Sprintf("(?:.*?%s)|(?:.+?$)", str)) if e != nil { errorExit("invalid regular expression: " + e.Error()) } - return rx + return Delimiter{regex: rx} } func isAlphabet(char uint8) bool { diff --git a/src/options_test.go b/src/options_test.go index 28a8d9d..e9884f2 100644 --- a/src/options_test.go +++ b/src/options_test.go @@ -2,16 +2,60 @@ package fzf import ( "fmt" + "strings" "testing" "github.com/junegunn/fzf/src/curses" ) func TestDelimiterRegex(t *testing.T) { - rx := delimiterRegexp("*") + // Valid regex + delim := delimiterRegexp(".") + if delim.regex == nil || delim.str != nil { + t.Error(delim) + } + // Broken regex -> string + delim = delimiterRegexp("[0-9") + if delim.regex != nil || *delim.str != "[0-9" { + t.Error(delim) + } + // Valid regex + delim = delimiterRegexp("[0-9]") + if strings.Index(delim.regex.String(), "[0-9]") < 0 || delim.str != nil { + t.Error(delim) + } + // Tab character + delim = delimiterRegexp("\t") + if delim.regex != nil || *delim.str != "\t" { + t.Error(delim) + } + // Tab expression + delim = delimiterRegexp("\\t") + if delim.regex != nil || *delim.str != "\t" { + t.Error(delim) + } + // Tabs -> regex + delim = delimiterRegexp("\t+") + if delim.regex == nil || delim.str != nil { + t.Error(delim) + } +} + +func TestDelimiterRegexString(t *testing.T) { + delim := delimiterRegexp("*") + tokens := strings.Split("-*--*---**---", *delim.str) + if delim.regex != nil || tokens[0] != "-" || tokens[1] != "--" || + tokens[2] != "---" || tokens[3] != "" || tokens[4] != "---" { + t.Errorf("%s %s %d", delim, tokens, len(tokens)) + } +} + +func TestDelimiterRegexRegex(t *testing.T) { + delim := delimiterRegexp("--\\*") + rx := delim.regex tokens := rx.FindAllString("-*--*---**---", -1) - if tokens[0] != "-*" || tokens[1] != "--*" || tokens[2] != "---*" || - tokens[3] != "*" || tokens[4] != "---" { + if delim.str != nil || + tokens[0] != "-*--*" || tokens[1] != "---*" || tokens[2] != "*---" { t.Errorf("%s %s %d", rx, tokens, len(tokens)) } } diff --git a/src/pattern.go b/src/pattern.go index 990450a..f83861e 100644 --- a/src/pattern.go +++ b/src/pattern.go @@ -42,7 +42,7 @@ type Pattern struct { text []rune terms []term hasInvTerm bool - delimiter *regexp.Regexp + delimiter Delimiter nth []Range procFun map[termType]func(bool, []rune, []rune) (int, int) } @@ -71,7 +71,7 @@ func clearChunkCache() { // BuildPattern builds Pattern object from the given arguments func BuildPattern(mode Mode, caseMode Case, - nth []Range, delimiter *regexp.Regexp, runes []rune) *Pattern { + nth []Range, delimiter Delimiter, runes []rune) *Pattern { var asString string switch mode { diff --git a/src/pattern_test.go b/src/pattern_test.go index 8134cdc..c7f5414 100644 --- a/src/pattern_test.go +++ b/src/pattern_test.go @@ -59,7 +59,7 @@ func TestExact(t *testing.T) { defer clearPatternCache() clearPatternCache() pattern := BuildPattern(ModeExtended, CaseSmart, - []Range{}, nil, []rune("'abc")) + []Range{}, Delimiter{}, []rune("'abc")) sidx, eidx := algo.ExactMatchNaive( pattern.caseSensitive, []rune("aabbcc abc"), pattern.terms[0].text) if sidx != 7 || eidx != 10 { @@ -70,7 +70,7 @@ func TestExact(t *testing.T) { func TestEqual(t *testing.T) { defer clearPatternCache() clearPatternCache() - pattern := BuildPattern(ModeExtended, CaseSmart, []Range{}, nil, []rune("^AbC$")) + pattern := BuildPattern(ModeExtended, CaseSmart, []Range{}, Delimiter{}, []rune("^AbC$")) match := func(str string, sidxExpected int, eidxExpected int) { sidx, eidx := algo.EqualMatch( @@ -86,17 +86,17 @@ func TestEqual(t *testing.T) { func TestCaseSensitivity(t *testing.T) { defer clearPatternCache() clearPatternCache() - pat1 := BuildPattern(ModeFuzzy, CaseSmart, []Range{}, nil, []rune("abc")) + pat1 := BuildPattern(ModeFuzzy, CaseSmart, []Range{}, Delimiter{}, []rune("abc")) clearPatternCache() - pat2 := BuildPattern(ModeFuzzy, CaseSmart, []Range{}, nil, []rune("Abc")) + pat2 := BuildPattern(ModeFuzzy, CaseSmart, []Range{}, Delimiter{}, []rune("Abc")) clearPatternCache() - pat3 := BuildPattern(ModeFuzzy, CaseIgnore, []Range{}, nil, []rune("abc")) + pat3 := BuildPattern(ModeFuzzy, CaseIgnore, []Range{}, Delimiter{}, []rune("abc")) clearPatternCache() - pat4 := BuildPattern(ModeFuzzy, CaseIgnore, []Range{}, nil, []rune("Abc")) + pat4 := BuildPattern(ModeFuzzy, CaseIgnore, []Range{}, Delimiter{}, []rune("Abc")) clearPatternCache() - pat5 := BuildPattern(ModeFuzzy, CaseRespect, []Range{}, nil, []rune("abc")) + pat5 := BuildPattern(ModeFuzzy, CaseRespect, []Range{}, Delimiter{}, []rune("abc")) clearPatternCache() - pat6 := BuildPattern(ModeFuzzy, CaseRespect, []Range{}, nil, []rune("Abc")) + pat6 := BuildPattern(ModeFuzzy, CaseRespect, []Range{}, Delimiter{}, []rune("Abc")) if string(pat1.text) != "abc" || pat1.caseSensitive != false || string(pat2.text) != "Abc" || pat2.caseSensitive != true || @@ -109,8 +109,8 @@ func TestCaseSensitivity(t *testing.T) { } func TestOrigTextAndTransformed(t *testing.T) { - pattern := BuildPattern(ModeExtended, CaseSmart, []Range{}, nil, []rune("jg")) - tokens := Tokenize([]rune("junegunn"), nil) + pattern := BuildPattern(ModeExtended, CaseSmart, []Range{}, Delimiter{}, []rune("jg")) + tokens := Tokenize([]rune("junegunn"), Delimiter{}) trans := Transform(tokens, []Range{Range{1, 1}}) origRunes := []rune("junegunn.choi") diff --git a/src/tokenizer.go b/src/tokenizer.go index a616c6b..72deb2b 100644 --- a/src/tokenizer.go +++ b/src/tokenizer.go @@ -22,6 +22,12 @@ type Token struct { prefixLength int } +// Delimiter for tokenizing the input +type Delimiter struct { + regex *regexp.Regexp + str *string +} + func newRange(begin int, end int) Range { if begin == 1 { begin = rangeEllipsis @@ -68,15 +74,15 @@ func ParseRange(str *string) (Range, bool) { return newRange(n, n), true } -func withPrefixLengths(tokens []string, begin int) []Token { +func withPrefixLengths(tokens [][]rune, begin int) []Token { ret := make([]Token, len(tokens)) prefixLength := begin for idx, token := range tokens { // Need to define a new local variable instead of the reused token to take // the pointer to it - ret[idx] = Token{text: []rune(token), prefixLength: prefixLength} - prefixLength += len([]rune(token)) + ret[idx] = Token{text: token, prefixLength: prefixLength} + prefixLength += len(token) } return ret } @@ -87,9 +93,9 @@ const ( awkWhite ) -func awkTokenizer(input []rune) ([]string, int) { +func awkTokenizer(input []rune) ([][]rune, int) { // 9, 32 - ret := []string{} + ret := [][]rune{} str := []rune{} prefixLength := 0 state := awkNil @@ -112,27 +118,40 @@ func awkTokenizer(input []rune) ([]string, int) { if white { str = append(str, r) } else { - ret = append(ret, string(str)) + ret = append(ret, str) state = awkBlack str = []rune{r} } } } if len(str) > 0 { - ret = append(ret, string(str)) + ret = append(ret, str) } return ret, prefixLength } // Tokenize tokenizes the given string with the delimiter -func Tokenize(runes []rune, delimiter *regexp.Regexp) []Token { - if delimiter == nil { +func Tokenize(runes []rune, delimiter Delimiter) []Token { + if delimiter.str == nil && delimiter.regex == nil { // AWK-style (\S+\s*) tokens, prefixLength := awkTokenizer(runes) return withPrefixLengths(tokens, prefixLength) } - tokens := delimiter.FindAllString(string(runes), -1) - return withPrefixLengths(tokens, 0) + + var tokens []string + if delimiter.str != nil { + tokens = strings.Split(string(runes), *delimiter.str) + for i := 0; i < len(tokens)-1; i++ { + tokens[i] = tokens[i] + *delimiter.str + } + } else if delimiter.regex != nil { + tokens = delimiter.regex.FindAllString(string(runes), -1) + } + asRunes := make([][]rune, len(tokens)) + for i, token := range tokens { + asRunes[i] = []rune(token) + } + return withPrefixLengths(asRunes, 0) } func joinTokens(tokens []Token) []rune { diff --git a/src/tokenizer_test.go b/src/tokenizer_test.go index 06603ae..61017b8 100644 --- a/src/tokenizer_test.go +++ b/src/tokenizer_test.go @@ -43,7 +43,7 @@ func TestParseRange(t *testing.T) { func TestTokenize(t *testing.T) { // AWK-style input := " abc: def: ghi " - tokens := Tokenize([]rune(input), nil) + tokens := Tokenize([]rune(input), Delimiter{}) if string(tokens[0].text) != "abc: " || tokens[0].prefixLength != 2 { t.Errorf("%s", tokens) } @@ -58,7 +58,7 @@ func TestTokenize(t *testing.T) { func TestTransform(t *testing.T) { input := " abc: def: ghi: jkl" { - tokens := Tokenize([]rune(input), nil) + tokens := Tokenize([]rune(input), Delimiter{}) { ranges := splitNth("1,2,3") tx := Transform(tokens, ranges)