diff --git a/src/tokenizer.go b/src/tokenizer.go index eec1989..e48f48b 100644 --- a/src/tokenizer.go +++ b/src/tokenizer.go @@ -140,13 +140,13 @@ func Tokenize(text util.Chars, delimiter Delimiter) []Token { return withPrefixLengths(tokens, prefixLength) } - var tokens []string if delimiter.str != nil { - tokens = strings.Split(text.ToString(), *delimiter.str) - for i := 0; i < len(tokens)-1; i++ { - tokens[i] = tokens[i] + *delimiter.str - } - } else if delimiter.regex != nil { + return withPrefixLengths(text.Split(*delimiter.str), 0) + } + + // FIXME performance + var tokens []string + if delimiter.regex != nil { str := text.ToString() for len(str) > 0 { loc := delimiter.regex.FindStringIndex(str) diff --git a/src/util/chars.go b/src/util/chars.go index 6034ee5..12417c6 100644 --- a/src/util/chars.go +++ b/src/util/chars.go @@ -118,3 +118,39 @@ func (chars *Chars) Slice(b int, e int) Chars { } return Chars{bytes: chars.bytes[b:e]} } + +func (chars *Chars) Split(delimiter string) []Chars { + delim := []rune(delimiter) + numChars := chars.Length() + numDelim := len(delim) + begin := 0 + ret := make([]Chars, 0, 1) + + for index := 0; index < numChars; { + if index+numDelim <= numChars { + match := true + for off, d := range delim { + if chars.Get(index+off) != d { + match = false + break + } + } + // Found the delimiter + if match { + incr := Max(numDelim, 1) + ret = append(ret, chars.Slice(begin, index+incr)) + index += incr + begin = index + continue + } + } else { + // Impossible to find the delimiter in the remaining substring + break + } + index++ + } + if begin < numChars || len(ret) == 0 { + ret = append(ret, chars.Slice(begin, numChars)) + } + return ret +} diff --git a/src/util/chars_test.go b/src/util/chars_test.go index 2cb6fc7..12c629d 100644 --- a/src/util/chars_test.go +++ b/src/util/chars_test.go @@ -55,3 +55,28 @@ func TestTrimLength(t *testing.T) { check(" h o ", 5) check(" ", 0) } + +func TestSplit(t *testing.T) { + check := func(str string, delim string, tokens ...string) { + input := ToChars([]byte(str)) + result := input.Split(delim) + if len(result) != len(tokens) { + t.Errorf("Invalid Split result for '%s': %d tokens found (expected %d): %s", + str, len(result), len(tokens), result) + } + for idx, token := range tokens { + if result[idx].ToString() != token { + t.Errorf("Invalid Split result for '%s': %s (expected %s)", + str, result[idx].ToString(), token) + } + } + } + check("abc:def::", ":", "abc:", "def:", ":") + check("abc:def::", "-", "abc:def::") + check("abc", "", "a", "b", "c") + check("abc", "a", "a", "bc") + check("abc", "ab", "ab", "c") + check("abc", "abc", "abc") + check("abc", "abcd", "abc") + check("", "abcd", "") +}