Use trimmed length when --nth is used with --tiebreak=length

This change improves sort ordering for aligned tabular input. Given the following input: apple juice 100 apple pie 200 fzf --nth=2 will now prefer the one with pie. Before this change fzf compared "juice " and "pie ", both of which have the same length.
2015-10-02 18:40:20 +09:00 · 2015-10-02 18:40:20 +09:00 · 92a75c9563
commit 92a75c9563
parent 7c7a30c472
7 changed files with 124 additions and 28 deletions
--- a/src/item.go
+++ b/src/item.go
@ -6,8 +6,8 @@ import (
 	"github.com/junegunn/fzf/src/curses"
 )

-// Offset holds two 32-bit integers denoting the offsets of a matched substring
-type Offset [2]int32
+// Offset holds three 32-bit integers denoting the offsets of a matched substring
+type Offset [3]int32

 type colorOffset struct {
 	offset [2]int32
@ -43,10 +43,13 @@ func (item *Item) Rank(cache bool) Rank {
 	}
 	matchlen := 0
 	prevEnd := 0
+	lenSum := 0
 	minBegin := math.MaxUint16
 	for _, offset := range item.offsets {
 		begin := int(offset[0])
 		end := int(offset[1])
+		trimLen := int(offset[2])
+		lenSum += trimLen
 		if prevEnd > begin {
 			begin = prevEnd
 		}
@ -65,10 +68,7 @@ func (item *Item) Rank(cache bool) Rank {
 	case byLength:
 		// It is guaranteed that .transformed in not null in normal execution
 		if item.transformed != nil {
-			lenSum := 0
-			for _, token := range item.transformed {
-				lenSum += len(token.text)
-			}
+			// If offsets is empty, lenSum will be 0, but we don't care
 			tiebreak = uint16(lenSum)
 		} else {
 			tiebreak = uint16(len(item.text))
@ -116,7 +116,8 @@ func (item *Item) colorOffsets(color int, bold bool, current bool) []colorOffset
 	if len(item.colors) == 0 {
 		var offsets []colorOffset
 		for _, off := range item.offsets {
-			offsets = append(offsets, colorOffset{offset: off, color: color, bold: bold})
+
+			offsets = append(offsets, colorOffset{offset: [2]int32{off[0], off[1]}, color: color, bold: bold})
 		}
 		return offsets
 	}
@ -160,7 +161,7 @@ func (item *Item) colorOffsets(color int, bold bool, current bool) []colorOffset
 		if curr != 0 && idx > start {
 			if curr == -1 {
 				offsets = append(offsets, colorOffset{
-					offset: Offset{int32(start), int32(idx)}, color: color, bold: bold})
+					offset: [2]int32{int32(start), int32(idx)}, color: color, bold: bold})
 			} else {
 				ansi := item.colors[curr-1]
 				fg := ansi.color.fg
@ -180,7 +181,7 @@ func (item *Item) colorOffsets(color int, bold bool, current bool) []colorOffset
 					}
 				}
 				offsets = append(offsets, colorOffset{
-					offset: Offset{int32(start), int32(idx)},
+					offset: [2]int32{int32(start), int32(idx)},
 					color:  curses.PairFor(fg, bg),
 					bold:   ansi.color.bold || bold})
 			}
--- a/src/pattern.go
+++ b/src/pattern.go
@ -6,6 +6,7 @@ import (
 	"strings"

 	"github.com/junegunn/fzf/src/algo"
+	"github.com/junegunn/fzf/src/util"
 )

 // fuzzy
@ -251,9 +252,9 @@ func (p *Pattern) matchChunk(chunk *Chunk) []*Item {
 	matches := []*Item{}
 	if p.mode == ModeFuzzy {
 		for _, item := range *chunk {
-			if sidx, eidx := p.fuzzyMatch(item); sidx >= 0 {
+			if sidx, eidx, tlen := p.fuzzyMatch(item); sidx >= 0 {
 				matches = append(matches,
-					dupItem(item, []Offset{Offset{int32(sidx), int32(eidx)}}))
+					dupItem(item, []Offset{Offset{int32(sidx), int32(eidx), int32(tlen)}}))
 			}
 		}
 	} else {
@ -269,7 +270,7 @@ func (p *Pattern) matchChunk(chunk *Chunk) []*Item {
 // MatchItem returns true if the Item is a match
 func (p *Pattern) MatchItem(item *Item) bool {
 	if p.mode == ModeFuzzy {
-		sidx, _ := p.fuzzyMatch(item)
+		sidx, _, _ := p.fuzzyMatch(item)
 		return sidx >= 0
 	}
 	offsets := p.extendedMatch(item)
@ -288,7 +289,7 @@ func dupItem(item *Item, offsets []Offset) *Item {
 		rank:        Rank{0, 0, item.index}}
 }

-func (p *Pattern) fuzzyMatch(item *Item) (int, int) {
+func (p *Pattern) fuzzyMatch(item *Item) (int, int, int) {
 	input := p.prepareInput(item)
 	return p.iter(algo.FuzzyMatch, input, p.caseSensitive, p.forward, p.text)
 }
@ -298,13 +299,13 @@ func (p *Pattern) extendedMatch(item *Item) []Offset {
 	offsets := []Offset{}
 	for _, term := range p.terms {
 		pfun := p.procFun[term.typ]
-		if sidx, eidx := p.iter(pfun, input, term.caseSensitive, p.forward, term.text); sidx >= 0 {
+		if sidx, eidx, tlen := p.iter(pfun, input, term.caseSensitive, p.forward, term.text); sidx >= 0 {
 			if term.inv {
 				break
 			}
-			offsets = append(offsets, Offset{int32(sidx), int32(eidx)})
+			offsets = append(offsets, Offset{int32(sidx), int32(eidx), int32(tlen)})
 		} else if term.inv {
-			offsets = append(offsets, Offset{0, 0})
+			offsets = append(offsets, Offset{0, 0, 0})
 		}
 	}
 	return offsets
@ -320,19 +321,19 @@ func (p *Pattern) prepareInput(item *Item) []Token {
 		tokens := Tokenize(item.text, p.delimiter)
 		ret = Transform(tokens, p.nth)
 	} else {
-		ret = []Token{Token{text: item.text, prefixLength: 0}}
+		ret = []Token{Token{text: item.text, prefixLength: 0, trimLength: util.TrimLen(item.text)}}
 	}
 	item.transformed = ret
 	return ret
 }

 func (p *Pattern) iter(pfun func(bool, bool, []rune, []rune) (int, int),
-	tokens []Token, caseSensitive bool, forward bool, pattern []rune) (int, int) {
+	tokens []Token, caseSensitive bool, forward bool, pattern []rune) (int, int, int) {
 	for _, part := range tokens {
 		prefixLength := part.prefixLength
 		if sidx, eidx := pfun(caseSensitive, forward, part.text, pattern); sidx >= 0 {
-			return sidx + prefixLength, eidx + prefixLength
+			return sidx + prefixLength, eidx + prefixLength, part.trimLength
 		}
 	}
-	return -1, -1
+	return -1, -1, -1 // math.MaxUint16
 }
--- a/src/tokenizer.go
+++ b/src/tokenizer.go
@ -20,6 +20,7 @@ type Range struct {
 type Token struct {
 	text         []rune
 	prefixLength int
+	trimLength   int
 }

 // Delimiter for tokenizing the input
@ -81,7 +82,7 @@ func withPrefixLengths(tokens [][]rune, begin int) []Token {
 	for idx, token := range tokens {
 		// Need to define a new local variable instead of the reused token to take
 		// the pointer to it
-		ret[idx] = Token{text: token, prefixLength: prefixLength}
+		ret[idx] = Token{token, prefixLength, util.TrimLen(token)}
 		prefixLength += len(token)
 	}
 	return ret
@ -233,7 +234,7 @@ func Transform(tokens []Token, withNth []Range) []Token {
 		} else {
 			prefixLength = 0
 		}
-		transTokens[idx] = Token{part, prefixLength}
+		transTokens[idx] = Token{part, prefixLength, util.TrimLen(part)}
 	}
 	return transTokens
 }
--- a/src/tokenizer_test.go
+++ b/src/tokenizer_test.go
@ -44,22 +44,22 @@ func TestTokenize(t *testing.T) {
 	// AWK-style
 	input := "  abc:  def:  ghi  "
 	tokens := Tokenize([]rune(input), Delimiter{})
-	if string(tokens[0].text) != "abc:  " || tokens[0].prefixLength != 2 {
+	if string(tokens[0].text) != "abc:  " || tokens[0].prefixLength != 2 || tokens[0].trimLength != 4 {
 		t.Errorf("%s", tokens)
 	}

 	// With delimiter
 	tokens = Tokenize([]rune(input), delimiterRegexp(":"))
-	if string(tokens[0].text) != "  abc:" || tokens[0].prefixLength != 0 {
+	if string(tokens[0].text) != "  abc:" || tokens[0].prefixLength != 0 || tokens[0].trimLength != 4 {
 		t.Errorf("%s", tokens)
 	}

 	// With delimiter regex
 	tokens = Tokenize([]rune(input), delimiterRegexp("\\s+"))
-	if string(tokens[0].text) != "  " || tokens[0].prefixLength != 0 ||
-		string(tokens[1].text) != "abc:  " || tokens[1].prefixLength != 2 ||
-		string(tokens[2].text) != "def:  " || tokens[2].prefixLength != 8 ||
-		string(tokens[3].text) != "ghi  " || tokens[3].prefixLength != 14 {
+	if string(tokens[0].text) != "  " || tokens[0].prefixLength != 0 || tokens[0].trimLength != 0 ||
+		string(tokens[1].text) != "abc:  " || tokens[1].prefixLength != 2 || tokens[1].trimLength != 4 ||
+		string(tokens[2].text) != "def:  " || tokens[2].prefixLength != 8 || tokens[2].trimLength != 4 ||
+		string(tokens[3].text) != "ghi  " || tokens[3].prefixLength != 14 || tokens[3].trimLength != 3 {
 		t.Errorf("%s", tokens)
 	}
 }
--- a/src/util/util.go
+++ b/src/util/util.go
@ -75,6 +75,7 @@ func IsTty() bool {
 	return int(C.isatty(C.int(os.Stdin.Fd()))) != 0
 }

+// TrimRight returns rune array with trailing white spaces cut off
 func TrimRight(runes []rune) []rune {
 	var i int
 	for i = len(runes) - 1; i >= 0; i-- {
@ -86,6 +87,7 @@ func TrimRight(runes []rune) []rune {
 	return runes[0 : i+1]
 }

+// BytesToRunes converts byte array into rune array
 func BytesToRunes(bytea []byte) []rune {
 	runes := make([]rune, 0, len(bytea))
 	for i := 0; i < len(bytea); {
@ -100,3 +102,27 @@ func BytesToRunes(bytea []byte) []rune {
 	}
 	return runes
 }
+
+// TrimLen returns the length of trimmed rune array
+func TrimLen(runes []rune) int {
+	var i int
+	for i = len(runes) - 1; i >= 0; i-- {
+		char := runes[i]
+		if char != ' ' && char != '\t' {
+			break
+		}
+	}
+	// Completely empty
+	if i < 0 {
+		return 0
+	}
+
+	var j int
+	for j = 0; j < len(runes); j++ {
+		char := runes[j]
+		if char != ' ' && char != '\t' {
+			break
+		}
+	}
+	return i - j + 1
+}
--- a/src/util/util_test.go
+++ b/src/util/util_test.go
@ -20,3 +20,23 @@ func TestContrain(t *testing.T) {
 		t.Error("Expected", 3)
 	}
 }
+
+func TestTrimLen(t *testing.T) {
+	check := func(str string, exp int) {
+		trimmed := TrimLen([]rune(str))
+		if trimmed != exp {
+			t.Errorf("Invalid TrimLen result for '%s': %d (expected %d)",
+				str, trimmed, exp)
+		}
+	}
+	check("hello", 5)
+	check("hello ", 5)
+	check("hello  ", 5)
+	check(" hello", 5)
+	check("  hello", 5)
+	check(" hello ", 5)
+	check("  hello  ", 5)
+	check("h   o", 5)
+	check("  h   o  ", 5)
+	check("         ", 0)
+}
--- a/test/test_go.rb
+++ b/test/test_go.rb
@ -527,6 +527,53 @@ class TestGoFZF < TestBase
    assert_equal output, `cat #{tempname} | #{FZF} -fh -n2 -d:`.split($/)
  end

+  def test_tiebreak_length_with_nth_trim_length
+    input = [
+      "apple juice   bottle 1",
+      "apple  ui     bottle 2",
+      "app     ice   bottle 3",
+      "app     ic    bottle 4",
+    ]
+    writelines tempname, input
+
+    # len(1)
+    output = [
+      "app     ice   bottle 3",
+      "app     ic    bottle 4",
+      "apple juice   bottle 1",
+      "apple  ui     bottle 2",
+    ]
+    assert_equal output, `cat #{tempname} | #{FZF} -fa -n1`.split($/)
+
+    # len(1 ~ 2)
+    output = [
+      "apple  ui     bottle 2",
+      "app     ic    bottle 4",
+      "apple juice   bottle 1",
+      "app     ice   bottle 3",
+    ]
+    assert_equal output, `cat #{tempname} | #{FZF} -fai -n1..2`.split($/)
+
+    # len(1) + len(2)
+    output = [
+      "app     ic    bottle 4",
+      "app     ice   bottle 3",
+      "apple  ui     bottle 2",
+      "apple juice   bottle 1",
+    ]
+    assert_equal output, `cat #{tempname} | #{FZF} -x -f"a i" -n1,2`.split($/)
+
+    # len(2)
+    output = [
+      "apple  ui     bottle 2",
+      "app     ic    bottle 4",
+      "app     ice   bottle 3",
+      "apple juice   bottle 1",
+    ]
+    assert_equal output, `cat #{tempname} | #{FZF} -fi -n2`.split($/)
+    assert_equal output, `cat #{tempname} | #{FZF} -fi -n2,1..2`.split($/)
+  end
+
  def test_tiebreak_end_backward_scan
    input = %w[
      foobar-fb