From 69aa2fea686b6e26418fa352abebd81e0a1ecc7b Mon Sep 17 00:00:00 2001 From: Junegunn Choi Date: Sun, 30 Jul 2017 17:31:50 +0900 Subject: [PATCH] Optimize fuzzy search performance for ASCII strings --- src/algo/algo.go | 68 ++++++++++++++++++++++++++++++++++++++++------- src/util/chars.go | 8 ++++++ 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/algo/algo.go b/src/algo/algo.go index c4930c1..a2f0f2b 100644 --- a/src/algo/algo.go +++ b/src/algo/algo.go @@ -78,9 +78,11 @@ Scoring criteria */ import ( + "bytes" "fmt" "strings" "unicode" + "unicode/utf8" "github.com/junegunn/fzf/src/util" ) @@ -251,19 +253,37 @@ func normalizeRune(r rune) rune { // 2. "pattern" is already normalized if "normalize" is true type Algo func(caseSensitive bool, normalize bool, forward bool, input util.Chars, pattern []rune, withPos bool, slab *util.Slab) (Result, *[]int) +func trySkip(input *util.Chars, caseSensitive bool, b byte, from int) int { + byteArray := input.Bytes()[from:] + idx := bytes.IndexByte(byteArray, b) + if idx == 0 { + // Can't skip any further + return from + } + // We may need to search for the uppercase letter again. We don't have to + // consider normalization as we can be sure that this is an ASCII string. + if !caseSensitive && b >= 'a' && b <= 'z' { + uidx := bytes.IndexByte(byteArray, b-32) + if idx < 0 || uidx >= 0 && uidx < idx { + idx = uidx + } + } + if idx < 0 { + return -1 + } + return from + idx +} + func FuzzyMatchV2(caseSensitive bool, normalize bool, forward bool, input util.Chars, pattern []rune, withPos bool, slab *util.Slab) (Result, *[]int) { // Assume that pattern is given in lowercase if case-insensitive. // First check if there's a match and calculate bonus for each position. // If the input string is too long, consider finding the matching chars in // this phase as well (non-optimal alignment). - N := input.Length() M := len(pattern) - switch M { - case 0: + if M == 0 { return Result{0, 0, 0}, posArray(withPos, M) - case 1: - return ExactMatchNaive(caseSensitive, normalize, forward, input, pattern[0:1], withPos, slab) } + N := input.Length() // Since O(nm) algorithm can be prohibitively expensive for large input, // we fall back to the greedy algorithm. @@ -281,10 +301,31 @@ func FuzzyMatchV2(caseSensitive bool, normalize bool, forward bool, input util.C // Rune array offset32, T := alloc32(offset32, slab, N, false) - // Phase 1. Check if there's a match and calculate bonus for each point + // Phase 1. Optimized search for ASCII string + firstIdx := 0 + if input.IsBytes() { + idx := 0 + for pidx := 0; pidx < M; pidx++ { + // Not possible + if pattern[pidx] >= utf8.RuneSelf { + return Result{-1, -1, 0}, nil + } + idx = trySkip(&input, caseSensitive, byte(pattern[pidx]), idx) + if idx < 0 { + return Result{-1, -1, 0}, nil + } + if pidx == 0 && idx > 0 { + // Step back to find the right bonus point + firstIdx = idx - 1 + } + idx++ + } + } + + // Phase 2. Calculate bonus for each point pidx, lastIdx, prevClass := 0, 0, charNonWord input.CopyRunes(T) - for idx := 0; idx < N; idx++ { + for idx := firstIdx; idx < N; idx++ { char := T[idx] var class charClass if char <= unicode.MaxASCII { @@ -324,8 +365,17 @@ func FuzzyMatchV2(caseSensitive bool, normalize bool, forward bool, input util.C if pidx != M { return Result{-1, -1, 0}, nil } + if M == 1 && B[F[0]] == bonusBoundary { + p := int(F[0]) + result := Result{p, p + 1, scoreMatch + bonusBoundary*bonusFirstCharMultiplier} + if !withPos { + return result, nil + } + pos := []int{p} + return result, &pos + } - // Phase 2. Fill in score matrix (H) + // Phase 3. Fill in score matrix (H) // Unlike the original algorithm, we do not allow omission. width := lastIdx - int(F[0]) + 1 offset16, H := alloc16(offset16, slab, width*M, false) @@ -414,7 +464,7 @@ func FuzzyMatchV2(caseSensitive bool, normalize bool, forward bool, input util.C } } - // Phase 3. (Optional) Backtrace to find character positions + // Phase 4. (Optional) Backtrace to find character positions pos := posArray(withPos, M) j := int(F[0]) if withPos { diff --git a/src/util/chars.go b/src/util/chars.go index 61e9341..0127bcb 100644 --- a/src/util/chars.go +++ b/src/util/chars.go @@ -65,6 +65,14 @@ func RunesToChars(runes []rune) Chars { return Chars{slice: *(*[]byte)(unsafe.Pointer(&runes)), inBytes: false} } +func (chars *Chars) IsBytes() bool { + return chars.inBytes +} + +func (chars *Chars) Bytes() []byte { + return chars.slice +} + func (chars *Chars) optionalRunes() []rune { if chars.inBytes { return nil