1// Copyright 2021 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package fuzzy
6
7import (
8	"unicode"
9)
10
11// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
12// of the form:
13//  example.com/path/to/package.object.field
14//
15// Knowing that we are matching symbols like this allows us to make the
16// following optimizations:
17//  - We can incorporate right-to-left relevance directly into the score
18//    calculation.
19//  - We can match from right to left, discarding leading bytes if the input is
20//    too long.
21//  - We just take the right-most match without losing too much precision. This
22//    allows us to use an O(n) algorithm.
23//  - We can operate directly on chunked strings; in many cases we will
24//    be storing the package path and/or package name separately from the
25//    symbol or identifiers, so doing this avoids allocating strings.
26//  - We can return the index of the right-most match, allowing us to trim
27//    irrelevant qualification.
28//
29// This implementation is experimental, serving as a reference fast algorithm
30// to compare to the fuzzy algorithm implemented by Matcher.
31type SymbolMatcher struct {
32	// Using buffers of length 256 is both a reasonable size for most qualified
33	// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
34	pattern     [256]rune
35	patternLen  uint8
36	inputBuffer [256]rune   // avoid allocating when considering chunks
37	roles       [256]uint32 // which roles does a rune play (word start, etc.)
38	segments    [256]uint8  // how many segments from the right is each rune
39}
40
41const (
42	segmentStart uint32 = 1 << iota
43	wordStart
44	separator
45)
46
47// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
48// search pattern.
49//
50// Currently this matcher only accepts case-insensitive fuzzy patterns.
51//
52// An empty pattern matches no input.
53func NewSymbolMatcher(pattern string) *SymbolMatcher {
54	m := &SymbolMatcher{}
55	for _, p := range pattern {
56		m.pattern[m.patternLen] = unicode.ToLower(p)
57		m.patternLen++
58		if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
59			// break at 255 so that we can represent patternLen with a uint8.
60			break
61		}
62	}
63	return m
64}
65
66// Match looks for the right-most match of the search pattern within the symbol
67// represented by concatenating the given chunks, returning its offset and
68// score.
69//
70// If a match is found, the first return value will hold the absolute byte
71// offset within all chunks for the start of the symbol. In other words, the
72// index of the match within strings.Join(chunks, ""). If no match is found,
73// the first return value will be -1.
74//
75// The second return value will be the score of the match, which is always
76// between 0 and 1, inclusive. A score of 0 indicates no match.
77func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
78	// Explicit behavior for an empty pattern.
79	//
80	// As a minor optimization, this also avoids nilness checks later on, since
81	// the compiler can prove that m != nil.
82	if m.patternLen == 0 {
83		return -1, 0
84	}
85
86	// First phase: populate the input buffer with lower-cased runes.
87	//
88	// We could also check for a forward match here, but since we'd have to write
89	// the entire input anyway this has negligible impact on performance.
90
91	var (
92		inputLen  = uint8(0)
93		modifiers = wordStart | segmentStart
94	)
95
96input:
97	for _, chunk := range chunks {
98		for _, r := range chunk {
99			if r == '.' || r == '/' {
100				modifiers |= separator
101			}
102			// optimization: avoid calls to unicode.ToLower, which can't be inlined.
103			l := r
104			if r <= unicode.MaxASCII {
105				if 'A' <= r && r <= 'Z' {
106					l = r + 'a' - 'A'
107				}
108			} else {
109				l = unicode.ToLower(r)
110			}
111			if l != r {
112				modifiers |= wordStart
113			}
114			m.inputBuffer[inputLen] = l
115			m.roles[inputLen] = modifiers
116			inputLen++
117			if m.roles[inputLen-1]&separator != 0 {
118				modifiers = wordStart | segmentStart
119			} else {
120				modifiers = 0
121			}
122			// TODO: we should prefer the right-most input if it overflows, rather
123			//       than the left-most as we're doing here.
124			if inputLen == 255 {
125				break input
126			}
127		}
128	}
129
130	// Second phase: find the right-most match, and count segments from the
131	// right.
132
133	var (
134		pi    = uint8(m.patternLen - 1) // pattern index
135		p     = m.pattern[pi]           // pattern rune
136		start = -1                      // start offset of match
137		rseg  = uint8(0)
138	)
139	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
140
141	for ii := inputLen - 1; ; ii-- {
142		r := m.inputBuffer[ii]
143		if rseg < maxSeg && m.roles[ii]&separator != 0 {
144			rseg++
145		}
146		m.segments[ii] = rseg
147		if p == r {
148			if pi == 0 {
149				start = int(ii)
150				break
151			}
152			pi--
153			p = m.pattern[pi]
154		}
155		// Don't check ii >= 0 in the loop condition: ii is a uint8.
156		if ii == 0 {
157			break
158		}
159	}
160
161	if start < 0 {
162		// no match: skip scoring
163		return -1, 0
164	}
165
166	// Third phase: find the shortest match, and compute the score.
167
168	// Score is the average score for each character.
169	//
170	// A character score is the multiple of:
171	//   1. 1.0 if the character starts a segment, .8 if the character start a
172	//      mid-segment word, otherwise 0.6. This carries over to immediately
173	//      following characters.
174	//   2. For the final character match, the multiplier from (1) is reduced to
175	//     .8 if the next character in the input is a mid-segment word, or 0.6 if
176	//      the next character in the input is not a word or segment start. This
177	//      ensures that we favor whole-word or whole-segment matches over prefix
178	//      matches.
179	//   3. 1.0 if the character is part of the last segment, otherwise
180	//      1.0-.2*<segments from the right>, with a max segment count of 3.
181	//
182	// This is a very naive algorithm, but it is fast. There's lots of prior art
183	// here, and we should leverage it. For example, we could explicitly consider
184	// character distance, and exact matches of words or segments.
185	//
186	// Also note that this might not actually find the highest scoring match, as
187	// doing so could require a non-linear algorithm, depending on how the score
188	// is calculated.
189
190	pi = 0
191	p = m.pattern[pi]
192
193	const (
194		segStreak  = 1.0
195		wordStreak = 0.8
196		noStreak   = 0.6
197		perSegment = 0.2 // we count at most 3 segments above
198	)
199
200	streakBonus := noStreak
201	totScore := 0.0
202	for ii := uint8(start); ii < inputLen; ii++ {
203		r := m.inputBuffer[ii]
204		if r == p {
205			pi++
206			p = m.pattern[pi]
207			// Note: this could be optimized with some bit operations.
208			switch {
209			case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
210				streakBonus = segStreak
211			case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
212				streakBonus = wordStreak
213			}
214			finalChar := pi >= m.patternLen
215			// finalCost := 1.0
216			if finalChar && streakBonus > noStreak {
217				switch {
218				case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0:
219					// Full segment: no reduction
220				case m.roles[ii+1]&wordStart != 0:
221					streakBonus = wordStreak
222				default:
223					streakBonus = noStreak
224				}
225			}
226			totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
227			if finalChar {
228				break
229			}
230		} else {
231			streakBonus = noStreak
232		}
233	}
234
235	return start, totScore / float64(m.patternLen)
236}
237