1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package fuzzy
6
7import (
8	"unicode"
9)
10
11// RuneRole specifies the role of a rune in the context of an input.
12type RuneRole byte
13
14const (
15	// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
16	RNone RuneRole = iota
17	// RSep specifies a rune with the role of segment separator.
18	RSep
19	// RTail specifies a rune which is a lower-case tail in a word in the input.
20	RTail
21	// RUCTail specifies a rune which is an upper-case tail in a word in the input.
22	RUCTail
23	// RHead specifies a rune which is the first character in a word in the input.
24	RHead
25)
26
27// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
28// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
29// or when it filled the output. If output is nil, then it gets created.
30func RuneRoles(str string, reuse []RuneRole) []RuneRole {
31	var output []RuneRole
32	if cap(reuse) < len(str) {
33		output = make([]RuneRole, 0, len(str))
34	} else {
35		output = reuse[:0]
36	}
37
38	prev, prev2 := rtNone, rtNone
39	for i := 0; i < len(str); i++ {
40		r := rune(str[i])
41
42		role := RNone
43
44		curr := rtLower
45		if str[i] <= unicode.MaxASCII {
46			curr = runeType(rt[str[i]] - '0')
47		}
48
49		if curr == rtLower {
50			if prev == rtNone || prev == rtPunct {
51				role = RHead
52			} else {
53				role = RTail
54			}
55		} else if curr == rtUpper {
56			role = RHead
57
58			if prev == rtUpper {
59				// This and previous characters are both upper case.
60
61				if i+1 == len(str) {
62					// This is last character, previous was also uppercase -> this is UCTail
63					// i.e., (current char is C): aBC / BC / ABC
64					role = RUCTail
65				}
66			}
67		} else if curr == rtPunct {
68			switch r {
69			case '.', ':':
70				role = RSep
71			}
72		}
73		if curr != rtLower {
74			if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
75				// The previous two characters were uppercase. The current one is not a lower case, so the
76				// previous one can't be a HEAD. Make it a UCTail.
77				// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
78				output[i-1] = RUCTail
79			}
80		}
81
82		output = append(output, role)
83		prev2 = prev
84		prev = curr
85	}
86	return output
87}
88
89type runeType byte
90
91const (
92	rtNone runeType = iota
93	rtPunct
94	rtLower
95	rtUpper
96)
97
98const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
99
100// LastSegment returns the substring representing the last segment from the input, where each
101// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
102// or Filename type.
103func LastSegment(input string, roles []RuneRole) string {
104	// Exclude ending separators.
105	end := len(input) - 1
106	for end >= 0 && roles[end] == RSep {
107		end--
108	}
109	if end < 0 {
110		return ""
111	}
112
113	start := end - 1
114	for start >= 0 && roles[start] != RSep {
115		start--
116	}
117
118	return input[start+1 : end+1]
119}
120
121// ToLower transforms the input string to lower case, which is stored in the output byte slice.
122// The lower casing considers only ASCII values - non ASCII values are left unmodified.
123// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
124// created.
125func ToLower(input string, reuse []byte) []byte {
126	output := reuse
127	if cap(reuse) < len(input) {
128		output = make([]byte, len(input))
129	}
130
131	for i := 0; i < len(input); i++ {
132		r := rune(input[i])
133		if r <= unicode.MaxASCII {
134			if 'A' <= r && r <= 'Z' {
135				r += 'a' - 'A'
136			}
137		}
138		output[i] = byte(r)
139	}
140	return output[:len(input)]
141}
142
143// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
144// (start is inclusive, end is exclusive).
145type WordConsumer func(start, end int)
146
147// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
148// delimiters for each word are fed to the provided consumer function.
149func Words(roles []RuneRole, consume WordConsumer) {
150	var wordStart int
151	for i, r := range roles {
152		switch r {
153		case RUCTail, RTail:
154		case RHead, RNone, RSep:
155			if i != wordStart {
156				consume(wordStart, i)
157			}
158			wordStart = i
159			if r != RHead {
160				// Skip this character.
161				wordStart = i + 1
162			}
163		}
164	}
165	if wordStart != len(roles) {
166		consume(wordStart, len(roles))
167	}
168}
169