1// Copyright 2019 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package fuzzy 6 7import ( 8 "unicode" 9) 10 11// RuneRole specifies the role of a rune in the context of an input. 12type RuneRole byte 13 14const ( 15 // RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII). 16 RNone RuneRole = iota 17 // RSep specifies a rune with the role of segment separator. 18 RSep 19 // RTail specifies a rune which is a lower-case tail in a word in the input. 20 RTail 21 // RUCTail specifies a rune which is an upper-case tail in a word in the input. 22 RUCTail 23 // RHead specifies a rune which is the first character in a word in the input. 24 RHead 25) 26 27// RuneRoles detects the roles of each byte rune in an input string and stores it in the output 28// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string 29// or when it filled the output. If output is nil, then it gets created. 30func RuneRoles(str string, reuse []RuneRole) []RuneRole { 31 var output []RuneRole 32 if cap(reuse) < len(str) { 33 output = make([]RuneRole, 0, len(str)) 34 } else { 35 output = reuse[:0] 36 } 37 38 prev, prev2 := rtNone, rtNone 39 for i := 0; i < len(str); i++ { 40 r := rune(str[i]) 41 42 role := RNone 43 44 curr := rtLower 45 if str[i] <= unicode.MaxASCII { 46 curr = runeType(rt[str[i]] - '0') 47 } 48 49 if curr == rtLower { 50 if prev == rtNone || prev == rtPunct { 51 role = RHead 52 } else { 53 role = RTail 54 } 55 } else if curr == rtUpper { 56 role = RHead 57 58 if prev == rtUpper { 59 // This and previous characters are both upper case. 60 61 if i+1 == len(str) { 62 // This is last character, previous was also uppercase -> this is UCTail 63 // i.e., (current char is C): aBC / BC / ABC 64 role = RUCTail 65 } 66 } 67 } else if curr == rtPunct { 68 switch r { 69 case '.', ':': 70 role = RSep 71 } 72 } 73 if curr != rtLower { 74 if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) { 75 // The previous two characters were uppercase. The current one is not a lower case, so the 76 // previous one can't be a HEAD. Make it a UCTail. 77 // i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB. 78 output[i-1] = RUCTail 79 } 80 } 81 82 output = append(output, role) 83 prev2 = prev 84 prev = curr 85 } 86 return output 87} 88 89type runeType byte 90 91const ( 92 rtNone runeType = iota 93 rtPunct 94 rtLower 95 rtUpper 96) 97 98const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000" 99 100// LastSegment returns the substring representing the last segment from the input, where each 101// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol 102// or Filename type. 103func LastSegment(input string, roles []RuneRole) string { 104 // Exclude ending separators. 105 end := len(input) - 1 106 for end >= 0 && roles[end] == RSep { 107 end-- 108 } 109 if end < 0 { 110 return "" 111 } 112 113 start := end - 1 114 for start >= 0 && roles[start] != RSep { 115 start-- 116 } 117 118 return input[start+1 : end+1] 119} 120 121// ToLower transforms the input string to lower case, which is stored in the output byte slice. 122// The lower casing considers only ASCII values - non ASCII values are left unmodified. 123// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets 124// created. 125func ToLower(input string, reuse []byte) []byte { 126 output := reuse 127 if cap(reuse) < len(input) { 128 output = make([]byte, len(input)) 129 } 130 131 for i := 0; i < len(input); i++ { 132 r := rune(input[i]) 133 if r <= unicode.MaxASCII { 134 if 'A' <= r && r <= 'Z' { 135 r += 'a' - 'A' 136 } 137 } 138 output[i] = byte(r) 139 } 140 return output[:len(input)] 141} 142 143// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input 144// (start is inclusive, end is exclusive). 145type WordConsumer func(start, end int) 146 147// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset 148// delimiters for each word are fed to the provided consumer function. 149func Words(roles []RuneRole, consume WordConsumer) { 150 var wordStart int 151 for i, r := range roles { 152 switch r { 153 case RUCTail, RTail: 154 case RHead, RNone, RSep: 155 if i != wordStart { 156 consume(wordStart, i) 157 } 158 wordStart = i 159 if r != RHead { 160 // Skip this character. 161 wordStart = i + 1 162 } 163 } 164 } 165 if wordStart != len(roles) { 166 consume(wordStart, len(roles)) 167 } 168} 169