1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package build 6 7import ( 8 "fmt" 9 "io" 10 "reflect" 11 "sort" 12 "strings" 13 14 "golang.org/x/text/internal/colltab" 15) 16 17// This file contains code for detecting contractions and generating 18// the necessary tables. 19// Any Unicode Collation Algorithm (UCA) table entry that has more than 20// one rune one the left-hand side is called a contraction. 21// See https://www.unicode.org/reports/tr10/#Contractions for more details. 22// 23// We define the following terms: 24// initial: a rune that appears as the first rune in a contraction. 25// suffix: a sequence of runes succeeding the initial rune 26// in a given contraction. 27// non-initial: a rune that appears in a suffix. 28// 29// A rune may be both an initial and a non-initial and may be so in 30// many contractions. An initial may typically also appear by itself. 31// In case of ambiguities, the UCA requires we match the longest 32// contraction. 33// 34// Many contraction rules share the same set of possible suffixes. 35// We store sets of suffixes in a trie that associates an index with 36// each suffix in the set. This index can be used to look up a 37// collation element associated with the (starter rune, suffix) pair. 38// 39// The trie is defined on a UTF-8 byte sequence. 40// The overall trie is represented as an array of ctEntries. Each node of the trie 41// is represented as a subsequence of ctEntries, where each entry corresponds to 42// a possible match of a next character in the search string. An entry 43// also includes the length and offset to the next sequence of entries 44// to check in case of a match. 45 46const ( 47 final = 0 48 noIndex = 0xFF 49) 50 51// ctEntry associates to a matching byte an offset and/or next sequence of 52// bytes to check. A ctEntry c is called final if a match means that the 53// longest suffix has been found. An entry c is final if c.N == 0. 54// A single final entry can match a range of characters to an offset. 55// A non-final entry always matches a single byte. Note that a non-final 56// entry might still resemble a completed suffix. 57// Examples: 58// The suffix strings "ab" and "ac" can be represented as: 59// []ctEntry{ 60// {'a', 1, 1, noIndex}, // 'a' by itself does not match, so i is 0xFF. 61// {'b', 'c', 0, 1}, // "ab" -> 1, "ac" -> 2 62// } 63// 64// The suffix strings "ab", "abc", "abd", and "abcd" can be represented as: 65// []ctEntry{ 66// {'a', 1, 1, noIndex}, // 'a' must be followed by 'b'. 67// {'b', 1, 2, 1}, // "ab" -> 1, may be followed by 'c' or 'd'. 68// {'d', 'd', final, 3}, // "abd" -> 3 69// {'c', 4, 1, 2}, // "abc" -> 2, may be followed by 'd'. 70// {'d', 'd', final, 4}, // "abcd" -> 4 71// } 72// See genStateTests in contract_test.go for more examples. 73type ctEntry struct { 74 L uint8 // non-final: byte value to match; final: lowest match in range. 75 H uint8 // non-final: relative index to next block; final: highest match in range. 76 N uint8 // non-final: length of next block; final: final 77 I uint8 // result offset. Will be noIndex if more bytes are needed to complete. 78} 79 80// contractTrieSet holds a set of contraction tries. The tries are stored 81// consecutively in the entry field. 82type contractTrieSet []struct{ l, h, n, i uint8 } 83 84// ctHandle is used to identify a trie in the trie set, consisting in an offset 85// in the array and the size of the first node. 86type ctHandle struct { 87 index, n int 88} 89 90// appendTrie adds a new trie for the given suffixes to the trie set and returns 91// a handle to it. The handle will be invalid on error. 92func appendTrie(ct *colltab.ContractTrieSet, suffixes []string) (ctHandle, error) { 93 es := make([]stridx, len(suffixes)) 94 for i, s := range suffixes { 95 es[i].str = s 96 } 97 sort.Sort(offsetSort(es)) 98 for i := range es { 99 es[i].index = i + 1 100 } 101 sort.Sort(genidxSort(es)) 102 i := len(*ct) 103 n, err := genStates(ct, es) 104 if err != nil { 105 *ct = (*ct)[:i] 106 return ctHandle{}, err 107 } 108 return ctHandle{i, n}, nil 109} 110 111// genStates generates ctEntries for a given suffix set and returns 112// the number of entries for the first node. 113func genStates(ct *colltab.ContractTrieSet, sis []stridx) (int, error) { 114 if len(sis) == 0 { 115 return 0, fmt.Errorf("genStates: list of suffices must be non-empty") 116 } 117 start := len(*ct) 118 // create entries for differing first bytes. 119 for _, si := range sis { 120 s := si.str 121 if len(s) == 0 { 122 continue 123 } 124 added := false 125 c := s[0] 126 if len(s) > 1 { 127 for j := len(*ct) - 1; j >= start; j-- { 128 if (*ct)[j].L == c { 129 added = true 130 break 131 } 132 } 133 if !added { 134 *ct = append(*ct, ctEntry{L: c, I: noIndex}) 135 } 136 } else { 137 for j := len(*ct) - 1; j >= start; j-- { 138 // Update the offset for longer suffixes with the same byte. 139 if (*ct)[j].L == c { 140 (*ct)[j].I = uint8(si.index) 141 added = true 142 } 143 // Extend range of final ctEntry, if possible. 144 if (*ct)[j].H+1 == c { 145 (*ct)[j].H = c 146 added = true 147 } 148 } 149 if !added { 150 *ct = append(*ct, ctEntry{L: c, H: c, N: final, I: uint8(si.index)}) 151 } 152 } 153 } 154 n := len(*ct) - start 155 // Append nodes for the remainder of the suffixes for each ctEntry. 156 sp := 0 157 for i, end := start, len(*ct); i < end; i++ { 158 fe := (*ct)[i] 159 if fe.H == 0 { // uninitialized non-final 160 ln := len(*ct) - start - n 161 if ln > 0xFF { 162 return 0, fmt.Errorf("genStates: relative block offset too large: %d > 255", ln) 163 } 164 fe.H = uint8(ln) 165 // Find first non-final strings with same byte as current entry. 166 for ; sis[sp].str[0] != fe.L; sp++ { 167 } 168 se := sp + 1 169 for ; se < len(sis) && len(sis[se].str) > 1 && sis[se].str[0] == fe.L; se++ { 170 } 171 sl := sis[sp:se] 172 sp = se 173 for i, si := range sl { 174 sl[i].str = si.str[1:] 175 } 176 nn, err := genStates(ct, sl) 177 if err != nil { 178 return 0, err 179 } 180 fe.N = uint8(nn) 181 (*ct)[i] = fe 182 } 183 } 184 sort.Sort(entrySort((*ct)[start : start+n])) 185 return n, nil 186} 187 188// There may be both a final and non-final entry for a byte if the byte 189// is implied in a range of matches in the final entry. 190// We need to ensure that the non-final entry comes first in that case. 191type entrySort colltab.ContractTrieSet 192 193func (fe entrySort) Len() int { return len(fe) } 194func (fe entrySort) Swap(i, j int) { fe[i], fe[j] = fe[j], fe[i] } 195func (fe entrySort) Less(i, j int) bool { 196 return fe[i].L > fe[j].L 197} 198 199// stridx is used for sorting suffixes and their associated offsets. 200type stridx struct { 201 str string 202 index int 203} 204 205// For computing the offsets, we first sort by size, and then by string. 206// This ensures that strings that only differ in the last byte by 1 207// are sorted consecutively in increasing order such that they can 208// be packed as a range in a final ctEntry. 209type offsetSort []stridx 210 211func (si offsetSort) Len() int { return len(si) } 212func (si offsetSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 213func (si offsetSort) Less(i, j int) bool { 214 if len(si[i].str) != len(si[j].str) { 215 return len(si[i].str) > len(si[j].str) 216 } 217 return si[i].str < si[j].str 218} 219 220// For indexing, we want to ensure that strings are sorted in string order, where 221// for strings with the same prefix, we put longer strings before shorter ones. 222type genidxSort []stridx 223 224func (si genidxSort) Len() int { return len(si) } 225func (si genidxSort) Swap(i, j int) { si[i], si[j] = si[j], si[i] } 226func (si genidxSort) Less(i, j int) bool { 227 if strings.HasPrefix(si[j].str, si[i].str) { 228 return false 229 } 230 if strings.HasPrefix(si[i].str, si[j].str) { 231 return true 232 } 233 return si[i].str < si[j].str 234} 235 236// lookup matches the longest suffix in str and returns the associated offset 237// and the number of bytes consumed. 238func lookup(ct *colltab.ContractTrieSet, h ctHandle, str []byte) (index, ns int) { 239 states := (*ct)[h.index:] 240 p := 0 241 n := h.n 242 for i := 0; i < n && p < len(str); { 243 e := states[i] 244 c := str[p] 245 if c >= e.L { 246 if e.L == c { 247 p++ 248 if e.I != noIndex { 249 index, ns = int(e.I), p 250 } 251 if e.N != final { 252 // set to new state 253 i, states, n = 0, states[int(e.H)+n:], int(e.N) 254 } else { 255 return 256 } 257 continue 258 } else if e.N == final && c <= e.H { 259 p++ 260 return int(c-e.L) + int(e.I), p 261 } 262 } 263 i++ 264 } 265 return 266} 267 268// print writes the contractTrieSet t as compilable Go code to w. It returns 269// the total number of bytes written and the size of the resulting data structure in bytes. 270func print(t *colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 271 update3 := func(nn, sz int, e error) { 272 n += nn 273 if err == nil { 274 err = e 275 } 276 size += sz 277 } 278 update2 := func(nn int, e error) { update3(nn, 0, e) } 279 280 update3(printArray(*t, w, name)) 281 update2(fmt.Fprintf(w, "var %sContractTrieSet = ", name)) 282 update3(printStruct(*t, w, name)) 283 update2(fmt.Fprintln(w)) 284 return 285} 286 287func printArray(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 288 p := func(f string, a ...interface{}) { 289 nn, e := fmt.Fprintf(w, f, a...) 290 n += nn 291 if err == nil { 292 err = e 293 } 294 } 295 size = len(ct) * 4 296 p("// %sCTEntries: %d entries, %d bytes\n", name, len(ct), size) 297 p("var %sCTEntries = [%d]struct{L,H,N,I uint8}{\n", name, len(ct)) 298 for _, fe := range ct { 299 p("\t{0x%X, 0x%X, %d, %d},\n", fe.L, fe.H, fe.N, fe.I) 300 } 301 p("}\n") 302 return 303} 304 305func printStruct(ct colltab.ContractTrieSet, w io.Writer, name string) (n, size int, err error) { 306 n, err = fmt.Fprintf(w, "colltab.ContractTrieSet( %sCTEntries[:] )", name) 307 size = int(reflect.TypeOf(ct).Size()) 308 return 309} 310