1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package build 6 7import ( 8 "fmt" 9 "log" 10 "sort" 11 "strings" 12 "unicode" 13 14 "golang.org/x/text/internal/colltab" 15 "golang.org/x/text/unicode/norm" 16) 17 18type logicalAnchor int 19 20const ( 21 firstAnchor logicalAnchor = -1 22 noAnchor = 0 23 lastAnchor = 1 24) 25 26// entry is used to keep track of a single entry in the collation element table 27// during building. Examples of entries can be found in the Default Unicode 28// Collation Element Table. 29// See https://www.unicode.org/Public/UCA/6.0.0/allkeys.txt. 30type entry struct { 31 str string // same as string(runes) 32 runes []rune 33 elems []rawCE // the collation elements 34 extend string // weights of extend to be appended to elems 35 before bool // weights relative to next instead of previous. 36 lock bool // entry is used in extension and can no longer be moved. 37 38 // prev, next, and level are used to keep track of tailorings. 39 prev, next *entry 40 level colltab.Level // next differs at this level 41 skipRemove bool // do not unlink when removed 42 43 decompose bool // can use NFKD decomposition to generate elems 44 exclude bool // do not include in table 45 implicit bool // derived, is not included in the list 46 modified bool // entry was modified in tailoring 47 logical logicalAnchor 48 49 expansionIndex int // used to store index into expansion table 50 contractionHandle ctHandle 51 contractionIndex int // index into contraction elements 52} 53 54func (e *entry) String() string { 55 return fmt.Sprintf("%X (%q) -> %X (ch:%x; ci:%d, ei:%d)", 56 e.runes, e.str, e.elems, e.contractionHandle, e.contractionIndex, e.expansionIndex) 57} 58 59func (e *entry) skip() bool { 60 return e.contraction() 61} 62 63func (e *entry) expansion() bool { 64 return !e.decompose && len(e.elems) > 1 65} 66 67func (e *entry) contraction() bool { 68 return len(e.runes) > 1 69} 70 71func (e *entry) contractionStarter() bool { 72 return e.contractionHandle.n != 0 73} 74 75// nextIndexed gets the next entry that needs to be stored in the table. 76// It returns the entry and the collation level at which the next entry differs 77// from the current entry. 78// Entries that can be explicitly derived and logical reset positions are 79// examples of entries that will not be indexed. 80func (e *entry) nextIndexed() (*entry, colltab.Level) { 81 level := e.level 82 for e = e.next; e != nil && (e.exclude || len(e.elems) == 0); e = e.next { 83 if e.level < level { 84 level = e.level 85 } 86 } 87 return e, level 88} 89 90// remove unlinks entry e from the sorted chain and clears the collation 91// elements. e may not be at the front or end of the list. This should always 92// be the case, as the front and end of the list are always logical anchors, 93// which may not be removed. 94func (e *entry) remove() { 95 if e.logical != noAnchor { 96 log.Fatalf("may not remove anchor %q", e.str) 97 } 98 // TODO: need to set e.prev.level to e.level if e.level is smaller? 99 e.elems = nil 100 if !e.skipRemove { 101 if e.prev != nil { 102 e.prev.next = e.next 103 } 104 if e.next != nil { 105 e.next.prev = e.prev 106 } 107 } 108 e.skipRemove = false 109} 110 111// insertAfter inserts n after e. 112func (e *entry) insertAfter(n *entry) { 113 if e == n { 114 panic("e == anchor") 115 } 116 if e == nil { 117 panic("unexpected nil anchor") 118 } 119 n.remove() 120 n.decompose = false // redo decomposition test 121 122 n.next = e.next 123 n.prev = e 124 if e.next != nil { 125 e.next.prev = n 126 } 127 e.next = n 128} 129 130// insertBefore inserts n before e. 131func (e *entry) insertBefore(n *entry) { 132 if e == n { 133 panic("e == anchor") 134 } 135 if e == nil { 136 panic("unexpected nil anchor") 137 } 138 n.remove() 139 n.decompose = false // redo decomposition test 140 141 n.prev = e.prev 142 n.next = e 143 if e.prev != nil { 144 e.prev.next = n 145 } 146 e.prev = n 147} 148 149func (e *entry) encodeBase() (ce uint32, err error) { 150 switch { 151 case e.expansion(): 152 ce, err = makeExpandIndex(e.expansionIndex) 153 default: 154 if e.decompose { 155 log.Fatal("decompose should be handled elsewhere") 156 } 157 ce, err = makeCE(e.elems[0]) 158 } 159 return 160} 161 162func (e *entry) encode() (ce uint32, err error) { 163 if e.skip() { 164 log.Fatal("cannot build colElem for entry that should be skipped") 165 } 166 switch { 167 case e.decompose: 168 t1 := e.elems[0].w[2] 169 t2 := 0 170 if len(e.elems) > 1 { 171 t2 = e.elems[1].w[2] 172 } 173 ce, err = makeDecompose(t1, t2) 174 case e.contractionStarter(): 175 ce, err = makeContractIndex(e.contractionHandle, e.contractionIndex) 176 default: 177 if len(e.runes) > 1 { 178 log.Fatal("colElem: contractions are handled in contraction trie") 179 } 180 ce, err = e.encodeBase() 181 } 182 return 183} 184 185// entryLess returns true if a sorts before b and false otherwise. 186func entryLess(a, b *entry) bool { 187 if res, _ := compareWeights(a.elems, b.elems); res != 0 { 188 return res == -1 189 } 190 if a.logical != noAnchor { 191 return a.logical == firstAnchor 192 } 193 if b.logical != noAnchor { 194 return b.logical == lastAnchor 195 } 196 return a.str < b.str 197} 198 199type sortedEntries []*entry 200 201func (s sortedEntries) Len() int { 202 return len(s) 203} 204 205func (s sortedEntries) Swap(i, j int) { 206 s[i], s[j] = s[j], s[i] 207} 208 209func (s sortedEntries) Less(i, j int) bool { 210 return entryLess(s[i], s[j]) 211} 212 213type ordering struct { 214 id string 215 entryMap map[string]*entry 216 ordered []*entry 217 handle *trieHandle 218} 219 220// insert inserts e into both entryMap and ordered. 221// Note that insert simply appends e to ordered. To reattain a sorted 222// order, o.sort() should be called. 223func (o *ordering) insert(e *entry) { 224 if e.logical == noAnchor { 225 o.entryMap[e.str] = e 226 } else { 227 // Use key format as used in UCA rules. 228 o.entryMap[fmt.Sprintf("[%s]", e.str)] = e 229 // Also add index entry for XML format. 230 o.entryMap[fmt.Sprintf("<%s/>", strings.Replace(e.str, " ", "_", -1))] = e 231 } 232 o.ordered = append(o.ordered, e) 233} 234 235// newEntry creates a new entry for the given info and inserts it into 236// the index. 237func (o *ordering) newEntry(s string, ces []rawCE) *entry { 238 e := &entry{ 239 runes: []rune(s), 240 elems: ces, 241 str: s, 242 } 243 o.insert(e) 244 return e 245} 246 247// find looks up and returns the entry for the given string. 248// It returns nil if str is not in the index and if an implicit value 249// cannot be derived, that is, if str represents more than one rune. 250func (o *ordering) find(str string) *entry { 251 e := o.entryMap[str] 252 if e == nil { 253 r := []rune(str) 254 if len(r) == 1 { 255 const ( 256 firstHangul = 0xAC00 257 lastHangul = 0xD7A3 258 ) 259 if r[0] >= firstHangul && r[0] <= lastHangul { 260 ce := []rawCE{} 261 nfd := norm.NFD.String(str) 262 for _, r := range nfd { 263 ce = append(ce, o.find(string(r)).elems...) 264 } 265 e = o.newEntry(nfd, ce) 266 } else { 267 e = o.newEntry(string(r[0]), []rawCE{ 268 {w: []int{ 269 implicitPrimary(r[0]), 270 defaultSecondary, 271 defaultTertiary, 272 int(r[0]), 273 }, 274 }, 275 }) 276 e.modified = true 277 } 278 e.exclude = true // do not index implicits 279 } 280 } 281 return e 282} 283 284// makeRootOrdering returns a newly initialized ordering value and populates 285// it with a set of logical reset points that can be used as anchors. 286// The anchors first_tertiary_ignorable and __END__ will always sort at 287// the beginning and end, respectively. This means that prev and next are non-nil 288// for any indexed entry. 289func makeRootOrdering() ordering { 290 const max = unicode.MaxRune 291 o := ordering{ 292 entryMap: make(map[string]*entry), 293 } 294 insert := func(typ logicalAnchor, s string, ce []int) { 295 e := &entry{ 296 elems: []rawCE{{w: ce}}, 297 str: s, 298 exclude: true, 299 logical: typ, 300 } 301 o.insert(e) 302 } 303 insert(firstAnchor, "first tertiary ignorable", []int{0, 0, 0, 0}) 304 insert(lastAnchor, "last tertiary ignorable", []int{0, 0, 0, max}) 305 insert(lastAnchor, "last primary ignorable", []int{0, defaultSecondary, defaultTertiary, max}) 306 insert(lastAnchor, "last non ignorable", []int{maxPrimary, defaultSecondary, defaultTertiary, max}) 307 insert(lastAnchor, "__END__", []int{1 << maxPrimaryBits, defaultSecondary, defaultTertiary, max}) 308 return o 309} 310 311// patchForInsert eleminates entries from the list with more than one collation element. 312// The next and prev fields of the eliminated entries still point to appropriate 313// values in the newly created list. 314// It requires that sort has been called. 315func (o *ordering) patchForInsert() { 316 for i := 0; i < len(o.ordered)-1; { 317 e := o.ordered[i] 318 lev := e.level 319 n := e.next 320 for ; n != nil && len(n.elems) > 1; n = n.next { 321 if n.level < lev { 322 lev = n.level 323 } 324 n.skipRemove = true 325 } 326 for ; o.ordered[i] != n; i++ { 327 o.ordered[i].level = lev 328 o.ordered[i].next = n 329 o.ordered[i+1].prev = e 330 } 331 } 332} 333 334// clone copies all ordering of es into a new ordering value. 335func (o *ordering) clone() *ordering { 336 o.sort() 337 oo := ordering{ 338 entryMap: make(map[string]*entry), 339 } 340 for _, e := range o.ordered { 341 ne := &entry{ 342 runes: e.runes, 343 elems: e.elems, 344 str: e.str, 345 decompose: e.decompose, 346 exclude: e.exclude, 347 logical: e.logical, 348 } 349 oo.insert(ne) 350 } 351 oo.sort() // link all ordering. 352 oo.patchForInsert() 353 return &oo 354} 355 356// front returns the first entry to be indexed. 357// It assumes that sort() has been called. 358func (o *ordering) front() *entry { 359 e := o.ordered[0] 360 if e.prev != nil { 361 log.Panicf("unexpected first entry: %v", e) 362 } 363 // The first entry is always a logical position, which should not be indexed. 364 e, _ = e.nextIndexed() 365 return e 366} 367 368// sort sorts all ordering based on their collation elements and initializes 369// the prev, next, and level fields accordingly. 370func (o *ordering) sort() { 371 sort.Sort(sortedEntries(o.ordered)) 372 l := o.ordered 373 for i := 1; i < len(l); i++ { 374 k := i - 1 375 l[k].next = l[i] 376 _, l[k].level = compareWeights(l[k].elems, l[i].elems) 377 l[i].prev = l[k] 378 } 379} 380 381// genColElems generates a collation element array from the runes in str. This 382// assumes that all collation elements have already been added to the Builder. 383func (o *ordering) genColElems(str string) []rawCE { 384 elems := []rawCE{} 385 for _, r := range []rune(str) { 386 for _, ce := range o.find(string(r)).elems { 387 if ce.w[0] != 0 || ce.w[1] != 0 || ce.w[2] != 0 { 388 elems = append(elems, ce) 389 } 390 } 391 } 392 return elems 393} 394