1// Copyright 2012 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package collate contains types for comparing and sorting Unicode strings 6// according to a given collation order. Package locale provides a high-level 7// interface to collation. Users should typically use that package instead. 8package collate 9 10import ( 11 "bytes" 12 "exp/norm" 13) 14 15// AlternateHandling identifies the various ways in which variables are handled. 16// A rune with a primary weight lower than the variable top is considered a 17// variable. 18// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details. 19type AlternateHandling int 20 21const ( 22 // AltNonIgnorable turns off special handling of variables. 23 AltNonIgnorable AlternateHandling = iota 24 25 // AltBlanked sets variables and all subsequent primary ignorables to be 26 // ignorable at all levels. This is identical to removing all variables 27 // and subsequent primary ignorables from the input. 28 AltBlanked 29 30 // AltShifted sets variables to be ignorable for levels one through three and 31 // adds a fourth level based on the values of the ignored levels. 32 AltShifted 33 34 // AltShiftTrimmed is a slight variant of AltShifted that is used to 35 // emulate POSIX. 36 AltShiftTrimmed 37) 38 39// Collator provides functionality for comparing strings for a given 40// collation order. 41type Collator struct { 42 // TODO: hide most of these options. Low-level options are set through the locale 43 // identifier (as defined by LDML) while high-level options are set through SetOptions. 44 // Using high-level options allows us to be more flexible (such as not ignoring 45 // Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing 46 // diacritical marks to be ignored but not case without having to fiddle with levels). 47 48 // Strength sets the maximum level to use in comparison. 49 Strength Level 50 51 // Alternate specifies an alternative handling of variables. 52 Alternate AlternateHandling 53 54 // Backwards specifies the order of sorting at the secondary level. 55 // This option exists predominantly to support reverse sorting of accents in French. 56 Backwards bool 57 58 // TODO: implement: 59 // With HiraganaQuaternary enabled, Hiragana codepoints will get lower values 60 // than all the other non-variable code points. Strength must be greater or 61 // equal to Quaternary for this to take effect. 62 HiraganaQuaternary bool 63 64 // If CaseLevel is true, a level consisting only of case characteristics will 65 // be inserted in front of the tertiary level. To ignore accents but take 66 // cases into account, set Strength to Primary and CaseLevel to true. 67 CaseLevel bool 68 69 // If Numeric is true, any sequence of decimal digits (category is Nd) is sorted 70 // at a primary level with its numeric value. For example, "A-21" < "A-123". 71 Numeric bool 72 73 // The largest primary value that is considered to be variable. 74 variableTop uint32 75 76 f norm.Form 77 78 t Weigher 79 80 sorter sorter 81 82 _iter [2]iter 83} 84 85// An Option is used to change the behavior of Collator. They override the 86// settings passed through the locale identifier. 87type Option int 88 89const ( 90 Numeric Option = 1 << iota // Sort numbers numerically ("2" < "12"). 91 IgnoreCase // Case-insensitive search. 92 IgnoreDiacritics // Ignore diacritical marks. ("o" == "ö"). 93 IgnoreWidth // Ignore full versus normal width. 94 UpperFirst // Sort upper case before lower case. 95 LowerFirst // Sort lower case before upper case. 96 Force // Force ordering if strings are equivalent but not equal. 97 98 Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase 99) 100 101// SetOptions accepts a Options or-ed together. All previous calls to SetOptions are ignored. 102func (c *Collator) SetOptions(o Option) { 103 // TODO: implement 104} 105 106func (c *Collator) iter(i int) *iter { 107 // TODO: evaluate performance for making the second iterator optional. 108 return &c._iter[i] 109} 110 111// Locales returns the list of locales for which collating differs from its parent locale. 112// The returned value should not be modified. 113func Locales() []string { 114 return availableLocales 115} 116 117// New returns a new Collator initialized for the given locale. 118func New(loc string) *Collator { 119 // TODO: handle locale selection according to spec. 120 var t tableIndex 121 if loc != "" { 122 if idx, ok := locales[loc]; ok { 123 t = idx 124 } else { 125 t = locales["root"] 126 } 127 } 128 return NewFromTable(Init(t)) 129} 130 131func NewFromTable(t Weigher) *Collator { 132 c := &Collator{ 133 Strength: Tertiary, 134 f: norm.NFD, 135 t: t, 136 } 137 c._iter[0].init(c) 138 c._iter[1].init(c) 139 return c 140} 141 142// Buffer holds keys generated by Key and KeyString. 143type Buffer struct { 144 buf [4096]byte 145 key []byte 146} 147 148func (b *Buffer) init() { 149 if b.key == nil { 150 b.key = b.buf[:0] 151 } 152} 153 154// Reset clears the buffer from previous results generated by Key and KeyString. 155func (b *Buffer) Reset() { 156 b.key = b.key[:0] 157} 158 159// Compare returns an integer comparing the two byte slices. 160// The result will be 0 if a==b, -1 if a < b, and +1 if a > b. 161func (c *Collator) Compare(a, b []byte) int { 162 // TODO: skip identical prefixes once we have a fast way to detect if a rune is 163 // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. 164 c.iter(0).setInput(a) 165 c.iter(1).setInput(b) 166 if res := c.compare(); res != 0 { 167 return res 168 } 169 if Identity == c.Strength { 170 return bytes.Compare(a, b) 171 } 172 return 0 173} 174 175// CompareString returns an integer comparing the two strings. 176// The result will be 0 if a==b, -1 if a < b, and +1 if a > b. 177func (c *Collator) CompareString(a, b string) int { 178 // TODO: skip identical prefixes once we have a fast way to detect if a rune is 179 // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. 180 c.iter(0).setInputString(a) 181 c.iter(1).setInputString(b) 182 if res := c.compare(); res != 0 { 183 return res 184 } 185 if Identity == c.Strength { 186 if a < b { 187 return -1 188 } else if a > b { 189 return 1 190 } 191 } 192 return 0 193} 194 195func compareLevel(f func(i *iter) int, a, b *iter) int { 196 a.pce = 0 197 b.pce = 0 198 for { 199 va := f(a) 200 vb := f(b) 201 if va != vb { 202 if va < vb { 203 return -1 204 } 205 return 1 206 } else if va == 0 { 207 break 208 } 209 } 210 return 0 211} 212 213func (c *Collator) compare() int { 214 ia, ib := c.iter(0), c.iter(1) 215 // Process primary level 216 if c.Alternate != AltShifted { 217 // TODO: implement script reordering 218 // TODO: special hiragana handling 219 if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 { 220 return res 221 } 222 } else { 223 // TODO: handle shifted 224 } 225 if Secondary <= c.Strength { 226 f := (*iter).nextSecondary 227 if c.Backwards { 228 f = (*iter).prevSecondary 229 } 230 if res := compareLevel(f, ia, ib); res != 0 { 231 return res 232 } 233 } 234 // TODO: special case handling (Danish?) 235 if Tertiary <= c.Strength || c.CaseLevel { 236 if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 { 237 return res 238 } 239 // TODO: Not needed for the default value of AltNonIgnorable? 240 if Quaternary <= c.Strength { 241 if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 { 242 return res 243 } 244 } 245 } 246 return 0 247} 248 249// Key returns the collation key for str. 250// Passing the buffer buf may avoid memory allocations. 251// The returned slice will point to an allocation in Buffer and will remain 252// valid until the next call to buf.Reset(). 253func (c *Collator) Key(buf *Buffer, str []byte) []byte { 254 // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. 255 buf.init() 256 return c.key(buf, c.getColElems(str)) 257} 258 259// KeyFromString returns the collation key for str. 260// Passing the buffer buf may avoid memory allocations. 261// The returned slice will point to an allocation in Buffer and will retain 262// valid until the next call to buf.ResetKeys(). 263func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { 264 // See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details. 265 buf.init() 266 return c.key(buf, c.getColElemsString(str)) 267} 268 269func (c *Collator) key(buf *Buffer, w []Elem) []byte { 270 processWeights(c.Alternate, c.variableTop, w) 271 kn := len(buf.key) 272 c.keyFromElems(buf, w) 273 return buf.key[kn:] 274} 275 276func (c *Collator) getColElems(str []byte) []Elem { 277 i := c.iter(0) 278 i.setInput(str) 279 for i.next() { 280 } 281 return i.ce 282} 283 284func (c *Collator) getColElemsString(str string) []Elem { 285 i := c.iter(0) 286 i.setInputString(str) 287 for i.next() { 288 } 289 return i.ce 290} 291 292type iter struct { 293 bytes []byte 294 str string 295 296 wa [512]Elem 297 ce []Elem 298 pce int 299 nce int // nce <= len(nce) 300 301 prevCCC uint8 302 pStarter int 303 304 t Weigher 305} 306 307func (i *iter) init(c *Collator) { 308 i.t = c.t 309 i.ce = i.wa[:0] 310} 311 312func (i *iter) reset() { 313 i.ce = i.ce[:0] 314 i.nce = 0 315 i.prevCCC = 0 316 i.pStarter = 0 317} 318 319func (i *iter) setInput(s []byte) *iter { 320 i.bytes = s 321 i.str = "" 322 i.reset() 323 return i 324} 325 326func (i *iter) setInputString(s string) *iter { 327 i.str = s 328 i.bytes = nil 329 i.reset() 330 return i 331} 332 333func (i *iter) done() bool { 334 return len(i.str) == 0 && len(i.bytes) == 0 335} 336 337func (i *iter) tail(n int) { 338 if i.bytes == nil { 339 i.str = i.str[n:] 340 } else { 341 i.bytes = i.bytes[n:] 342 } 343} 344 345func (i *iter) appendNext() int { 346 var sz int 347 if i.bytes == nil { 348 i.ce, sz = i.t.AppendNextString(i.ce, i.str) 349 } else { 350 i.ce, sz = i.t.AppendNext(i.ce, i.bytes) 351 } 352 return sz 353} 354 355// next appends Elems to the internal array until it adds an element with CCC=0. 356// In the majority of cases, a Elem with a primary value > 0 will have 357// a CCC of 0. The CCC values of colation elements are also used to detect if the 358// input string was not normalized and to adjust the result accordingly. 359func (i *iter) next() bool { 360 for !i.done() { 361 p0 := len(i.ce) 362 sz := i.appendNext() 363 i.tail(sz) 364 last := len(i.ce) - 1 365 if ccc := i.ce[last].CCC(); ccc == 0 { 366 i.nce = len(i.ce) 367 i.pStarter = last 368 i.prevCCC = 0 369 return true 370 } else if p0 < last && i.ce[p0].CCC() == 0 { 371 // set i.nce to only cover part of i.ce for which ccc == 0 and 372 // use rest the next call to next. 373 for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ { 374 } 375 i.nce = p0 376 i.pStarter = p0 - 1 377 i.prevCCC = ccc 378 return true 379 } else if ccc < i.prevCCC { 380 i.doNorm(p0, ccc) // should be rare for most common cases 381 } else { 382 i.prevCCC = ccc 383 } 384 } 385 if len(i.ce) != i.nce { 386 i.nce = len(i.ce) 387 return true 388 } 389 return false 390} 391 392// nextPlain is the same as next, but does not "normalize" the collation 393// elements. 394// TODO: remove this function. Using this instead of next does not seem 395// to improve performance in any significant way. We retain this until 396// later for evaluation purposes. 397func (i *iter) nextPlain() bool { 398 if i.done() { 399 return false 400 } 401 sz := i.appendNext() 402 i.tail(sz) 403 i.nce = len(i.ce) 404 return true 405} 406 407const maxCombiningCharacters = 30 408 409// doNorm reorders the collation elements in i.ce. 410// It assumes that blocks of collation elements added with appendNext 411// either start and end with the same CCC or start with CCC == 0. 412// This allows for a single insertion point for the entire block. 413// The correctness of this assumption is verified in builder.go. 414func (i *iter) doNorm(p int, ccc uint8) { 415 if p-i.pStarter > maxCombiningCharacters { 416 i.prevCCC = i.ce[len(i.ce)-1].CCC() 417 i.pStarter = len(i.ce) - 1 418 return 419 } 420 n := len(i.ce) 421 k := p 422 for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- { 423 } 424 i.ce = append(i.ce, i.ce[p:k]...) 425 copy(i.ce[p:], i.ce[k:]) 426 i.ce = i.ce[:n] 427} 428 429func (i *iter) nextPrimary() int { 430 for { 431 for ; i.pce < i.nce; i.pce++ { 432 if v := i.ce[i.pce].Primary(); v != 0 { 433 i.pce++ 434 return v 435 } 436 } 437 if !i.next() { 438 return 0 439 } 440 } 441 panic("should not reach here") 442} 443 444func (i *iter) nextSecondary() int { 445 for ; i.pce < len(i.ce); i.pce++ { 446 if v := i.ce[i.pce].Secondary(); v != 0 { 447 i.pce++ 448 return v 449 } 450 } 451 return 0 452} 453 454func (i *iter) prevSecondary() int { 455 for ; i.pce < len(i.ce); i.pce++ { 456 if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 { 457 i.pce++ 458 return v 459 } 460 } 461 return 0 462} 463 464func (i *iter) nextTertiary() int { 465 for ; i.pce < len(i.ce); i.pce++ { 466 if v := i.ce[i.pce].Tertiary(); v != 0 { 467 i.pce++ 468 return int(v) 469 } 470 } 471 return 0 472} 473 474func (i *iter) nextQuaternary() int { 475 for ; i.pce < len(i.ce); i.pce++ { 476 if v := i.ce[i.pce].Quaternary(); v != 0 { 477 i.pce++ 478 return v 479 } 480 } 481 return 0 482} 483 484func appendPrimary(key []byte, p int) []byte { 485 // Convert to variable length encoding; supports up to 23 bits. 486 if p <= 0x7FFF { 487 key = append(key, uint8(p>>8), uint8(p)) 488 } else { 489 key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p)) 490 } 491 return key 492} 493 494// keyFromElems converts the weights ws to a compact sequence of bytes. 495// The result will be appended to the byte buffer in buf. 496func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) { 497 for _, v := range ws { 498 if w := v.Primary(); w > 0 { 499 buf.key = appendPrimary(buf.key, w) 500 } 501 } 502 if Secondary <= c.Strength { 503 buf.key = append(buf.key, 0, 0) 504 // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. 505 if !c.Backwards { 506 for _, v := range ws { 507 if w := v.Secondary(); w > 0 { 508 buf.key = append(buf.key, uint8(w>>8), uint8(w)) 509 } 510 } 511 } else { 512 for i := len(ws) - 1; i >= 0; i-- { 513 if w := ws[i].Secondary(); w > 0 { 514 buf.key = append(buf.key, uint8(w>>8), uint8(w)) 515 } 516 } 517 } 518 } else if c.CaseLevel { 519 buf.key = append(buf.key, 0, 0) 520 } 521 if Tertiary <= c.Strength || c.CaseLevel { 522 buf.key = append(buf.key, 0, 0) 523 for _, v := range ws { 524 if w := v.Tertiary(); w > 0 { 525 buf.key = append(buf.key, uint8(w)) 526 } 527 } 528 // Derive the quaternary weights from the options and other levels. 529 // Note that we represent MaxQuaternary as 0xFF. The first byte of the 530 // representation of a primary weight is always smaller than 0xFF, 531 // so using this single byte value will compare correctly. 532 if Quaternary <= c.Strength && c.Alternate >= AltShifted { 533 if c.Alternate == AltShiftTrimmed { 534 lastNonFFFF := len(buf.key) 535 buf.key = append(buf.key, 0) 536 for _, v := range ws { 537 if w := v.Quaternary(); w == MaxQuaternary { 538 buf.key = append(buf.key, 0xFF) 539 } else if w > 0 { 540 buf.key = appendPrimary(buf.key, w) 541 lastNonFFFF = len(buf.key) 542 } 543 } 544 buf.key = buf.key[:lastNonFFFF] 545 } else { 546 buf.key = append(buf.key, 0) 547 for _, v := range ws { 548 if w := v.Quaternary(); w == MaxQuaternary { 549 buf.key = append(buf.key, 0xFF) 550 } else if w > 0 { 551 buf.key = appendPrimary(buf.key, w) 552 } 553 } 554 } 555 } 556 } 557} 558 559func processWeights(vw AlternateHandling, top uint32, wa []Elem) { 560 ignore := false 561 vtop := int(top) 562 switch vw { 563 case AltShifted, AltShiftTrimmed: 564 for i := range wa { 565 if p := wa[i].Primary(); p <= vtop && p != 0 { 566 wa[i] = MakeQuaternary(p) 567 ignore = true 568 } else if p == 0 { 569 if ignore { 570 wa[i] = ceIgnore 571 } 572 } else { 573 ignore = false 574 } 575 } 576 case AltBlanked: 577 for i := range wa { 578 if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) { 579 wa[i] = ceIgnore 580 ignore = true 581 } else { 582 ignore = false 583 } 584 } 585 } 586} 587