1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// This program generates the trie for casing operations. The Unicode casing 8// algorithm requires the lookup of various properties and mappings for each 9// rune. The table generated by this generator combines several of the most 10// frequently used of these into a single trie so that they can be accessed 11// with a single lookup. 12package main 13 14import ( 15 "bytes" 16 "fmt" 17 "io" 18 "io/ioutil" 19 "log" 20 "reflect" 21 "strconv" 22 "strings" 23 "unicode" 24 25 "golang.org/x/text/internal/gen" 26 "golang.org/x/text/internal/triegen" 27 "golang.org/x/text/internal/ucd" 28 "golang.org/x/text/unicode/norm" 29) 30 31func main() { 32 gen.Init() 33 genTables() 34 genTablesTest() 35 gen.Repackage("gen_trieval.go", "trieval.go", "cases") 36} 37 38// runeInfo contains all information for a rune that we care about for casing 39// operations. 40type runeInfo struct { 41 Rune rune 42 43 entry info // trie value for this rune. 44 45 CaseMode info 46 47 // Simple case mappings. 48 Simple [1 + maxCaseMode][]rune 49 50 // Special casing 51 HasSpecial bool 52 Conditional bool 53 Special [1 + maxCaseMode][]rune 54 55 // Folding 56 FoldSimple rune 57 FoldSpecial rune 58 FoldFull []rune 59 60 // TODO: FC_NFKC, or equivalent data. 61 62 // Properties 63 SoftDotted bool 64 CaseIgnorable bool 65 Cased bool 66 DecomposeGreek bool 67 BreakType string 68 BreakCat breakCategory 69 70 // We care mostly about 0, Above, and IotaSubscript. 71 CCC byte 72} 73 74type breakCategory int 75 76const ( 77 breakBreak breakCategory = iota 78 breakLetter 79 breakMid 80) 81 82// mapping returns the case mapping for the given case type. 83func (r *runeInfo) mapping(c info) string { 84 if r.HasSpecial { 85 return string(r.Special[c]) 86 } 87 if len(r.Simple[c]) != 0 { 88 return string(r.Simple[c]) 89 } 90 return string(r.Rune) 91} 92 93func parse(file string, f func(p *ucd.Parser)) { 94 ucd.Parse(gen.OpenUCDFile(file), f) 95} 96 97func parseUCD() []runeInfo { 98 chars := make([]runeInfo, unicode.MaxRune) 99 100 get := func(r rune) *runeInfo { 101 c := &chars[r] 102 c.Rune = r 103 return c 104 } 105 106 parse("UnicodeData.txt", func(p *ucd.Parser) { 107 ri := get(p.Rune(0)) 108 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass)) 109 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping) 110 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping) 111 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping) 112 if p.String(ucd.GeneralCategory) == "Lt" { 113 ri.CaseMode = cTitle 114 } 115 }) 116 117 // <code>; <property> 118 parse("PropList.txt", func(p *ucd.Parser) { 119 if p.String(1) == "Soft_Dotted" { 120 chars[p.Rune(0)].SoftDotted = true 121 } 122 }) 123 124 // <code>; <word break type> 125 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 126 ri := get(p.Rune(0)) 127 switch p.String(1) { 128 case "Case_Ignorable": 129 ri.CaseIgnorable = true 130 case "Cased": 131 ri.Cased = true 132 case "Lowercase": 133 ri.CaseMode = cLower 134 case "Uppercase": 135 ri.CaseMode = cUpper 136 } 137 }) 138 139 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 140 parse("SpecialCasing.txt", func(p *ucd.Parser) { 141 // We drop all conditional special casing and deal with them manually in 142 // the language-specific case mappers. Rune 0x03A3 is the only one with 143 // a conditional formatting that is not language-specific. However, 144 // dealing with this letter is tricky, especially in a streaming 145 // context, so we deal with it in the Caser for Greek specifically. 146 ri := get(p.Rune(0)) 147 if p.String(4) == "" { 148 ri.HasSpecial = true 149 ri.Special[cLower] = p.Runes(1) 150 ri.Special[cTitle] = p.Runes(2) 151 ri.Special[cUpper] = p.Runes(3) 152 } else { 153 ri.Conditional = true 154 } 155 }) 156 157 // TODO: Use text breaking according to UAX #29. 158 // <code>; <word break type> 159 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 160 ri := get(p.Rune(0)) 161 ri.BreakType = p.String(1) 162 163 // We collapse the word breaking properties onto the categories we need. 164 switch p.String(1) { // TODO: officially we need to canonicalize. 165 case "MidLetter", "MidNumLet", "Single_Quote": 166 ri.BreakCat = breakMid 167 if !ri.CaseIgnorable { 168 // finalSigma relies on the fact that all breakMid runes are 169 // also a Case_Ignorable. Revisit this code when this changes. 170 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri) 171 } 172 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ": 173 ri.BreakCat = breakLetter 174 } 175 }) 176 177 // <code>; <type>; <mapping> 178 parse("CaseFolding.txt", func(p *ucd.Parser) { 179 ri := get(p.Rune(0)) 180 switch p.String(1) { 181 case "C": 182 ri.FoldSimple = p.Rune(2) 183 ri.FoldFull = p.Runes(2) 184 case "S": 185 ri.FoldSimple = p.Rune(2) 186 case "T": 187 ri.FoldSpecial = p.Rune(2) 188 case "F": 189 ri.FoldFull = p.Runes(2) 190 default: 191 log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1)) 192 } 193 }) 194 195 return chars 196} 197 198func genTables() { 199 chars := parseUCD() 200 verifyProperties(chars) 201 202 t := triegen.NewTrie("case") 203 for i := range chars { 204 c := &chars[i] 205 makeEntry(c) 206 t.Insert(rune(i), uint64(c.entry)) 207 } 208 209 w := gen.NewCodeWriter() 210 defer w.WriteVersionedGoFile("tables.go", "cases") 211 212 gen.WriteUnicodeVersion(w) 213 214 // TODO: write CLDR version after adding a mechanism to detect that the 215 // tables on which the manually created locale-sensitive casing code is 216 // based hasn't changed. 217 218 w.WriteVar("xorData", string(xorData)) 219 w.WriteVar("exceptions", string(exceptionData)) 220 221 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) 222 if err != nil { 223 log.Fatal(err) 224 } 225 w.Size += sz 226} 227 228func makeEntry(ri *runeInfo) { 229 if ri.CaseIgnorable { 230 if ri.Cased { 231 ri.entry = cIgnorableCased 232 } else { 233 ri.entry = cIgnorableUncased 234 } 235 } else { 236 ri.entry = ri.CaseMode 237 } 238 239 // TODO: handle soft-dotted. 240 241 ccc := cccOther 242 switch ri.CCC { 243 case 0: // Not_Reordered 244 ccc = cccZero 245 case above: // Above 246 ccc = cccAbove 247 } 248 switch ri.BreakCat { 249 case breakBreak: 250 ccc = cccBreak 251 case breakMid: 252 ri.entry |= isMidBit 253 } 254 255 ri.entry |= ccc 256 257 if ri.CaseMode == cUncased { 258 return 259 } 260 261 // Need to do something special. 262 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) { 263 makeException(ri) 264 return 265 } 266 if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) { 267 makeException(ri) 268 return 269 } 270 271 // Rune is either lowercase or uppercase. 272 273 orig := string(ri.Rune) 274 mapped := "" 275 if ri.CaseMode == cUpper { 276 mapped = ri.mapping(cLower) 277 } else { 278 mapped = ri.mapping(cUpper) 279 } 280 281 if len(orig) != len(mapped) { 282 makeException(ri) 283 return 284 } 285 286 if string(ri.FoldFull) == ri.mapping(cUpper) { 287 ri.entry |= inverseFoldBit 288 } 289 290 n := len(orig) 291 292 // Create per-byte XOR mask. 293 var b []byte 294 for i := 0; i < n; i++ { 295 b = append(b, orig[i]^mapped[i]) 296 } 297 298 // Remove leading 0 bytes, but keep at least one byte. 299 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 300 } 301 302 if len(b) == 1 && b[0]&0xc0 == 0 { 303 ri.entry |= info(b[0]) << xorShift 304 return 305 } 306 307 key := string(b) 308 x, ok := xorCache[key] 309 if !ok { 310 xorData = append(xorData, 0) // for detecting start of sequence 311 xorData = append(xorData, b...) 312 313 x = len(xorData) - 1 314 xorCache[key] = x 315 } 316 ri.entry |= info(x<<xorShift) | xorIndexBit 317} 318 319var xorCache = map[string]int{} 320 321// xorData contains byte-wise XOR data for the least significant bytes of a 322// UTF-8 encoded rune. An index points to the last byte. The sequence starts 323// with a zero terminator. 324var xorData = []byte{} 325 326// See the comments in gen_trieval.go re "the exceptions slice". 327var exceptionData = []byte{0} 328 329// makeException encodes case mappings that cannot be expressed in a simple 330// XOR diff. 331func makeException(ri *runeInfo) { 332 ccc := ri.entry & cccMask 333 // Set exception bit and retain case type. 334 ri.entry &= 0x0007 335 ri.entry |= exceptionBit 336 337 if len(exceptionData) >= 1<<numExceptionBits { 338 log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits) 339 } 340 341 // Set the offset in the exceptionData array. 342 ri.entry |= info(len(exceptionData) << exceptionShift) 343 344 orig := string(ri.Rune) 345 tc := ri.mapping(cTitle) 346 uc := ri.mapping(cUpper) 347 lc := ri.mapping(cLower) 348 ff := string(ri.FoldFull) 349 350 // addString sets the length of a string and adds it to the expansions array. 351 addString := func(s string, b *byte) { 352 if len(s) == 0 { 353 // Zero-length mappings exist, but only for conditional casing, 354 // which we are representing outside of this table. 355 log.Fatalf("%U: has zero-length mapping.", ri.Rune) 356 } 357 *b <<= 3 358 if s != orig || ri.CaseMode == cLower { 359 n := len(s) 360 if n > 7 { 361 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n) 362 } 363 *b |= byte(n) 364 exceptionData = append(exceptionData, s...) 365 } 366 } 367 368 // byte 0: 369 exceptionData = append(exceptionData, byte(ccc)|byte(len(ff))) 370 371 // byte 1: 372 p := len(exceptionData) 373 exceptionData = append(exceptionData, 0) 374 375 if len(ff) > 7 { // May be zero-length. 376 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff)) 377 } 378 exceptionData = append(exceptionData, ff...) 379 ct := ri.CaseMode 380 if ct != cLower { 381 addString(lc, &exceptionData[p]) 382 } 383 if ct != cUpper { 384 addString(uc, &exceptionData[p]) 385 } 386 if ct != cTitle { 387 addString(tc, &exceptionData[p]) 388 } 389} 390 391// sparseCompacter is a trie value block Compacter. There are many cases where 392// successive runes alternate between lower- and upper-case. This Compacter 393// exploits this by adding a special case type where the case value is obtained 394// from or-ing it with the least-significant bit of the rune, creating large 395// ranges of equal case values that compress well. 396type sparseCompacter struct { 397 sparseBlocks [][]uint16 398 sparseOffsets []uint16 399 sparseCount int 400} 401 402// makeSparse returns the number of elements that compact block would contain 403// as well as the modified values. 404func makeSparse(vals []uint64) ([]uint16, int) { 405 // Copy the values. 406 values := make([]uint16, len(vals)) 407 for i, v := range vals { 408 values[i] = uint16(v) 409 } 410 411 alt := func(i int, v uint16) uint16 { 412 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower { 413 // Convert cLower or cUpper to cXORCase value, which has the form 11x. 414 xor := v 415 xor &^= 1 416 xor |= uint16(i&1) ^ (v & 1) 417 xor |= 0x4 418 return xor 419 } 420 return v 421 } 422 423 var count int 424 var previous uint16 425 for i, v := range values { 426 if v != 0 { 427 // Try if the unmodified value is equal to the previous. 428 if v == previous { 429 continue 430 } 431 432 // Try if the xor-ed value is equal to the previous value. 433 a := alt(i, v) 434 if a == previous { 435 values[i] = a 436 continue 437 } 438 439 // This is a new value. 440 count++ 441 442 // Use the xor-ed value if it will be identical to the next value. 443 if p := i + 1; p < len(values) && alt(p, values[p]) == a { 444 values[i] = a 445 v = a 446 } 447 } 448 previous = v 449 } 450 return values, count 451} 452 453func (s *sparseCompacter) Size(v []uint64) (int, bool) { 454 _, n := makeSparse(v) 455 456 // We limit using this method to having 16 entries. 457 if n > 16 { 458 return 0, false 459 } 460 461 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true 462} 463 464func (s *sparseCompacter) Store(v []uint64) uint32 { 465 h := uint32(len(s.sparseOffsets)) 466 values, sz := makeSparse(v) 467 s.sparseBlocks = append(s.sparseBlocks, values) 468 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 469 s.sparseCount += sz 470 return h 471} 472 473func (s *sparseCompacter) Handler() string { 474 // The sparse global variable and its lookup method is defined in gen_trieval.go. 475 return "sparse.lookup" 476} 477 478func (s *sparseCompacter) Print(w io.Writer) (retErr error) { 479 p := func(format string, args ...interface{}) { 480 _, err := fmt.Fprintf(w, format, args...) 481 if retErr == nil && err != nil { 482 retErr = err 483 } 484 } 485 486 ls := len(s.sparseBlocks) 487 if ls == len(s.sparseOffsets) { 488 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 489 } 490 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2) 491 p("var sparseOffsets = %#v\n\n", s.sparseOffsets) 492 493 ns := s.sparseCount 494 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4) 495 p("var sparseValues = [%d]valueRange {", ns) 496 for i, values := range s.sparseBlocks { 497 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i]) 498 var v uint16 499 for i, nv := range values { 500 if nv != v { 501 if v != 0 { 502 p(",hi:%#02x},", 0x80+i-1) 503 } 504 if nv != 0 { 505 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 506 } 507 } 508 v = nv 509 } 510 if v != 0 { 511 p(",hi:%#02x},", 0x80+len(values)-1) 512 } 513 } 514 p("\n}\n\n") 515 return 516} 517 518// verifyProperties that properties of the runes that are relied upon in the 519// implementation. Each property is marked with an identifier that is referred 520// to in the places where it is used. 521func verifyProperties(chars []runeInfo) { 522 for i, c := range chars { 523 r := rune(i) 524 525 // Rune properties. 526 527 // A.1: modifier never changes on lowercase. [ltLower] 528 if c.CCC > 0 && unicode.ToLower(r) != r { 529 log.Fatalf("%U: non-starter changes when lowercased", r) 530 } 531 532 // A.2: properties of decompositions starting with I or J. [ltLower] 533 d := norm.NFD.PropertiesString(string(r)).Decomposition() 534 if len(d) > 0 { 535 if d[0] == 'I' || d[0] == 'J' { 536 // A.2.1: we expect at least an ASCII character and a modifier. 537 if len(d) < 3 { 538 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d)) 539 } 540 541 // All subsequent runes are modifiers and all have the same CCC. 542 runes := []rune(string(d[1:])) 543 ccc := chars[runes[0]].CCC 544 545 for _, mr := range runes[1:] { 546 mc := chars[mr] 547 548 // A.2.2: all modifiers have a CCC of Above or less. 549 if ccc == 0 || ccc > above { 550 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc) 551 } 552 553 // A.2.3: a sequence of modifiers all have the same CCC. 554 if mc.CCC != ccc { 555 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc) 556 } 557 558 // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above. 559 if (ccc == above) != (0x300 <= mr && mr <= 0x311) { 560 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr) 561 } 562 563 if i += len(string(mr)); i >= len(d) { 564 break 565 } 566 } 567 } 568 } 569 570 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper] 571 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") { 572 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r) 573 } 574 575 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper] 576 if c.CCC == iotaSubscript && r != 0x0345 { 577 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r) 578 } 579 580 // A.5: soft-dotted runes do not have exceptions. 581 if c.SoftDotted && c.entry&exceptionBit != 0 { 582 log.Fatalf("%U: soft-dotted has exception", r) 583 } 584 585 // A.6: Greek decomposition. [elUpper] 586 if unicode.Is(unicode.Greek, r) { 587 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil { 588 runes := []rune(string(b)) 589 // A.6.1: If a Greek rune decomposes and the first rune of the 590 // decomposition is greater than U+00FF, the rune is always 591 // great and not a modifier. 592 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) { 593 log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f) 594 } 595 // A.6.2: Any follow-up rune in a Greek decomposition is a 596 // modifier of which the first should be gobbled in 597 // decomposition. 598 for _, m := range runes[1:] { 599 switch m { 600 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345: 601 default: 602 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m) 603 } 604 } 605 } 606 } 607 608 // Breaking properties. 609 610 // B.1: all runes with CCC > 0 are of break type Extend. 611 if c.CCC > 0 && c.BreakType != "Extend" { 612 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType) 613 } 614 615 // B.2: all cased runes with c.CCC == 0 are of break type ALetter. 616 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" { 617 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType) 618 } 619 620 // B.3: letter category. 621 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable { 622 if c.BreakCat != breakLetter { 623 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter) 624 } 625 } 626 } 627} 628 629func genTablesTest() { 630 w := &bytes.Buffer{} 631 632 fmt.Fprintln(w, "var (") 633 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore) 634 635 // We discard the output as we know we have perfect functions. We run them 636 // just to verify the properties are correct. 637 n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased) 638 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower) 639 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper) 640 if n > 0 { 641 log.Fatalf("One of the discarded properties does not have a perfect filter.") 642 } 643 644 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 645 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{") 646 parse("SpecialCasing.txt", func(p *ucd.Parser) { 647 // Skip conditional entries. 648 if p.String(4) != "" { 649 return 650 } 651 r := p.Rune(0) 652 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", 653 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3))) 654 }) 655 fmt.Fprint(w, "\t}\n\n") 656 657 // <code>; <type>; <runes> 658 table := map[rune]struct{ simple, full, special string }{} 659 parse("CaseFolding.txt", func(p *ucd.Parser) { 660 r := p.Rune(0) 661 t := p.String(1) 662 v := string(p.Runes(2)) 663 if t != "T" && v == string(unicode.ToLower(r)) { 664 return 665 } 666 x := table[r] 667 switch t { 668 case "C": 669 x.full = v 670 x.simple = v 671 case "S": 672 x.simple = v 673 case "F": 674 x.full = v 675 case "T": 676 x.special = v 677 } 678 table[r] = x 679 }) 680 fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{") 681 for r := rune(0); r < 0x10FFFF; r++ { 682 x, ok := table[r] 683 if !ok { 684 continue 685 } 686 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special) 687 } 688 fmt.Fprint(w, "\t}\n\n") 689 690 // Break property 691 notBreak := map[rune]bool{} 692 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 693 switch p.String(1) { 694 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote", 695 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ": 696 notBreak[p.Rune(0)] = true 697 } 698 }) 699 700 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{") 701 inBreak := false 702 for r := rune(0); r <= lastRuneForTesting; r++ { 703 if isBreak := !notBreak[r]; isBreak != inBreak { 704 if isBreak { 705 fmt.Fprintf(w, "\t\t{0x%x, ", r) 706 } else { 707 fmt.Fprintf(w, "0x%x},\n", r-1) 708 } 709 inBreak = isBreak 710 } 711 } 712 if inBreak { 713 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting) 714 } 715 fmt.Fprint(w, "\t}\n\n") 716 717 // Word break test 718 // Filter out all samples that do not contain cased characters. 719 cased := map[rune]bool{} 720 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 721 if p.String(1) == "Cased" { 722 cased[p.Rune(0)] = true 723 } 724 }) 725 726 fmt.Fprintln(w, "\tbreakTest = []string{") 727 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) { 728 c := strings.Split(p.String(0), " ") 729 730 const sep = '|' 731 numCased := 0 732 test := "" 733 for ; len(c) >= 2; c = c[2:] { 734 if c[0] == "÷" && test != "" { 735 test += string(sep) 736 } 737 i, err := strconv.ParseUint(c[1], 16, 32) 738 r := rune(i) 739 if err != nil { 740 log.Fatalf("Invalid rune %q.", c[1]) 741 } 742 if r == sep { 743 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep) 744 } 745 if cased[r] { 746 numCased++ 747 } 748 test += string(r) 749 } 750 if numCased > 1 { 751 fmt.Fprintf(w, "\t\t%q,\n", test) 752 } 753 }) 754 fmt.Fprintln(w, "\t}") 755 756 fmt.Fprintln(w, ")") 757 758 gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes()) 759} 760 761// These functions are just used for verification that their definition have not 762// changed in the Unicode Standard. 763 764func verifyCased(r rune) bool { 765 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r) 766} 767 768func verifyLower(r rune) bool { 769 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r) 770} 771 772func verifyUpper(r rune) bool { 773 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r) 774} 775 776// verifyIgnore is an approximation of the Case_Ignorable property using the 777// core unicode package. It is used to reduce the size of the test data. 778func verifyIgnore(r rune) bool { 779 props := []*unicode.RangeTable{ 780 unicode.Mn, 781 unicode.Me, 782 unicode.Cf, 783 unicode.Lm, 784 unicode.Sk, 785 } 786 for _, p := range props { 787 if unicode.Is(p, r) { 788 return true 789 } 790 } 791 return false 792} 793 794// printProperties prints tables of rune properties from the given UCD file. 795// A filter func f can be given to exclude certain values. A rune r will have 796// the indicated property if it is in the generated table or if f(r). 797func printProperties(w io.Writer, file, property string, f func(r rune) bool) int { 798 verify := map[rune]bool{} 799 n := 0 800 varNameParts := strings.Split(property, "_") 801 varNameParts[0] = strings.ToLower(varNameParts[0]) 802 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, "")) 803 parse(file, func(p *ucd.Parser) { 804 if p.String(1) == property { 805 r := p.Rune(0) 806 verify[r] = true 807 if !f(r) { 808 n++ 809 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r) 810 } 811 } 812 }) 813 fmt.Fprint(w, "\t}\n\n") 814 815 // Verify that f is correct, that is, it represents a subset of the property. 816 for r := rune(0); r <= lastRuneForTesting; r++ { 817 if !verify[r] && f(r) { 818 log.Fatalf("Incorrect filter func for property %q.", property) 819 } 820 } 821 return n 822} 823 824// The newCaseTrie, sparseValues and sparseOffsets definitions below are 825// placeholders referred to by gen_trieval.go. The real definitions are 826// generated by this program and written to tables.go. 827 828func newCaseTrie(int) int { return 0 } 829 830var ( 831 sparseValues [0]valueRange 832 sparseOffsets [0]uint16 833) 834