1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// This program generates the trie for casing operations. The Unicode casing 9// algorithm requires the lookup of various properties and mappings for each 10// rune. The table generated by this generator combines several of the most 11// frequently used of these into a single trie so that they can be accessed 12// with a single lookup. 13package main 14 15import ( 16 "bytes" 17 "fmt" 18 "io" 19 "io/ioutil" 20 "log" 21 "reflect" 22 "strconv" 23 "strings" 24 "unicode" 25 26 "golang.org/x/text/internal/gen" 27 "golang.org/x/text/internal/triegen" 28 "golang.org/x/text/internal/ucd" 29 "golang.org/x/text/unicode/norm" 30) 31 32func main() { 33 gen.Init() 34 genTables() 35 genTablesTest() 36 gen.Repackage("gen_trieval.go", "trieval.go", "cases") 37} 38 39// runeInfo contains all information for a rune that we care about for casing 40// operations. 41type runeInfo struct { 42 Rune rune 43 44 entry info // trie value for this rune. 45 46 CaseMode info 47 48 // Simple case mappings. 49 Simple [1 + maxCaseMode][]rune 50 51 // Special casing 52 HasSpecial bool 53 Conditional bool 54 Special [1 + maxCaseMode][]rune 55 56 // Folding 57 FoldSimple rune 58 FoldSpecial rune 59 FoldFull []rune 60 61 // TODO: FC_NFKC, or equivalent data. 62 63 // Properties 64 SoftDotted bool 65 CaseIgnorable bool 66 Cased bool 67 DecomposeGreek bool 68 BreakType string 69 BreakCat breakCategory 70 71 // We care mostly about 0, Above, and IotaSubscript. 72 CCC byte 73} 74 75type breakCategory int 76 77const ( 78 breakBreak breakCategory = iota 79 breakLetter 80 breakMid 81) 82 83// mapping returns the case mapping for the given case type. 84func (r *runeInfo) mapping(c info) string { 85 if r.HasSpecial { 86 return string(r.Special[c]) 87 } 88 if len(r.Simple[c]) != 0 { 89 return string(r.Simple[c]) 90 } 91 return string(r.Rune) 92} 93 94func parse(file string, f func(p *ucd.Parser)) { 95 ucd.Parse(gen.OpenUCDFile(file), f) 96} 97 98func parseUCD() []runeInfo { 99 chars := make([]runeInfo, unicode.MaxRune) 100 101 get := func(r rune) *runeInfo { 102 c := &chars[r] 103 c.Rune = r 104 return c 105 } 106 107 parse("UnicodeData.txt", func(p *ucd.Parser) { 108 ri := get(p.Rune(0)) 109 ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass)) 110 ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping) 111 ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping) 112 ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping) 113 if p.String(ucd.GeneralCategory) == "Lt" { 114 ri.CaseMode = cTitle 115 } 116 }) 117 118 // <code>; <property> 119 parse("PropList.txt", func(p *ucd.Parser) { 120 if p.String(1) == "Soft_Dotted" { 121 chars[p.Rune(0)].SoftDotted = true 122 } 123 }) 124 125 // <code>; <word break type> 126 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 127 ri := get(p.Rune(0)) 128 switch p.String(1) { 129 case "Case_Ignorable": 130 ri.CaseIgnorable = true 131 case "Cased": 132 ri.Cased = true 133 case "Lowercase": 134 ri.CaseMode = cLower 135 case "Uppercase": 136 ri.CaseMode = cUpper 137 } 138 }) 139 140 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 141 parse("SpecialCasing.txt", func(p *ucd.Parser) { 142 // We drop all conditional special casing and deal with them manually in 143 // the language-specific case mappers. Rune 0x03A3 is the only one with 144 // a conditional formatting that is not language-specific. However, 145 // dealing with this letter is tricky, especially in a streaming 146 // context, so we deal with it in the Caser for Greek specifically. 147 ri := get(p.Rune(0)) 148 if p.String(4) == "" { 149 ri.HasSpecial = true 150 ri.Special[cLower] = p.Runes(1) 151 ri.Special[cTitle] = p.Runes(2) 152 ri.Special[cUpper] = p.Runes(3) 153 } else { 154 ri.Conditional = true 155 } 156 }) 157 158 // TODO: Use text breaking according to UAX #29. 159 // <code>; <word break type> 160 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 161 ri := get(p.Rune(0)) 162 ri.BreakType = p.String(1) 163 164 // We collapse the word breaking properties onto the categories we need. 165 switch p.String(1) { // TODO: officially we need to canonicalize. 166 case "MidLetter", "MidNumLet", "Single_Quote": 167 ri.BreakCat = breakMid 168 if !ri.CaseIgnorable { 169 // finalSigma relies on the fact that all breakMid runes are 170 // also a Case_Ignorable. Revisit this code when this changes. 171 log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri) 172 } 173 case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ": 174 ri.BreakCat = breakLetter 175 } 176 }) 177 178 // <code>; <type>; <mapping> 179 parse("CaseFolding.txt", func(p *ucd.Parser) { 180 ri := get(p.Rune(0)) 181 switch p.String(1) { 182 case "C": 183 ri.FoldSimple = p.Rune(2) 184 ri.FoldFull = p.Runes(2) 185 case "S": 186 ri.FoldSimple = p.Rune(2) 187 case "T": 188 ri.FoldSpecial = p.Rune(2) 189 case "F": 190 ri.FoldFull = p.Runes(2) 191 default: 192 log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1)) 193 } 194 }) 195 196 return chars 197} 198 199func genTables() { 200 chars := parseUCD() 201 verifyProperties(chars) 202 203 t := triegen.NewTrie("case") 204 for i := range chars { 205 c := &chars[i] 206 makeEntry(c) 207 t.Insert(rune(i), uint64(c.entry)) 208 } 209 210 w := gen.NewCodeWriter() 211 defer w.WriteVersionedGoFile("tables.go", "cases") 212 213 gen.WriteUnicodeVersion(w) 214 215 // TODO: write CLDR version after adding a mechanism to detect that the 216 // tables on which the manually created locale-sensitive casing code is 217 // based hasn't changed. 218 219 w.WriteVar("xorData", string(xorData)) 220 w.WriteVar("exceptions", string(exceptionData)) 221 222 sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{})) 223 if err != nil { 224 log.Fatal(err) 225 } 226 w.Size += sz 227} 228 229func makeEntry(ri *runeInfo) { 230 if ri.CaseIgnorable { 231 if ri.Cased { 232 ri.entry = cIgnorableCased 233 } else { 234 ri.entry = cIgnorableUncased 235 } 236 } else { 237 ri.entry = ri.CaseMode 238 } 239 240 // TODO: handle soft-dotted. 241 242 ccc := cccOther 243 switch ri.CCC { 244 case 0: // Not_Reordered 245 ccc = cccZero 246 case above: // Above 247 ccc = cccAbove 248 } 249 switch ri.BreakCat { 250 case breakBreak: 251 ccc = cccBreak 252 case breakMid: 253 ri.entry |= isMidBit 254 } 255 256 ri.entry |= ccc 257 258 if ri.CaseMode == cUncased { 259 return 260 } 261 262 // Need to do something special. 263 if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) { 264 makeException(ri) 265 return 266 } 267 if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) { 268 makeException(ri) 269 return 270 } 271 272 // Rune is either lowercase or uppercase. 273 274 orig := string(ri.Rune) 275 mapped := "" 276 if ri.CaseMode == cUpper { 277 mapped = ri.mapping(cLower) 278 } else { 279 mapped = ri.mapping(cUpper) 280 } 281 282 if len(orig) != len(mapped) { 283 makeException(ri) 284 return 285 } 286 287 if string(ri.FoldFull) == ri.mapping(cUpper) { 288 ri.entry |= inverseFoldBit 289 } 290 291 n := len(orig) 292 293 // Create per-byte XOR mask. 294 var b []byte 295 for i := 0; i < n; i++ { 296 b = append(b, orig[i]^mapped[i]) 297 } 298 299 // Remove leading 0 bytes, but keep at least one byte. 300 for ; len(b) > 1 && b[0] == 0; b = b[1:] { 301 } 302 303 if len(b) == 1 && b[0]&0xc0 == 0 { 304 ri.entry |= info(b[0]) << xorShift 305 return 306 } 307 308 key := string(b) 309 x, ok := xorCache[key] 310 if !ok { 311 xorData = append(xorData, 0) // for detecting start of sequence 312 xorData = append(xorData, b...) 313 314 x = len(xorData) - 1 315 xorCache[key] = x 316 } 317 ri.entry |= info(x<<xorShift) | xorIndexBit 318} 319 320var xorCache = map[string]int{} 321 322// xorData contains byte-wise XOR data for the least significant bytes of a 323// UTF-8 encoded rune. An index points to the last byte. The sequence starts 324// with a zero terminator. 325var xorData = []byte{} 326 327// See the comments in gen_trieval.go re "the exceptions slice". 328var exceptionData = []byte{0} 329 330// makeException encodes case mappings that cannot be expressed in a simple 331// XOR diff. 332func makeException(ri *runeInfo) { 333 ccc := ri.entry & cccMask 334 // Set exception bit and retain case type. 335 ri.entry &= 0x0007 336 ri.entry |= exceptionBit 337 338 if len(exceptionData) >= 1<<numExceptionBits { 339 log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits) 340 } 341 342 // Set the offset in the exceptionData array. 343 ri.entry |= info(len(exceptionData) << exceptionShift) 344 345 orig := string(ri.Rune) 346 tc := ri.mapping(cTitle) 347 uc := ri.mapping(cUpper) 348 lc := ri.mapping(cLower) 349 ff := string(ri.FoldFull) 350 351 // addString sets the length of a string and adds it to the expansions array. 352 addString := func(s string, b *byte) { 353 if len(s) == 0 { 354 // Zero-length mappings exist, but only for conditional casing, 355 // which we are representing outside of this table. 356 log.Fatalf("%U: has zero-length mapping.", ri.Rune) 357 } 358 *b <<= 3 359 if s != orig || ri.CaseMode == cLower { 360 n := len(s) 361 if n > 7 { 362 log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n) 363 } 364 *b |= byte(n) 365 exceptionData = append(exceptionData, s...) 366 } 367 } 368 369 // byte 0: 370 exceptionData = append(exceptionData, byte(ccc)|byte(len(ff))) 371 372 // byte 1: 373 p := len(exceptionData) 374 exceptionData = append(exceptionData, 0) 375 376 if len(ff) > 7 { // May be zero-length. 377 log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff)) 378 } 379 exceptionData = append(exceptionData, ff...) 380 ct := ri.CaseMode 381 if ct != cLower { 382 addString(lc, &exceptionData[p]) 383 } 384 if ct != cUpper { 385 addString(uc, &exceptionData[p]) 386 } 387 if ct != cTitle { 388 addString(tc, &exceptionData[p]) 389 } 390} 391 392// sparseCompacter is a trie value block Compacter. There are many cases where 393// successive runes alternate between lower- and upper-case. This Compacter 394// exploits this by adding a special case type where the case value is obtained 395// from or-ing it with the least-significant bit of the rune, creating large 396// ranges of equal case values that compress well. 397type sparseCompacter struct { 398 sparseBlocks [][]uint16 399 sparseOffsets []uint16 400 sparseCount int 401} 402 403// makeSparse returns the number of elements that compact block would contain 404// as well as the modified values. 405func makeSparse(vals []uint64) ([]uint16, int) { 406 // Copy the values. 407 values := make([]uint16, len(vals)) 408 for i, v := range vals { 409 values[i] = uint16(v) 410 } 411 412 alt := func(i int, v uint16) uint16 { 413 if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower { 414 // Convert cLower or cUpper to cXORCase value, which has the form 11x. 415 xor := v 416 xor &^= 1 417 xor |= uint16(i&1) ^ (v & 1) 418 xor |= 0x4 419 return xor 420 } 421 return v 422 } 423 424 var count int 425 var previous uint16 426 for i, v := range values { 427 if v != 0 { 428 // Try if the unmodified value is equal to the previous. 429 if v == previous { 430 continue 431 } 432 433 // Try if the xor-ed value is equal to the previous value. 434 a := alt(i, v) 435 if a == previous { 436 values[i] = a 437 continue 438 } 439 440 // This is a new value. 441 count++ 442 443 // Use the xor-ed value if it will be identical to the next value. 444 if p := i + 1; p < len(values) && alt(p, values[p]) == a { 445 values[i] = a 446 v = a 447 } 448 } 449 previous = v 450 } 451 return values, count 452} 453 454func (s *sparseCompacter) Size(v []uint64) (int, bool) { 455 _, n := makeSparse(v) 456 457 // We limit using this method to having 16 entries. 458 if n > 16 { 459 return 0, false 460 } 461 462 return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true 463} 464 465func (s *sparseCompacter) Store(v []uint64) uint32 { 466 h := uint32(len(s.sparseOffsets)) 467 values, sz := makeSparse(v) 468 s.sparseBlocks = append(s.sparseBlocks, values) 469 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 470 s.sparseCount += sz 471 return h 472} 473 474func (s *sparseCompacter) Handler() string { 475 // The sparse global variable and its lookup method is defined in gen_trieval.go. 476 return "sparse.lookup" 477} 478 479func (s *sparseCompacter) Print(w io.Writer) (retErr error) { 480 p := func(format string, args ...interface{}) { 481 _, err := fmt.Fprintf(w, format, args...) 482 if retErr == nil && err != nil { 483 retErr = err 484 } 485 } 486 487 ls := len(s.sparseBlocks) 488 if ls == len(s.sparseOffsets) { 489 s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount)) 490 } 491 p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2) 492 p("var sparseOffsets = %#v\n\n", s.sparseOffsets) 493 494 ns := s.sparseCount 495 p("// sparseValues: %d entries, %d bytes\n", ns, ns*4) 496 p("var sparseValues = [%d]valueRange {", ns) 497 for i, values := range s.sparseBlocks { 498 p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i]) 499 var v uint16 500 for i, nv := range values { 501 if nv != v { 502 if v != 0 { 503 p(",hi:%#02x},", 0x80+i-1) 504 } 505 if nv != 0 { 506 p("\n{value:%#04x,lo:%#02x", nv, 0x80+i) 507 } 508 } 509 v = nv 510 } 511 if v != 0 { 512 p(",hi:%#02x},", 0x80+len(values)-1) 513 } 514 } 515 p("\n}\n\n") 516 return 517} 518 519// verifyProperties that properties of the runes that are relied upon in the 520// implementation. Each property is marked with an identifier that is referred 521// to in the places where it is used. 522func verifyProperties(chars []runeInfo) { 523 for i, c := range chars { 524 r := rune(i) 525 526 // Rune properties. 527 528 // A.1: modifier never changes on lowercase. [ltLower] 529 if c.CCC > 0 && unicode.ToLower(r) != r { 530 log.Fatalf("%U: non-starter changes when lowercased", r) 531 } 532 533 // A.2: properties of decompositions starting with I or J. [ltLower] 534 d := norm.NFD.PropertiesString(string(r)).Decomposition() 535 if len(d) > 0 { 536 if d[0] == 'I' || d[0] == 'J' { 537 // A.2.1: we expect at least an ASCII character and a modifier. 538 if len(d) < 3 { 539 log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d)) 540 } 541 542 // All subsequent runes are modifiers and all have the same CCC. 543 runes := []rune(string(d[1:])) 544 ccc := chars[runes[0]].CCC 545 546 for _, mr := range runes[1:] { 547 mc := chars[mr] 548 549 // A.2.2: all modifiers have a CCC of Above or less. 550 if ccc == 0 || ccc > above { 551 log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc) 552 } 553 554 // A.2.3: a sequence of modifiers all have the same CCC. 555 if mc.CCC != ccc { 556 log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc) 557 } 558 559 // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above. 560 if (ccc == above) != (0x300 <= mr && mr <= 0x311) { 561 log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr) 562 } 563 564 if i += len(string(mr)); i >= len(d) { 565 break 566 } 567 } 568 } 569 } 570 571 // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper] 572 if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") { 573 log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r) 574 } 575 576 // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper] 577 if c.CCC == iotaSubscript && r != 0x0345 { 578 log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r) 579 } 580 581 // A.5: soft-dotted runes do not have exceptions. 582 if c.SoftDotted && c.entry&exceptionBit != 0 { 583 log.Fatalf("%U: soft-dotted has exception", r) 584 } 585 586 // A.6: Greek decomposition. [elUpper] 587 if unicode.Is(unicode.Greek, r) { 588 if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil { 589 runes := []rune(string(b)) 590 // A.6.1: If a Greek rune decomposes and the first rune of the 591 // decomposition is greater than U+00FF, the rune is always 592 // great and not a modifier. 593 if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) { 594 log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f) 595 } 596 // A.6.2: Any follow-up rune in a Greek decomposition is a 597 // modifier of which the first should be gobbled in 598 // decomposition. 599 for _, m := range runes[1:] { 600 switch m { 601 case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345: 602 default: 603 log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m) 604 } 605 } 606 } 607 } 608 609 // Breaking properties. 610 611 // B.1: all runes with CCC > 0 are of break type Extend. 612 if c.CCC > 0 && c.BreakType != "Extend" { 613 log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType) 614 } 615 616 // B.2: all cased runes with c.CCC == 0 are of break type ALetter. 617 if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" { 618 log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType) 619 } 620 621 // B.3: letter category. 622 if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable { 623 if c.BreakCat != breakLetter { 624 log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter) 625 } 626 } 627 } 628} 629 630func genTablesTest() { 631 w := &bytes.Buffer{} 632 633 fmt.Fprintln(w, "var (") 634 printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore) 635 636 // We discard the output as we know we have perfect functions. We run them 637 // just to verify the properties are correct. 638 n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased) 639 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower) 640 n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper) 641 if n > 0 { 642 log.Fatalf("One of the discarded properties does not have a perfect filter.") 643 } 644 645 // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? 646 fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{") 647 parse("SpecialCasing.txt", func(p *ucd.Parser) { 648 // Skip conditional entries. 649 if p.String(4) != "" { 650 return 651 } 652 r := p.Rune(0) 653 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", 654 r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3))) 655 }) 656 fmt.Fprint(w, "\t}\n\n") 657 658 // <code>; <type>; <runes> 659 table := map[rune]struct{ simple, full, special string }{} 660 parse("CaseFolding.txt", func(p *ucd.Parser) { 661 r := p.Rune(0) 662 t := p.String(1) 663 v := string(p.Runes(2)) 664 if t != "T" && v == string(unicode.ToLower(r)) { 665 return 666 } 667 x := table[r] 668 switch t { 669 case "C": 670 x.full = v 671 x.simple = v 672 case "S": 673 x.simple = v 674 case "F": 675 x.full = v 676 case "T": 677 x.special = v 678 } 679 table[r] = x 680 }) 681 fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{") 682 for r := rune(0); r < 0x10FFFF; r++ { 683 x, ok := table[r] 684 if !ok { 685 continue 686 } 687 fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special) 688 } 689 fmt.Fprint(w, "\t}\n\n") 690 691 // Break property 692 notBreak := map[rune]bool{} 693 parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) { 694 switch p.String(1) { 695 case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote", 696 "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ": 697 notBreak[p.Rune(0)] = true 698 } 699 }) 700 701 fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{") 702 inBreak := false 703 for r := rune(0); r <= lastRuneForTesting; r++ { 704 if isBreak := !notBreak[r]; isBreak != inBreak { 705 if isBreak { 706 fmt.Fprintf(w, "\t\t{0x%x, ", r) 707 } else { 708 fmt.Fprintf(w, "0x%x},\n", r-1) 709 } 710 inBreak = isBreak 711 } 712 } 713 if inBreak { 714 fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting) 715 } 716 fmt.Fprint(w, "\t}\n\n") 717 718 // Word break test 719 // Filter out all samples that do not contain cased characters. 720 cased := map[rune]bool{} 721 parse("DerivedCoreProperties.txt", func(p *ucd.Parser) { 722 if p.String(1) == "Cased" { 723 cased[p.Rune(0)] = true 724 } 725 }) 726 727 fmt.Fprintln(w, "\tbreakTest = []string{") 728 parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) { 729 c := strings.Split(p.String(0), " ") 730 731 const sep = '|' 732 numCased := 0 733 test := "" 734 for ; len(c) >= 2; c = c[2:] { 735 if c[0] == "÷" && test != "" { 736 test += string(sep) 737 } 738 i, err := strconv.ParseUint(c[1], 16, 32) 739 r := rune(i) 740 if err != nil { 741 log.Fatalf("Invalid rune %q.", c[1]) 742 } 743 if r == sep { 744 log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep) 745 } 746 if cased[r] { 747 numCased++ 748 } 749 test += string(r) 750 } 751 if numCased > 1 { 752 fmt.Fprintf(w, "\t\t%q,\n", test) 753 } 754 }) 755 fmt.Fprintln(w, "\t}") 756 757 fmt.Fprintln(w, ")") 758 759 gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes()) 760} 761 762// These functions are just used for verification that their definition have not 763// changed in the Unicode Standard. 764 765func verifyCased(r rune) bool { 766 return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r) 767} 768 769func verifyLower(r rune) bool { 770 return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r) 771} 772 773func verifyUpper(r rune) bool { 774 return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r) 775} 776 777// verifyIgnore is an approximation of the Case_Ignorable property using the 778// core unicode package. It is used to reduce the size of the test data. 779func verifyIgnore(r rune) bool { 780 props := []*unicode.RangeTable{ 781 unicode.Mn, 782 unicode.Me, 783 unicode.Cf, 784 unicode.Lm, 785 unicode.Sk, 786 } 787 for _, p := range props { 788 if unicode.Is(p, r) { 789 return true 790 } 791 } 792 return false 793} 794 795// printProperties prints tables of rune properties from the given UCD file. 796// A filter func f can be given to exclude certain values. A rune r will have 797// the indicated property if it is in the generated table or if f(r). 798func printProperties(w io.Writer, file, property string, f func(r rune) bool) int { 799 verify := map[rune]bool{} 800 n := 0 801 varNameParts := strings.Split(property, "_") 802 varNameParts[0] = strings.ToLower(varNameParts[0]) 803 fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, "")) 804 parse(file, func(p *ucd.Parser) { 805 if p.String(1) == property { 806 r := p.Rune(0) 807 verify[r] = true 808 if !f(r) { 809 n++ 810 fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r) 811 } 812 } 813 }) 814 fmt.Fprint(w, "\t}\n\n") 815 816 // Verify that f is correct, that is, it represents a subset of the property. 817 for r := rune(0); r <= lastRuneForTesting; r++ { 818 if !verify[r] && f(r) { 819 log.Fatalf("Incorrect filter func for property %q.", property) 820 } 821 } 822 return n 823} 824 825// The newCaseTrie, sparseValues and sparseOffsets definitions below are 826// placeholders referred to by gen_trieval.go. The real definitions are 827// generated by this program and written to tables.go. 828 829func newCaseTrie(int) int { return 0 } 830 831var ( 832 sparseValues [0]valueRange 833 sparseOffsets [0]uint16 834) 835