1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Unicode table generator. 8// Data read from the web. 9 10package main 11 12import ( 13 "flag" 14 "fmt" 15 "log" 16 "os" 17 "regexp" 18 "sort" 19 "strings" 20 "unicode" 21 22 "golang.org/x/text/internal/gen" 23 "golang.org/x/text/internal/ucd" 24 "golang.org/x/text/unicode/rangetable" 25) 26 27func main() { 28 flag.Parse() 29 setupOutput() 30 loadChars() // always needed 31 loadCasefold() 32 printCategories() 33 printScriptOrProperty(false) 34 printScriptOrProperty(true) 35 printCases() 36 printLatinProperties() 37 printCasefold() 38 printSizes() 39 flushOutput() 40} 41 42func defaultVersion() string { 43 if v := os.Getenv("UNICODE_VERSION"); v != "" { 44 return v 45 } 46 return unicode.Version 47} 48 49var tablelist = flag.String("tables", 50 "all", 51 "comma-separated list of which tables to generate; can be letter") 52var scriptlist = flag.String("scripts", 53 "all", 54 "comma-separated list of which script tables to generate") 55var proplist = flag.String("props", 56 "all", 57 "comma-separated list of which property tables to generate") 58var cases = flag.Bool("cases", 59 true, 60 "generate case tables") 61var test = flag.Bool("test", 62 false, 63 "test existing tables; can be used to compare web data with package data") 64 65var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 66var logger = log.New(os.Stderr, "", log.Lshortfile) 67 68var output *gen.CodeWriter 69 70func setupOutput() { 71 output = gen.NewCodeWriter() 72} 73 74func flushOutput() { 75 output.WriteGoFile("tables.go", "unicode") 76} 77 78func printf(format string, args ...interface{}) { 79 fmt.Fprintf(output, format, args...) 80} 81 82func print(args ...interface{}) { 83 fmt.Fprint(output, args...) 84} 85 86func println(args ...interface{}) { 87 fmt.Fprintln(output, args...) 88} 89 90var category = map[string]bool{ 91 // Nd Lu etc. 92 // We use one-character names to identify merged categories 93 "L": true, // Lu Ll Lt Lm Lo 94 "P": true, // Pc Pd Ps Pe Pu Pf Po 95 "M": true, // Mn Mc Me 96 "N": true, // Nd Nl No 97 "S": true, // Sm Sc Sk So 98 "Z": true, // Zs Zl Zp 99 "C": true, // Cc Cf Cs Co Cn 100} 101 102// This contains only the properties we're interested in. 103type Char struct { 104 codePoint rune // if zero, this index is not a valid code point. 105 category string 106 upperCase rune 107 lowerCase rune 108 titleCase rune 109 foldCase rune // simple case folding 110 caseOrbit rune // next in simple case folding orbit 111} 112 113const MaxChar = 0x10FFFF 114 115var chars = make([]Char, MaxChar+1) 116var scripts = make(map[string][]rune) 117var props = make(map[string][]rune) // a property looks like a script; can share the format 118 119func allCategories() []string { 120 a := make([]string, 0, len(category)) 121 for k := range category { 122 a = append(a, k) 123 } 124 sort.Strings(a) 125 return a 126} 127 128func all(scripts map[string][]rune) []string { 129 a := make([]string, 0, len(scripts)) 130 for k := range scripts { 131 a = append(a, k) 132 } 133 sort.Strings(a) 134 return a 135} 136 137func allCatFold(m map[string]map[rune]bool) []string { 138 a := make([]string, 0, len(m)) 139 for k := range m { 140 a = append(a, k) 141 } 142 sort.Strings(a) 143 return a 144} 145 146func categoryOp(code rune, class uint8) bool { 147 category := chars[code].category 148 return len(category) > 0 && category[0] == class 149} 150 151func loadChars() { 152 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { 153 c := Char{codePoint: p.Rune(0)} 154 155 getRune := func(field int) rune { 156 if p.String(field) == "" { 157 return 0 158 } 159 return p.Rune(field) 160 } 161 162 c.category = p.String(ucd.GeneralCategory) 163 category[c.category] = true 164 switch c.category { 165 case "Nd": 166 // Decimal digit 167 p.Int(ucd.NumericValue) 168 case "Lu": 169 c.upperCase = getRune(ucd.CodePoint) 170 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 171 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 172 case "Ll": 173 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 174 c.lowerCase = getRune(ucd.CodePoint) 175 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 176 case "Lt": 177 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 178 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 179 c.titleCase = getRune(ucd.CodePoint) 180 default: 181 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 182 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 183 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 184 } 185 186 chars[c.codePoint] = c 187 }) 188} 189 190func loadCasefold() { 191 ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) { 192 kind := p.String(1) 193 if kind != "C" && kind != "S" { 194 // Only care about 'common' and 'simple' foldings. 195 return 196 } 197 p1 := p.Rune(0) 198 p2 := p.Rune(2) 199 chars[p1].foldCase = rune(p2) 200 }) 201} 202 203var categoryMapping = map[string]string{ 204 "Lu": "Letter, uppercase", 205 "Ll": "Letter, lowercase", 206 "Lt": "Letter, titlecase", 207 "Lm": "Letter, modifier", 208 "Lo": "Letter, other", 209 "Mn": "Mark, nonspacing", 210 "Mc": "Mark, spacing combining", 211 "Me": "Mark, enclosing", 212 "Nd": "Number, decimal digit", 213 "Nl": "Number, letter", 214 "No": "Number, other", 215 "Pc": "Punctuation, connector", 216 "Pd": "Punctuation, dash", 217 "Ps": "Punctuation, open", 218 "Pe": "Punctuation, close", 219 "Pi": "Punctuation, initial quote", 220 "Pf": "Punctuation, final quote", 221 "Po": "Punctuation, other", 222 "Sm": "Symbol, math", 223 "Sc": "Symbol, currency", 224 "Sk": "Symbol, modifier", 225 "So": "Symbol, other", 226 "Zs": "Separator, space", 227 "Zl": "Separator, line", 228 "Zp": "Separator, paragraph", 229 "Cc": "Other, control", 230 "Cf": "Other, format", 231 "Cs": "Other, surrogate", 232 "Co": "Other, private use", 233 "Cn": "Other, not assigned", 234} 235 236func printCategories() { 237 if *tablelist == "" { 238 return 239 } 240 // Find out which categories to dump 241 list := strings.Split(*tablelist, ",") 242 if *tablelist == "all" { 243 list = allCategories() 244 } 245 if *test { 246 fullCategoryTest(list) 247 return 248 } 249 250 println("// Version is the Unicode edition from which the tables are derived.") 251 printf("const Version = %q\n\n", gen.UnicodeVersion()) 252 253 if *tablelist == "all" { 254 println("// Categories is the set of Unicode category tables.") 255 println("var Categories = map[string] *RangeTable {") 256 for _, k := range allCategories() { 257 printf("\t%q: %s,\n", k, k) 258 } 259 print("}\n\n") 260 } 261 262 decl := make(sort.StringSlice, len(list)) 263 ndecl := 0 264 for _, name := range list { 265 if _, ok := category[name]; !ok { 266 logger.Fatal("unknown category", name) 267 } 268 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 269 // name to store the data. This stops godoc dumping all the tables but keeps them 270 // available to clients. 271 // Cases deserving special comments 272 varDecl := "" 273 switch name { 274 case "C": 275 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 276 varDecl += "\tC = _C\n" 277 case "L": 278 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 279 varDecl += "\tL = _L\n" 280 case "M": 281 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 282 varDecl += "\tM = _M\n" 283 case "N": 284 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 285 varDecl += "\tN = _N\n" 286 case "P": 287 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 288 varDecl += "\tP = _P\n" 289 case "S": 290 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 291 varDecl += "\tS = _S\n" 292 case "Z": 293 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 294 varDecl += "\tZ = _Z\n" 295 case "Nd": 296 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 297 case "Lu": 298 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 299 case "Ll": 300 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 301 case "Lt": 302 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 303 } 304 if len(name) > 1 { 305 desc, ok := categoryMapping[name] 306 if ok { 307 varDecl += fmt.Sprintf( 308 "\t%s = _%s; // %s is the set of Unicode characters in category %s (%s).\n", 309 name, name, name, name, desc) 310 } else { 311 varDecl += fmt.Sprintf( 312 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 313 name, name, name, name) 314 } 315 } 316 decl[ndecl] = varDecl 317 ndecl++ 318 if len(name) == 1 { // unified categories 319 dumpRange( 320 "_"+name, 321 func(code rune) bool { return categoryOp(code, name[0]) }) 322 continue 323 } 324 dumpRange("_"+name, 325 func(code rune) bool { return chars[code].category == name }) 326 } 327 decl.Sort() 328 println("// These variables have type *RangeTable.") 329 println("var (") 330 for _, d := range decl { 331 print(d) 332 } 333 print(")\n\n") 334} 335 336type Op func(code rune) bool 337 338func dumpRange(name string, inCategory Op) { 339 runes := []rune{} 340 for i := range chars { 341 r := rune(i) 342 if inCategory(r) { 343 runes = append(runes, r) 344 } 345 } 346 printRangeTable(name, runes) 347} 348 349func printRangeTable(name string, runes []rune) { 350 rt := rangetable.New(runes...) 351 printf("var %s = &RangeTable{\n", name) 352 println("\tR16: []Range16{") 353 for _, r := range rt.R16 { 354 printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride) 355 range16Count++ 356 } 357 println("\t},") 358 if len(rt.R32) > 0 { 359 println("\tR32: []Range32{") 360 for _, r := range rt.R32 { 361 printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride) 362 range32Count++ 363 } 364 println("\t},") 365 } 366 if rt.LatinOffset > 0 { 367 printf("\tLatinOffset: %d,\n", rt.LatinOffset) 368 } 369 printf("}\n\n") 370} 371 372func fullCategoryTest(list []string) { 373 for _, name := range list { 374 if _, ok := category[name]; !ok { 375 logger.Fatal("unknown category", name) 376 } 377 r, ok := unicode.Categories[name] 378 if !ok && len(name) > 1 { 379 logger.Fatalf("unknown table %q", name) 380 } 381 if len(name) == 1 { 382 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 383 } else { 384 verifyRange( 385 name, 386 func(code rune) bool { return chars[code].category == name }, 387 r) 388 } 389 } 390} 391 392func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 393 count := 0 394 for j := range chars { 395 i := rune(j) 396 web := inCategory(i) 397 pkg := unicode.Is(table, i) 398 if web != pkg { 399 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 400 count++ 401 if count > 10 { 402 break 403 } 404 } 405 } 406} 407 408func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) { 409 for _, name := range list { 410 if _, ok := scripts[name]; !ok { 411 logger.Fatal("unknown script", name) 412 } 413 _, ok := installed[name] 414 if !ok { 415 logger.Fatal("unknown table", name) 416 } 417 for _, r := range scripts[name] { 418 if !unicode.Is(installed[name], rune(r)) { 419 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 420 } 421 } 422 } 423} 424 425var deprecatedAliases = map[string]string{ 426 "Sentence_Terminal": "STerm", 427} 428 429// PropList.txt has the same format as Scripts.txt so we can share its parser. 430func printScriptOrProperty(doProps bool) { 431 flaglist := *scriptlist 432 file := "Scripts.txt" 433 table := scripts 434 installed := unicode.Scripts 435 if doProps { 436 flaglist = *proplist 437 file = "PropList.txt" 438 table = props 439 installed = unicode.Properties 440 } 441 if flaglist == "" { 442 return 443 } 444 ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) { 445 name := p.String(1) 446 table[name] = append(table[name], p.Rune(0)) 447 }) 448 // Find out which scripts to dump 449 list := strings.Split(flaglist, ",") 450 if flaglist == "all" { 451 list = all(table) 452 } 453 if *test { 454 fullScriptTest(list, installed, table) 455 return 456 } 457 458 if flaglist == "all" { 459 if doProps { 460 println("// Properties is the set of Unicode property tables.") 461 println("var Properties = map[string] *RangeTable{") 462 } else { 463 println("// Scripts is the set of Unicode script tables.") 464 println("var Scripts = map[string] *RangeTable{") 465 } 466 for _, k := range all(table) { 467 printf("\t%q: %s,\n", k, k) 468 if alias, ok := deprecatedAliases[k]; ok { 469 printf("\t%q: %s,\n", alias, k) 470 } 471 } 472 print("}\n\n") 473 } 474 475 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases)) 476 ndecl := 0 477 for _, name := range list { 478 if doProps { 479 decl[ndecl] = fmt.Sprintf( 480 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 481 name, name, name, name) 482 } else { 483 decl[ndecl] = fmt.Sprintf( 484 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 485 name, name, name, name) 486 } 487 ndecl++ 488 if alias, ok := deprecatedAliases[name]; ok { 489 decl[ndecl] = fmt.Sprintf( 490 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n", 491 alias, name) 492 ndecl++ 493 } 494 printRangeTable("_"+name, table[name]) 495 } 496 decl.Sort() 497 println("// These variables have type *RangeTable.") 498 println("var (") 499 for _, d := range decl { 500 print(d) 501 } 502 print(")\n\n") 503} 504 505const ( 506 CaseUpper = 1 << iota 507 CaseLower 508 CaseTitle 509 CaseNone = 0 // must be zero 510 CaseMissing = -1 // character not present; not a valid case state 511) 512 513type caseState struct { 514 point rune 515 _case int 516 deltaToUpper rune 517 deltaToLower rune 518 deltaToTitle rune 519} 520 521// Is d a continuation of the state of c? 522func (c *caseState) adjacent(d *caseState) bool { 523 if d.point < c.point { 524 c, d = d, c 525 } 526 switch { 527 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 528 return false 529 case d._case != c._case: // different cases 530 return c.upperLowerAdjacent(d) 531 case c._case == CaseNone: 532 return false 533 case c._case == CaseMissing: 534 return false 535 case d.deltaToUpper != c.deltaToUpper: 536 return false 537 case d.deltaToLower != c.deltaToLower: 538 return false 539 case d.deltaToTitle != c.deltaToTitle: 540 return false 541 } 542 return true 543} 544 545// Is d the same as c, but opposite in upper/lower case? this would make it 546// an element of an UpperLower sequence. 547func (c *caseState) upperLowerAdjacent(d *caseState) bool { 548 // check they're a matched case pair. we know they have adjacent values 549 switch { 550 case c._case == CaseUpper && d._case != CaseLower: 551 return false 552 case c._case == CaseLower && d._case != CaseUpper: 553 return false 554 } 555 // matched pair (at least in upper/lower). make the order Upper Lower 556 if c._case == CaseLower { 557 c, d = d, c 558 } 559 // for an Upper Lower sequence the deltas have to be in order 560 // c: 0 1 0 561 // d: -1 0 -1 562 switch { 563 case c.deltaToUpper != 0: 564 return false 565 case c.deltaToLower != 1: 566 return false 567 case c.deltaToTitle != 0: 568 return false 569 case d.deltaToUpper != -1: 570 return false 571 case d.deltaToLower != 0: 572 return false 573 case d.deltaToTitle != -1: 574 return false 575 } 576 return true 577} 578 579// Does this character start an UpperLower sequence? 580func (c *caseState) isUpperLower() bool { 581 // for an Upper Lower sequence the deltas have to be in order 582 // c: 0 1 0 583 switch { 584 case c.deltaToUpper != 0: 585 return false 586 case c.deltaToLower != 1: 587 return false 588 case c.deltaToTitle != 0: 589 return false 590 } 591 return true 592} 593 594// Does this character start a LowerUpper sequence? 595func (c *caseState) isLowerUpper() bool { 596 // for an Upper Lower sequence the deltas have to be in order 597 // c: -1 0 -1 598 switch { 599 case c.deltaToUpper != -1: 600 return false 601 case c.deltaToLower != 0: 602 return false 603 case c.deltaToTitle != -1: 604 return false 605 } 606 return true 607} 608 609func getCaseState(i rune) (c *caseState) { 610 c = &caseState{point: i, _case: CaseNone} 611 ch := &chars[i] 612 switch ch.codePoint { 613 case 0: 614 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 615 return 616 case ch.upperCase: 617 c._case = CaseUpper 618 case ch.lowerCase: 619 c._case = CaseLower 620 case ch.titleCase: 621 c._case = CaseTitle 622 } 623 // Some things such as roman numeral U+2161 don't describe themselves 624 // as upper case, but have a lower case. Second-guess them. 625 if c._case == CaseNone && ch.lowerCase != 0 { 626 c._case = CaseUpper 627 } 628 // Same in the other direction. 629 if c._case == CaseNone && ch.upperCase != 0 { 630 c._case = CaseLower 631 } 632 633 if ch.upperCase != 0 { 634 c.deltaToUpper = ch.upperCase - i 635 } 636 if ch.lowerCase != 0 { 637 c.deltaToLower = ch.lowerCase - i 638 } 639 if ch.titleCase != 0 { 640 c.deltaToTitle = ch.titleCase - i 641 } 642 return 643} 644 645func printCases() { 646 if *test { 647 fullCaseTest() 648 return 649 } 650 printf( 651 "// CaseRanges is the table describing case mappings for all letters with\n" + 652 "// non-self mappings.\n" + 653 "var CaseRanges = _CaseRanges\n" + 654 "var _CaseRanges = []CaseRange {\n") 655 656 var startState *caseState // the start of a run; nil for not active 657 var prevState = &caseState{} // the state of the previous character 658 for i := range chars { 659 state := getCaseState(rune(i)) 660 if state.adjacent(prevState) { 661 prevState = state 662 continue 663 } 664 // end of run (possibly) 665 printCaseRange(startState, prevState) 666 startState = nil 667 if state._case != CaseMissing && state._case != CaseNone { 668 startState = state 669 } 670 prevState = state 671 } 672 print("}\n") 673} 674 675func printCaseRange(lo, hi *caseState) { 676 if lo == nil { 677 return 678 } 679 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 680 // character represents itself in all cases - no need to mention it 681 return 682 } 683 switch { 684 case hi.point > lo.point && lo.isUpperLower(): 685 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 686 lo.point, hi.point) 687 case hi.point > lo.point && lo.isLowerUpper(): 688 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 689 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 690 lo.point, hi.point) 691 default: 692 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 693 lo.point, hi.point, 694 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 695 } 696} 697 698// If the cased value in the Char is 0, it means use the rune itself. 699func caseIt(r, cased rune) rune { 700 if cased == 0 { 701 return r 702 } 703 return cased 704} 705 706func fullCaseTest() { 707 for j, c := range chars { 708 i := rune(j) 709 lower := unicode.ToLower(i) 710 want := caseIt(i, c.lowerCase) 711 if lower != want { 712 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 713 } 714 upper := unicode.ToUpper(i) 715 want = caseIt(i, c.upperCase) 716 if upper != want { 717 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 718 } 719 title := unicode.ToTitle(i) 720 want = caseIt(i, c.titleCase) 721 if title != want { 722 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 723 } 724 } 725} 726 727func printLatinProperties() { 728 if *test { 729 return 730 } 731 println("var properties = [MaxLatin1+1]uint8{") 732 for code := 0; code <= unicode.MaxLatin1; code++ { 733 var property string 734 switch chars[code].category { 735 case "Cc", "": // NUL has no category. 736 property = "pC" 737 case "Cf": // soft hyphen, unique category, not printable. 738 property = "0" 739 case "Ll": 740 property = "pLl | pp" 741 case "Lo": 742 property = "pLo | pp" 743 case "Lu": 744 property = "pLu | pp" 745 case "Nd", "No": 746 property = "pN | pp" 747 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 748 property = "pP | pp" 749 case "Sc", "Sk", "Sm", "So": 750 property = "pS | pp" 751 case "Zs": 752 property = "pZ" 753 default: 754 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 755 } 756 // Special case 757 if code == ' ' { 758 property = "pZ | pp" 759 } 760 printf("\t0x%02X: %s, // %q\n", code, property, code) 761 } 762 printf("}\n\n") 763} 764 765func printCasefold() { 766 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 767 var caseOrbit = make([][]rune, MaxChar+1) 768 for j := range chars { 769 i := rune(j) 770 c := &chars[i] 771 if c.foldCase == 0 { 772 continue 773 } 774 orb := caseOrbit[c.foldCase] 775 if orb == nil { 776 orb = append(orb, c.foldCase) 777 } 778 caseOrbit[c.foldCase] = append(orb, i) 779 } 780 781 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 782 for j := range chars { 783 i := rune(j) 784 c := &chars[i] 785 f := c.foldCase 786 if f == 0 { 787 f = i 788 } 789 orb := caseOrbit[f] 790 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 791 // Default assumption of [upper, lower] is wrong. 792 caseOrbit[i] = []rune{i} 793 } 794 } 795 796 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 797 for i, orb := range caseOrbit { 798 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 799 caseOrbit[i] = nil 800 } 801 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 802 caseOrbit[i] = nil 803 } 804 } 805 806 // Record orbit information in chars. 807 for _, orb := range caseOrbit { 808 if orb == nil { 809 continue 810 } 811 sort.Slice(orb, func(i, j int) bool { 812 return orb[i] < orb[j] 813 }) 814 c := orb[len(orb)-1] 815 for _, d := range orb { 816 chars[c].caseOrbit = d 817 c = d 818 } 819 } 820 821 printAsciiFold() 822 printCaseOrbit() 823 824 // Tables of category and script folding exceptions: code points 825 // that must be added when interpreting a particular category/script 826 // in a case-folding context. 827 cat := make(map[string]map[rune]bool) 828 for name := range category { 829 if x := foldExceptions(inCategory(name)); len(x) > 0 { 830 cat[name] = x 831 } 832 } 833 834 scr := make(map[string]map[rune]bool) 835 for name := range scripts { 836 if x := foldExceptions(scripts[name]); len(x) > 0 { 837 scr[name] = x 838 } 839 } 840 841 printCatFold("FoldCategory", cat) 842 printCatFold("FoldScript", scr) 843} 844 845// inCategory returns a list of all the runes in the category. 846func inCategory(name string) []rune { 847 var x []rune 848 for j := range chars { 849 i := rune(j) 850 c := &chars[i] 851 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 852 x = append(x, i) 853 } 854 } 855 return x 856} 857 858// foldExceptions returns a list of all the runes fold-equivalent 859// to runes in class but not in class themselves. 860func foldExceptions(class []rune) map[rune]bool { 861 // Create map containing class and all fold-equivalent chars. 862 m := make(map[rune]bool) 863 for _, r := range class { 864 c := &chars[r] 865 if c.caseOrbit == 0 { 866 // Just upper and lower. 867 if u := c.upperCase; u != 0 { 868 m[u] = true 869 } 870 if l := c.lowerCase; l != 0 { 871 m[l] = true 872 } 873 m[r] = true 874 continue 875 } 876 // Otherwise walk orbit. 877 r0 := r 878 for { 879 m[r] = true 880 r = chars[r].caseOrbit 881 if r == r0 { 882 break 883 } 884 } 885 } 886 887 // Remove class itself. 888 for _, r := range class { 889 delete(m, r) 890 } 891 892 // What's left is the exceptions. 893 return m 894} 895 896var comment = map[string]string{ 897 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 898 "// code points outside the category that are equivalent under\n" + 899 "// simple case folding to code points inside the category.\n" + 900 "// If there is no entry for a category name, there are no such points.\n", 901 902 "FoldScript": "// FoldScript maps a script name to a table of\n" + 903 "// code points outside the script that are equivalent under\n" + 904 "// simple case folding to code points inside the script.\n" + 905 "// If there is no entry for a script name, there are no such points.\n", 906} 907 908func printAsciiFold() { 909 printf("var asciiFold = [MaxASCII + 1]uint16{\n") 910 for i := rune(0); i <= unicode.MaxASCII; i++ { 911 c := chars[i] 912 f := c.caseOrbit 913 if f == 0 { 914 if c.lowerCase != i && c.lowerCase != 0 { 915 f = c.lowerCase 916 } else if c.upperCase != i && c.upperCase != 0 { 917 f = c.upperCase 918 } else { 919 f = i 920 } 921 } 922 printf("\t0x%04X,\n", f) 923 } 924 printf("}\n\n") 925} 926 927func printCaseOrbit() { 928 if *test { 929 for j := range chars { 930 i := rune(j) 931 c := &chars[i] 932 f := c.caseOrbit 933 if f == 0 { 934 if c.lowerCase != i && c.lowerCase != 0 { 935 f = c.lowerCase 936 } else if c.upperCase != i && c.upperCase != 0 { 937 f = c.upperCase 938 } else { 939 f = i 940 } 941 } 942 if g := unicode.SimpleFold(i); g != f { 943 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 944 } 945 } 946 return 947 } 948 949 printf("var caseOrbit = []foldPair{\n") 950 for i := range chars { 951 c := &chars[i] 952 if c.caseOrbit != 0 { 953 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 954 foldPairCount++ 955 } 956 } 957 printf("}\n\n") 958} 959 960func printCatFold(name string, m map[string]map[rune]bool) { 961 if *test { 962 var pkgMap map[string]*unicode.RangeTable 963 if name == "FoldCategory" { 964 pkgMap = unicode.FoldCategory 965 } else { 966 pkgMap = unicode.FoldScript 967 } 968 if len(pkgMap) != len(m) { 969 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 970 return 971 } 972 for k, v := range m { 973 t, ok := pkgMap[k] 974 if !ok { 975 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 976 continue 977 } 978 n := 0 979 for _, r := range t.R16 { 980 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 981 if !v[c] { 982 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 983 } 984 n++ 985 } 986 } 987 for _, r := range t.R32 { 988 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 989 if !v[c] { 990 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 991 } 992 n++ 993 } 994 } 995 if n != len(v) { 996 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 997 } 998 } 999 return 1000 } 1001 1002 print(comment[name]) 1003 printf("var %s = map[string]*RangeTable{\n", name) 1004 for _, name := range allCatFold(m) { 1005 printf("\t%q: fold%s,\n", name, name) 1006 } 1007 printf("}\n\n") 1008 for _, name := range allCatFold(m) { 1009 class := m[name] 1010 dumpRange("fold"+name, func(code rune) bool { return class[code] }) 1011 } 1012} 1013 1014var range16Count = 0 // Number of entries in the 16-bit range tables. 1015var range32Count = 0 // Number of entries in the 32-bit range tables. 1016var foldPairCount = 0 // Number of fold pairs in the exception tables. 1017 1018func printSizes() { 1019 if *test { 1020 return 1021 } 1022 println() 1023 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1024 range16Bytes := range16Count * 3 * 2 1025 range32Bytes := range32Count * 3 * 4 1026 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1027 println() 1028 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1029} 1030