1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// Unicode table generator. 9// Data read from the web. 10 11package main 12 13import ( 14 "flag" 15 "fmt" 16 "log" 17 "os" 18 "regexp" 19 "sort" 20 "strings" 21 "unicode" 22 23 "golang.org/x/text/internal/gen" 24 "golang.org/x/text/internal/ucd" 25 "golang.org/x/text/unicode/rangetable" 26) 27 28func main() { 29 flag.Parse() 30 setupOutput() 31 loadChars() // always needed 32 loadCasefold() 33 printCategories() 34 printScriptOrProperty(false) 35 printScriptOrProperty(true) 36 printCases() 37 printLatinProperties() 38 printCasefold() 39 printSizes() 40 flushOutput() 41} 42 43func defaultVersion() string { 44 if v := os.Getenv("UNICODE_VERSION"); v != "" { 45 return v 46 } 47 return unicode.Version 48} 49 50var tablelist = flag.String("tables", 51 "all", 52 "comma-separated list of which tables to generate; can be letter") 53var scriptlist = flag.String("scripts", 54 "all", 55 "comma-separated list of which script tables to generate") 56var proplist = flag.String("props", 57 "all", 58 "comma-separated list of which property tables to generate") 59var cases = flag.Bool("cases", 60 true, 61 "generate case tables") 62var test = flag.Bool("test", 63 false, 64 "test existing tables; can be used to compare web data with package data") 65 66var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`) 67var logger = log.New(os.Stderr, "", log.Lshortfile) 68 69var output *gen.CodeWriter 70 71func setupOutput() { 72 output = gen.NewCodeWriter() 73} 74 75func flushOutput() { 76 output.WriteGoFile("tables.go", "unicode") 77} 78 79func printf(format string, args ...interface{}) { 80 fmt.Fprintf(output, format, args...) 81} 82 83func print(args ...interface{}) { 84 fmt.Fprint(output, args...) 85} 86 87func println(args ...interface{}) { 88 fmt.Fprintln(output, args...) 89} 90 91var category = map[string]bool{ 92 // Nd Lu etc. 93 // We use one-character names to identify merged categories 94 "L": true, // Lu Ll Lt Lm Lo 95 "P": true, // Pc Pd Ps Pe Pu Pf Po 96 "M": true, // Mn Mc Me 97 "N": true, // Nd Nl No 98 "S": true, // Sm Sc Sk So 99 "Z": true, // Zs Zl Zp 100 "C": true, // Cc Cf Cs Co Cn 101} 102 103// This contains only the properties we're interested in. 104type Char struct { 105 codePoint rune // if zero, this index is not a valid code point. 106 category string 107 upperCase rune 108 lowerCase rune 109 titleCase rune 110 foldCase rune // simple case folding 111 caseOrbit rune // next in simple case folding orbit 112} 113 114const MaxChar = 0x10FFFF 115 116var chars = make([]Char, MaxChar+1) 117var scripts = make(map[string][]rune) 118var props = make(map[string][]rune) // a property looks like a script; can share the format 119 120func allCategories() []string { 121 a := make([]string, 0, len(category)) 122 for k := range category { 123 a = append(a, k) 124 } 125 sort.Strings(a) 126 return a 127} 128 129func all(scripts map[string][]rune) []string { 130 a := make([]string, 0, len(scripts)) 131 for k := range scripts { 132 a = append(a, k) 133 } 134 sort.Strings(a) 135 return a 136} 137 138func allCatFold(m map[string]map[rune]bool) []string { 139 a := make([]string, 0, len(m)) 140 for k := range m { 141 a = append(a, k) 142 } 143 sort.Strings(a) 144 return a 145} 146 147func categoryOp(code rune, class uint8) bool { 148 category := chars[code].category 149 return len(category) > 0 && category[0] == class 150} 151 152func loadChars() { 153 ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) { 154 c := Char{codePoint: p.Rune(0)} 155 156 getRune := func(field int) rune { 157 if p.String(field) == "" { 158 return 0 159 } 160 return p.Rune(field) 161 } 162 163 c.category = p.String(ucd.GeneralCategory) 164 category[c.category] = true 165 switch c.category { 166 case "Nd": 167 // Decimal digit 168 p.Int(ucd.NumericValue) 169 case "Lu": 170 c.upperCase = getRune(ucd.CodePoint) 171 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 172 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 173 case "Ll": 174 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 175 c.lowerCase = getRune(ucd.CodePoint) 176 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 177 case "Lt": 178 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 179 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 180 c.titleCase = getRune(ucd.CodePoint) 181 default: 182 c.upperCase = getRune(ucd.SimpleUppercaseMapping) 183 c.lowerCase = getRune(ucd.SimpleLowercaseMapping) 184 c.titleCase = getRune(ucd.SimpleTitlecaseMapping) 185 } 186 187 chars[c.codePoint] = c 188 }) 189} 190 191func loadCasefold() { 192 ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) { 193 kind := p.String(1) 194 if kind != "C" && kind != "S" { 195 // Only care about 'common' and 'simple' foldings. 196 return 197 } 198 p1 := p.Rune(0) 199 p2 := p.Rune(2) 200 chars[p1].foldCase = rune(p2) 201 }) 202} 203 204var categoryMapping = map[string]string{ 205 "Lu": "Letter, uppercase", 206 "Ll": "Letter, lowercase", 207 "Lt": "Letter, titlecase", 208 "Lm": "Letter, modifier", 209 "Lo": "Letter, other", 210 "Mn": "Mark, nonspacing", 211 "Mc": "Mark, spacing combining", 212 "Me": "Mark, enclosing", 213 "Nd": "Number, decimal digit", 214 "Nl": "Number, letter", 215 "No": "Number, other", 216 "Pc": "Punctuation, connector", 217 "Pd": "Punctuation, dash", 218 "Ps": "Punctuation, open", 219 "Pe": "Punctuation, close", 220 "Pi": "Punctuation, initial quote", 221 "Pf": "Punctuation, final quote", 222 "Po": "Punctuation, other", 223 "Sm": "Symbol, math", 224 "Sc": "Symbol, currency", 225 "Sk": "Symbol, modifier", 226 "So": "Symbol, other", 227 "Zs": "Separator, space", 228 "Zl": "Separator, line", 229 "Zp": "Separator, paragraph", 230 "Cc": "Other, control", 231 "Cf": "Other, format", 232 "Cs": "Other, surrogate", 233 "Co": "Other, private use", 234 "Cn": "Other, not assigned", 235} 236 237func printCategories() { 238 if *tablelist == "" { 239 return 240 } 241 // Find out which categories to dump 242 list := strings.Split(*tablelist, ",") 243 if *tablelist == "all" { 244 list = allCategories() 245 } 246 if *test { 247 fullCategoryTest(list) 248 return 249 } 250 251 println("// Version is the Unicode edition from which the tables are derived.") 252 printf("const Version = %q\n\n", gen.UnicodeVersion()) 253 254 if *tablelist == "all" { 255 println("// Categories is the set of Unicode category tables.") 256 println("var Categories = map[string] *RangeTable {") 257 for _, k := range allCategories() { 258 printf("\t%q: %s,\n", k, k) 259 } 260 print("}\n\n") 261 } 262 263 decl := make(sort.StringSlice, len(list)) 264 ndecl := 0 265 for _, name := range list { 266 if _, ok := category[name]; !ok { 267 logger.Fatal("unknown category", name) 268 } 269 // We generate an UpperCase name to serve as concise documentation and an _UnderScored 270 // name to store the data. This stops godoc dumping all the tables but keeps them 271 // available to clients. 272 // Cases deserving special comments 273 varDecl := "" 274 switch name { 275 case "C": 276 varDecl = "\tOther = _C; // Other/C is the set of Unicode control and special characters, category C.\n" 277 varDecl += "\tC = _C\n" 278 case "L": 279 varDecl = "\tLetter = _L; // Letter/L is the set of Unicode letters, category L.\n" 280 varDecl += "\tL = _L\n" 281 case "M": 282 varDecl = "\tMark = _M; // Mark/M is the set of Unicode mark characters, category M.\n" 283 varDecl += "\tM = _M\n" 284 case "N": 285 varDecl = "\tNumber = _N; // Number/N is the set of Unicode number characters, category N.\n" 286 varDecl += "\tN = _N\n" 287 case "P": 288 varDecl = "\tPunct = _P; // Punct/P is the set of Unicode punctuation characters, category P.\n" 289 varDecl += "\tP = _P\n" 290 case "S": 291 varDecl = "\tSymbol = _S; // Symbol/S is the set of Unicode symbol characters, category S.\n" 292 varDecl += "\tS = _S\n" 293 case "Z": 294 varDecl = "\tSpace = _Z; // Space/Z is the set of Unicode space characters, category Z.\n" 295 varDecl += "\tZ = _Z\n" 296 case "Nd": 297 varDecl = "\tDigit = _Nd; // Digit is the set of Unicode characters with the \"decimal digit\" property.\n" 298 case "Lu": 299 varDecl = "\tUpper = _Lu; // Upper is the set of Unicode upper case letters.\n" 300 case "Ll": 301 varDecl = "\tLower = _Ll; // Lower is the set of Unicode lower case letters.\n" 302 case "Lt": 303 varDecl = "\tTitle = _Lt; // Title is the set of Unicode title case letters.\n" 304 } 305 if len(name) > 1 { 306 desc, ok := categoryMapping[name] 307 if ok { 308 varDecl += fmt.Sprintf( 309 "\t%s = _%s; // %s is the set of Unicode characters in category %s (%s).\n", 310 name, name, name, name, desc) 311 } else { 312 varDecl += fmt.Sprintf( 313 "\t%s = _%s; // %s is the set of Unicode characters in category %s.\n", 314 name, name, name, name) 315 } 316 } 317 decl[ndecl] = varDecl 318 ndecl++ 319 if len(name) == 1 { // unified categories 320 dumpRange( 321 "_"+name, 322 func(code rune) bool { return categoryOp(code, name[0]) }) 323 continue 324 } 325 dumpRange("_"+name, 326 func(code rune) bool { return chars[code].category == name }) 327 } 328 decl.Sort() 329 println("// These variables have type *RangeTable.") 330 println("var (") 331 for _, d := range decl { 332 print(d) 333 } 334 print(")\n\n") 335} 336 337type Op func(code rune) bool 338 339func dumpRange(name string, inCategory Op) { 340 runes := []rune{} 341 for i := range chars { 342 r := rune(i) 343 if inCategory(r) { 344 runes = append(runes, r) 345 } 346 } 347 printRangeTable(name, runes) 348} 349 350func printRangeTable(name string, runes []rune) { 351 rt := rangetable.New(runes...) 352 printf("var %s = &RangeTable{\n", name) 353 println("\tR16: []Range16{") 354 for _, r := range rt.R16 { 355 printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride) 356 range16Count++ 357 } 358 println("\t},") 359 if len(rt.R32) > 0 { 360 println("\tR32: []Range32{") 361 for _, r := range rt.R32 { 362 printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride) 363 range32Count++ 364 } 365 println("\t},") 366 } 367 if rt.LatinOffset > 0 { 368 printf("\tLatinOffset: %d,\n", rt.LatinOffset) 369 } 370 printf("}\n\n") 371} 372 373func fullCategoryTest(list []string) { 374 for _, name := range list { 375 if _, ok := category[name]; !ok { 376 logger.Fatal("unknown category", name) 377 } 378 r, ok := unicode.Categories[name] 379 if !ok && len(name) > 1 { 380 logger.Fatalf("unknown table %q", name) 381 } 382 if len(name) == 1 { 383 verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r) 384 } else { 385 verifyRange( 386 name, 387 func(code rune) bool { return chars[code].category == name }, 388 r) 389 } 390 } 391} 392 393func verifyRange(name string, inCategory Op, table *unicode.RangeTable) { 394 count := 0 395 for j := range chars { 396 i := rune(j) 397 web := inCategory(i) 398 pkg := unicode.Is(table, i) 399 if web != pkg { 400 fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg) 401 count++ 402 if count > 10 { 403 break 404 } 405 } 406 } 407} 408 409func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) { 410 for _, name := range list { 411 if _, ok := scripts[name]; !ok { 412 logger.Fatal("unknown script", name) 413 } 414 _, ok := installed[name] 415 if !ok { 416 logger.Fatal("unknown table", name) 417 } 418 for _, r := range scripts[name] { 419 if !unicode.Is(installed[name], rune(r)) { 420 fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name) 421 } 422 } 423 } 424} 425 426var deprecatedAliases = map[string]string{ 427 "Sentence_Terminal": "STerm", 428} 429 430// PropList.txt has the same format as Scripts.txt so we can share its parser. 431func printScriptOrProperty(doProps bool) { 432 flaglist := *scriptlist 433 file := "Scripts.txt" 434 table := scripts 435 installed := unicode.Scripts 436 if doProps { 437 flaglist = *proplist 438 file = "PropList.txt" 439 table = props 440 installed = unicode.Properties 441 } 442 if flaglist == "" { 443 return 444 } 445 ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) { 446 name := p.String(1) 447 table[name] = append(table[name], p.Rune(0)) 448 }) 449 // Find out which scripts to dump 450 list := strings.Split(flaglist, ",") 451 if flaglist == "all" { 452 list = all(table) 453 } 454 if *test { 455 fullScriptTest(list, installed, table) 456 return 457 } 458 459 if flaglist == "all" { 460 if doProps { 461 println("// Properties is the set of Unicode property tables.") 462 println("var Properties = map[string] *RangeTable{") 463 } else { 464 println("// Scripts is the set of Unicode script tables.") 465 println("var Scripts = map[string] *RangeTable{") 466 } 467 for _, k := range all(table) { 468 printf("\t%q: %s,\n", k, k) 469 if alias, ok := deprecatedAliases[k]; ok { 470 printf("\t%q: %s,\n", alias, k) 471 } 472 } 473 print("}\n\n") 474 } 475 476 decl := make(sort.StringSlice, len(list)+len(deprecatedAliases)) 477 ndecl := 0 478 for _, name := range list { 479 if doProps { 480 decl[ndecl] = fmt.Sprintf( 481 "\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n", 482 name, name, name, name) 483 } else { 484 decl[ndecl] = fmt.Sprintf( 485 "\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n", 486 name, name, name, name) 487 } 488 ndecl++ 489 if alias, ok := deprecatedAliases[name]; ok { 490 decl[ndecl] = fmt.Sprintf( 491 "\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n", 492 alias, name) 493 ndecl++ 494 } 495 printRangeTable("_"+name, table[name]) 496 } 497 decl.Sort() 498 println("// These variables have type *RangeTable.") 499 println("var (") 500 for _, d := range decl { 501 print(d) 502 } 503 print(")\n\n") 504} 505 506const ( 507 CaseUpper = 1 << iota 508 CaseLower 509 CaseTitle 510 CaseNone = 0 // must be zero 511 CaseMissing = -1 // character not present; not a valid case state 512) 513 514type caseState struct { 515 point rune 516 _case int 517 deltaToUpper rune 518 deltaToLower rune 519 deltaToTitle rune 520} 521 522// Is d a continuation of the state of c? 523func (c *caseState) adjacent(d *caseState) bool { 524 if d.point < c.point { 525 c, d = d, c 526 } 527 switch { 528 case d.point != c.point+1: // code points not adjacent (shouldn't happen) 529 return false 530 case d._case != c._case: // different cases 531 return c.upperLowerAdjacent(d) 532 case c._case == CaseNone: 533 return false 534 case c._case == CaseMissing: 535 return false 536 case d.deltaToUpper != c.deltaToUpper: 537 return false 538 case d.deltaToLower != c.deltaToLower: 539 return false 540 case d.deltaToTitle != c.deltaToTitle: 541 return false 542 } 543 return true 544} 545 546// Is d the same as c, but opposite in upper/lower case? this would make it 547// an element of an UpperLower sequence. 548func (c *caseState) upperLowerAdjacent(d *caseState) bool { 549 // check they're a matched case pair. we know they have adjacent values 550 switch { 551 case c._case == CaseUpper && d._case != CaseLower: 552 return false 553 case c._case == CaseLower && d._case != CaseUpper: 554 return false 555 } 556 // matched pair (at least in upper/lower). make the order Upper Lower 557 if c._case == CaseLower { 558 c, d = d, c 559 } 560 // for an Upper Lower sequence the deltas have to be in order 561 // c: 0 1 0 562 // d: -1 0 -1 563 switch { 564 case c.deltaToUpper != 0: 565 return false 566 case c.deltaToLower != 1: 567 return false 568 case c.deltaToTitle != 0: 569 return false 570 case d.deltaToUpper != -1: 571 return false 572 case d.deltaToLower != 0: 573 return false 574 case d.deltaToTitle != -1: 575 return false 576 } 577 return true 578} 579 580// Does this character start an UpperLower sequence? 581func (c *caseState) isUpperLower() bool { 582 // for an Upper Lower sequence the deltas have to be in order 583 // c: 0 1 0 584 switch { 585 case c.deltaToUpper != 0: 586 return false 587 case c.deltaToLower != 1: 588 return false 589 case c.deltaToTitle != 0: 590 return false 591 } 592 return true 593} 594 595// Does this character start a LowerUpper sequence? 596func (c *caseState) isLowerUpper() bool { 597 // for an Upper Lower sequence the deltas have to be in order 598 // c: -1 0 -1 599 switch { 600 case c.deltaToUpper != -1: 601 return false 602 case c.deltaToLower != 0: 603 return false 604 case c.deltaToTitle != -1: 605 return false 606 } 607 return true 608} 609 610func getCaseState(i rune) (c *caseState) { 611 c = &caseState{point: i, _case: CaseNone} 612 ch := &chars[i] 613 switch ch.codePoint { 614 case 0: 615 c._case = CaseMissing // Will get NUL wrong but that doesn't matter 616 return 617 case ch.upperCase: 618 c._case = CaseUpper 619 case ch.lowerCase: 620 c._case = CaseLower 621 case ch.titleCase: 622 c._case = CaseTitle 623 } 624 // Some things such as roman numeral U+2161 don't describe themselves 625 // as upper case, but have a lower case. Second-guess them. 626 if c._case == CaseNone && ch.lowerCase != 0 { 627 c._case = CaseUpper 628 } 629 // Same in the other direction. 630 if c._case == CaseNone && ch.upperCase != 0 { 631 c._case = CaseLower 632 } 633 634 if ch.upperCase != 0 { 635 c.deltaToUpper = ch.upperCase - i 636 } 637 if ch.lowerCase != 0 { 638 c.deltaToLower = ch.lowerCase - i 639 } 640 if ch.titleCase != 0 { 641 c.deltaToTitle = ch.titleCase - i 642 } 643 return 644} 645 646func printCases() { 647 if *test { 648 fullCaseTest() 649 return 650 } 651 printf( 652 "// CaseRanges is the table describing case mappings for all letters with\n" + 653 "// non-self mappings.\n" + 654 "var CaseRanges = _CaseRanges\n" + 655 "var _CaseRanges = []CaseRange {\n") 656 657 var startState *caseState // the start of a run; nil for not active 658 var prevState = &caseState{} // the state of the previous character 659 for i := range chars { 660 state := getCaseState(rune(i)) 661 if state.adjacent(prevState) { 662 prevState = state 663 continue 664 } 665 // end of run (possibly) 666 printCaseRange(startState, prevState) 667 startState = nil 668 if state._case != CaseMissing && state._case != CaseNone { 669 startState = state 670 } 671 prevState = state 672 } 673 print("}\n") 674} 675 676func printCaseRange(lo, hi *caseState) { 677 if lo == nil { 678 return 679 } 680 if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 { 681 // character represents itself in all cases - no need to mention it 682 return 683 } 684 switch { 685 case hi.point > lo.point && lo.isUpperLower(): 686 printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n", 687 lo.point, hi.point) 688 case hi.point > lo.point && lo.isLowerUpper(): 689 logger.Fatalf("LowerUpper sequence: should not happen: %U. If it's real, need to fix To()", lo.point) 690 printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n", 691 lo.point, hi.point) 692 default: 693 printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n", 694 lo.point, hi.point, 695 lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle) 696 } 697} 698 699// If the cased value in the Char is 0, it means use the rune itself. 700func caseIt(r, cased rune) rune { 701 if cased == 0 { 702 return r 703 } 704 return cased 705} 706 707func fullCaseTest() { 708 for j, c := range chars { 709 i := rune(j) 710 lower := unicode.ToLower(i) 711 want := caseIt(i, c.lowerCase) 712 if lower != want { 713 fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower) 714 } 715 upper := unicode.ToUpper(i) 716 want = caseIt(i, c.upperCase) 717 if upper != want { 718 fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper) 719 } 720 title := unicode.ToTitle(i) 721 want = caseIt(i, c.titleCase) 722 if title != want { 723 fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title) 724 } 725 } 726} 727 728func printLatinProperties() { 729 if *test { 730 return 731 } 732 println("var properties = [MaxLatin1+1]uint8{") 733 for code := 0; code <= unicode.MaxLatin1; code++ { 734 var property string 735 switch chars[code].category { 736 case "Cc", "": // NUL has no category. 737 property = "pC" 738 case "Cf": // soft hyphen, unique category, not printable. 739 property = "0" 740 case "Ll": 741 property = "pLl | pp" 742 case "Lo": 743 property = "pLo | pp" 744 case "Lu": 745 property = "pLu | pp" 746 case "Nd", "No": 747 property = "pN | pp" 748 case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps": 749 property = "pP | pp" 750 case "Sc", "Sk", "Sm", "So": 751 property = "pS | pp" 752 case "Zs": 753 property = "pZ" 754 default: 755 logger.Fatalf("%U has unknown category %q", code, chars[code].category) 756 } 757 // Special case 758 if code == ' ' { 759 property = "pZ | pp" 760 } 761 printf("\t0x%02X: %s, // %q\n", code, property, code) 762 } 763 printf("}\n\n") 764} 765 766func printCasefold() { 767 // Build list of case-folding groups attached to each canonical folded char (typically lower case). 768 var caseOrbit = make([][]rune, MaxChar+1) 769 for j := range chars { 770 i := rune(j) 771 c := &chars[i] 772 if c.foldCase == 0 { 773 continue 774 } 775 orb := caseOrbit[c.foldCase] 776 if orb == nil { 777 orb = append(orb, c.foldCase) 778 } 779 caseOrbit[c.foldCase] = append(orb, i) 780 } 781 782 // Insert explicit 1-element groups when assuming [lower, upper] would be wrong. 783 for j := range chars { 784 i := rune(j) 785 c := &chars[i] 786 f := c.foldCase 787 if f == 0 { 788 f = i 789 } 790 orb := caseOrbit[f] 791 if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) { 792 // Default assumption of [upper, lower] is wrong. 793 caseOrbit[i] = []rune{i} 794 } 795 } 796 797 // Delete the groups for which assuming [lower, upper] or [upper, lower] is right. 798 for i, orb := range caseOrbit { 799 if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] { 800 caseOrbit[i] = nil 801 } 802 if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] { 803 caseOrbit[i] = nil 804 } 805 } 806 807 // Record orbit information in chars. 808 for _, orb := range caseOrbit { 809 if orb == nil { 810 continue 811 } 812 sort.Slice(orb, func(i, j int) bool { 813 return orb[i] < orb[j] 814 }) 815 c := orb[len(orb)-1] 816 for _, d := range orb { 817 chars[c].caseOrbit = d 818 c = d 819 } 820 } 821 822 printAsciiFold() 823 printCaseOrbit() 824 825 // Tables of category and script folding exceptions: code points 826 // that must be added when interpreting a particular category/script 827 // in a case-folding context. 828 cat := make(map[string]map[rune]bool) 829 for name := range category { 830 if x := foldExceptions(inCategory(name)); len(x) > 0 { 831 cat[name] = x 832 } 833 } 834 835 scr := make(map[string]map[rune]bool) 836 for name := range scripts { 837 if x := foldExceptions(scripts[name]); len(x) > 0 { 838 scr[name] = x 839 } 840 } 841 842 printCatFold("FoldCategory", cat) 843 printCatFold("FoldScript", scr) 844} 845 846// inCategory returns a list of all the runes in the category. 847func inCategory(name string) []rune { 848 var x []rune 849 for j := range chars { 850 i := rune(j) 851 c := &chars[i] 852 if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] { 853 x = append(x, i) 854 } 855 } 856 return x 857} 858 859// foldExceptions returns a list of all the runes fold-equivalent 860// to runes in class but not in class themselves. 861func foldExceptions(class []rune) map[rune]bool { 862 // Create map containing class and all fold-equivalent chars. 863 m := make(map[rune]bool) 864 for _, r := range class { 865 c := &chars[r] 866 if c.caseOrbit == 0 { 867 // Just upper and lower. 868 if u := c.upperCase; u != 0 { 869 m[u] = true 870 } 871 if l := c.lowerCase; l != 0 { 872 m[l] = true 873 } 874 m[r] = true 875 continue 876 } 877 // Otherwise walk orbit. 878 r0 := r 879 for { 880 m[r] = true 881 r = chars[r].caseOrbit 882 if r == r0 { 883 break 884 } 885 } 886 } 887 888 // Remove class itself. 889 for _, r := range class { 890 delete(m, r) 891 } 892 893 // What's left is the exceptions. 894 return m 895} 896 897var comment = map[string]string{ 898 "FoldCategory": "// FoldCategory maps a category name to a table of\n" + 899 "// code points outside the category that are equivalent under\n" + 900 "// simple case folding to code points inside the category.\n" + 901 "// If there is no entry for a category name, there are no such points.\n", 902 903 "FoldScript": "// FoldScript maps a script name to a table of\n" + 904 "// code points outside the script that are equivalent under\n" + 905 "// simple case folding to code points inside the script.\n" + 906 "// If there is no entry for a script name, there are no such points.\n", 907} 908 909func printAsciiFold() { 910 printf("var asciiFold = [MaxASCII + 1]uint16{\n") 911 for i := rune(0); i <= unicode.MaxASCII; i++ { 912 c := chars[i] 913 f := c.caseOrbit 914 if f == 0 { 915 if c.lowerCase != i && c.lowerCase != 0 { 916 f = c.lowerCase 917 } else if c.upperCase != i && c.upperCase != 0 { 918 f = c.upperCase 919 } else { 920 f = i 921 } 922 } 923 printf("\t0x%04X,\n", f) 924 } 925 printf("}\n\n") 926} 927 928func printCaseOrbit() { 929 if *test { 930 for j := range chars { 931 i := rune(j) 932 c := &chars[i] 933 f := c.caseOrbit 934 if f == 0 { 935 if c.lowerCase != i && c.lowerCase != 0 { 936 f = c.lowerCase 937 } else if c.upperCase != i && c.upperCase != 0 { 938 f = c.upperCase 939 } else { 940 f = i 941 } 942 } 943 if g := unicode.SimpleFold(i); g != f { 944 fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f) 945 } 946 } 947 return 948 } 949 950 printf("var caseOrbit = []foldPair{\n") 951 for i := range chars { 952 c := &chars[i] 953 if c.caseOrbit != 0 { 954 printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit) 955 foldPairCount++ 956 } 957 } 958 printf("}\n\n") 959} 960 961func printCatFold(name string, m map[string]map[rune]bool) { 962 if *test { 963 var pkgMap map[string]*unicode.RangeTable 964 if name == "FoldCategory" { 965 pkgMap = unicode.FoldCategory 966 } else { 967 pkgMap = unicode.FoldScript 968 } 969 if len(pkgMap) != len(m) { 970 fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m)) 971 return 972 } 973 for k, v := range m { 974 t, ok := pkgMap[k] 975 if !ok { 976 fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k) 977 continue 978 } 979 n := 0 980 for _, r := range t.R16 { 981 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 982 if !v[c] { 983 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 984 } 985 n++ 986 } 987 } 988 for _, r := range t.R32 { 989 for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) { 990 if !v[c] { 991 fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c) 992 } 993 n++ 994 } 995 } 996 if n != len(v) { 997 fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v)) 998 } 999 } 1000 return 1001 } 1002 1003 print(comment[name]) 1004 printf("var %s = map[string]*RangeTable{\n", name) 1005 for _, name := range allCatFold(m) { 1006 printf("\t%q: fold%s,\n", name, name) 1007 } 1008 printf("}\n\n") 1009 for _, name := range allCatFold(m) { 1010 class := m[name] 1011 dumpRange("fold"+name, func(code rune) bool { return class[code] }) 1012 } 1013} 1014 1015var range16Count = 0 // Number of entries in the 16-bit range tables. 1016var range32Count = 0 // Number of entries in the 32-bit range tables. 1017var foldPairCount = 0 // Number of fold pairs in the exception tables. 1018 1019func printSizes() { 1020 if *test { 1021 return 1022 } 1023 println() 1024 printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count) 1025 range16Bytes := range16Count * 3 * 2 1026 range32Bytes := range32Count * 3 * 4 1027 printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes) 1028 println() 1029 printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2) 1030} 1031