1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// Normalization table generator. 9// Data read from the web. 10// See forminfo.go for a description of the trie values associated with each rune. 11 12package main 13 14import ( 15 "bytes" 16 "encoding/binary" 17 "flag" 18 "fmt" 19 "io" 20 "log" 21 "sort" 22 "strconv" 23 "strings" 24 25 "golang.org/x/text/internal/gen" 26 "golang.org/x/text/internal/triegen" 27 "golang.org/x/text/internal/ucd" 28) 29 30func main() { 31 gen.Init() 32 loadUnicodeData() 33 compactCCC() 34 loadCompositionExclusions() 35 completeCharFields(FCanonical) 36 completeCharFields(FCompatibility) 37 computeNonStarterCounts() 38 verifyComputed() 39 printChars() 40 testDerived() 41 printTestdata() 42 makeTables() 43} 44 45var ( 46 tablelist = flag.String("tables", 47 "all", 48 "comma-separated list of which tables to generate; "+ 49 "can be 'decomp', 'recomp', 'info' and 'all'") 50 test = flag.Bool("test", 51 false, 52 "test existing tables against DerivedNormalizationProps and generate test data for regression testing") 53 verbose = flag.Bool("verbose", 54 false, 55 "write data to stdout as it is parsed") 56) 57 58const MaxChar = 0x10FFFF // anything above this shouldn't exist 59 60// Quick Check properties of runes allow us to quickly 61// determine whether a rune may occur in a normal form. 62// For a given normal form, a rune may be guaranteed to occur 63// verbatim (QC=Yes), may or may not combine with another 64// rune (QC=Maybe), or may not occur (QC=No). 65type QCResult int 66 67const ( 68 QCUnknown QCResult = iota 69 QCYes 70 QCNo 71 QCMaybe 72) 73 74func (r QCResult) String() string { 75 switch r { 76 case QCYes: 77 return "Yes" 78 case QCNo: 79 return "No" 80 case QCMaybe: 81 return "Maybe" 82 } 83 return "***UNKNOWN***" 84} 85 86const ( 87 FCanonical = iota // NFC or NFD 88 FCompatibility // NFKC or NFKD 89 FNumberOfFormTypes 90) 91 92const ( 93 MComposed = iota // NFC or NFKC 94 MDecomposed // NFD or NFKD 95 MNumberOfModes 96) 97 98// This contains only the properties we're interested in. 99type Char struct { 100 name string 101 codePoint rune // if zero, this index is not a valid code point. 102 ccc uint8 // canonical combining class 103 origCCC uint8 104 excludeInComp bool // from CompositionExclusions.txt 105 compatDecomp bool // it has a compatibility expansion 106 107 nTrailingNonStarters uint8 108 nLeadingNonStarters uint8 // must be equal to trailing if non-zero 109 110 forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility 111 112 state State 113} 114 115var chars = make([]Char, MaxChar+1) 116var cccMap = make(map[uint8]uint8) 117 118func (c Char) String() string { 119 buf := new(bytes.Buffer) 120 121 fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name) 122 fmt.Fprintf(buf, " ccc: %v\n", c.ccc) 123 fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp) 124 fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp) 125 fmt.Fprintf(buf, " state: %v\n", c.state) 126 fmt.Fprintf(buf, " NFC:\n") 127 fmt.Fprint(buf, c.forms[FCanonical]) 128 fmt.Fprintf(buf, " NFKC:\n") 129 fmt.Fprint(buf, c.forms[FCompatibility]) 130 131 return buf.String() 132} 133 134// In UnicodeData.txt, some ranges are marked like this: 135// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 136// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; 137// parseCharacter keeps a state variable indicating the weirdness. 138type State int 139 140const ( 141 SNormal State = iota // known to be zero for the type 142 SFirst 143 SLast 144 SMissing 145) 146 147var lastChar = rune('\u0000') 148 149func (c Char) isValid() bool { 150 return c.codePoint != 0 && c.state != SMissing 151} 152 153type FormInfo struct { 154 quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed 155 verified [MNumberOfModes]bool // index: MComposed or MDecomposed 156 157 combinesForward bool // May combine with rune on the right 158 combinesBackward bool // May combine with rune on the left 159 isOneWay bool // Never appears in result 160 inDecomp bool // Some decompositions result in this char. 161 decomp Decomposition 162 expandedDecomp Decomposition 163} 164 165func (f FormInfo) String() string { 166 buf := bytes.NewBuffer(make([]byte, 0)) 167 168 fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed]) 169 fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed]) 170 fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward) 171 fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward) 172 fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay) 173 fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp) 174 fmt.Fprintf(buf, " decomposition: %X\n", f.decomp) 175 fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp) 176 177 return buf.String() 178} 179 180type Decomposition []rune 181 182func parseDecomposition(s string, skipfirst bool) (a []rune, err error) { 183 decomp := strings.Split(s, " ") 184 if len(decomp) > 0 && skipfirst { 185 decomp = decomp[1:] 186 } 187 for _, d := range decomp { 188 point, err := strconv.ParseUint(d, 16, 64) 189 if err != nil { 190 return a, err 191 } 192 a = append(a, rune(point)) 193 } 194 return a, nil 195} 196 197func loadUnicodeData() { 198 f := gen.OpenUCDFile("UnicodeData.txt") 199 defer f.Close() 200 p := ucd.New(f) 201 for p.Next() { 202 r := p.Rune(ucd.CodePoint) 203 char := &chars[r] 204 205 char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass)) 206 decmap := p.String(ucd.DecompMapping) 207 208 exp, err := parseDecomposition(decmap, false) 209 isCompat := false 210 if err != nil { 211 if len(decmap) > 0 { 212 exp, err = parseDecomposition(decmap, true) 213 if err != nil { 214 log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err) 215 } 216 isCompat = true 217 } 218 } 219 220 char.name = p.String(ucd.Name) 221 char.codePoint = r 222 char.forms[FCompatibility].decomp = exp 223 if !isCompat { 224 char.forms[FCanonical].decomp = exp 225 } else { 226 char.compatDecomp = true 227 } 228 if len(decmap) > 0 { 229 char.forms[FCompatibility].decomp = exp 230 } 231 } 232 if err := p.Err(); err != nil { 233 log.Fatal(err) 234 } 235} 236 237// compactCCC converts the sparse set of CCC values to a continguous one, 238// reducing the number of bits needed from 8 to 6. 239func compactCCC() { 240 m := make(map[uint8]uint8) 241 for i := range chars { 242 c := &chars[i] 243 m[c.ccc] = 0 244 } 245 cccs := []int{} 246 for v, _ := range m { 247 cccs = append(cccs, int(v)) 248 } 249 sort.Ints(cccs) 250 for i, c := range cccs { 251 cccMap[uint8(i)] = uint8(c) 252 m[uint8(c)] = uint8(i) 253 } 254 for i := range chars { 255 c := &chars[i] 256 c.origCCC = c.ccc 257 c.ccc = m[c.ccc] 258 } 259 if len(m) >= 1<<6 { 260 log.Fatalf("too many difference CCC values: %d >= 64", len(m)) 261 } 262} 263 264// CompositionExclusions.txt has form: 265// 0958 # ... 266// See https://unicode.org/reports/tr44/ for full explanation 267func loadCompositionExclusions() { 268 f := gen.OpenUCDFile("CompositionExclusions.txt") 269 defer f.Close() 270 p := ucd.New(f) 271 for p.Next() { 272 c := &chars[p.Rune(0)] 273 if c.excludeInComp { 274 log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint) 275 } 276 c.excludeInComp = true 277 } 278 if e := p.Err(); e != nil { 279 log.Fatal(e) 280 } 281} 282 283// hasCompatDecomp returns true if any of the recursive 284// decompositions contains a compatibility expansion. 285// In this case, the character may not occur in NFK*. 286func hasCompatDecomp(r rune) bool { 287 c := &chars[r] 288 if c.compatDecomp { 289 return true 290 } 291 for _, d := range c.forms[FCompatibility].decomp { 292 if hasCompatDecomp(d) { 293 return true 294 } 295 } 296 return false 297} 298 299// Hangul related constants. 300const ( 301 HangulBase = 0xAC00 302 HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28) 303 304 JamoLBase = 0x1100 305 JamoLEnd = 0x1113 306 JamoVBase = 0x1161 307 JamoVEnd = 0x1176 308 JamoTBase = 0x11A8 309 JamoTEnd = 0x11C3 310 311 JamoLVTCount = 19 * 21 * 28 312 JamoTCount = 28 313) 314 315func isHangul(r rune) bool { 316 return HangulBase <= r && r < HangulEnd 317} 318 319func isHangulWithoutJamoT(r rune) bool { 320 if !isHangul(r) { 321 return false 322 } 323 r -= HangulBase 324 return r < JamoLVTCount && r%JamoTCount == 0 325} 326 327func ccc(r rune) uint8 { 328 return chars[r].ccc 329} 330 331// Insert a rune in a buffer, ordered by Canonical Combining Class. 332func insertOrdered(b Decomposition, r rune) Decomposition { 333 n := len(b) 334 b = append(b, 0) 335 cc := ccc(r) 336 if cc > 0 { 337 // Use bubble sort. 338 for ; n > 0; n-- { 339 if ccc(b[n-1]) <= cc { 340 break 341 } 342 b[n] = b[n-1] 343 } 344 } 345 b[n] = r 346 return b 347} 348 349// Recursively decompose. 350func decomposeRecursive(form int, r rune, d Decomposition) Decomposition { 351 dcomp := chars[r].forms[form].decomp 352 if len(dcomp) == 0 { 353 return insertOrdered(d, r) 354 } 355 for _, c := range dcomp { 356 d = decomposeRecursive(form, c, d) 357 } 358 return d 359} 360 361func completeCharFields(form int) { 362 // Phase 0: pre-expand decomposition. 363 for i := range chars { 364 f := &chars[i].forms[form] 365 if len(f.decomp) == 0 { 366 continue 367 } 368 exp := make(Decomposition, 0) 369 for _, c := range f.decomp { 370 exp = decomposeRecursive(form, c, exp) 371 } 372 f.expandedDecomp = exp 373 } 374 375 // Phase 1: composition exclusion, mark decomposition. 376 for i := range chars { 377 c := &chars[i] 378 f := &c.forms[form] 379 380 // Marks script-specific exclusions and version restricted. 381 f.isOneWay = c.excludeInComp 382 383 // Singletons 384 f.isOneWay = f.isOneWay || len(f.decomp) == 1 385 386 // Non-starter decompositions 387 if len(f.decomp) > 1 { 388 chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0 389 f.isOneWay = f.isOneWay || chk 390 } 391 392 // Runes that decompose into more than two runes. 393 f.isOneWay = f.isOneWay || len(f.decomp) > 2 394 395 if form == FCompatibility { 396 f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint) 397 } 398 399 for _, r := range f.decomp { 400 chars[r].forms[form].inDecomp = true 401 } 402 } 403 404 // Phase 2: forward and backward combining. 405 for i := range chars { 406 c := &chars[i] 407 f := &c.forms[form] 408 409 if !f.isOneWay && len(f.decomp) == 2 { 410 f0 := &chars[f.decomp[0]].forms[form] 411 f1 := &chars[f.decomp[1]].forms[form] 412 if !f0.isOneWay { 413 f0.combinesForward = true 414 } 415 if !f1.isOneWay { 416 f1.combinesBackward = true 417 } 418 } 419 if isHangulWithoutJamoT(rune(i)) { 420 f.combinesForward = true 421 } 422 } 423 424 // Phase 3: quick check values. 425 for i := range chars { 426 c := &chars[i] 427 f := &c.forms[form] 428 429 switch { 430 case len(f.decomp) > 0: 431 f.quickCheck[MDecomposed] = QCNo 432 case isHangul(rune(i)): 433 f.quickCheck[MDecomposed] = QCNo 434 default: 435 f.quickCheck[MDecomposed] = QCYes 436 } 437 switch { 438 case f.isOneWay: 439 f.quickCheck[MComposed] = QCNo 440 case (i & 0xffff00) == JamoLBase: 441 f.quickCheck[MComposed] = QCYes 442 if JamoLBase <= i && i < JamoLEnd { 443 f.combinesForward = true 444 } 445 if JamoVBase <= i && i < JamoVEnd { 446 f.quickCheck[MComposed] = QCMaybe 447 f.combinesBackward = true 448 f.combinesForward = true 449 } 450 if JamoTBase <= i && i < JamoTEnd { 451 f.quickCheck[MComposed] = QCMaybe 452 f.combinesBackward = true 453 } 454 case !f.combinesBackward: 455 f.quickCheck[MComposed] = QCYes 456 default: 457 f.quickCheck[MComposed] = QCMaybe 458 } 459 } 460} 461 462func computeNonStarterCounts() { 463 // Phase 4: leading and trailing non-starter count 464 for i := range chars { 465 c := &chars[i] 466 467 runes := []rune{rune(i)} 468 // We always use FCompatibility so that the CGJ insertion points do not 469 // change for repeated normalizations with different forms. 470 if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 { 471 runes = exp 472 } 473 // We consider runes that combine backwards to be non-starters for the 474 // purpose of Stream-Safe Text Processing. 475 for _, r := range runes { 476 if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { 477 break 478 } 479 c.nLeadingNonStarters++ 480 } 481 for i := len(runes) - 1; i >= 0; i-- { 482 if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward { 483 break 484 } 485 c.nTrailingNonStarters++ 486 } 487 if c.nTrailingNonStarters > 3 { 488 log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes) 489 } 490 491 if isHangul(rune(i)) { 492 c.nTrailingNonStarters = 2 493 if isHangulWithoutJamoT(rune(i)) { 494 c.nTrailingNonStarters = 1 495 } 496 } 497 498 if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t { 499 log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t) 500 } 501 if t := c.nTrailingNonStarters; t > 3 { 502 log.Fatalf("%U: number of trailing non-starters is %d > 3", t) 503 } 504 } 505} 506 507func printBytes(w io.Writer, b []byte, name string) { 508 fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b)) 509 fmt.Fprintf(w, "var %s = [...]byte {", name) 510 for i, c := range b { 511 switch { 512 case i%64 == 0: 513 fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63) 514 case i%8 == 0: 515 fmt.Fprintf(w, "\n") 516 } 517 fmt.Fprintf(w, "0x%.2X, ", c) 518 } 519 fmt.Fprint(w, "\n}\n\n") 520} 521 522// See forminfo.go for format. 523func makeEntry(f *FormInfo, c *Char) uint16 { 524 e := uint16(0) 525 if r := c.codePoint; HangulBase <= r && r < HangulEnd { 526 e |= 0x40 527 } 528 if f.combinesForward { 529 e |= 0x20 530 } 531 if f.quickCheck[MDecomposed] == QCNo { 532 e |= 0x4 533 } 534 switch f.quickCheck[MComposed] { 535 case QCYes: 536 case QCNo: 537 e |= 0x10 538 case QCMaybe: 539 e |= 0x18 540 default: 541 log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed]) 542 } 543 e |= uint16(c.nTrailingNonStarters) 544 return e 545} 546 547// decompSet keeps track of unique decompositions, grouped by whether 548// the decomposition is followed by a trailing and/or leading CCC. 549type decompSet [7]map[string]bool 550 551const ( 552 normalDecomp = iota 553 firstMulti 554 firstCCC 555 endMulti 556 firstLeadingCCC 557 firstCCCZeroExcept 558 firstStarterWithNLead 559 lastDecomp 560) 561 562var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"} 563 564func makeDecompSet() decompSet { 565 m := decompSet{} 566 for i := range m { 567 m[i] = make(map[string]bool) 568 } 569 return m 570} 571func (m *decompSet) insert(key int, s string) { 572 m[key][s] = true 573} 574 575func printCharInfoTables(w io.Writer) int { 576 mkstr := func(r rune, f *FormInfo) (int, string) { 577 d := f.expandedDecomp 578 s := string([]rune(d)) 579 if max := 1 << 6; len(s) >= max { 580 const msg = "%U: too many bytes in decomposition: %d >= %d" 581 log.Fatalf(msg, r, len(s), max) 582 } 583 head := uint8(len(s)) 584 if f.quickCheck[MComposed] != QCYes { 585 head |= 0x40 586 } 587 if f.combinesForward { 588 head |= 0x80 589 } 590 s = string([]byte{head}) + s 591 592 lccc := ccc(d[0]) 593 tccc := ccc(d[len(d)-1]) 594 cc := ccc(r) 595 if cc != 0 && lccc == 0 && tccc == 0 { 596 log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc) 597 } 598 if tccc < lccc && lccc != 0 { 599 const msg = "%U: lccc (%d) must be <= tcc (%d)" 600 log.Fatalf(msg, r, lccc, tccc) 601 } 602 index := normalDecomp 603 nTrail := chars[r].nTrailingNonStarters 604 nLead := chars[r].nLeadingNonStarters 605 if tccc > 0 || lccc > 0 || nTrail > 0 { 606 tccc <<= 2 607 tccc |= nTrail 608 s += string([]byte{tccc}) 609 index = endMulti 610 for _, r := range d[1:] { 611 if ccc(r) == 0 { 612 index = firstCCC 613 } 614 } 615 if lccc > 0 || nLead > 0 { 616 s += string([]byte{lccc}) 617 if index == firstCCC { 618 log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r) 619 } 620 index = firstLeadingCCC 621 } 622 if cc != lccc { 623 if cc != 0 { 624 log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc) 625 } 626 index = firstCCCZeroExcept 627 } 628 } else if len(d) > 1 { 629 index = firstMulti 630 } 631 return index, s 632 } 633 634 decompSet := makeDecompSet() 635 const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail. 636 decompSet.insert(firstStarterWithNLead, nLeadStr) 637 638 // Store the uniqued decompositions in a byte buffer, 639 // preceded by their byte length. 640 for _, c := range chars { 641 for _, f := range c.forms { 642 if len(f.expandedDecomp) == 0 { 643 continue 644 } 645 if f.combinesBackward { 646 log.Fatalf("%U: combinesBackward and decompose", c.codePoint) 647 } 648 index, s := mkstr(c.codePoint, &f) 649 decompSet.insert(index, s) 650 } 651 } 652 653 decompositions := bytes.NewBuffer(make([]byte, 0, 10000)) 654 size := 0 655 positionMap := make(map[string]uint16) 656 decompositions.WriteString("\000") 657 fmt.Fprintln(w, "const (") 658 for i, m := range decompSet { 659 sa := []string{} 660 for s := range m { 661 sa = append(sa, s) 662 } 663 sort.Strings(sa) 664 for _, s := range sa { 665 p := decompositions.Len() 666 decompositions.WriteString(s) 667 positionMap[s] = uint16(p) 668 } 669 if cname[i] != "" { 670 fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len()) 671 } 672 } 673 fmt.Fprintln(w, "maxDecomp = 0x8000") 674 fmt.Fprintln(w, ")") 675 b := decompositions.Bytes() 676 printBytes(w, b, "decomps") 677 size += len(b) 678 679 varnames := []string{"nfc", "nfkc"} 680 for i := 0; i < FNumberOfFormTypes; i++ { 681 trie := triegen.NewTrie(varnames[i]) 682 683 for r, c := range chars { 684 f := c.forms[i] 685 d := f.expandedDecomp 686 if len(d) != 0 { 687 _, key := mkstr(c.codePoint, &f) 688 trie.Insert(rune(r), uint64(positionMap[key])) 689 if c.ccc != ccc(d[0]) { 690 // We assume the lead ccc of a decomposition !=0 in this case. 691 if ccc(d[0]) == 0 { 692 log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc) 693 } 694 } 695 } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward { 696 // Handle cases where it can't be detected that the nLead should be equal 697 // to nTrail. 698 trie.Insert(c.codePoint, uint64(positionMap[nLeadStr])) 699 } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 { 700 trie.Insert(c.codePoint, uint64(0x8000|v)) 701 } 702 } 703 sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]})) 704 if err != nil { 705 log.Fatal(err) 706 } 707 size += sz 708 } 709 return size 710} 711 712func contains(sa []string, s string) bool { 713 for _, a := range sa { 714 if a == s { 715 return true 716 } 717 } 718 return false 719} 720 721func makeTables() { 722 w := &bytes.Buffer{} 723 724 size := 0 725 if *tablelist == "" { 726 return 727 } 728 list := strings.Split(*tablelist, ",") 729 if *tablelist == "all" { 730 list = []string{"recomp", "info"} 731 } 732 733 // Compute maximum decomposition size. 734 max := 0 735 for _, c := range chars { 736 if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max { 737 max = n 738 } 739 } 740 fmt.Fprintln(w, `import "sync"`) 741 fmt.Fprintln(w) 742 743 fmt.Fprintln(w, "const (") 744 fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.") 745 fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion()) 746 fmt.Fprintln(w) 747 fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform") 748 fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at") 749 fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that") 750 fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.") 751 fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max) 752 fmt.Fprintln(w, ")\n") 753 754 // Print the CCC remap table. 755 size += len(cccMap) 756 fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap)) 757 for i := 0; i < len(cccMap); i++ { 758 if i%8 == 0 { 759 fmt.Fprintln(w) 760 } 761 fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)]) 762 } 763 fmt.Fprintln(w, "\n}\n") 764 765 if contains(list, "info") { 766 size += printCharInfoTables(w) 767 } 768 769 if contains(list, "recomp") { 770 // Note that we use 32 bit keys, instead of 64 bit. 771 // This clips the bits of three entries, but we know 772 // this won't cause a collision. The compiler will catch 773 // any changes made to UnicodeData.txt that introduces 774 // a collision. 775 // Note that the recomposition map for NFC and NFKC 776 // are identical. 777 778 // Recomposition map 779 nrentries := 0 780 for _, c := range chars { 781 f := c.forms[FCanonical] 782 if !f.isOneWay && len(f.decomp) > 0 { 783 nrentries++ 784 } 785 } 786 sz := nrentries * 8 787 size += sz 788 fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz) 789 fmt.Fprintln(w, "var recompMap map[uint32]rune") 790 fmt.Fprintln(w, "var recompMapOnce sync.Once\n") 791 fmt.Fprintln(w, `const recompMapPacked = "" +`) 792 var buf [8]byte 793 for i, c := range chars { 794 f := c.forms[FCanonical] 795 d := f.decomp 796 if !f.isOneWay && len(d) > 0 { 797 key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1])) 798 binary.BigEndian.PutUint32(buf[:4], key) 799 binary.BigEndian.PutUint32(buf[4:], uint32(i)) 800 fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i)) 801 } 802 } 803 // hack so we don't have to special case the trailing plus sign 804 fmt.Fprintf(w, ` ""`) 805 fmt.Fprintln(w) 806 } 807 808 fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size) 809 gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes()) 810} 811 812func printChars() { 813 if *verbose { 814 for _, c := range chars { 815 if !c.isValid() || c.state == SMissing { 816 continue 817 } 818 fmt.Println(c) 819 } 820 } 821} 822 823// verifyComputed does various consistency tests. 824func verifyComputed() { 825 for i, c := range chars { 826 for _, f := range c.forms { 827 isNo := (f.quickCheck[MDecomposed] == QCNo) 828 if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) { 829 log.Fatalf("%U: NF*D QC must be No if rune decomposes", i) 830 } 831 832 isMaybe := f.quickCheck[MComposed] == QCMaybe 833 if f.combinesBackward != isMaybe { 834 log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i) 835 } 836 if len(f.decomp) > 0 && f.combinesForward && isMaybe { 837 log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i) 838 } 839 840 if len(f.expandedDecomp) != 0 { 841 continue 842 } 843 if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b { 844 // We accept these runes to be treated differently (it only affects 845 // segment breaking in iteration, most likely on improper use), but 846 // reconsider if more characters are added. 847 // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;; 848 // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;; 849 // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;; 850 // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;; 851 // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;; 852 // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;; 853 if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) { 854 log.Fatalf("%U: nLead was %v; want %v", i, a, b) 855 } 856 } 857 } 858 nfc := c.forms[FCanonical] 859 nfkc := c.forms[FCompatibility] 860 if nfc.combinesBackward != nfkc.combinesBackward { 861 log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint) 862 } 863 } 864} 865 866// Use values in DerivedNormalizationProps.txt to compare against the 867// values we computed. 868// DerivedNormalizationProps.txt has form: 869// 00C0..00C5 ; NFD_QC; N # ... 870// 0374 ; NFD_QC; N # ... 871// See https://unicode.org/reports/tr44/ for full explanation 872func testDerived() { 873 f := gen.OpenUCDFile("DerivedNormalizationProps.txt") 874 defer f.Close() 875 p := ucd.New(f) 876 for p.Next() { 877 r := p.Rune(0) 878 c := &chars[r] 879 880 var ftype, mode int 881 qt := p.String(1) 882 switch qt { 883 case "NFC_QC": 884 ftype, mode = FCanonical, MComposed 885 case "NFD_QC": 886 ftype, mode = FCanonical, MDecomposed 887 case "NFKC_QC": 888 ftype, mode = FCompatibility, MComposed 889 case "NFKD_QC": 890 ftype, mode = FCompatibility, MDecomposed 891 default: 892 continue 893 } 894 var qr QCResult 895 switch p.String(2) { 896 case "Y": 897 qr = QCYes 898 case "N": 899 qr = QCNo 900 case "M": 901 qr = QCMaybe 902 default: 903 log.Fatalf(`Unexpected quick check value "%s"`, p.String(2)) 904 } 905 if got := c.forms[ftype].quickCheck[mode]; got != qr { 906 log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr) 907 } 908 c.forms[ftype].verified[mode] = true 909 } 910 if err := p.Err(); err != nil { 911 log.Fatal(err) 912 } 913 // Any unspecified value must be QCYes. Verify this. 914 for i, c := range chars { 915 for j, fd := range c.forms { 916 for k, qr := range fd.quickCheck { 917 if !fd.verified[k] && qr != QCYes { 918 m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n" 919 log.Printf(m, i, j, k, qr, c.name) 920 } 921 } 922 } 923 } 924} 925 926var testHeader = `const ( 927 Yes = iota 928 No 929 Maybe 930) 931 932type formData struct { 933 qc uint8 934 combinesForward bool 935 decomposition string 936} 937 938type runeData struct { 939 r rune 940 ccc uint8 941 nLead uint8 942 nTrail uint8 943 f [2]formData // 0: canonical; 1: compatibility 944} 945 946func f(qc uint8, cf bool, dec string) [2]formData { 947 return [2]formData{{qc, cf, dec}, {qc, cf, dec}} 948} 949 950func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData { 951 return [2]formData{{qc, cf, d}, {qck, cfk, dk}} 952} 953 954var testData = []runeData{ 955` 956 957func printTestdata() { 958 type lastInfo struct { 959 ccc uint8 960 nLead uint8 961 nTrail uint8 962 f string 963 } 964 965 last := lastInfo{} 966 w := &bytes.Buffer{} 967 fmt.Fprintf(w, testHeader) 968 for r, c := range chars { 969 f := c.forms[FCanonical] 970 qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) 971 f = c.forms[FCompatibility] 972 qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp) 973 s := "" 974 if d == dk && qc == qck && cf == cfk { 975 s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d) 976 } else { 977 s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk) 978 } 979 current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s} 980 if last != current { 981 fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s) 982 last = current 983 } 984 } 985 fmt.Fprintln(w, "}") 986 gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes()) 987} 988