1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Language tag table generator. 8// Data read from the web. 9 10package main 11 12import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "io/ioutil" 18 "log" 19 "math" 20 "reflect" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 26 "golang.org/x/text/internal/gen" 27 "golang.org/x/text/internal/tag" 28 "golang.org/x/text/unicode/cldr" 29) 30 31var ( 32 test = flag.Bool("test", 33 false, 34 "test existing tables; can be used to compare web data with package data.") 35 outputFile = flag.String("output", 36 "tables.go", 37 "output file for generated tables") 38) 39 40var comment = []string{ 41 ` 42lang holds an alphabetically sorted list of ISO-639 language identifiers. 43All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 44For 2-byte language identifiers, the two successive bytes have the following meaning: 45 - if the first letter of the 2- and 3-letter ISO codes are the same: 46 the second and third letter of the 3-letter ISO code. 47 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 48For 3-byte language identifiers the 4th byte is 0.`, 49 ` 50langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 51in lookup tables. The language ids for these language codes are derived directly 52from the letters and are not consecutive.`, 53 ` 54altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 55to 2-letter language codes that cannot be derived using the method described above. 56Each 3-letter code is followed by its 1-byte langID.`, 57 ` 58altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 59 ` 60AliasMap maps langIDs to their suggested replacements.`, 61 ` 62script is an alphabetically sorted list of ISO 15924 codes. The index 63of the script in the string, divided by 4, is the internal scriptID.`, 64 ` 65isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 66for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 67the UN.M49 codes used for groups.)`, 68 ` 69regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 70Each 2-letter codes is followed by two bytes with the following meaning: 71 - [A-Z}{2}: the first letter of the 2-letter code plus these two 72 letters form the 3-letter ISO code. 73 - 0, n: index into altRegionISO3.`, 74 ` 75regionTypes defines the status of a region for various standards.`, 76 ` 77m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 78codes indicating collections of regions.`, 79 ` 80m49Index gives indexes into fromM49 based on the three most significant bits 81of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 82 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 83for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 84The region code is stored in the 9 lsb of the indexed value.`, 85 ` 86fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 87 ` 88altRegionISO3 holds a list of 3-letter region codes that cannot be 89mapped to 2-letter codes using the default algorithm. This is a short list.`, 90 ` 91altRegionIDs holds a list of regionIDs the positions of which match those 92of the 3-letter ISO codes in altRegionISO3.`, 93 ` 94variantNumSpecialized is the number of specialized variants in variants.`, 95 ` 96suppressScript is an index from langID to the dominant script for that language, 97if it exists. If a script is given, it should be suppressed from the language tag.`, 98 ` 99likelyLang is a lookup table, indexed by langID, for the most likely 100scripts and regions given incomplete information. If more entries exist for a 101given language, region and script are the index and size respectively 102of the list in likelyLangList.`, 103 ` 104likelyLangList holds lists info associated with likelyLang.`, 105 ` 106likelyRegion is a lookup table, indexed by regionID, for the most likely 107languages and scripts given incomplete information. If more entries exist 108for a given regionID, lang and script are the index and size respectively 109of the list in likelyRegionList. 110TODO: exclude containers and user-definable regions from the list.`, 111 ` 112likelyRegionList holds lists info associated with likelyRegion.`, 113 ` 114likelyScript is a lookup table, indexed by scriptID, for the most likely 115languages and regions given a script.`, 116 ` 117nRegionGroups is the number of region groups.`, 118 ` 119regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 120where each set holds all groupings that are directly connected in a region 121containment graph.`, 122 ` 123regionInclusionBits is an array of bit vectors where every vector represents 124a set of region groupings. These sets are used to compute the distance 125between two regions for the purpose of language matching.`, 126 ` 127regionInclusionNext marks, for each entry in regionInclusionBits, the set of 128all groups that are reachable from the groups set in the respective entry.`, 129} 130 131// TODO: consider changing some of these structures to tries. This can reduce 132// memory, but may increase the need for memory allocations. This could be 133// mitigated if we can piggyback on language tags for common cases. 134 135func failOnError(e error) { 136 if e != nil { 137 log.Panic(e) 138 } 139} 140 141type setType int 142 143const ( 144 Indexed setType = 1 + iota // all elements must be of same size 145 Linear 146) 147 148type stringSet struct { 149 s []string 150 sorted, frozen bool 151 152 // We often need to update values after the creation of an index is completed. 153 // We include a convenience map for keeping track of this. 154 update map[string]string 155 typ setType // used for checking. 156} 157 158func (ss *stringSet) clone() stringSet { 159 c := *ss 160 c.s = append([]string(nil), c.s...) 161 return c 162} 163 164func (ss *stringSet) setType(t setType) { 165 if ss.typ != t && ss.typ != 0 { 166 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 167 } 168} 169 170// parse parses a whitespace-separated string and initializes ss with its 171// components. 172func (ss *stringSet) parse(s string) { 173 scan := bufio.NewScanner(strings.NewReader(s)) 174 scan.Split(bufio.ScanWords) 175 for scan.Scan() { 176 ss.add(scan.Text()) 177 } 178} 179 180func (ss *stringSet) assertChangeable() { 181 if ss.frozen { 182 log.Panic("attempt to modify a frozen stringSet") 183 } 184} 185 186func (ss *stringSet) add(s string) { 187 ss.assertChangeable() 188 ss.s = append(ss.s, s) 189 ss.sorted = ss.frozen 190} 191 192func (ss *stringSet) freeze() { 193 ss.compact() 194 ss.frozen = true 195} 196 197func (ss *stringSet) compact() { 198 if ss.sorted { 199 return 200 } 201 a := ss.s 202 sort.Strings(a) 203 k := 0 204 for i := 1; i < len(a); i++ { 205 if a[k] != a[i] { 206 a[k+1] = a[i] 207 k++ 208 } 209 } 210 ss.s = a[:k+1] 211 ss.sorted = ss.frozen 212} 213 214type funcSorter struct { 215 fn func(a, b string) bool 216 sort.StringSlice 217} 218 219func (s funcSorter) Less(i, j int) bool { 220 return s.fn(s.StringSlice[i], s.StringSlice[j]) 221} 222 223func (ss *stringSet) sortFunc(f func(a, b string) bool) { 224 ss.compact() 225 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 226} 227 228func (ss *stringSet) remove(s string) { 229 ss.assertChangeable() 230 if i, ok := ss.find(s); ok { 231 copy(ss.s[i:], ss.s[i+1:]) 232 ss.s = ss.s[:len(ss.s)-1] 233 } 234} 235 236func (ss *stringSet) replace(ol, nu string) { 237 ss.s[ss.index(ol)] = nu 238 ss.sorted = ss.frozen 239} 240 241func (ss *stringSet) index(s string) int { 242 ss.setType(Indexed) 243 i, ok := ss.find(s) 244 if !ok { 245 if i < len(ss.s) { 246 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 247 } 248 log.Panicf("find: item %q is not in list", s) 249 250 } 251 return i 252} 253 254func (ss *stringSet) find(s string) (int, bool) { 255 ss.compact() 256 i := sort.SearchStrings(ss.s, s) 257 return i, i != len(ss.s) && ss.s[i] == s 258} 259 260func (ss *stringSet) slice() []string { 261 ss.compact() 262 return ss.s 263} 264 265func (ss *stringSet) updateLater(v, key string) { 266 if ss.update == nil { 267 ss.update = map[string]string{} 268 } 269 ss.update[v] = key 270} 271 272// join joins the string and ensures that all entries are of the same length. 273func (ss *stringSet) join() string { 274 ss.setType(Indexed) 275 n := len(ss.s[0]) 276 for _, s := range ss.s { 277 if len(s) != n { 278 log.Panicf("join: not all entries are of the same length: %q", s) 279 } 280 } 281 ss.s = append(ss.s, strings.Repeat("\xff", n)) 282 return strings.Join(ss.s, "") 283} 284 285// ianaEntry holds information for an entry in the IANA Language Subtag Repository. 286// All types use the same entry. 287// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 288// fields. 289type ianaEntry struct { 290 typ string 291 description []string 292 scope string 293 added string 294 preferred string 295 deprecated string 296 suppressScript string 297 macro string 298 prefix []string 299} 300 301type builder struct { 302 w *gen.CodeWriter 303 hw io.Writer // MultiWriter for w and w.Hash 304 data *cldr.CLDR 305 supp *cldr.SupplementalData 306 307 // indices 308 locale stringSet // common locales 309 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 310 langNoIndex stringSet // 3-letter ISO codes with no associated data 311 script stringSet // 4-letter ISO codes 312 region stringSet // 2-letter ISO or 3-digit UN M49 codes 313 variant stringSet // 4-8-alphanumeric variant code. 314 315 // Region codes that are groups with their corresponding group IDs. 316 groups map[int]index 317 318 // langInfo 319 registry map[string]*ianaEntry 320} 321 322type index uint 323 324func newBuilder(w *gen.CodeWriter) *builder { 325 r := gen.OpenCLDRCoreZip() 326 defer r.Close() 327 d := &cldr.Decoder{} 328 data, err := d.DecodeZip(r) 329 failOnError(err) 330 b := builder{ 331 w: w, 332 hw: io.MultiWriter(w, w.Hash), 333 data: data, 334 supp: data.Supplemental(), 335 } 336 b.parseRegistry() 337 return &b 338} 339 340func (b *builder) parseRegistry() { 341 r := gen.OpenIANAFile("assignments/language-subtag-registry") 342 defer r.Close() 343 b.registry = make(map[string]*ianaEntry) 344 345 scan := bufio.NewScanner(r) 346 scan.Split(bufio.ScanWords) 347 var record *ianaEntry 348 for more := scan.Scan(); more; { 349 key := scan.Text() 350 more = scan.Scan() 351 value := scan.Text() 352 switch key { 353 case "Type:": 354 record = &ianaEntry{typ: value} 355 case "Subtag:", "Tag:": 356 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 357 for a := s[0]; a <= s[1]; a = inc(a) { 358 b.addToRegistry(a, record) 359 } 360 } else { 361 b.addToRegistry(value, record) 362 } 363 case "Suppress-Script:": 364 record.suppressScript = value 365 case "Added:": 366 record.added = value 367 case "Deprecated:": 368 record.deprecated = value 369 case "Macrolanguage:": 370 record.macro = value 371 case "Preferred-Value:": 372 record.preferred = value 373 case "Prefix:": 374 record.prefix = append(record.prefix, value) 375 case "Scope:": 376 record.scope = value 377 case "Description:": 378 buf := []byte(value) 379 for more = scan.Scan(); more; more = scan.Scan() { 380 b := scan.Bytes() 381 if b[0] == '%' || b[len(b)-1] == ':' { 382 break 383 } 384 buf = append(buf, ' ') 385 buf = append(buf, b...) 386 } 387 record.description = append(record.description, string(buf)) 388 continue 389 default: 390 continue 391 } 392 more = scan.Scan() 393 } 394 if scan.Err() != nil { 395 log.Panic(scan.Err()) 396 } 397} 398 399func (b *builder) addToRegistry(key string, entry *ianaEntry) { 400 if info, ok := b.registry[key]; ok { 401 if info.typ != "language" || entry.typ != "extlang" { 402 log.Fatalf("parseRegistry: tag %q already exists", key) 403 } 404 } else { 405 b.registry[key] = entry 406 } 407} 408 409var commentIndex = make(map[string]string) 410 411func init() { 412 for _, s := range comment { 413 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 414 commentIndex[key] = s 415 } 416} 417 418func (b *builder) comment(name string) { 419 if s := commentIndex[name]; len(s) > 0 { 420 b.w.WriteComment(s) 421 } else { 422 fmt.Fprintln(b.w) 423 } 424} 425 426func (b *builder) pf(f string, x ...interface{}) { 427 fmt.Fprintf(b.hw, f, x...) 428 fmt.Fprint(b.hw, "\n") 429} 430 431func (b *builder) p(x ...interface{}) { 432 fmt.Fprintln(b.hw, x...) 433} 434 435func (b *builder) addSize(s int) { 436 b.w.Size += s 437 b.pf("// Size: %d bytes", s) 438} 439 440func (b *builder) writeConst(name string, x interface{}) { 441 b.comment(name) 442 b.w.WriteConst(name, x) 443} 444 445// writeConsts computes f(v) for all v in values and writes the results 446// as constants named _v to a single constant block. 447func (b *builder) writeConsts(f func(string) int, values ...string) { 448 b.pf("const (") 449 for _, v := range values { 450 b.pf("\t_%s = %v", v, f(v)) 451 } 452 b.pf(")") 453} 454 455// writeType writes the type of the given value, which must be a struct. 456func (b *builder) writeType(value interface{}) { 457 b.comment(reflect.TypeOf(value).Name()) 458 b.w.WriteType(value) 459} 460 461func (b *builder) writeSlice(name string, ss interface{}) { 462 b.writeSliceAddSize(name, 0, ss) 463} 464 465func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 466 b.comment(name) 467 b.w.Size += extraSize 468 v := reflect.ValueOf(ss) 469 t := v.Type().Elem() 470 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 471 472 fmt.Fprintf(b.w, "var %s = ", name) 473 b.w.WriteArray(ss) 474 b.p() 475} 476 477type FromTo struct { 478 From, To uint16 479} 480 481func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 482 ss.sortFunc(func(a, b string) bool { 483 return index(a) < index(b) 484 }) 485 m := []FromTo{} 486 for _, s := range ss.s { 487 m = append(m, FromTo{index(s), index(ss.update[s])}) 488 } 489 b.writeSlice(name, m) 490} 491 492const base = 'z' - 'a' + 1 493 494func strToInt(s string) uint { 495 v := uint(0) 496 for i := 0; i < len(s); i++ { 497 v *= base 498 v += uint(s[i] - 'a') 499 } 500 return v 501} 502 503// converts the given integer to the original ASCII string passed to strToInt. 504// len(s) must match the number of characters obtained. 505func intToStr(v uint, s []byte) { 506 for i := len(s) - 1; i >= 0; i-- { 507 s[i] = byte(v%base) + 'a' 508 v /= base 509 } 510} 511 512func (b *builder) writeBitVector(name string, ss []string) { 513 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 514 for _, s := range ss { 515 v := strToInt(s) 516 vec[v/8] |= 1 << (v % 8) 517 } 518 b.writeSlice(name, vec) 519} 520 521// TODO: convert this type into a list or two-stage trie. 522func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 523 b.comment(name) 524 v := reflect.ValueOf(m) 525 sz := v.Len() * (2 + int(v.Type().Key().Size())) 526 for _, k := range m { 527 sz += len(k) 528 } 529 b.addSize(sz) 530 keys := []string{} 531 b.pf(`var %s = map[string]uint16{`, name) 532 for k := range m { 533 keys = append(keys, k) 534 } 535 sort.Strings(keys) 536 for _, k := range keys { 537 b.pf("\t%q: %v,", k, f(m[k])) 538 } 539 b.p("}") 540} 541 542func (b *builder) writeMap(name string, m interface{}) { 543 b.comment(name) 544 v := reflect.ValueOf(m) 545 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 546 b.addSize(sz) 547 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 548 return strings.IndexRune("{}, ", r) != -1 549 }) 550 sort.Strings(f[1:]) 551 b.pf(`var %s = %s{`, name, f[0]) 552 for _, kv := range f[1:] { 553 b.pf("\t%s,", kv) 554 } 555 b.p("}") 556} 557 558func (b *builder) langIndex(s string) uint16 { 559 if s == "und" { 560 return 0 561 } 562 if i, ok := b.lang.find(s); ok { 563 return uint16(i) 564 } 565 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 566} 567 568// inc advances the string to its lexicographical successor. 569func inc(s string) string { 570 const maxTagLength = 4 571 var buf [maxTagLength]byte 572 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 573 for i := 0; i < len(s); i++ { 574 if s[i] <= 'Z' { 575 buf[i] -= 'a' - 'A' 576 } 577 } 578 return string(buf[:len(s)]) 579} 580 581func (b *builder) parseIndices() { 582 meta := b.supp.Metadata 583 584 for k, v := range b.registry { 585 var ss *stringSet 586 switch v.typ { 587 case "language": 588 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 589 b.lang.add(k) 590 continue 591 } else { 592 ss = &b.langNoIndex 593 } 594 case "region": 595 ss = &b.region 596 case "script": 597 ss = &b.script 598 case "variant": 599 ss = &b.variant 600 default: 601 continue 602 } 603 ss.add(k) 604 } 605 // Include any language for which there is data. 606 for _, lang := range b.data.Locales() { 607 if x := b.data.RawLDML(lang); false || 608 x.LocaleDisplayNames != nil || 609 x.Characters != nil || 610 x.Delimiters != nil || 611 x.Measurement != nil || 612 x.Dates != nil || 613 x.Numbers != nil || 614 x.Units != nil || 615 x.ListPatterns != nil || 616 x.Collations != nil || 617 x.Segmentations != nil || 618 x.Rbnf != nil || 619 x.Annotations != nil || 620 x.Metadata != nil { 621 622 from := strings.Split(lang, "_") 623 if lang := from[0]; lang != "root" { 624 b.lang.add(lang) 625 } 626 } 627 } 628 // Include locales for plural rules, which uses a different structure. 629 for _, plurals := range b.data.Supplemental().Plurals { 630 for _, rules := range plurals.PluralRules { 631 for _, lang := range strings.Split(rules.Locales, " ") { 632 if lang = strings.Split(lang, "_")[0]; lang != "root" { 633 b.lang.add(lang) 634 } 635 } 636 } 637 } 638 // Include languages in likely subtags. 639 for _, m := range b.supp.LikelySubtags.LikelySubtag { 640 from := strings.Split(m.From, "_") 641 b.lang.add(from[0]) 642 } 643 // Include ISO-639 alpha-3 bibliographic entries. 644 for _, a := range meta.Alias.LanguageAlias { 645 if a.Reason == "bibliographic" { 646 b.langNoIndex.add(a.Type) 647 } 648 } 649 // Include regions in territoryAlias (not all are in the IANA registry!) 650 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 651 if len(reg.Type) == 2 { 652 b.region.add(reg.Type) 653 } 654 } 655 656 for _, s := range b.lang.s { 657 if len(s) == 3 { 658 b.langNoIndex.remove(s) 659 } 660 } 661 b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 662 b.writeConst("NumScripts", len(b.script.slice())) 663 b.writeConst("NumRegions", len(b.region.slice())) 664 665 // Add dummy codes at the start of each list to represent "unspecified". 666 b.lang.add("---") 667 b.script.add("----") 668 b.region.add("---") 669 670 // common locales 671 b.locale.parse(meta.DefaultContent.Locales) 672} 673 674// TODO: region inclusion data will probably not be use used in future matchers. 675 676func (b *builder) computeRegionGroups() { 677 b.groups = make(map[int]index) 678 679 // Create group indices. 680 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 681 b.groups[i] = index(len(b.groups)) 682 } 683 for _, g := range b.supp.TerritoryContainment.Group { 684 // Skip UN and EURO zone as they are flattening the containment 685 // relationship. 686 if g.Type == "EZ" || g.Type == "UN" { 687 continue 688 } 689 group := b.region.index(g.Type) 690 if _, ok := b.groups[group]; !ok { 691 b.groups[group] = index(len(b.groups)) 692 } 693 } 694 if len(b.groups) > 64 { 695 log.Fatalf("only 64 groups supported, found %d", len(b.groups)) 696 } 697 b.writeConst("nRegionGroups", len(b.groups)) 698} 699 700var langConsts = []string{ 701 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 702 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 703 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 704 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 705 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 706 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 707 708 // constants for grandfathered tags (if not already defined) 709 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 710 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 711} 712 713// writeLanguage generates all tables needed for language canonicalization. 714func (b *builder) writeLanguage() { 715 meta := b.supp.Metadata 716 717 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 718 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 719 b.writeConst("langPrivateStart", b.langIndex("qaa")) 720 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 721 722 // Get language codes that need to be mapped (overlong 3-letter codes, 723 // deprecated 2-letter codes, legacy and grandfathered tags.) 724 langAliasMap := stringSet{} 725 aliasTypeMap := map[string]AliasType{} 726 727 // altLangISO3 get the alternative ISO3 names that need to be mapped. 728 altLangISO3 := stringSet{} 729 // Add dummy start to avoid the use of index 0. 730 altLangISO3.add("---") 731 altLangISO3.updateLater("---", "aa") 732 733 lang := b.lang.clone() 734 for _, a := range meta.Alias.LanguageAlias { 735 if a.Replacement == "" { 736 a.Replacement = "und" 737 } 738 // TODO: support mapping to tags 739 repl := strings.SplitN(a.Replacement, "_", 2)[0] 740 if a.Reason == "overlong" { 741 if len(a.Replacement) == 2 && len(a.Type) == 3 { 742 lang.updateLater(a.Replacement, a.Type) 743 } 744 } else if len(a.Type) <= 3 { 745 switch a.Reason { 746 case "macrolanguage": 747 aliasTypeMap[a.Type] = Macro 748 case "deprecated": 749 // handled elsewhere 750 continue 751 case "bibliographic", "legacy": 752 if a.Type == "no" { 753 continue 754 } 755 aliasTypeMap[a.Type] = Legacy 756 default: 757 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 758 } 759 langAliasMap.add(a.Type) 760 langAliasMap.updateLater(a.Type, repl) 761 } 762 } 763 // Manually add the mapping of "nb" (Norwegian) to its macro language. 764 // This can be removed if CLDR adopts this change. 765 langAliasMap.add("nb") 766 langAliasMap.updateLater("nb", "no") 767 aliasTypeMap["nb"] = Macro 768 769 for k, v := range b.registry { 770 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 771 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 772 langAliasMap.add(k) 773 langAliasMap.updateLater(k, v.preferred) 774 aliasTypeMap[k] = Deprecated 775 } 776 } 777 // Fix CLDR mappings. 778 lang.updateLater("tl", "tgl") 779 lang.updateLater("sh", "hbs") 780 lang.updateLater("mo", "mol") 781 lang.updateLater("no", "nor") 782 lang.updateLater("tw", "twi") 783 lang.updateLater("nb", "nob") 784 lang.updateLater("ak", "aka") 785 lang.updateLater("bh", "bih") 786 787 // Ensure that each 2-letter code is matched with a 3-letter code. 788 for _, v := range lang.s[1:] { 789 s, ok := lang.update[v] 790 if !ok { 791 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 792 continue 793 } 794 lang.update[v] = s 795 } 796 if v[0] != s[0] { 797 altLangISO3.add(s) 798 altLangISO3.updateLater(s, v) 799 } 800 } 801 802 // Complete canonicalized language tags. 803 lang.freeze() 804 for i, v := range lang.s { 805 // We can avoid these manual entries by using the IANA registry directly. 806 // Seems easier to update the list manually, as changes are rare. 807 // The panic in this loop will trigger if we miss an entry. 808 add := "" 809 if s, ok := lang.update[v]; ok { 810 if s[0] == v[0] { 811 add = s[1:] 812 } else { 813 add = string([]byte{0, byte(altLangISO3.index(s))}) 814 } 815 } else if len(v) == 3 { 816 add = "\x00" 817 } else { 818 log.Panicf("no data for long form of %q", v) 819 } 820 lang.s[i] += add 821 } 822 b.writeConst("lang", tag.Index(lang.join())) 823 824 b.writeConst("langNoIndexOffset", len(b.lang.s)) 825 826 // space of all valid 3-letter language identifiers. 827 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 828 829 altLangIndex := []uint16{} 830 for i, s := range altLangISO3.slice() { 831 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 832 if i > 0 { 833 idx := b.lang.index(altLangISO3.update[s]) 834 altLangIndex = append(altLangIndex, uint16(idx)) 835 } 836 } 837 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 838 b.writeSlice("altLangIndex", altLangIndex) 839 840 b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex) 841 types := make([]AliasType, len(langAliasMap.s)) 842 for i, s := range langAliasMap.s { 843 types[i] = aliasTypeMap[s] 844 } 845 b.writeSlice("AliasTypes", types) 846} 847 848var scriptConsts = []string{ 849 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 850 "Zzzz", 851} 852 853func (b *builder) writeScript() { 854 b.writeConsts(b.script.index, scriptConsts...) 855 b.writeConst("script", tag.Index(b.script.join())) 856 857 supp := make([]uint8, len(b.lang.slice())) 858 for i, v := range b.lang.slice()[1:] { 859 if sc := b.registry[v].suppressScript; sc != "" { 860 supp[i+1] = uint8(b.script.index(sc)) 861 } 862 } 863 b.writeSlice("suppressScript", supp) 864 865 // There is only one deprecated script in CLDR. This value is hard-coded. 866 // We check here if the code must be updated. 867 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 868 if a.Type != "Qaai" { 869 log.Panicf("unexpected deprecated stript %q", a.Type) 870 } 871 } 872} 873 874func parseM49(s string) int16 { 875 if len(s) == 0 { 876 return 0 877 } 878 v, err := strconv.ParseUint(s, 10, 10) 879 failOnError(err) 880 return int16(v) 881} 882 883var regionConsts = []string{ 884 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 885 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 886} 887 888func (b *builder) writeRegion() { 889 b.writeConsts(b.region.index, regionConsts...) 890 891 isoOffset := b.region.index("AA") 892 m49map := make([]int16, len(b.region.slice())) 893 fromM49map := make(map[int16]int) 894 altRegionISO3 := "" 895 altRegionIDs := []uint16{} 896 897 b.writeConst("isoRegionOffset", isoOffset) 898 899 // 2-letter region lookup and mapping to numeric codes. 900 regionISO := b.region.clone() 901 regionISO.s = regionISO.s[isoOffset:] 902 regionISO.sorted = false 903 904 regionTypes := make([]byte, len(b.region.s)) 905 906 // Is the region valid BCP 47? 907 for s, e := range b.registry { 908 if len(s) == 2 && s == strings.ToUpper(s) { 909 i := b.region.index(s) 910 for _, d := range e.description { 911 if strings.Contains(d, "Private use") { 912 regionTypes[i] = iso3166UserAssigned 913 } 914 } 915 regionTypes[i] |= bcp47Region 916 } 917 } 918 919 // Is the region a valid ccTLD? 920 r := gen.OpenIANAFile("domains/root/db") 921 defer r.Close() 922 923 buf, err := ioutil.ReadAll(r) 924 failOnError(err) 925 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 926 for _, m := range re.FindAllSubmatch(buf, -1) { 927 i := b.region.index(strings.ToUpper(string(m[1]))) 928 regionTypes[i] |= ccTLD 929 } 930 931 b.writeSlice("regionTypes", regionTypes) 932 933 iso3Set := make(map[string]int) 934 update := func(iso2, iso3 string) { 935 i := regionISO.index(iso2) 936 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 937 regionISO.s[i] += iso3[1:] 938 iso3Set[iso3] = -1 939 } else { 940 if ok && j >= 0 { 941 regionISO.s[i] += string([]byte{0, byte(j)}) 942 } else { 943 iso3Set[iso3] = len(altRegionISO3) 944 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 945 altRegionISO3 += iso3 946 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 947 } 948 } 949 } 950 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 951 i := regionISO.index(tc.Type) + isoOffset 952 if d := m49map[i]; d != 0 { 953 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 954 } 955 m49 := parseM49(tc.Numeric) 956 m49map[i] = m49 957 if r := fromM49map[m49]; r == 0 { 958 fromM49map[m49] = i 959 } else if r != i { 960 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 961 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 962 fromM49map[m49] = i 963 } 964 } 965 } 966 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 967 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 968 from := parseM49(ta.Type) 969 if r := fromM49map[from]; r == 0 { 970 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 971 } 972 } 973 } 974 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 975 if len(tc.Alpha3) == 3 { 976 update(tc.Type, tc.Alpha3) 977 } 978 } 979 // This entries are not included in territoryCodes. Mostly 3-letter variants 980 // of deleted codes and an entry for QU. 981 for _, m := range []struct{ iso2, iso3 string }{ 982 {"CT", "CTE"}, 983 {"DY", "DHY"}, 984 {"HV", "HVO"}, 985 {"JT", "JTN"}, 986 {"MI", "MID"}, 987 {"NH", "NHB"}, 988 {"NQ", "ATN"}, 989 {"PC", "PCI"}, 990 {"PU", "PUS"}, 991 {"PZ", "PCZ"}, 992 {"RH", "RHO"}, 993 {"VD", "VDR"}, 994 {"WK", "WAK"}, 995 // These three-letter codes are used for others as well. 996 {"FQ", "ATF"}, 997 } { 998 update(m.iso2, m.iso3) 999 } 1000 for i, s := range regionISO.s { 1001 if len(s) != 4 { 1002 regionISO.s[i] = s + " " 1003 } 1004 } 1005 b.writeConst("regionISO", tag.Index(regionISO.join())) 1006 b.writeConst("altRegionISO3", altRegionISO3) 1007 b.writeSlice("altRegionIDs", altRegionIDs) 1008 1009 // Create list of deprecated regions. 1010 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1011 // Transitionally-reserved mapping not included. 1012 regionOldMap := stringSet{} 1013 // Include regions in territoryAlias (not all are in the IANA registry!) 1014 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1015 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1016 regionOldMap.add(reg.Type) 1017 regionOldMap.updateLater(reg.Type, reg.Replacement) 1018 i, _ := regionISO.find(reg.Type) 1019 j, _ := regionISO.find(reg.Replacement) 1020 if k := m49map[i+isoOffset]; k == 0 { 1021 m49map[i+isoOffset] = m49map[j+isoOffset] 1022 } 1023 } 1024 } 1025 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1026 return uint16(b.region.index(s)) 1027 }) 1028 // 3-digit region lookup, groupings. 1029 for i := 1; i < isoOffset; i++ { 1030 m := parseM49(b.region.s[i]) 1031 m49map[i] = m 1032 fromM49map[m] = i 1033 } 1034 b.writeSlice("m49", m49map) 1035 1036 const ( 1037 searchBits = 7 1038 regionBits = 9 1039 ) 1040 if len(m49map) >= 1<<regionBits { 1041 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1042 } 1043 m49Index := [9]int16{} 1044 fromM49 := []uint16{} 1045 m49 := []int{} 1046 for k, _ := range fromM49map { 1047 m49 = append(m49, int(k)) 1048 } 1049 sort.Ints(m49) 1050 for _, k := range m49[1:] { 1051 val := (k & (1<<searchBits - 1)) << regionBits 1052 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1053 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1054 } 1055 b.writeSlice("m49Index", m49Index) 1056 b.writeSlice("fromM49", fromM49) 1057} 1058 1059const ( 1060 // TODO: put these lists in regionTypes as user data? Could be used for 1061 // various optimizations and refinements and could be exposed in the API. 1062 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1063 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1064 // DY and RH are actually not deleted, but indeterminately reserved. 1065 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1066) 1067 1068const ( 1069 iso3166UserAssigned = 1 << iota 1070 ccTLD 1071 bcp47Region 1072) 1073 1074func find(list []string, s string) int { 1075 for i, t := range list { 1076 if t == s { 1077 return i 1078 } 1079 } 1080 return -1 1081} 1082 1083// writeVariants generates per-variant information and creates a map from variant 1084// name to index value. We assign index values such that sorting multiple 1085// variants by index value will result in the correct order. 1086// There are two types of variants: specialized and general. Specialized variants 1087// are only applicable to certain language or language-script pairs. Generalized 1088// variants apply to any language. Generalized variants always sort after 1089// specialized variants. We will therefore always assign a higher index value 1090// to a generalized variant than any other variant. Generalized variants are 1091// sorted alphabetically among themselves. 1092// Specialized variants may also sort after other specialized variants. Such 1093// variants will be ordered after any of the variants they may follow. 1094// We assume that if a variant x is followed by a variant y, then for any prefix 1095// p of x, p-x is a prefix of y. This allows us to order tags based on the 1096// maximum of the length of any of its prefixes. 1097// TODO: it is possible to define a set of Prefix values on variants such that 1098// a total order cannot be defined to the point that this algorithm breaks. 1099// In other words, we cannot guarantee the same order of variants for the 1100// future using the same algorithm or for non-compliant combinations of 1101// variants. For this reason, consider using simple alphabetic sorting 1102// of variants and ignore Prefix restrictions altogether. 1103func (b *builder) writeVariant() { 1104 generalized := stringSet{} 1105 specialized := stringSet{} 1106 specializedExtend := stringSet{} 1107 // Collate the variants by type and check assumptions. 1108 for _, v := range b.variant.slice() { 1109 e := b.registry[v] 1110 if len(e.prefix) == 0 { 1111 generalized.add(v) 1112 continue 1113 } 1114 c := strings.Split(e.prefix[0], "-") 1115 hasScriptOrRegion := false 1116 if len(c) > 1 { 1117 _, hasScriptOrRegion = b.script.find(c[1]) 1118 if !hasScriptOrRegion { 1119 _, hasScriptOrRegion = b.region.find(c[1]) 1120 1121 } 1122 } 1123 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1124 // Variant is preceded by a language. 1125 specialized.add(v) 1126 continue 1127 } 1128 // Variant is preceded by another variant. 1129 specializedExtend.add(v) 1130 prefix := c[0] + "-" 1131 if hasScriptOrRegion { 1132 prefix += c[1] 1133 } 1134 for _, p := range e.prefix { 1135 // Verify that the prefix minus the last element is a prefix of the 1136 // predecessor element. 1137 i := strings.LastIndex(p, "-") 1138 pred := b.registry[p[i+1:]] 1139 if find(pred.prefix, p[:i]) < 0 { 1140 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1141 } 1142 // The sorting used below does not work in the general case. It works 1143 // if we assume that variants that may be followed by others only have 1144 // prefixes of the same length. Verify this. 1145 count := strings.Count(p[:i], "-") 1146 for _, q := range pred.prefix { 1147 if c := strings.Count(q, "-"); c != count { 1148 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1149 } 1150 } 1151 if !strings.HasPrefix(p, prefix) { 1152 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1153 } 1154 } 1155 } 1156 1157 // Sort extended variants. 1158 a := specializedExtend.s 1159 less := func(v, w string) bool { 1160 // Sort by the maximum number of elements. 1161 maxCount := func(s string) (max int) { 1162 for _, p := range b.registry[s].prefix { 1163 if c := strings.Count(p, "-"); c > max { 1164 max = c 1165 } 1166 } 1167 return 1168 } 1169 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1170 return cv < cw 1171 } 1172 // Sort by name as tie breaker. 1173 return v < w 1174 } 1175 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1176 specializedExtend.frozen = true 1177 1178 // Create index from variant name to index. 1179 variantIndex := make(map[string]uint8) 1180 add := func(s []string) { 1181 for _, v := range s { 1182 variantIndex[v] = uint8(len(variantIndex)) 1183 } 1184 } 1185 add(specialized.slice()) 1186 add(specializedExtend.s) 1187 numSpecialized := len(variantIndex) 1188 add(generalized.slice()) 1189 if n := len(variantIndex); n > 255 { 1190 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1191 } 1192 b.writeMap("variantIndex", variantIndex) 1193 b.writeConst("variantNumSpecialized", numSpecialized) 1194} 1195 1196func (b *builder) writeLanguageInfo() { 1197} 1198 1199// writeLikelyData writes tables that are used both for finding parent relations and for 1200// language matching. Each entry contains additional bits to indicate the status of the 1201// data to know when it cannot be used for parent relations. 1202func (b *builder) writeLikelyData() { 1203 const ( 1204 isList = 1 << iota 1205 scriptInFrom 1206 regionInFrom 1207 ) 1208 type ( // generated types 1209 likelyScriptRegion struct { 1210 region uint16 1211 script uint8 1212 flags uint8 1213 } 1214 likelyLangScript struct { 1215 lang uint16 1216 script uint8 1217 flags uint8 1218 } 1219 likelyLangRegion struct { 1220 lang uint16 1221 region uint16 1222 } 1223 // likelyTag is used for getting likely tags for group regions, where 1224 // the likely region might be a region contained in the group. 1225 likelyTag struct { 1226 lang uint16 1227 region uint16 1228 script uint8 1229 } 1230 ) 1231 var ( // generated variables 1232 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1233 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1234 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1235 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1236 likelyLangList = []likelyScriptRegion{} 1237 likelyRegionList = []likelyLangScript{} 1238 ) 1239 type fromTo struct { 1240 from, to []string 1241 } 1242 langToOther := map[int][]fromTo{} 1243 regionToOther := map[int][]fromTo{} 1244 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1245 from := strings.Split(m.From, "_") 1246 to := strings.Split(m.To, "_") 1247 if len(to) != 3 { 1248 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1249 } 1250 if len(from) > 3 { 1251 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1252 } 1253 if from[0] != to[0] && from[0] != "und" { 1254 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1255 } 1256 if len(from) == 3 { 1257 if from[2] != to[2] { 1258 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1259 } 1260 if from[0] != "und" { 1261 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1262 } 1263 } 1264 if len(from) == 1 || from[0] != "und" { 1265 id := 0 1266 if from[0] != "und" { 1267 id = b.lang.index(from[0]) 1268 } 1269 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1270 } else if len(from) == 2 && len(from[1]) == 4 { 1271 sid := b.script.index(from[1]) 1272 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1273 likelyScript[sid].region = uint16(b.region.index(to[2])) 1274 } else { 1275 r := b.region.index(from[len(from)-1]) 1276 if id, ok := b.groups[r]; ok { 1277 if from[0] != "und" { 1278 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1279 } 1280 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1281 likelyRegionGroup[id].script = uint8(b.script.index(to[1])) 1282 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1283 } else { 1284 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1285 } 1286 } 1287 } 1288 b.writeType(likelyLangRegion{}) 1289 b.writeSlice("likelyScript", likelyScript) 1290 1291 for id := range b.lang.s { 1292 list := langToOther[id] 1293 if len(list) == 1 { 1294 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1295 likelyLang[id].script = uint8(b.script.index(list[0].to[1])) 1296 } else if len(list) > 1 { 1297 likelyLang[id].flags = isList 1298 likelyLang[id].region = uint16(len(likelyLangList)) 1299 likelyLang[id].script = uint8(len(list)) 1300 for _, x := range list { 1301 flags := uint8(0) 1302 if len(x.from) > 1 { 1303 if x.from[1] == x.to[2] { 1304 flags = regionInFrom 1305 } else { 1306 flags = scriptInFrom 1307 } 1308 } 1309 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1310 region: uint16(b.region.index(x.to[2])), 1311 script: uint8(b.script.index(x.to[1])), 1312 flags: flags, 1313 }) 1314 } 1315 } 1316 } 1317 // TODO: merge suppressScript data with this table. 1318 b.writeType(likelyScriptRegion{}) 1319 b.writeSlice("likelyLang", likelyLang) 1320 b.writeSlice("likelyLangList", likelyLangList) 1321 1322 for id := range b.region.s { 1323 list := regionToOther[id] 1324 if len(list) == 1 { 1325 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1326 likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) 1327 if len(list[0].from) > 2 { 1328 likelyRegion[id].flags = scriptInFrom 1329 } 1330 } else if len(list) > 1 { 1331 likelyRegion[id].flags = isList 1332 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1333 likelyRegion[id].script = uint8(len(list)) 1334 for i, x := range list { 1335 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1336 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1337 } 1338 x := likelyLangScript{ 1339 lang: uint16(b.langIndex(x.to[0])), 1340 script: uint8(b.script.index(x.to[1])), 1341 } 1342 if len(list[0].from) > 2 { 1343 x.flags = scriptInFrom 1344 } 1345 likelyRegionList = append(likelyRegionList, x) 1346 } 1347 } 1348 } 1349 b.writeType(likelyLangScript{}) 1350 b.writeSlice("likelyRegion", likelyRegion) 1351 b.writeSlice("likelyRegionList", likelyRegionList) 1352 1353 b.writeType(likelyTag{}) 1354 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1355} 1356 1357func (b *builder) writeRegionInclusionData() { 1358 var ( 1359 // mm holds for each group the set of groups with a distance of 1. 1360 mm = make(map[int][]index) 1361 1362 // containment holds for each group the transitive closure of 1363 // containment of other groups. 1364 containment = make(map[index][]index) 1365 ) 1366 for _, g := range b.supp.TerritoryContainment.Group { 1367 // Skip UN and EURO zone as they are flattening the containment 1368 // relationship. 1369 if g.Type == "EZ" || g.Type == "UN" { 1370 continue 1371 } 1372 group := b.region.index(g.Type) 1373 groupIdx := b.groups[group] 1374 for _, mem := range strings.Split(g.Contains, " ") { 1375 r := b.region.index(mem) 1376 mm[r] = append(mm[r], groupIdx) 1377 if g, ok := b.groups[r]; ok { 1378 mm[group] = append(mm[group], g) 1379 containment[groupIdx] = append(containment[groupIdx], g) 1380 } 1381 } 1382 } 1383 1384 regionContainment := make([]uint64, len(b.groups)) 1385 for _, g := range b.groups { 1386 l := containment[g] 1387 1388 // Compute the transitive closure of containment. 1389 for i := 0; i < len(l); i++ { 1390 l = append(l, containment[l[i]]...) 1391 } 1392 1393 // Compute the bitmask. 1394 regionContainment[g] = 1 << g 1395 for _, v := range l { 1396 regionContainment[g] |= 1 << v 1397 } 1398 } 1399 b.writeSlice("regionContainment", regionContainment) 1400 1401 regionInclusion := make([]uint8, len(b.region.s)) 1402 bvs := make(map[uint64]index) 1403 // Make the first bitvector positions correspond with the groups. 1404 for r, i := range b.groups { 1405 bv := uint64(1 << i) 1406 for _, g := range mm[r] { 1407 bv |= 1 << g 1408 } 1409 bvs[bv] = i 1410 regionInclusion[r] = uint8(bvs[bv]) 1411 } 1412 for r := 1; r < len(b.region.s); r++ { 1413 if _, ok := b.groups[r]; !ok { 1414 bv := uint64(0) 1415 for _, g := range mm[r] { 1416 bv |= 1 << g 1417 } 1418 if bv == 0 { 1419 // Pick the world for unspecified regions. 1420 bv = 1 << b.groups[b.region.index("001")] 1421 } 1422 if _, ok := bvs[bv]; !ok { 1423 bvs[bv] = index(len(bvs)) 1424 } 1425 regionInclusion[r] = uint8(bvs[bv]) 1426 } 1427 } 1428 b.writeSlice("regionInclusion", regionInclusion) 1429 regionInclusionBits := make([]uint64, len(bvs)) 1430 for k, v := range bvs { 1431 regionInclusionBits[v] = uint64(k) 1432 } 1433 // Add bit vectors for increasingly large distances until a fixed point is reached. 1434 regionInclusionNext := []uint8{} 1435 for i := 0; i < len(regionInclusionBits); i++ { 1436 bits := regionInclusionBits[i] 1437 next := bits 1438 for i := uint(0); i < uint(len(b.groups)); i++ { 1439 if bits&(1<<i) != 0 { 1440 next |= regionInclusionBits[i] 1441 } 1442 } 1443 if _, ok := bvs[next]; !ok { 1444 bvs[next] = index(len(bvs)) 1445 regionInclusionBits = append(regionInclusionBits, next) 1446 } 1447 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1448 } 1449 b.writeSlice("regionInclusionBits", regionInclusionBits) 1450 b.writeSlice("regionInclusionNext", regionInclusionNext) 1451} 1452 1453type parentRel struct { 1454 lang uint16 1455 script uint8 1456 maxScript uint8 1457 toRegion uint16 1458 fromRegion []uint16 1459} 1460 1461func (b *builder) writeParents() { 1462 b.writeType(parentRel{}) 1463 1464 parents := []parentRel{} 1465 1466 // Construct parent overrides. 1467 n := 0 1468 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1469 // Skipping non-standard scripts to root is implemented using addTags. 1470 if p.Parent == "root" { 1471 continue 1472 } 1473 1474 sub := strings.Split(p.Parent, "_") 1475 parent := parentRel{lang: b.langIndex(sub[0])} 1476 if len(sub) == 2 { 1477 // TODO: check that all undefined scripts are indeed Latn in these 1478 // cases. 1479 parent.maxScript = uint8(b.script.index("Latn")) 1480 parent.toRegion = uint16(b.region.index(sub[1])) 1481 } else { 1482 parent.script = uint8(b.script.index(sub[1])) 1483 parent.maxScript = parent.script 1484 parent.toRegion = uint16(b.region.index(sub[2])) 1485 } 1486 for _, c := range strings.Split(p.Locales, " ") { 1487 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1488 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1489 } 1490 parents = append(parents, parent) 1491 n += len(parent.fromRegion) 1492 } 1493 b.writeSliceAddSize("parents", n*2, parents) 1494} 1495 1496func main() { 1497 gen.Init() 1498 1499 gen.Repackage("gen_common.go", "common.go", "language") 1500 1501 w := gen.NewCodeWriter() 1502 defer w.WriteGoFile("tables.go", "language") 1503 1504 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) 1505 1506 b := newBuilder(w) 1507 gen.WriteCLDRVersion(w) 1508 1509 b.parseIndices() 1510 b.writeType(FromTo{}) 1511 b.writeLanguage() 1512 b.writeScript() 1513 b.writeRegion() 1514 b.writeVariant() 1515 // TODO: b.writeLocale() 1516 b.computeRegionGroups() 1517 b.writeLikelyData() 1518 b.writeRegionInclusionData() 1519 b.writeParents() 1520} 1521