1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// Language tag table generator. 9// Data read from the web. 10 11package main 12 13import ( 14 "bufio" 15 "flag" 16 "fmt" 17 "io" 18 "io/ioutil" 19 "log" 20 "math" 21 "reflect" 22 "regexp" 23 "sort" 24 "strconv" 25 "strings" 26 27 "golang.org/x/text/internal/gen" 28 "golang.org/x/text/internal/tag" 29 "golang.org/x/text/unicode/cldr" 30) 31 32var ( 33 test = flag.Bool("test", 34 false, 35 "test existing tables; can be used to compare web data with package data.") 36 outputFile = flag.String("output", 37 "tables.go", 38 "output file for generated tables") 39) 40 41var comment = []string{ 42 ` 43lang holds an alphabetically sorted list of ISO-639 language identifiers. 44All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 45For 2-byte language identifiers, the two successive bytes have the following meaning: 46 - if the first letter of the 2- and 3-letter ISO codes are the same: 47 the second and third letter of the 3-letter ISO code. 48 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 49For 3-byte language identifiers the 4th byte is 0.`, 50 ` 51langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 52in lookup tables. The language ids for these language codes are derived directly 53from the letters and are not consecutive.`, 54 ` 55altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 56to 2-letter language codes that cannot be derived using the method described above. 57Each 3-letter code is followed by its 1-byte langID.`, 58 ` 59altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 60 ` 61AliasMap maps langIDs to their suggested replacements.`, 62 ` 63script is an alphabetically sorted list of ISO 15924 codes. The index 64of the script in the string, divided by 4, is the internal scriptID.`, 65 ` 66isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 67for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 68the UN.M49 codes used for groups.)`, 69 ` 70regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 71Each 2-letter codes is followed by two bytes with the following meaning: 72 - [A-Z}{2}: the first letter of the 2-letter code plus these two 73 letters form the 3-letter ISO code. 74 - 0, n: index into altRegionISO3.`, 75 ` 76regionTypes defines the status of a region for various standards.`, 77 ` 78m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 79codes indicating collections of regions.`, 80 ` 81m49Index gives indexes into fromM49 based on the three most significant bits 82of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 83 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 84for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 85The region code is stored in the 9 lsb of the indexed value.`, 86 ` 87fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 88 ` 89altRegionISO3 holds a list of 3-letter region codes that cannot be 90mapped to 2-letter codes using the default algorithm. This is a short list.`, 91 ` 92altRegionIDs holds a list of regionIDs the positions of which match those 93of the 3-letter ISO codes in altRegionISO3.`, 94 ` 95variantNumSpecialized is the number of specialized variants in variants.`, 96 ` 97suppressScript is an index from langID to the dominant script for that language, 98if it exists. If a script is given, it should be suppressed from the language tag.`, 99 ` 100likelyLang is a lookup table, indexed by langID, for the most likely 101scripts and regions given incomplete information. If more entries exist for a 102given language, region and script are the index and size respectively 103of the list in likelyLangList.`, 104 ` 105likelyLangList holds lists info associated with likelyLang.`, 106 ` 107likelyRegion is a lookup table, indexed by regionID, for the most likely 108languages and scripts given incomplete information. If more entries exist 109for a given regionID, lang and script are the index and size respectively 110of the list in likelyRegionList. 111TODO: exclude containers and user-definable regions from the list.`, 112 ` 113likelyRegionList holds lists info associated with likelyRegion.`, 114 ` 115likelyScript is a lookup table, indexed by scriptID, for the most likely 116languages and regions given a script.`, 117 ` 118nRegionGroups is the number of region groups.`, 119 ` 120regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 121where each set holds all groupings that are directly connected in a region 122containment graph.`, 123 ` 124regionInclusionBits is an array of bit vectors where every vector represents 125a set of region groupings. These sets are used to compute the distance 126between two regions for the purpose of language matching.`, 127 ` 128regionInclusionNext marks, for each entry in regionInclusionBits, the set of 129all groups that are reachable from the groups set in the respective entry.`, 130} 131 132// TODO: consider changing some of these structures to tries. This can reduce 133// memory, but may increase the need for memory allocations. This could be 134// mitigated if we can piggyback on language tags for common cases. 135 136func failOnError(e error) { 137 if e != nil { 138 log.Panic(e) 139 } 140} 141 142type setType int 143 144const ( 145 Indexed setType = 1 + iota // all elements must be of same size 146 Linear 147) 148 149type stringSet struct { 150 s []string 151 sorted, frozen bool 152 153 // We often need to update values after the creation of an index is completed. 154 // We include a convenience map for keeping track of this. 155 update map[string]string 156 typ setType // used for checking. 157} 158 159func (ss *stringSet) clone() stringSet { 160 c := *ss 161 c.s = append([]string(nil), c.s...) 162 return c 163} 164 165func (ss *stringSet) setType(t setType) { 166 if ss.typ != t && ss.typ != 0 { 167 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 168 } 169} 170 171// parse parses a whitespace-separated string and initializes ss with its 172// components. 173func (ss *stringSet) parse(s string) { 174 scan := bufio.NewScanner(strings.NewReader(s)) 175 scan.Split(bufio.ScanWords) 176 for scan.Scan() { 177 ss.add(scan.Text()) 178 } 179} 180 181func (ss *stringSet) assertChangeable() { 182 if ss.frozen { 183 log.Panic("attempt to modify a frozen stringSet") 184 } 185} 186 187func (ss *stringSet) add(s string) { 188 ss.assertChangeable() 189 ss.s = append(ss.s, s) 190 ss.sorted = ss.frozen 191} 192 193func (ss *stringSet) freeze() { 194 ss.compact() 195 ss.frozen = true 196} 197 198func (ss *stringSet) compact() { 199 if ss.sorted { 200 return 201 } 202 a := ss.s 203 sort.Strings(a) 204 k := 0 205 for i := 1; i < len(a); i++ { 206 if a[k] != a[i] { 207 a[k+1] = a[i] 208 k++ 209 } 210 } 211 ss.s = a[:k+1] 212 ss.sorted = ss.frozen 213} 214 215type funcSorter struct { 216 fn func(a, b string) bool 217 sort.StringSlice 218} 219 220func (s funcSorter) Less(i, j int) bool { 221 return s.fn(s.StringSlice[i], s.StringSlice[j]) 222} 223 224func (ss *stringSet) sortFunc(f func(a, b string) bool) { 225 ss.compact() 226 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 227} 228 229func (ss *stringSet) remove(s string) { 230 ss.assertChangeable() 231 if i, ok := ss.find(s); ok { 232 copy(ss.s[i:], ss.s[i+1:]) 233 ss.s = ss.s[:len(ss.s)-1] 234 } 235} 236 237func (ss *stringSet) replace(ol, nu string) { 238 ss.s[ss.index(ol)] = nu 239 ss.sorted = ss.frozen 240} 241 242func (ss *stringSet) index(s string) int { 243 ss.setType(Indexed) 244 i, ok := ss.find(s) 245 if !ok { 246 if i < len(ss.s) { 247 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 248 } 249 log.Panicf("find: item %q is not in list", s) 250 251 } 252 return i 253} 254 255func (ss *stringSet) find(s string) (int, bool) { 256 ss.compact() 257 i := sort.SearchStrings(ss.s, s) 258 return i, i != len(ss.s) && ss.s[i] == s 259} 260 261func (ss *stringSet) slice() []string { 262 ss.compact() 263 return ss.s 264} 265 266func (ss *stringSet) updateLater(v, key string) { 267 if ss.update == nil { 268 ss.update = map[string]string{} 269 } 270 ss.update[v] = key 271} 272 273// join joins the string and ensures that all entries are of the same length. 274func (ss *stringSet) join() string { 275 ss.setType(Indexed) 276 n := len(ss.s[0]) 277 for _, s := range ss.s { 278 if len(s) != n { 279 log.Panicf("join: not all entries are of the same length: %q", s) 280 } 281 } 282 ss.s = append(ss.s, strings.Repeat("\xff", n)) 283 return strings.Join(ss.s, "") 284} 285 286// ianaEntry holds information for an entry in the IANA Language Subtag Repository. 287// All types use the same entry. 288// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 289// fields. 290type ianaEntry struct { 291 typ string 292 description []string 293 scope string 294 added string 295 preferred string 296 deprecated string 297 suppressScript string 298 macro string 299 prefix []string 300} 301 302type builder struct { 303 w *gen.CodeWriter 304 hw io.Writer // MultiWriter for w and w.Hash 305 data *cldr.CLDR 306 supp *cldr.SupplementalData 307 308 // indices 309 locale stringSet // common locales 310 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 311 langNoIndex stringSet // 3-letter ISO codes with no associated data 312 script stringSet // 4-letter ISO codes 313 region stringSet // 2-letter ISO or 3-digit UN M49 codes 314 variant stringSet // 4-8-alphanumeric variant code. 315 316 // Region codes that are groups with their corresponding group IDs. 317 groups map[int]index 318 319 // langInfo 320 registry map[string]*ianaEntry 321} 322 323type index uint 324 325func newBuilder(w *gen.CodeWriter) *builder { 326 r := gen.OpenCLDRCoreZip() 327 defer r.Close() 328 d := &cldr.Decoder{} 329 data, err := d.DecodeZip(r) 330 failOnError(err) 331 b := builder{ 332 w: w, 333 hw: io.MultiWriter(w, w.Hash), 334 data: data, 335 supp: data.Supplemental(), 336 } 337 b.parseRegistry() 338 return &b 339} 340 341func (b *builder) parseRegistry() { 342 r := gen.OpenIANAFile("assignments/language-subtag-registry") 343 defer r.Close() 344 b.registry = make(map[string]*ianaEntry) 345 346 scan := bufio.NewScanner(r) 347 scan.Split(bufio.ScanWords) 348 var record *ianaEntry 349 for more := scan.Scan(); more; { 350 key := scan.Text() 351 more = scan.Scan() 352 value := scan.Text() 353 switch key { 354 case "Type:": 355 record = &ianaEntry{typ: value} 356 case "Subtag:", "Tag:": 357 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 358 for a := s[0]; a <= s[1]; a = inc(a) { 359 b.addToRegistry(a, record) 360 } 361 } else { 362 b.addToRegistry(value, record) 363 } 364 case "Suppress-Script:": 365 record.suppressScript = value 366 case "Added:": 367 record.added = value 368 case "Deprecated:": 369 record.deprecated = value 370 case "Macrolanguage:": 371 record.macro = value 372 case "Preferred-Value:": 373 record.preferred = value 374 case "Prefix:": 375 record.prefix = append(record.prefix, value) 376 case "Scope:": 377 record.scope = value 378 case "Description:": 379 buf := []byte(value) 380 for more = scan.Scan(); more; more = scan.Scan() { 381 b := scan.Bytes() 382 if b[0] == '%' || b[len(b)-1] == ':' { 383 break 384 } 385 buf = append(buf, ' ') 386 buf = append(buf, b...) 387 } 388 record.description = append(record.description, string(buf)) 389 continue 390 default: 391 continue 392 } 393 more = scan.Scan() 394 } 395 if scan.Err() != nil { 396 log.Panic(scan.Err()) 397 } 398} 399 400func (b *builder) addToRegistry(key string, entry *ianaEntry) { 401 if info, ok := b.registry[key]; ok { 402 if info.typ != "language" || entry.typ != "extlang" { 403 log.Fatalf("parseRegistry: tag %q already exists", key) 404 } 405 } else { 406 b.registry[key] = entry 407 } 408} 409 410var commentIndex = make(map[string]string) 411 412func init() { 413 for _, s := range comment { 414 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 415 commentIndex[key] = s 416 } 417} 418 419func (b *builder) comment(name string) { 420 if s := commentIndex[name]; len(s) > 0 { 421 b.w.WriteComment(s) 422 } else { 423 fmt.Fprintln(b.w) 424 } 425} 426 427func (b *builder) pf(f string, x ...interface{}) { 428 fmt.Fprintf(b.hw, f, x...) 429 fmt.Fprint(b.hw, "\n") 430} 431 432func (b *builder) p(x ...interface{}) { 433 fmt.Fprintln(b.hw, x...) 434} 435 436func (b *builder) addSize(s int) { 437 b.w.Size += s 438 b.pf("// Size: %d bytes", s) 439} 440 441func (b *builder) writeConst(name string, x interface{}) { 442 b.comment(name) 443 b.w.WriteConst(name, x) 444} 445 446// writeConsts computes f(v) for all v in values and writes the results 447// as constants named _v to a single constant block. 448func (b *builder) writeConsts(f func(string) int, values ...string) { 449 b.pf("const (") 450 for _, v := range values { 451 b.pf("\t_%s = %v", v, f(v)) 452 } 453 b.pf(")") 454} 455 456// writeType writes the type of the given value, which must be a struct. 457func (b *builder) writeType(value interface{}) { 458 b.comment(reflect.TypeOf(value).Name()) 459 b.w.WriteType(value) 460} 461 462func (b *builder) writeSlice(name string, ss interface{}) { 463 b.writeSliceAddSize(name, 0, ss) 464} 465 466func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 467 b.comment(name) 468 b.w.Size += extraSize 469 v := reflect.ValueOf(ss) 470 t := v.Type().Elem() 471 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 472 473 fmt.Fprintf(b.w, "var %s = ", name) 474 b.w.WriteArray(ss) 475 b.p() 476} 477 478type FromTo struct { 479 From, To uint16 480} 481 482func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 483 ss.sortFunc(func(a, b string) bool { 484 return index(a) < index(b) 485 }) 486 m := []FromTo{} 487 for _, s := range ss.s { 488 m = append(m, FromTo{index(s), index(ss.update[s])}) 489 } 490 b.writeSlice(name, m) 491} 492 493const base = 'z' - 'a' + 1 494 495func strToInt(s string) uint { 496 v := uint(0) 497 for i := 0; i < len(s); i++ { 498 v *= base 499 v += uint(s[i] - 'a') 500 } 501 return v 502} 503 504// converts the given integer to the original ASCII string passed to strToInt. 505// len(s) must match the number of characters obtained. 506func intToStr(v uint, s []byte) { 507 for i := len(s) - 1; i >= 0; i-- { 508 s[i] = byte(v%base) + 'a' 509 v /= base 510 } 511} 512 513func (b *builder) writeBitVector(name string, ss []string) { 514 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 515 for _, s := range ss { 516 v := strToInt(s) 517 vec[v/8] |= 1 << (v % 8) 518 } 519 b.writeSlice(name, vec) 520} 521 522// TODO: convert this type into a list or two-stage trie. 523func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 524 b.comment(name) 525 v := reflect.ValueOf(m) 526 sz := v.Len() * (2 + int(v.Type().Key().Size())) 527 for _, k := range m { 528 sz += len(k) 529 } 530 b.addSize(sz) 531 keys := []string{} 532 b.pf(`var %s = map[string]uint16{`, name) 533 for k := range m { 534 keys = append(keys, k) 535 } 536 sort.Strings(keys) 537 for _, k := range keys { 538 b.pf("\t%q: %v,", k, f(m[k])) 539 } 540 b.p("}") 541} 542 543func (b *builder) writeMap(name string, m interface{}) { 544 b.comment(name) 545 v := reflect.ValueOf(m) 546 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 547 b.addSize(sz) 548 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 549 return strings.IndexRune("{}, ", r) != -1 550 }) 551 sort.Strings(f[1:]) 552 b.pf(`var %s = %s{`, name, f[0]) 553 for _, kv := range f[1:] { 554 b.pf("\t%s,", kv) 555 } 556 b.p("}") 557} 558 559func (b *builder) langIndex(s string) uint16 { 560 if s == "und" { 561 return 0 562 } 563 if i, ok := b.lang.find(s); ok { 564 return uint16(i) 565 } 566 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 567} 568 569// inc advances the string to its lexicographical successor. 570func inc(s string) string { 571 const maxTagLength = 4 572 var buf [maxTagLength]byte 573 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 574 for i := 0; i < len(s); i++ { 575 if s[i] <= 'Z' { 576 buf[i] -= 'a' - 'A' 577 } 578 } 579 return string(buf[:len(s)]) 580} 581 582func (b *builder) parseIndices() { 583 meta := b.supp.Metadata 584 585 for k, v := range b.registry { 586 var ss *stringSet 587 switch v.typ { 588 case "language": 589 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 590 b.lang.add(k) 591 continue 592 } else { 593 ss = &b.langNoIndex 594 } 595 case "region": 596 ss = &b.region 597 case "script": 598 ss = &b.script 599 case "variant": 600 ss = &b.variant 601 default: 602 continue 603 } 604 ss.add(k) 605 } 606 // Include any language for which there is data. 607 for _, lang := range b.data.Locales() { 608 if x := b.data.RawLDML(lang); false || 609 x.LocaleDisplayNames != nil || 610 x.Characters != nil || 611 x.Delimiters != nil || 612 x.Measurement != nil || 613 x.Dates != nil || 614 x.Numbers != nil || 615 x.Units != nil || 616 x.ListPatterns != nil || 617 x.Collations != nil || 618 x.Segmentations != nil || 619 x.Rbnf != nil || 620 x.Annotations != nil || 621 x.Metadata != nil { 622 623 from := strings.Split(lang, "_") 624 if lang := from[0]; lang != "root" { 625 b.lang.add(lang) 626 } 627 } 628 } 629 // Include locales for plural rules, which uses a different structure. 630 for _, plurals := range b.data.Supplemental().Plurals { 631 for _, rules := range plurals.PluralRules { 632 for _, lang := range strings.Split(rules.Locales, " ") { 633 if lang = strings.Split(lang, "_")[0]; lang != "root" { 634 b.lang.add(lang) 635 } 636 } 637 } 638 } 639 // Include languages in likely subtags. 640 for _, m := range b.supp.LikelySubtags.LikelySubtag { 641 from := strings.Split(m.From, "_") 642 b.lang.add(from[0]) 643 } 644 // Include ISO-639 alpha-3 bibliographic entries. 645 for _, a := range meta.Alias.LanguageAlias { 646 if a.Reason == "bibliographic" { 647 b.langNoIndex.add(a.Type) 648 } 649 } 650 // Include regions in territoryAlias (not all are in the IANA registry!) 651 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 652 if len(reg.Type) == 2 { 653 b.region.add(reg.Type) 654 } 655 } 656 657 for _, s := range b.lang.s { 658 if len(s) == 3 { 659 b.langNoIndex.remove(s) 660 } 661 } 662 b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 663 b.writeConst("NumScripts", len(b.script.slice())) 664 b.writeConst("NumRegions", len(b.region.slice())) 665 666 // Add dummy codes at the start of each list to represent "unspecified". 667 b.lang.add("---") 668 b.script.add("----") 669 b.region.add("---") 670 671 // common locales 672 b.locale.parse(meta.DefaultContent.Locales) 673} 674 675// TODO: region inclusion data will probably not be use used in future matchers. 676 677func (b *builder) computeRegionGroups() { 678 b.groups = make(map[int]index) 679 680 // Create group indices. 681 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 682 b.groups[i] = index(len(b.groups)) 683 } 684 for _, g := range b.supp.TerritoryContainment.Group { 685 // Skip UN and EURO zone as they are flattening the containment 686 // relationship. 687 if g.Type == "EZ" || g.Type == "UN" { 688 continue 689 } 690 group := b.region.index(g.Type) 691 if _, ok := b.groups[group]; !ok { 692 b.groups[group] = index(len(b.groups)) 693 } 694 } 695 if len(b.groups) > 64 { 696 log.Fatalf("only 64 groups supported, found %d", len(b.groups)) 697 } 698 b.writeConst("nRegionGroups", len(b.groups)) 699} 700 701var langConsts = []string{ 702 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 703 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 704 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 705 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 706 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 707 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 708 709 // constants for grandfathered tags (if not already defined) 710 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 711 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 712} 713 714// writeLanguage generates all tables needed for language canonicalization. 715func (b *builder) writeLanguage() { 716 meta := b.supp.Metadata 717 718 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 719 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 720 b.writeConst("langPrivateStart", b.langIndex("qaa")) 721 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 722 723 // Get language codes that need to be mapped (overlong 3-letter codes, 724 // deprecated 2-letter codes, legacy and grandfathered tags.) 725 langAliasMap := stringSet{} 726 aliasTypeMap := map[string]AliasType{} 727 728 // altLangISO3 get the alternative ISO3 names that need to be mapped. 729 altLangISO3 := stringSet{} 730 // Add dummy start to avoid the use of index 0. 731 altLangISO3.add("---") 732 altLangISO3.updateLater("---", "aa") 733 734 lang := b.lang.clone() 735 for _, a := range meta.Alias.LanguageAlias { 736 if a.Replacement == "" { 737 a.Replacement = "und" 738 } 739 // TODO: support mapping to tags 740 repl := strings.SplitN(a.Replacement, "_", 2)[0] 741 if a.Reason == "overlong" { 742 if len(a.Replacement) == 2 && len(a.Type) == 3 { 743 lang.updateLater(a.Replacement, a.Type) 744 } 745 } else if len(a.Type) <= 3 { 746 switch a.Reason { 747 case "macrolanguage": 748 aliasTypeMap[a.Type] = Macro 749 case "deprecated": 750 // handled elsewhere 751 continue 752 case "bibliographic", "legacy": 753 if a.Type == "no" { 754 continue 755 } 756 aliasTypeMap[a.Type] = Legacy 757 default: 758 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 759 } 760 langAliasMap.add(a.Type) 761 langAliasMap.updateLater(a.Type, repl) 762 } 763 } 764 // Manually add the mapping of "nb" (Norwegian) to its macro language. 765 // This can be removed if CLDR adopts this change. 766 langAliasMap.add("nb") 767 langAliasMap.updateLater("nb", "no") 768 aliasTypeMap["nb"] = Macro 769 770 for k, v := range b.registry { 771 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 772 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 773 langAliasMap.add(k) 774 langAliasMap.updateLater(k, v.preferred) 775 aliasTypeMap[k] = Deprecated 776 } 777 } 778 // Fix CLDR mappings. 779 lang.updateLater("tl", "tgl") 780 lang.updateLater("sh", "hbs") 781 lang.updateLater("mo", "mol") 782 lang.updateLater("no", "nor") 783 lang.updateLater("tw", "twi") 784 lang.updateLater("nb", "nob") 785 lang.updateLater("ak", "aka") 786 lang.updateLater("bh", "bih") 787 788 // Ensure that each 2-letter code is matched with a 3-letter code. 789 for _, v := range lang.s[1:] { 790 s, ok := lang.update[v] 791 if !ok { 792 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 793 continue 794 } 795 lang.update[v] = s 796 } 797 if v[0] != s[0] { 798 altLangISO3.add(s) 799 altLangISO3.updateLater(s, v) 800 } 801 } 802 803 // Complete canonicalized language tags. 804 lang.freeze() 805 for i, v := range lang.s { 806 // We can avoid these manual entries by using the IANA registry directly. 807 // Seems easier to update the list manually, as changes are rare. 808 // The panic in this loop will trigger if we miss an entry. 809 add := "" 810 if s, ok := lang.update[v]; ok { 811 if s[0] == v[0] { 812 add = s[1:] 813 } else { 814 add = string([]byte{0, byte(altLangISO3.index(s))}) 815 } 816 } else if len(v) == 3 { 817 add = "\x00" 818 } else { 819 log.Panicf("no data for long form of %q", v) 820 } 821 lang.s[i] += add 822 } 823 b.writeConst("lang", tag.Index(lang.join())) 824 825 b.writeConst("langNoIndexOffset", len(b.lang.s)) 826 827 // space of all valid 3-letter language identifiers. 828 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 829 830 altLangIndex := []uint16{} 831 for i, s := range altLangISO3.slice() { 832 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 833 if i > 0 { 834 idx := b.lang.index(altLangISO3.update[s]) 835 altLangIndex = append(altLangIndex, uint16(idx)) 836 } 837 } 838 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 839 b.writeSlice("altLangIndex", altLangIndex) 840 841 b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex) 842 types := make([]AliasType, len(langAliasMap.s)) 843 for i, s := range langAliasMap.s { 844 types[i] = aliasTypeMap[s] 845 } 846 b.writeSlice("AliasTypes", types) 847} 848 849var scriptConsts = []string{ 850 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 851 "Zzzz", 852} 853 854func (b *builder) writeScript() { 855 b.writeConsts(b.script.index, scriptConsts...) 856 b.writeConst("script", tag.Index(b.script.join())) 857 858 supp := make([]uint8, len(b.lang.slice())) 859 for i, v := range b.lang.slice()[1:] { 860 if sc := b.registry[v].suppressScript; sc != "" { 861 supp[i+1] = uint8(b.script.index(sc)) 862 } 863 } 864 b.writeSlice("suppressScript", supp) 865 866 // There is only one deprecated script in CLDR. This value is hard-coded. 867 // We check here if the code must be updated. 868 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 869 if a.Type != "Qaai" { 870 log.Panicf("unexpected deprecated stript %q", a.Type) 871 } 872 } 873} 874 875func parseM49(s string) int16 { 876 if len(s) == 0 { 877 return 0 878 } 879 v, err := strconv.ParseUint(s, 10, 10) 880 failOnError(err) 881 return int16(v) 882} 883 884var regionConsts = []string{ 885 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 886 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 887} 888 889func (b *builder) writeRegion() { 890 b.writeConsts(b.region.index, regionConsts...) 891 892 isoOffset := b.region.index("AA") 893 m49map := make([]int16, len(b.region.slice())) 894 fromM49map := make(map[int16]int) 895 altRegionISO3 := "" 896 altRegionIDs := []uint16{} 897 898 b.writeConst("isoRegionOffset", isoOffset) 899 900 // 2-letter region lookup and mapping to numeric codes. 901 regionISO := b.region.clone() 902 regionISO.s = regionISO.s[isoOffset:] 903 regionISO.sorted = false 904 905 regionTypes := make([]byte, len(b.region.s)) 906 907 // Is the region valid BCP 47? 908 for s, e := range b.registry { 909 if len(s) == 2 && s == strings.ToUpper(s) { 910 i := b.region.index(s) 911 for _, d := range e.description { 912 if strings.Contains(d, "Private use") { 913 regionTypes[i] = iso3166UserAssigned 914 } 915 } 916 regionTypes[i] |= bcp47Region 917 } 918 } 919 920 // Is the region a valid ccTLD? 921 r := gen.OpenIANAFile("domains/root/db") 922 defer r.Close() 923 924 buf, err := ioutil.ReadAll(r) 925 failOnError(err) 926 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 927 for _, m := range re.FindAllSubmatch(buf, -1) { 928 i := b.region.index(strings.ToUpper(string(m[1]))) 929 regionTypes[i] |= ccTLD 930 } 931 932 b.writeSlice("regionTypes", regionTypes) 933 934 iso3Set := make(map[string]int) 935 update := func(iso2, iso3 string) { 936 i := regionISO.index(iso2) 937 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 938 regionISO.s[i] += iso3[1:] 939 iso3Set[iso3] = -1 940 } else { 941 if ok && j >= 0 { 942 regionISO.s[i] += string([]byte{0, byte(j)}) 943 } else { 944 iso3Set[iso3] = len(altRegionISO3) 945 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 946 altRegionISO3 += iso3 947 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 948 } 949 } 950 } 951 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 952 i := regionISO.index(tc.Type) + isoOffset 953 if d := m49map[i]; d != 0 { 954 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 955 } 956 m49 := parseM49(tc.Numeric) 957 m49map[i] = m49 958 if r := fromM49map[m49]; r == 0 { 959 fromM49map[m49] = i 960 } else if r != i { 961 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 962 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 963 fromM49map[m49] = i 964 } 965 } 966 } 967 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 968 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 969 from := parseM49(ta.Type) 970 if r := fromM49map[from]; r == 0 { 971 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 972 } 973 } 974 } 975 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 976 if len(tc.Alpha3) == 3 { 977 update(tc.Type, tc.Alpha3) 978 } 979 } 980 // This entries are not included in territoryCodes. Mostly 3-letter variants 981 // of deleted codes and an entry for QU. 982 for _, m := range []struct{ iso2, iso3 string }{ 983 {"CT", "CTE"}, 984 {"DY", "DHY"}, 985 {"HV", "HVO"}, 986 {"JT", "JTN"}, 987 {"MI", "MID"}, 988 {"NH", "NHB"}, 989 {"NQ", "ATN"}, 990 {"PC", "PCI"}, 991 {"PU", "PUS"}, 992 {"PZ", "PCZ"}, 993 {"RH", "RHO"}, 994 {"VD", "VDR"}, 995 {"WK", "WAK"}, 996 // These three-letter codes are used for others as well. 997 {"FQ", "ATF"}, 998 } { 999 update(m.iso2, m.iso3) 1000 } 1001 for i, s := range regionISO.s { 1002 if len(s) != 4 { 1003 regionISO.s[i] = s + " " 1004 } 1005 } 1006 b.writeConst("regionISO", tag.Index(regionISO.join())) 1007 b.writeConst("altRegionISO3", altRegionISO3) 1008 b.writeSlice("altRegionIDs", altRegionIDs) 1009 1010 // Create list of deprecated regions. 1011 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1012 // Transitionally-reserved mapping not included. 1013 regionOldMap := stringSet{} 1014 // Include regions in territoryAlias (not all are in the IANA registry!) 1015 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1016 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1017 regionOldMap.add(reg.Type) 1018 regionOldMap.updateLater(reg.Type, reg.Replacement) 1019 i, _ := regionISO.find(reg.Type) 1020 j, _ := regionISO.find(reg.Replacement) 1021 if k := m49map[i+isoOffset]; k == 0 { 1022 m49map[i+isoOffset] = m49map[j+isoOffset] 1023 } 1024 } 1025 } 1026 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1027 return uint16(b.region.index(s)) 1028 }) 1029 // 3-digit region lookup, groupings. 1030 for i := 1; i < isoOffset; i++ { 1031 m := parseM49(b.region.s[i]) 1032 m49map[i] = m 1033 fromM49map[m] = i 1034 } 1035 b.writeSlice("m49", m49map) 1036 1037 const ( 1038 searchBits = 7 1039 regionBits = 9 1040 ) 1041 if len(m49map) >= 1<<regionBits { 1042 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1043 } 1044 m49Index := [9]int16{} 1045 fromM49 := []uint16{} 1046 m49 := []int{} 1047 for k, _ := range fromM49map { 1048 m49 = append(m49, int(k)) 1049 } 1050 sort.Ints(m49) 1051 for _, k := range m49[1:] { 1052 val := (k & (1<<searchBits - 1)) << regionBits 1053 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1054 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1055 } 1056 b.writeSlice("m49Index", m49Index) 1057 b.writeSlice("fromM49", fromM49) 1058} 1059 1060const ( 1061 // TODO: put these lists in regionTypes as user data? Could be used for 1062 // various optimizations and refinements and could be exposed in the API. 1063 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1064 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1065 // DY and RH are actually not deleted, but indeterminately reserved. 1066 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1067) 1068 1069const ( 1070 iso3166UserAssigned = 1 << iota 1071 ccTLD 1072 bcp47Region 1073) 1074 1075func find(list []string, s string) int { 1076 for i, t := range list { 1077 if t == s { 1078 return i 1079 } 1080 } 1081 return -1 1082} 1083 1084// writeVariants generates per-variant information and creates a map from variant 1085// name to index value. We assign index values such that sorting multiple 1086// variants by index value will result in the correct order. 1087// There are two types of variants: specialized and general. Specialized variants 1088// are only applicable to certain language or language-script pairs. Generalized 1089// variants apply to any language. Generalized variants always sort after 1090// specialized variants. We will therefore always assign a higher index value 1091// to a generalized variant than any other variant. Generalized variants are 1092// sorted alphabetically among themselves. 1093// Specialized variants may also sort after other specialized variants. Such 1094// variants will be ordered after any of the variants they may follow. 1095// We assume that if a variant x is followed by a variant y, then for any prefix 1096// p of x, p-x is a prefix of y. This allows us to order tags based on the 1097// maximum of the length of any of its prefixes. 1098// TODO: it is possible to define a set of Prefix values on variants such that 1099// a total order cannot be defined to the point that this algorithm breaks. 1100// In other words, we cannot guarantee the same order of variants for the 1101// future using the same algorithm or for non-compliant combinations of 1102// variants. For this reason, consider using simple alphabetic sorting 1103// of variants and ignore Prefix restrictions altogether. 1104func (b *builder) writeVariant() { 1105 generalized := stringSet{} 1106 specialized := stringSet{} 1107 specializedExtend := stringSet{} 1108 // Collate the variants by type and check assumptions. 1109 for _, v := range b.variant.slice() { 1110 e := b.registry[v] 1111 if len(e.prefix) == 0 { 1112 generalized.add(v) 1113 continue 1114 } 1115 c := strings.Split(e.prefix[0], "-") 1116 hasScriptOrRegion := false 1117 if len(c) > 1 { 1118 _, hasScriptOrRegion = b.script.find(c[1]) 1119 if !hasScriptOrRegion { 1120 _, hasScriptOrRegion = b.region.find(c[1]) 1121 1122 } 1123 } 1124 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1125 // Variant is preceded by a language. 1126 specialized.add(v) 1127 continue 1128 } 1129 // Variant is preceded by another variant. 1130 specializedExtend.add(v) 1131 prefix := c[0] + "-" 1132 if hasScriptOrRegion { 1133 prefix += c[1] 1134 } 1135 for _, p := range e.prefix { 1136 // Verify that the prefix minus the last element is a prefix of the 1137 // predecessor element. 1138 i := strings.LastIndex(p, "-") 1139 pred := b.registry[p[i+1:]] 1140 if find(pred.prefix, p[:i]) < 0 { 1141 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1142 } 1143 // The sorting used below does not work in the general case. It works 1144 // if we assume that variants that may be followed by others only have 1145 // prefixes of the same length. Verify this. 1146 count := strings.Count(p[:i], "-") 1147 for _, q := range pred.prefix { 1148 if c := strings.Count(q, "-"); c != count { 1149 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1150 } 1151 } 1152 if !strings.HasPrefix(p, prefix) { 1153 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1154 } 1155 } 1156 } 1157 1158 // Sort extended variants. 1159 a := specializedExtend.s 1160 less := func(v, w string) bool { 1161 // Sort by the maximum number of elements. 1162 maxCount := func(s string) (max int) { 1163 for _, p := range b.registry[s].prefix { 1164 if c := strings.Count(p, "-"); c > max { 1165 max = c 1166 } 1167 } 1168 return 1169 } 1170 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1171 return cv < cw 1172 } 1173 // Sort by name as tie breaker. 1174 return v < w 1175 } 1176 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1177 specializedExtend.frozen = true 1178 1179 // Create index from variant name to index. 1180 variantIndex := make(map[string]uint8) 1181 add := func(s []string) { 1182 for _, v := range s { 1183 variantIndex[v] = uint8(len(variantIndex)) 1184 } 1185 } 1186 add(specialized.slice()) 1187 add(specializedExtend.s) 1188 numSpecialized := len(variantIndex) 1189 add(generalized.slice()) 1190 if n := len(variantIndex); n > 255 { 1191 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1192 } 1193 b.writeMap("variantIndex", variantIndex) 1194 b.writeConst("variantNumSpecialized", numSpecialized) 1195} 1196 1197func (b *builder) writeLanguageInfo() { 1198} 1199 1200// writeLikelyData writes tables that are used both for finding parent relations and for 1201// language matching. Each entry contains additional bits to indicate the status of the 1202// data to know when it cannot be used for parent relations. 1203func (b *builder) writeLikelyData() { 1204 const ( 1205 isList = 1 << iota 1206 scriptInFrom 1207 regionInFrom 1208 ) 1209 type ( // generated types 1210 likelyScriptRegion struct { 1211 region uint16 1212 script uint8 1213 flags uint8 1214 } 1215 likelyLangScript struct { 1216 lang uint16 1217 script uint8 1218 flags uint8 1219 } 1220 likelyLangRegion struct { 1221 lang uint16 1222 region uint16 1223 } 1224 // likelyTag is used for getting likely tags for group regions, where 1225 // the likely region might be a region contained in the group. 1226 likelyTag struct { 1227 lang uint16 1228 region uint16 1229 script uint8 1230 } 1231 ) 1232 var ( // generated variables 1233 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1234 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1235 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1236 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1237 likelyLangList = []likelyScriptRegion{} 1238 likelyRegionList = []likelyLangScript{} 1239 ) 1240 type fromTo struct { 1241 from, to []string 1242 } 1243 langToOther := map[int][]fromTo{} 1244 regionToOther := map[int][]fromTo{} 1245 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1246 from := strings.Split(m.From, "_") 1247 to := strings.Split(m.To, "_") 1248 if len(to) != 3 { 1249 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1250 } 1251 if len(from) > 3 { 1252 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1253 } 1254 if from[0] != to[0] && from[0] != "und" { 1255 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1256 } 1257 if len(from) == 3 { 1258 if from[2] != to[2] { 1259 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1260 } 1261 if from[0] != "und" { 1262 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1263 } 1264 } 1265 if len(from) == 1 || from[0] != "und" { 1266 id := 0 1267 if from[0] != "und" { 1268 id = b.lang.index(from[0]) 1269 } 1270 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1271 } else if len(from) == 2 && len(from[1]) == 4 { 1272 sid := b.script.index(from[1]) 1273 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1274 likelyScript[sid].region = uint16(b.region.index(to[2])) 1275 } else { 1276 r := b.region.index(from[len(from)-1]) 1277 if id, ok := b.groups[r]; ok { 1278 if from[0] != "und" { 1279 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1280 } 1281 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1282 likelyRegionGroup[id].script = uint8(b.script.index(to[1])) 1283 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1284 } else { 1285 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1286 } 1287 } 1288 } 1289 b.writeType(likelyLangRegion{}) 1290 b.writeSlice("likelyScript", likelyScript) 1291 1292 for id := range b.lang.s { 1293 list := langToOther[id] 1294 if len(list) == 1 { 1295 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1296 likelyLang[id].script = uint8(b.script.index(list[0].to[1])) 1297 } else if len(list) > 1 { 1298 likelyLang[id].flags = isList 1299 likelyLang[id].region = uint16(len(likelyLangList)) 1300 likelyLang[id].script = uint8(len(list)) 1301 for _, x := range list { 1302 flags := uint8(0) 1303 if len(x.from) > 1 { 1304 if x.from[1] == x.to[2] { 1305 flags = regionInFrom 1306 } else { 1307 flags = scriptInFrom 1308 } 1309 } 1310 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1311 region: uint16(b.region.index(x.to[2])), 1312 script: uint8(b.script.index(x.to[1])), 1313 flags: flags, 1314 }) 1315 } 1316 } 1317 } 1318 // TODO: merge suppressScript data with this table. 1319 b.writeType(likelyScriptRegion{}) 1320 b.writeSlice("likelyLang", likelyLang) 1321 b.writeSlice("likelyLangList", likelyLangList) 1322 1323 for id := range b.region.s { 1324 list := regionToOther[id] 1325 if len(list) == 1 { 1326 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1327 likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) 1328 if len(list[0].from) > 2 { 1329 likelyRegion[id].flags = scriptInFrom 1330 } 1331 } else if len(list) > 1 { 1332 likelyRegion[id].flags = isList 1333 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1334 likelyRegion[id].script = uint8(len(list)) 1335 for i, x := range list { 1336 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1337 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1338 } 1339 x := likelyLangScript{ 1340 lang: uint16(b.langIndex(x.to[0])), 1341 script: uint8(b.script.index(x.to[1])), 1342 } 1343 if len(list[0].from) > 2 { 1344 x.flags = scriptInFrom 1345 } 1346 likelyRegionList = append(likelyRegionList, x) 1347 } 1348 } 1349 } 1350 b.writeType(likelyLangScript{}) 1351 b.writeSlice("likelyRegion", likelyRegion) 1352 b.writeSlice("likelyRegionList", likelyRegionList) 1353 1354 b.writeType(likelyTag{}) 1355 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1356} 1357 1358func (b *builder) writeRegionInclusionData() { 1359 var ( 1360 // mm holds for each group the set of groups with a distance of 1. 1361 mm = make(map[int][]index) 1362 1363 // containment holds for each group the transitive closure of 1364 // containment of other groups. 1365 containment = make(map[index][]index) 1366 ) 1367 for _, g := range b.supp.TerritoryContainment.Group { 1368 // Skip UN and EURO zone as they are flattening the containment 1369 // relationship. 1370 if g.Type == "EZ" || g.Type == "UN" { 1371 continue 1372 } 1373 group := b.region.index(g.Type) 1374 groupIdx := b.groups[group] 1375 for _, mem := range strings.Split(g.Contains, " ") { 1376 r := b.region.index(mem) 1377 mm[r] = append(mm[r], groupIdx) 1378 if g, ok := b.groups[r]; ok { 1379 mm[group] = append(mm[group], g) 1380 containment[groupIdx] = append(containment[groupIdx], g) 1381 } 1382 } 1383 } 1384 1385 regionContainment := make([]uint64, len(b.groups)) 1386 for _, g := range b.groups { 1387 l := containment[g] 1388 1389 // Compute the transitive closure of containment. 1390 for i := 0; i < len(l); i++ { 1391 l = append(l, containment[l[i]]...) 1392 } 1393 1394 // Compute the bitmask. 1395 regionContainment[g] = 1 << g 1396 for _, v := range l { 1397 regionContainment[g] |= 1 << v 1398 } 1399 } 1400 b.writeSlice("regionContainment", regionContainment) 1401 1402 regionInclusion := make([]uint8, len(b.region.s)) 1403 bvs := make(map[uint64]index) 1404 // Make the first bitvector positions correspond with the groups. 1405 for r, i := range b.groups { 1406 bv := uint64(1 << i) 1407 for _, g := range mm[r] { 1408 bv |= 1 << g 1409 } 1410 bvs[bv] = i 1411 regionInclusion[r] = uint8(bvs[bv]) 1412 } 1413 for r := 1; r < len(b.region.s); r++ { 1414 if _, ok := b.groups[r]; !ok { 1415 bv := uint64(0) 1416 for _, g := range mm[r] { 1417 bv |= 1 << g 1418 } 1419 if bv == 0 { 1420 // Pick the world for unspecified regions. 1421 bv = 1 << b.groups[b.region.index("001")] 1422 } 1423 if _, ok := bvs[bv]; !ok { 1424 bvs[bv] = index(len(bvs)) 1425 } 1426 regionInclusion[r] = uint8(bvs[bv]) 1427 } 1428 } 1429 b.writeSlice("regionInclusion", regionInclusion) 1430 regionInclusionBits := make([]uint64, len(bvs)) 1431 for k, v := range bvs { 1432 regionInclusionBits[v] = uint64(k) 1433 } 1434 // Add bit vectors for increasingly large distances until a fixed point is reached. 1435 regionInclusionNext := []uint8{} 1436 for i := 0; i < len(regionInclusionBits); i++ { 1437 bits := regionInclusionBits[i] 1438 next := bits 1439 for i := uint(0); i < uint(len(b.groups)); i++ { 1440 if bits&(1<<i) != 0 { 1441 next |= regionInclusionBits[i] 1442 } 1443 } 1444 if _, ok := bvs[next]; !ok { 1445 bvs[next] = index(len(bvs)) 1446 regionInclusionBits = append(regionInclusionBits, next) 1447 } 1448 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1449 } 1450 b.writeSlice("regionInclusionBits", regionInclusionBits) 1451 b.writeSlice("regionInclusionNext", regionInclusionNext) 1452} 1453 1454type parentRel struct { 1455 lang uint16 1456 script uint8 1457 maxScript uint8 1458 toRegion uint16 1459 fromRegion []uint16 1460} 1461 1462func (b *builder) writeParents() { 1463 b.writeType(parentRel{}) 1464 1465 parents := []parentRel{} 1466 1467 // Construct parent overrides. 1468 n := 0 1469 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1470 // Skipping non-standard scripts to root is implemented using addTags. 1471 if p.Parent == "root" { 1472 continue 1473 } 1474 1475 sub := strings.Split(p.Parent, "_") 1476 parent := parentRel{lang: b.langIndex(sub[0])} 1477 if len(sub) == 2 { 1478 // TODO: check that all undefined scripts are indeed Latn in these 1479 // cases. 1480 parent.maxScript = uint8(b.script.index("Latn")) 1481 parent.toRegion = uint16(b.region.index(sub[1])) 1482 } else { 1483 parent.script = uint8(b.script.index(sub[1])) 1484 parent.maxScript = parent.script 1485 parent.toRegion = uint16(b.region.index(sub[2])) 1486 } 1487 for _, c := range strings.Split(p.Locales, " ") { 1488 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1489 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1490 } 1491 parents = append(parents, parent) 1492 n += len(parent.fromRegion) 1493 } 1494 b.writeSliceAddSize("parents", n*2, parents) 1495} 1496 1497func main() { 1498 gen.Init() 1499 1500 gen.Repackage("gen_common.go", "common.go", "language") 1501 1502 w := gen.NewCodeWriter() 1503 defer w.WriteGoFile("tables.go", "language") 1504 1505 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) 1506 1507 b := newBuilder(w) 1508 gen.WriteCLDRVersion(w) 1509 1510 b.parseIndices() 1511 b.writeType(FromTo{}) 1512 b.writeLanguage() 1513 b.writeScript() 1514 b.writeRegion() 1515 b.writeVariant() 1516 // TODO: b.writeLocale() 1517 b.computeRegionGroups() 1518 b.writeLikelyData() 1519 b.writeRegionInclusionData() 1520 b.writeParents() 1521} 1522