1// Copyright 2013 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Language tag table generator. 8// Data read from the web. 9 10package main 11 12import ( 13 "bufio" 14 "flag" 15 "fmt" 16 "io" 17 "io/ioutil" 18 "log" 19 "math" 20 "reflect" 21 "regexp" 22 "sort" 23 "strconv" 24 "strings" 25 26 "golang.org/x/text/internal/gen" 27 "golang.org/x/text/internal/tag" 28 "golang.org/x/text/unicode/cldr" 29) 30 31var ( 32 test = flag.Bool("test", 33 false, 34 "test existing tables; can be used to compare web data with package data.") 35 outputFile = flag.String("output", 36 "tables.go", 37 "output file for generated tables") 38) 39 40var comment = []string{ 41 ` 42lang holds an alphabetically sorted list of ISO-639 language identifiers. 43All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag. 44For 2-byte language identifiers, the two successive bytes have the following meaning: 45 - if the first letter of the 2- and 3-letter ISO codes are the same: 46 the second and third letter of the 3-letter ISO code. 47 - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3. 48For 3-byte language identifiers the 4th byte is 0.`, 49 ` 50langNoIndex is a bit vector of all 3-letter language codes that are not used as an index 51in lookup tables. The language ids for these language codes are derived directly 52from the letters and are not consecutive.`, 53 ` 54altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives 55to 2-letter language codes that cannot be derived using the method described above. 56Each 3-letter code is followed by its 1-byte langID.`, 57 ` 58altLangIndex is used to convert indexes in altLangISO3 to langIDs.`, 59 ` 60langAliasMap maps langIDs to their suggested replacements.`, 61 ` 62script is an alphabetically sorted list of ISO 15924 codes. The index 63of the script in the string, divided by 4, is the internal scriptID.`, 64 ` 65isoRegionOffset needs to be added to the index of regionISO to obtain the regionID 66for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for 67the UN.M49 codes used for groups.)`, 68 ` 69regionISO holds a list of alphabetically sorted 2-letter ISO region codes. 70Each 2-letter codes is followed by two bytes with the following meaning: 71 - [A-Z}{2}: the first letter of the 2-letter code plus these two 72 letters form the 3-letter ISO code. 73 - 0, n: index into altRegionISO3.`, 74 ` 75regionTypes defines the status of a region for various standards.`, 76 ` 77m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are 78codes indicating collections of regions.`, 79 ` 80m49Index gives indexes into fromM49 based on the three most significant bits 81of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in 82 fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]] 83for an entry where the first 7 bits match the 7 lsb of the UN.M49 code. 84The region code is stored in the 9 lsb of the indexed value.`, 85 ` 86fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`, 87 ` 88altRegionISO3 holds a list of 3-letter region codes that cannot be 89mapped to 2-letter codes using the default algorithm. This is a short list.`, 90 ` 91altRegionIDs holds a list of regionIDs the positions of which match those 92of the 3-letter ISO codes in altRegionISO3.`, 93 ` 94variantNumSpecialized is the number of specialized variants in variants.`, 95 ` 96suppressScript is an index from langID to the dominant script for that language, 97if it exists. If a script is given, it should be suppressed from the language tag.`, 98 ` 99likelyLang is a lookup table, indexed by langID, for the most likely 100scripts and regions given incomplete information. If more entries exist for a 101given language, region and script are the index and size respectively 102of the list in likelyLangList.`, 103 ` 104likelyLangList holds lists info associated with likelyLang.`, 105 ` 106likelyRegion is a lookup table, indexed by regionID, for the most likely 107languages and scripts given incomplete information. If more entries exist 108for a given regionID, lang and script are the index and size respectively 109of the list in likelyRegionList. 110TODO: exclude containers and user-definable regions from the list.`, 111 ` 112likelyRegionList holds lists info associated with likelyRegion.`, 113 ` 114likelyScript is a lookup table, indexed by scriptID, for the most likely 115languages and regions given a script.`, 116 ` 117matchLang holds pairs of langIDs of base languages that are typically 118mutually intelligible. Each pair is associated with a confidence and 119whether the intelligibility goes one or both ways.`, 120 ` 121matchScript holds pairs of scriptIDs where readers of one script 122can typically also read the other. Each is associated with a confidence.`, 123 ` 124nRegionGroups is the number of region groups.`, 125 ` 126regionInclusion maps region identifiers to sets of regions in regionInclusionBits, 127where each set holds all groupings that are directly connected in a region 128containment graph.`, 129 ` 130regionInclusionBits is an array of bit vectors where every vector represents 131a set of region groupings. These sets are used to compute the distance 132between two regions for the purpose of language matching.`, 133 ` 134regionInclusionNext marks, for each entry in regionInclusionBits, the set of 135all groups that are reachable from the groups set in the respective entry.`, 136} 137 138// TODO: consider changing some of these structures to tries. This can reduce 139// memory, but may increase the need for memory allocations. This could be 140// mitigated if we can piggyback on language tags for common cases. 141 142func failOnError(e error) { 143 if e != nil { 144 log.Panic(e) 145 } 146} 147 148type setType int 149 150const ( 151 Indexed setType = 1 + iota // all elements must be of same size 152 Linear 153) 154 155type stringSet struct { 156 s []string 157 sorted, frozen bool 158 159 // We often need to update values after the creation of an index is completed. 160 // We include a convenience map for keeping track of this. 161 update map[string]string 162 typ setType // used for checking. 163} 164 165func (ss *stringSet) clone() stringSet { 166 c := *ss 167 c.s = append([]string(nil), c.s...) 168 return c 169} 170 171func (ss *stringSet) setType(t setType) { 172 if ss.typ != t && ss.typ != 0 { 173 log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ) 174 } 175} 176 177// parse parses a whitespace-separated string and initializes ss with its 178// components. 179func (ss *stringSet) parse(s string) { 180 scan := bufio.NewScanner(strings.NewReader(s)) 181 scan.Split(bufio.ScanWords) 182 for scan.Scan() { 183 ss.add(scan.Text()) 184 } 185} 186 187func (ss *stringSet) assertChangeable() { 188 if ss.frozen { 189 log.Panic("attempt to modify a frozen stringSet") 190 } 191} 192 193func (ss *stringSet) add(s string) { 194 ss.assertChangeable() 195 ss.s = append(ss.s, s) 196 ss.sorted = ss.frozen 197} 198 199func (ss *stringSet) freeze() { 200 ss.compact() 201 ss.frozen = true 202} 203 204func (ss *stringSet) compact() { 205 if ss.sorted { 206 return 207 } 208 a := ss.s 209 sort.Strings(a) 210 k := 0 211 for i := 1; i < len(a); i++ { 212 if a[k] != a[i] { 213 a[k+1] = a[i] 214 k++ 215 } 216 } 217 ss.s = a[:k+1] 218 ss.sorted = ss.frozen 219} 220 221type funcSorter struct { 222 fn func(a, b string) bool 223 sort.StringSlice 224} 225 226func (s funcSorter) Less(i, j int) bool { 227 return s.fn(s.StringSlice[i], s.StringSlice[j]) 228} 229 230func (ss *stringSet) sortFunc(f func(a, b string) bool) { 231 ss.compact() 232 sort.Sort(funcSorter{f, sort.StringSlice(ss.s)}) 233} 234 235func (ss *stringSet) remove(s string) { 236 ss.assertChangeable() 237 if i, ok := ss.find(s); ok { 238 copy(ss.s[i:], ss.s[i+1:]) 239 ss.s = ss.s[:len(ss.s)-1] 240 } 241} 242 243func (ss *stringSet) replace(ol, nu string) { 244 ss.s[ss.index(ol)] = nu 245 ss.sorted = ss.frozen 246} 247 248func (ss *stringSet) index(s string) int { 249 ss.setType(Indexed) 250 i, ok := ss.find(s) 251 if !ok { 252 if i < len(ss.s) { 253 log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i]) 254 } 255 log.Panicf("find: item %q is not in list", s) 256 257 } 258 return i 259} 260 261func (ss *stringSet) find(s string) (int, bool) { 262 ss.compact() 263 i := sort.SearchStrings(ss.s, s) 264 return i, i != len(ss.s) && ss.s[i] == s 265} 266 267func (ss *stringSet) slice() []string { 268 ss.compact() 269 return ss.s 270} 271 272func (ss *stringSet) updateLater(v, key string) { 273 if ss.update == nil { 274 ss.update = map[string]string{} 275 } 276 ss.update[v] = key 277} 278 279// join joins the string and ensures that all entries are of the same length. 280func (ss *stringSet) join() string { 281 ss.setType(Indexed) 282 n := len(ss.s[0]) 283 for _, s := range ss.s { 284 if len(s) != n { 285 log.Panicf("join: not all entries are of the same length: %q", s) 286 } 287 } 288 ss.s = append(ss.s, strings.Repeat("\xff", n)) 289 return strings.Join(ss.s, "") 290} 291 292// ianaEntry holds information for an entry in the IANA Language Subtag Repository. 293// All types use the same entry. 294// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various 295// fields. 296type ianaEntry struct { 297 typ string 298 description []string 299 scope string 300 added string 301 preferred string 302 deprecated string 303 suppressScript string 304 macro string 305 prefix []string 306} 307 308type builder struct { 309 w *gen.CodeWriter 310 hw io.Writer // MultiWriter for w and w.Hash 311 data *cldr.CLDR 312 supp *cldr.SupplementalData 313 314 // indices 315 locale stringSet // common locales 316 lang stringSet // canonical language ids (2 or 3 letter ISO codes) with data 317 langNoIndex stringSet // 3-letter ISO codes with no associated data 318 script stringSet // 4-letter ISO codes 319 region stringSet // 2-letter ISO or 3-digit UN M49 codes 320 variant stringSet // 4-8-alphanumeric variant code. 321 322 // Region codes that are groups with their corresponding group IDs. 323 groups map[int]index 324 325 // langInfo 326 registry map[string]*ianaEntry 327} 328 329type index uint 330 331func newBuilder(w *gen.CodeWriter) *builder { 332 r := gen.OpenCLDRCoreZip() 333 defer r.Close() 334 d := &cldr.Decoder{} 335 data, err := d.DecodeZip(r) 336 failOnError(err) 337 b := builder{ 338 w: w, 339 hw: io.MultiWriter(w, w.Hash), 340 data: data, 341 supp: data.Supplemental(), 342 } 343 b.parseRegistry() 344 return &b 345} 346 347func (b *builder) parseRegistry() { 348 r := gen.OpenIANAFile("assignments/language-subtag-registry") 349 defer r.Close() 350 b.registry = make(map[string]*ianaEntry) 351 352 scan := bufio.NewScanner(r) 353 scan.Split(bufio.ScanWords) 354 var record *ianaEntry 355 for more := scan.Scan(); more; { 356 key := scan.Text() 357 more = scan.Scan() 358 value := scan.Text() 359 switch key { 360 case "Type:": 361 record = &ianaEntry{typ: value} 362 case "Subtag:", "Tag:": 363 if s := strings.SplitN(value, "..", 2); len(s) > 1 { 364 for a := s[0]; a <= s[1]; a = inc(a) { 365 b.addToRegistry(a, record) 366 } 367 } else { 368 b.addToRegistry(value, record) 369 } 370 case "Suppress-Script:": 371 record.suppressScript = value 372 case "Added:": 373 record.added = value 374 case "Deprecated:": 375 record.deprecated = value 376 case "Macrolanguage:": 377 record.macro = value 378 case "Preferred-Value:": 379 record.preferred = value 380 case "Prefix:": 381 record.prefix = append(record.prefix, value) 382 case "Scope:": 383 record.scope = value 384 case "Description:": 385 buf := []byte(value) 386 for more = scan.Scan(); more; more = scan.Scan() { 387 b := scan.Bytes() 388 if b[0] == '%' || b[len(b)-1] == ':' { 389 break 390 } 391 buf = append(buf, ' ') 392 buf = append(buf, b...) 393 } 394 record.description = append(record.description, string(buf)) 395 continue 396 default: 397 continue 398 } 399 more = scan.Scan() 400 } 401 if scan.Err() != nil { 402 log.Panic(scan.Err()) 403 } 404} 405 406func (b *builder) addToRegistry(key string, entry *ianaEntry) { 407 if info, ok := b.registry[key]; ok { 408 if info.typ != "language" || entry.typ != "extlang" { 409 log.Fatalf("parseRegistry: tag %q already exists", key) 410 } 411 } else { 412 b.registry[key] = entry 413 } 414} 415 416var commentIndex = make(map[string]string) 417 418func init() { 419 for _, s := range comment { 420 key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0]) 421 commentIndex[key] = s 422 } 423} 424 425func (b *builder) comment(name string) { 426 if s := commentIndex[name]; len(s) > 0 { 427 b.w.WriteComment(s) 428 } else { 429 fmt.Fprintln(b.w) 430 } 431} 432 433func (b *builder) pf(f string, x ...interface{}) { 434 fmt.Fprintf(b.hw, f, x...) 435 fmt.Fprint(b.hw, "\n") 436} 437 438func (b *builder) p(x ...interface{}) { 439 fmt.Fprintln(b.hw, x...) 440} 441 442func (b *builder) addSize(s int) { 443 b.w.Size += s 444 b.pf("// Size: %d bytes", s) 445} 446 447func (b *builder) writeConst(name string, x interface{}) { 448 b.comment(name) 449 b.w.WriteConst(name, x) 450} 451 452// writeConsts computes f(v) for all v in values and writes the results 453// as constants named _v to a single constant block. 454func (b *builder) writeConsts(f func(string) int, values ...string) { 455 b.pf("const (") 456 for _, v := range values { 457 b.pf("\t_%s = %v", v, f(v)) 458 } 459 b.pf(")") 460} 461 462// writeType writes the type of the given value, which must be a struct. 463func (b *builder) writeType(value interface{}) { 464 b.comment(reflect.TypeOf(value).Name()) 465 b.w.WriteType(value) 466} 467 468func (b *builder) writeSlice(name string, ss interface{}) { 469 b.writeSliceAddSize(name, 0, ss) 470} 471 472func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) { 473 b.comment(name) 474 b.w.Size += extraSize 475 v := reflect.ValueOf(ss) 476 t := v.Type().Elem() 477 b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len()) 478 479 fmt.Fprintf(b.w, "var %s = ", name) 480 b.w.WriteArray(ss) 481 b.p() 482} 483 484type fromTo struct { 485 from, to uint16 486} 487 488func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) { 489 ss.sortFunc(func(a, b string) bool { 490 return index(a) < index(b) 491 }) 492 m := []fromTo{} 493 for _, s := range ss.s { 494 m = append(m, fromTo{index(s), index(ss.update[s])}) 495 } 496 b.writeSlice(name, m) 497} 498 499const base = 'z' - 'a' + 1 500 501func strToInt(s string) uint { 502 v := uint(0) 503 for i := 0; i < len(s); i++ { 504 v *= base 505 v += uint(s[i] - 'a') 506 } 507 return v 508} 509 510// converts the given integer to the original ASCII string passed to strToInt. 511// len(s) must match the number of characters obtained. 512func intToStr(v uint, s []byte) { 513 for i := len(s) - 1; i >= 0; i-- { 514 s[i] = byte(v%base) + 'a' 515 v /= base 516 } 517} 518 519func (b *builder) writeBitVector(name string, ss []string) { 520 vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8))) 521 for _, s := range ss { 522 v := strToInt(s) 523 vec[v/8] |= 1 << (v % 8) 524 } 525 b.writeSlice(name, vec) 526} 527 528// TODO: convert this type into a list or two-stage trie. 529func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) { 530 b.comment(name) 531 v := reflect.ValueOf(m) 532 sz := v.Len() * (2 + int(v.Type().Key().Size())) 533 for _, k := range m { 534 sz += len(k) 535 } 536 b.addSize(sz) 537 keys := []string{} 538 b.pf(`var %s = map[string]uint16{`, name) 539 for k := range m { 540 keys = append(keys, k) 541 } 542 sort.Strings(keys) 543 for _, k := range keys { 544 b.pf("\t%q: %v,", k, f(m[k])) 545 } 546 b.p("}") 547} 548 549func (b *builder) writeMap(name string, m interface{}) { 550 b.comment(name) 551 v := reflect.ValueOf(m) 552 sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size())) 553 b.addSize(sz) 554 f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool { 555 return strings.IndexRune("{}, ", r) != -1 556 }) 557 sort.Strings(f[1:]) 558 b.pf(`var %s = %s{`, name, f[0]) 559 for _, kv := range f[1:] { 560 b.pf("\t%s,", kv) 561 } 562 b.p("}") 563} 564 565func (b *builder) langIndex(s string) uint16 { 566 if s == "und" { 567 return 0 568 } 569 if i, ok := b.lang.find(s); ok { 570 return uint16(i) 571 } 572 return uint16(strToInt(s)) + uint16(len(b.lang.s)) 573} 574 575// inc advances the string to its lexicographical successor. 576func inc(s string) string { 577 const maxTagLength = 4 578 var buf [maxTagLength]byte 579 intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)]) 580 for i := 0; i < len(s); i++ { 581 if s[i] <= 'Z' { 582 buf[i] -= 'a' - 'A' 583 } 584 } 585 return string(buf[:len(s)]) 586} 587 588func (b *builder) parseIndices() { 589 meta := b.supp.Metadata 590 591 for k, v := range b.registry { 592 var ss *stringSet 593 switch v.typ { 594 case "language": 595 if len(k) == 2 || v.suppressScript != "" || v.scope == "special" { 596 b.lang.add(k) 597 continue 598 } else { 599 ss = &b.langNoIndex 600 } 601 case "region": 602 ss = &b.region 603 case "script": 604 ss = &b.script 605 case "variant": 606 ss = &b.variant 607 default: 608 continue 609 } 610 ss.add(k) 611 } 612 // Include any language for which there is data. 613 for _, lang := range b.data.Locales() { 614 if x := b.data.RawLDML(lang); false || 615 x.LocaleDisplayNames != nil || 616 x.Characters != nil || 617 x.Delimiters != nil || 618 x.Measurement != nil || 619 x.Dates != nil || 620 x.Numbers != nil || 621 x.Units != nil || 622 x.ListPatterns != nil || 623 x.Collations != nil || 624 x.Segmentations != nil || 625 x.Rbnf != nil || 626 x.Annotations != nil || 627 x.Metadata != nil { 628 629 from := strings.Split(lang, "_") 630 if lang := from[0]; lang != "root" { 631 b.lang.add(lang) 632 } 633 } 634 } 635 // Include locales for plural rules, which uses a different structure. 636 for _, plurals := range b.data.Supplemental().Plurals { 637 for _, rules := range plurals.PluralRules { 638 for _, lang := range strings.Split(rules.Locales, " ") { 639 if lang = strings.Split(lang, "_")[0]; lang != "root" { 640 b.lang.add(lang) 641 } 642 } 643 } 644 } 645 // Include languages in likely subtags. 646 for _, m := range b.supp.LikelySubtags.LikelySubtag { 647 from := strings.Split(m.From, "_") 648 b.lang.add(from[0]) 649 } 650 // Include ISO-639 alpha-3 bibliographic entries. 651 for _, a := range meta.Alias.LanguageAlias { 652 if a.Reason == "bibliographic" { 653 b.langNoIndex.add(a.Type) 654 } 655 } 656 // Include regions in territoryAlias (not all are in the IANA registry!) 657 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 658 if len(reg.Type) == 2 { 659 b.region.add(reg.Type) 660 } 661 } 662 663 for _, s := range b.lang.s { 664 if len(s) == 3 { 665 b.langNoIndex.remove(s) 666 } 667 } 668 b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice())) 669 b.writeConst("numScripts", len(b.script.slice())) 670 b.writeConst("numRegions", len(b.region.slice())) 671 672 // Add dummy codes at the start of each list to represent "unspecified". 673 b.lang.add("---") 674 b.script.add("----") 675 b.region.add("---") 676 677 // common locales 678 b.locale.parse(meta.DefaultContent.Locales) 679} 680 681// TODO: region inclusion data will probably not be use used in future matchers. 682 683func (b *builder) computeRegionGroups() { 684 b.groups = make(map[int]index) 685 686 // Create group indices. 687 for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID. 688 b.groups[i] = index(len(b.groups)) 689 } 690 for _, g := range b.supp.TerritoryContainment.Group { 691 // Skip UN and EURO zone as they are flattening the containment 692 // relationship. 693 if g.Type == "EZ" || g.Type == "UN" { 694 continue 695 } 696 group := b.region.index(g.Type) 697 if _, ok := b.groups[group]; !ok { 698 b.groups[group] = index(len(b.groups)) 699 } 700 } 701 if len(b.groups) > 64 { 702 log.Fatalf("only 64 groups supported, found %d", len(b.groups)) 703 } 704 b.writeConst("nRegionGroups", len(b.groups)) 705} 706 707var langConsts = []string{ 708 "af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", 709 "et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is", 710 "it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml", 711 "mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt", 712 "ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th", 713 "tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu", 714 715 // constants for grandfathered tags (if not already defined) 716 "jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu", 717 "nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn", 718} 719 720// writeLanguage generates all tables needed for language canonicalization. 721func (b *builder) writeLanguage() { 722 meta := b.supp.Metadata 723 724 b.writeConst("nonCanonicalUnd", b.lang.index("und")) 725 b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...) 726 b.writeConst("langPrivateStart", b.langIndex("qaa")) 727 b.writeConst("langPrivateEnd", b.langIndex("qtz")) 728 729 // Get language codes that need to be mapped (overlong 3-letter codes, 730 // deprecated 2-letter codes, legacy and grandfathered tags.) 731 langAliasMap := stringSet{} 732 aliasTypeMap := map[string]langAliasType{} 733 734 // altLangISO3 get the alternative ISO3 names that need to be mapped. 735 altLangISO3 := stringSet{} 736 // Add dummy start to avoid the use of index 0. 737 altLangISO3.add("---") 738 altLangISO3.updateLater("---", "aa") 739 740 lang := b.lang.clone() 741 for _, a := range meta.Alias.LanguageAlias { 742 if a.Replacement == "" { 743 a.Replacement = "und" 744 } 745 // TODO: support mapping to tags 746 repl := strings.SplitN(a.Replacement, "_", 2)[0] 747 if a.Reason == "overlong" { 748 if len(a.Replacement) == 2 && len(a.Type) == 3 { 749 lang.updateLater(a.Replacement, a.Type) 750 } 751 } else if len(a.Type) <= 3 { 752 switch a.Reason { 753 case "macrolanguage": 754 aliasTypeMap[a.Type] = langMacro 755 case "deprecated": 756 // handled elsewhere 757 continue 758 case "bibliographic", "legacy": 759 if a.Type == "no" { 760 continue 761 } 762 aliasTypeMap[a.Type] = langLegacy 763 default: 764 log.Fatalf("new %s alias: %s", a.Reason, a.Type) 765 } 766 langAliasMap.add(a.Type) 767 langAliasMap.updateLater(a.Type, repl) 768 } 769 } 770 // Manually add the mapping of "nb" (Norwegian) to its macro language. 771 // This can be removed if CLDR adopts this change. 772 langAliasMap.add("nb") 773 langAliasMap.updateLater("nb", "no") 774 aliasTypeMap["nb"] = langMacro 775 776 for k, v := range b.registry { 777 // Also add deprecated values for 3-letter ISO codes, which CLDR omits. 778 if v.typ == "language" && v.deprecated != "" && v.preferred != "" { 779 langAliasMap.add(k) 780 langAliasMap.updateLater(k, v.preferred) 781 aliasTypeMap[k] = langDeprecated 782 } 783 } 784 // Fix CLDR mappings. 785 lang.updateLater("tl", "tgl") 786 lang.updateLater("sh", "hbs") 787 lang.updateLater("mo", "mol") 788 lang.updateLater("no", "nor") 789 lang.updateLater("tw", "twi") 790 lang.updateLater("nb", "nob") 791 lang.updateLater("ak", "aka") 792 lang.updateLater("bh", "bih") 793 794 // Ensure that each 2-letter code is matched with a 3-letter code. 795 for _, v := range lang.s[1:] { 796 s, ok := lang.update[v] 797 if !ok { 798 if s, ok = lang.update[langAliasMap.update[v]]; !ok { 799 continue 800 } 801 lang.update[v] = s 802 } 803 if v[0] != s[0] { 804 altLangISO3.add(s) 805 altLangISO3.updateLater(s, v) 806 } 807 } 808 809 // Complete canonicalized language tags. 810 lang.freeze() 811 for i, v := range lang.s { 812 // We can avoid these manual entries by using the IANA registry directly. 813 // Seems easier to update the list manually, as changes are rare. 814 // The panic in this loop will trigger if we miss an entry. 815 add := "" 816 if s, ok := lang.update[v]; ok { 817 if s[0] == v[0] { 818 add = s[1:] 819 } else { 820 add = string([]byte{0, byte(altLangISO3.index(s))}) 821 } 822 } else if len(v) == 3 { 823 add = "\x00" 824 } else { 825 log.Panicf("no data for long form of %q", v) 826 } 827 lang.s[i] += add 828 } 829 b.writeConst("lang", tag.Index(lang.join())) 830 831 b.writeConst("langNoIndexOffset", len(b.lang.s)) 832 833 // space of all valid 3-letter language identifiers. 834 b.writeBitVector("langNoIndex", b.langNoIndex.slice()) 835 836 altLangIndex := []uint16{} 837 for i, s := range altLangISO3.slice() { 838 altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))}) 839 if i > 0 { 840 idx := b.lang.index(altLangISO3.update[s]) 841 altLangIndex = append(altLangIndex, uint16(idx)) 842 } 843 } 844 b.writeConst("altLangISO3", tag.Index(altLangISO3.join())) 845 b.writeSlice("altLangIndex", altLangIndex) 846 847 b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex) 848 types := make([]langAliasType, len(langAliasMap.s)) 849 for i, s := range langAliasMap.s { 850 types[i] = aliasTypeMap[s] 851 } 852 b.writeSlice("langAliasTypes", types) 853} 854 855var scriptConsts = []string{ 856 "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy", 857 "Zzzz", 858} 859 860func (b *builder) writeScript() { 861 b.writeConsts(b.script.index, scriptConsts...) 862 b.writeConst("script", tag.Index(b.script.join())) 863 864 supp := make([]uint8, len(b.lang.slice())) 865 for i, v := range b.lang.slice()[1:] { 866 if sc := b.registry[v].suppressScript; sc != "" { 867 supp[i+1] = uint8(b.script.index(sc)) 868 } 869 } 870 b.writeSlice("suppressScript", supp) 871 872 // There is only one deprecated script in CLDR. This value is hard-coded. 873 // We check here if the code must be updated. 874 for _, a := range b.supp.Metadata.Alias.ScriptAlias { 875 if a.Type != "Qaai" { 876 log.Panicf("unexpected deprecated stript %q", a.Type) 877 } 878 } 879} 880 881func parseM49(s string) int16 { 882 if len(s) == 0 { 883 return 0 884 } 885 v, err := strconv.ParseUint(s, 10, 10) 886 failOnError(err) 887 return int16(v) 888} 889 890var regionConsts = []string{ 891 "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US", 892 "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo. 893} 894 895func (b *builder) writeRegion() { 896 b.writeConsts(b.region.index, regionConsts...) 897 898 isoOffset := b.region.index("AA") 899 m49map := make([]int16, len(b.region.slice())) 900 fromM49map := make(map[int16]int) 901 altRegionISO3 := "" 902 altRegionIDs := []uint16{} 903 904 b.writeConst("isoRegionOffset", isoOffset) 905 906 // 2-letter region lookup and mapping to numeric codes. 907 regionISO := b.region.clone() 908 regionISO.s = regionISO.s[isoOffset:] 909 regionISO.sorted = false 910 911 regionTypes := make([]byte, len(b.region.s)) 912 913 // Is the region valid BCP 47? 914 for s, e := range b.registry { 915 if len(s) == 2 && s == strings.ToUpper(s) { 916 i := b.region.index(s) 917 for _, d := range e.description { 918 if strings.Contains(d, "Private use") { 919 regionTypes[i] = iso3166UserAssigned 920 } 921 } 922 regionTypes[i] |= bcp47Region 923 } 924 } 925 926 // Is the region a valid ccTLD? 927 r := gen.OpenIANAFile("domains/root/db") 928 defer r.Close() 929 930 buf, err := ioutil.ReadAll(r) 931 failOnError(err) 932 re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`) 933 for _, m := range re.FindAllSubmatch(buf, -1) { 934 i := b.region.index(strings.ToUpper(string(m[1]))) 935 regionTypes[i] |= ccTLD 936 } 937 938 b.writeSlice("regionTypes", regionTypes) 939 940 iso3Set := make(map[string]int) 941 update := func(iso2, iso3 string) { 942 i := regionISO.index(iso2) 943 if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] { 944 regionISO.s[i] += iso3[1:] 945 iso3Set[iso3] = -1 946 } else { 947 if ok && j >= 0 { 948 regionISO.s[i] += string([]byte{0, byte(j)}) 949 } else { 950 iso3Set[iso3] = len(altRegionISO3) 951 regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))}) 952 altRegionISO3 += iso3 953 altRegionIDs = append(altRegionIDs, uint16(isoOffset+i)) 954 } 955 } 956 } 957 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 958 i := regionISO.index(tc.Type) + isoOffset 959 if d := m49map[i]; d != 0 { 960 log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d) 961 } 962 m49 := parseM49(tc.Numeric) 963 m49map[i] = m49 964 if r := fromM49map[m49]; r == 0 { 965 fromM49map[m49] = i 966 } else if r != i { 967 dep := b.registry[regionISO.s[r-isoOffset]].deprecated 968 if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) { 969 fromM49map[m49] = i 970 } 971 } 972 } 973 for _, ta := range b.supp.Metadata.Alias.TerritoryAlias { 974 if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 { 975 from := parseM49(ta.Type) 976 if r := fromM49map[from]; r == 0 { 977 fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset 978 } 979 } 980 } 981 for _, tc := range b.supp.CodeMappings.TerritoryCodes { 982 if len(tc.Alpha3) == 3 { 983 update(tc.Type, tc.Alpha3) 984 } 985 } 986 // This entries are not included in territoryCodes. Mostly 3-letter variants 987 // of deleted codes and an entry for QU. 988 for _, m := range []struct{ iso2, iso3 string }{ 989 {"CT", "CTE"}, 990 {"DY", "DHY"}, 991 {"HV", "HVO"}, 992 {"JT", "JTN"}, 993 {"MI", "MID"}, 994 {"NH", "NHB"}, 995 {"NQ", "ATN"}, 996 {"PC", "PCI"}, 997 {"PU", "PUS"}, 998 {"PZ", "PCZ"}, 999 {"RH", "RHO"}, 1000 {"VD", "VDR"}, 1001 {"WK", "WAK"}, 1002 // These three-letter codes are used for others as well. 1003 {"FQ", "ATF"}, 1004 } { 1005 update(m.iso2, m.iso3) 1006 } 1007 for i, s := range regionISO.s { 1008 if len(s) != 4 { 1009 regionISO.s[i] = s + " " 1010 } 1011 } 1012 b.writeConst("regionISO", tag.Index(regionISO.join())) 1013 b.writeConst("altRegionISO3", altRegionISO3) 1014 b.writeSlice("altRegionIDs", altRegionIDs) 1015 1016 // Create list of deprecated regions. 1017 // TODO: consider inserting SF -> FI. Not included by CLDR, but is the only 1018 // Transitionally-reserved mapping not included. 1019 regionOldMap := stringSet{} 1020 // Include regions in territoryAlias (not all are in the IANA registry!) 1021 for _, reg := range b.supp.Metadata.Alias.TerritoryAlias { 1022 if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 { 1023 regionOldMap.add(reg.Type) 1024 regionOldMap.updateLater(reg.Type, reg.Replacement) 1025 i, _ := regionISO.find(reg.Type) 1026 j, _ := regionISO.find(reg.Replacement) 1027 if k := m49map[i+isoOffset]; k == 0 { 1028 m49map[i+isoOffset] = m49map[j+isoOffset] 1029 } 1030 } 1031 } 1032 b.writeSortedMap("regionOldMap", ®ionOldMap, func(s string) uint16 { 1033 return uint16(b.region.index(s)) 1034 }) 1035 // 3-digit region lookup, groupings. 1036 for i := 1; i < isoOffset; i++ { 1037 m := parseM49(b.region.s[i]) 1038 m49map[i] = m 1039 fromM49map[m] = i 1040 } 1041 b.writeSlice("m49", m49map) 1042 1043 const ( 1044 searchBits = 7 1045 regionBits = 9 1046 ) 1047 if len(m49map) >= 1<<regionBits { 1048 log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits) 1049 } 1050 m49Index := [9]int16{} 1051 fromM49 := []uint16{} 1052 m49 := []int{} 1053 for k, _ := range fromM49map { 1054 m49 = append(m49, int(k)) 1055 } 1056 sort.Ints(m49) 1057 for _, k := range m49[1:] { 1058 val := (k & (1<<searchBits - 1)) << regionBits 1059 fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)])) 1060 m49Index[1:][k>>searchBits] = int16(len(fromM49)) 1061 } 1062 b.writeSlice("m49Index", m49Index) 1063 b.writeSlice("fromM49", fromM49) 1064} 1065 1066const ( 1067 // TODO: put these lists in regionTypes as user data? Could be used for 1068 // various optimizations and refinements and could be exposed in the API. 1069 iso3166Except = "AC CP DG EA EU FX IC SU TA UK" 1070 iso3166Trans = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions. 1071 // DY and RH are actually not deleted, but indeterminately reserved. 1072 iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD" 1073) 1074 1075const ( 1076 iso3166UserAssigned = 1 << iota 1077 ccTLD 1078 bcp47Region 1079) 1080 1081func find(list []string, s string) int { 1082 for i, t := range list { 1083 if t == s { 1084 return i 1085 } 1086 } 1087 return -1 1088} 1089 1090// writeVariants generates per-variant information and creates a map from variant 1091// name to index value. We assign index values such that sorting multiple 1092// variants by index value will result in the correct order. 1093// There are two types of variants: specialized and general. Specialized variants 1094// are only applicable to certain language or language-script pairs. Generalized 1095// variants apply to any language. Generalized variants always sort after 1096// specialized variants. We will therefore always assign a higher index value 1097// to a generalized variant than any other variant. Generalized variants are 1098// sorted alphabetically among themselves. 1099// Specialized variants may also sort after other specialized variants. Such 1100// variants will be ordered after any of the variants they may follow. 1101// We assume that if a variant x is followed by a variant y, then for any prefix 1102// p of x, p-x is a prefix of y. This allows us to order tags based on the 1103// maximum of the length of any of its prefixes. 1104// TODO: it is possible to define a set of Prefix values on variants such that 1105// a total order cannot be defined to the point that this algorithm breaks. 1106// In other words, we cannot guarantee the same order of variants for the 1107// future using the same algorithm or for non-compliant combinations of 1108// variants. For this reason, consider using simple alphabetic sorting 1109// of variants and ignore Prefix restrictions altogether. 1110func (b *builder) writeVariant() { 1111 generalized := stringSet{} 1112 specialized := stringSet{} 1113 specializedExtend := stringSet{} 1114 // Collate the variants by type and check assumptions. 1115 for _, v := range b.variant.slice() { 1116 e := b.registry[v] 1117 if len(e.prefix) == 0 { 1118 generalized.add(v) 1119 continue 1120 } 1121 c := strings.Split(e.prefix[0], "-") 1122 hasScriptOrRegion := false 1123 if len(c) > 1 { 1124 _, hasScriptOrRegion = b.script.find(c[1]) 1125 if !hasScriptOrRegion { 1126 _, hasScriptOrRegion = b.region.find(c[1]) 1127 1128 } 1129 } 1130 if len(c) == 1 || len(c) == 2 && hasScriptOrRegion { 1131 // Variant is preceded by a language. 1132 specialized.add(v) 1133 continue 1134 } 1135 // Variant is preceded by another variant. 1136 specializedExtend.add(v) 1137 prefix := c[0] + "-" 1138 if hasScriptOrRegion { 1139 prefix += c[1] 1140 } 1141 for _, p := range e.prefix { 1142 // Verify that the prefix minus the last element is a prefix of the 1143 // predecessor element. 1144 i := strings.LastIndex(p, "-") 1145 pred := b.registry[p[i+1:]] 1146 if find(pred.prefix, p[:i]) < 0 { 1147 log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v) 1148 } 1149 // The sorting used below does not work in the general case. It works 1150 // if we assume that variants that may be followed by others only have 1151 // prefixes of the same length. Verify this. 1152 count := strings.Count(p[:i], "-") 1153 for _, q := range pred.prefix { 1154 if c := strings.Count(q, "-"); c != count { 1155 log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count) 1156 } 1157 } 1158 if !strings.HasPrefix(p, prefix) { 1159 log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix) 1160 } 1161 } 1162 } 1163 1164 // Sort extended variants. 1165 a := specializedExtend.s 1166 less := func(v, w string) bool { 1167 // Sort by the maximum number of elements. 1168 maxCount := func(s string) (max int) { 1169 for _, p := range b.registry[s].prefix { 1170 if c := strings.Count(p, "-"); c > max { 1171 max = c 1172 } 1173 } 1174 return 1175 } 1176 if cv, cw := maxCount(v), maxCount(w); cv != cw { 1177 return cv < cw 1178 } 1179 // Sort by name as tie breaker. 1180 return v < w 1181 } 1182 sort.Sort(funcSorter{less, sort.StringSlice(a)}) 1183 specializedExtend.frozen = true 1184 1185 // Create index from variant name to index. 1186 variantIndex := make(map[string]uint8) 1187 add := func(s []string) { 1188 for _, v := range s { 1189 variantIndex[v] = uint8(len(variantIndex)) 1190 } 1191 } 1192 add(specialized.slice()) 1193 add(specializedExtend.s) 1194 numSpecialized := len(variantIndex) 1195 add(generalized.slice()) 1196 if n := len(variantIndex); n > 255 { 1197 log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n) 1198 } 1199 b.writeMap("variantIndex", variantIndex) 1200 b.writeConst("variantNumSpecialized", numSpecialized) 1201} 1202 1203func (b *builder) writeLanguageInfo() { 1204} 1205 1206// writeLikelyData writes tables that are used both for finding parent relations and for 1207// language matching. Each entry contains additional bits to indicate the status of the 1208// data to know when it cannot be used for parent relations. 1209func (b *builder) writeLikelyData() { 1210 const ( 1211 isList = 1 << iota 1212 scriptInFrom 1213 regionInFrom 1214 ) 1215 type ( // generated types 1216 likelyScriptRegion struct { 1217 region uint16 1218 script uint8 1219 flags uint8 1220 } 1221 likelyLangScript struct { 1222 lang uint16 1223 script uint8 1224 flags uint8 1225 } 1226 likelyLangRegion struct { 1227 lang uint16 1228 region uint16 1229 } 1230 // likelyTag is used for getting likely tags for group regions, where 1231 // the likely region might be a region contained in the group. 1232 likelyTag struct { 1233 lang uint16 1234 region uint16 1235 script uint8 1236 } 1237 ) 1238 var ( // generated variables 1239 likelyRegionGroup = make([]likelyTag, len(b.groups)) 1240 likelyLang = make([]likelyScriptRegion, len(b.lang.s)) 1241 likelyRegion = make([]likelyLangScript, len(b.region.s)) 1242 likelyScript = make([]likelyLangRegion, len(b.script.s)) 1243 likelyLangList = []likelyScriptRegion{} 1244 likelyRegionList = []likelyLangScript{} 1245 ) 1246 type fromTo struct { 1247 from, to []string 1248 } 1249 langToOther := map[int][]fromTo{} 1250 regionToOther := map[int][]fromTo{} 1251 for _, m := range b.supp.LikelySubtags.LikelySubtag { 1252 from := strings.Split(m.From, "_") 1253 to := strings.Split(m.To, "_") 1254 if len(to) != 3 { 1255 log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to)) 1256 } 1257 if len(from) > 3 { 1258 log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from)) 1259 } 1260 if from[0] != to[0] && from[0] != "und" { 1261 log.Fatalf("unexpected language change in expansion: %s -> %s", from, to) 1262 } 1263 if len(from) == 3 { 1264 if from[2] != to[2] { 1265 log.Fatalf("unexpected region change in expansion: %s -> %s", from, to) 1266 } 1267 if from[0] != "und" { 1268 log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to) 1269 } 1270 } 1271 if len(from) == 1 || from[0] != "und" { 1272 id := 0 1273 if from[0] != "und" { 1274 id = b.lang.index(from[0]) 1275 } 1276 langToOther[id] = append(langToOther[id], fromTo{from, to}) 1277 } else if len(from) == 2 && len(from[1]) == 4 { 1278 sid := b.script.index(from[1]) 1279 likelyScript[sid].lang = uint16(b.langIndex(to[0])) 1280 likelyScript[sid].region = uint16(b.region.index(to[2])) 1281 } else { 1282 r := b.region.index(from[len(from)-1]) 1283 if id, ok := b.groups[r]; ok { 1284 if from[0] != "und" { 1285 log.Fatalf("region changed unexpectedly: %s -> %s", from, to) 1286 } 1287 likelyRegionGroup[id].lang = uint16(b.langIndex(to[0])) 1288 likelyRegionGroup[id].script = uint8(b.script.index(to[1])) 1289 likelyRegionGroup[id].region = uint16(b.region.index(to[2])) 1290 } else { 1291 regionToOther[r] = append(regionToOther[r], fromTo{from, to}) 1292 } 1293 } 1294 } 1295 b.writeType(likelyLangRegion{}) 1296 b.writeSlice("likelyScript", likelyScript) 1297 1298 for id := range b.lang.s { 1299 list := langToOther[id] 1300 if len(list) == 1 { 1301 likelyLang[id].region = uint16(b.region.index(list[0].to[2])) 1302 likelyLang[id].script = uint8(b.script.index(list[0].to[1])) 1303 } else if len(list) > 1 { 1304 likelyLang[id].flags = isList 1305 likelyLang[id].region = uint16(len(likelyLangList)) 1306 likelyLang[id].script = uint8(len(list)) 1307 for _, x := range list { 1308 flags := uint8(0) 1309 if len(x.from) > 1 { 1310 if x.from[1] == x.to[2] { 1311 flags = regionInFrom 1312 } else { 1313 flags = scriptInFrom 1314 } 1315 } 1316 likelyLangList = append(likelyLangList, likelyScriptRegion{ 1317 region: uint16(b.region.index(x.to[2])), 1318 script: uint8(b.script.index(x.to[1])), 1319 flags: flags, 1320 }) 1321 } 1322 } 1323 } 1324 // TODO: merge suppressScript data with this table. 1325 b.writeType(likelyScriptRegion{}) 1326 b.writeSlice("likelyLang", likelyLang) 1327 b.writeSlice("likelyLangList", likelyLangList) 1328 1329 for id := range b.region.s { 1330 list := regionToOther[id] 1331 if len(list) == 1 { 1332 likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0])) 1333 likelyRegion[id].script = uint8(b.script.index(list[0].to[1])) 1334 if len(list[0].from) > 2 { 1335 likelyRegion[id].flags = scriptInFrom 1336 } 1337 } else if len(list) > 1 { 1338 likelyRegion[id].flags = isList 1339 likelyRegion[id].lang = uint16(len(likelyRegionList)) 1340 likelyRegion[id].script = uint8(len(list)) 1341 for i, x := range list { 1342 if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 { 1343 log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i) 1344 } 1345 x := likelyLangScript{ 1346 lang: uint16(b.langIndex(x.to[0])), 1347 script: uint8(b.script.index(x.to[1])), 1348 } 1349 if len(list[0].from) > 2 { 1350 x.flags = scriptInFrom 1351 } 1352 likelyRegionList = append(likelyRegionList, x) 1353 } 1354 } 1355 } 1356 b.writeType(likelyLangScript{}) 1357 b.writeSlice("likelyRegion", likelyRegion) 1358 b.writeSlice("likelyRegionList", likelyRegionList) 1359 1360 b.writeType(likelyTag{}) 1361 b.writeSlice("likelyRegionGroup", likelyRegionGroup) 1362} 1363 1364type mutualIntelligibility struct { 1365 want, have uint16 1366 distance uint8 1367 oneway bool 1368} 1369 1370type scriptIntelligibility struct { 1371 wantLang, haveLang uint16 1372 wantScript, haveScript uint8 1373 distance uint8 1374 // Always oneway 1375} 1376 1377type regionIntelligibility struct { 1378 lang uint16 // compact language id 1379 script uint8 // 0 means any 1380 group uint8 // 0 means any; if bit 7 is set it means inverse 1381 distance uint8 1382 // Always twoway. 1383} 1384 1385// writeMatchData writes tables with languages and scripts for which there is 1386// mutual intelligibility. The data is based on CLDR's languageMatching data. 1387// Note that we use a different algorithm than the one defined by CLDR and that 1388// we slightly modify the data. For example, we convert scores to confidence levels. 1389// We also drop all region-related data as we use a different algorithm to 1390// determine region equivalence. 1391func (b *builder) writeMatchData() { 1392 lm := b.supp.LanguageMatching.LanguageMatches 1393 cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new") 1394 1395 regionHierarchy := map[string][]string{} 1396 for _, g := range b.supp.TerritoryContainment.Group { 1397 regions := strings.Split(g.Contains, " ") 1398 regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...) 1399 } 1400 regionToGroups := make([]uint8, len(b.region.s)) 1401 1402 idToIndex := map[string]uint8{} 1403 for i, mv := range lm[0].MatchVariable { 1404 if i > 6 { 1405 log.Fatalf("Too many groups: %d", i) 1406 } 1407 idToIndex[mv.Id] = uint8(i + 1) 1408 // TODO: also handle '-' 1409 for _, r := range strings.Split(mv.Value, "+") { 1410 todo := []string{r} 1411 for k := 0; k < len(todo); k++ { 1412 r := todo[k] 1413 regionToGroups[b.region.index(r)] |= 1 << uint8(i) 1414 todo = append(todo, regionHierarchy[r]...) 1415 } 1416 } 1417 } 1418 b.writeSlice("regionToGroups", regionToGroups) 1419 1420 // maps language id to in- and out-of-group region. 1421 paradigmLocales := [][3]uint16{} 1422 locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ") 1423 for i := 0; i < len(locales); i += 2 { 1424 x := [3]uint16{} 1425 for j := 0; j < 2; j++ { 1426 pc := strings.SplitN(locales[i+j], "-", 2) 1427 x[0] = b.langIndex(pc[0]) 1428 if len(pc) == 2 { 1429 x[1+j] = uint16(b.region.index(pc[1])) 1430 } 1431 } 1432 paradigmLocales = append(paradigmLocales, x) 1433 } 1434 b.writeSlice("paradigmLocales", paradigmLocales) 1435 1436 b.writeType(mutualIntelligibility{}) 1437 b.writeType(scriptIntelligibility{}) 1438 b.writeType(regionIntelligibility{}) 1439 1440 matchLang := []mutualIntelligibility{} 1441 matchScript := []scriptIntelligibility{} 1442 matchRegion := []regionIntelligibility{} 1443 // Convert the languageMatch entries in lists keyed by desired language. 1444 for _, m := range lm[0].LanguageMatch { 1445 // Different versions of CLDR use different separators. 1446 desired := strings.Replace(m.Desired, "-", "_", -1) 1447 supported := strings.Replace(m.Supported, "-", "_", -1) 1448 d := strings.Split(desired, "_") 1449 s := strings.Split(supported, "_") 1450 if len(d) != len(s) { 1451 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 1452 continue 1453 } 1454 distance, _ := strconv.ParseInt(m.Distance, 10, 8) 1455 switch len(d) { 1456 case 2: 1457 if desired == supported && desired == "*_*" { 1458 continue 1459 } 1460 // language-script pair. 1461 matchScript = append(matchScript, scriptIntelligibility{ 1462 wantLang: uint16(b.langIndex(d[0])), 1463 haveLang: uint16(b.langIndex(s[0])), 1464 wantScript: uint8(b.script.index(d[1])), 1465 haveScript: uint8(b.script.index(s[1])), 1466 distance: uint8(distance), 1467 }) 1468 if m.Oneway != "true" { 1469 matchScript = append(matchScript, scriptIntelligibility{ 1470 wantLang: uint16(b.langIndex(s[0])), 1471 haveLang: uint16(b.langIndex(d[0])), 1472 wantScript: uint8(b.script.index(s[1])), 1473 haveScript: uint8(b.script.index(d[1])), 1474 distance: uint8(distance), 1475 }) 1476 } 1477 case 1: 1478 if desired == supported && desired == "*" { 1479 continue 1480 } 1481 if distance == 1 { 1482 // nb == no is already handled by macro mapping. Check there 1483 // really is only this case. 1484 if d[0] != "no" || s[0] != "nb" { 1485 log.Fatalf("unhandled equivalence %s == %s", s[0], d[0]) 1486 } 1487 continue 1488 } 1489 // TODO: consider dropping oneway field and just doubling the entry. 1490 matchLang = append(matchLang, mutualIntelligibility{ 1491 want: uint16(b.langIndex(d[0])), 1492 have: uint16(b.langIndex(s[0])), 1493 distance: uint8(distance), 1494 oneway: m.Oneway == "true", 1495 }) 1496 case 3: 1497 if desired == supported && desired == "*_*_*" { 1498 continue 1499 } 1500 if desired != supported { 1501 // This is now supported by CLDR, but only one case, which 1502 // should already be covered by paradigm locales. For instance, 1503 // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in 1504 // testdata/CLDRLocaleMatcherTest.txt tests this. 1505 if supported != "en_*_GB" { 1506 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 1507 } 1508 continue 1509 } 1510 ri := regionIntelligibility{ 1511 lang: b.langIndex(d[0]), 1512 distance: uint8(distance), 1513 } 1514 if d[1] != "*" { 1515 ri.script = uint8(b.script.index(d[1])) 1516 } 1517 switch { 1518 case d[2] == "*": 1519 ri.group = 0x80 // not contained in anything 1520 case strings.HasPrefix(d[2], "$!"): 1521 ri.group = 0x80 1522 d[2] = "$" + d[2][len("$!"):] 1523 fallthrough 1524 case strings.HasPrefix(d[2], "$"): 1525 ri.group |= idToIndex[d[2]] 1526 } 1527 matchRegion = append(matchRegion, ri) 1528 default: 1529 log.Fatalf("not supported: desired=%q; supported=%q", desired, supported) 1530 } 1531 } 1532 sort.SliceStable(matchLang, func(i, j int) bool { 1533 return matchLang[i].distance < matchLang[j].distance 1534 }) 1535 b.writeSlice("matchLang", matchLang) 1536 1537 sort.SliceStable(matchScript, func(i, j int) bool { 1538 return matchScript[i].distance < matchScript[j].distance 1539 }) 1540 b.writeSlice("matchScript", matchScript) 1541 1542 sort.SliceStable(matchRegion, func(i, j int) bool { 1543 return matchRegion[i].distance < matchRegion[j].distance 1544 }) 1545 b.writeSlice("matchRegion", matchRegion) 1546} 1547 1548func (b *builder) writeRegionInclusionData() { 1549 var ( 1550 // mm holds for each group the set of groups with a distance of 1. 1551 mm = make(map[int][]index) 1552 1553 // containment holds for each group the transitive closure of 1554 // containment of other groups. 1555 containment = make(map[index][]index) 1556 ) 1557 for _, g := range b.supp.TerritoryContainment.Group { 1558 // Skip UN and EURO zone as they are flattening the containment 1559 // relationship. 1560 if g.Type == "EZ" || g.Type == "UN" { 1561 continue 1562 } 1563 group := b.region.index(g.Type) 1564 groupIdx := b.groups[group] 1565 for _, mem := range strings.Split(g.Contains, " ") { 1566 r := b.region.index(mem) 1567 mm[r] = append(mm[r], groupIdx) 1568 if g, ok := b.groups[r]; ok { 1569 mm[group] = append(mm[group], g) 1570 containment[groupIdx] = append(containment[groupIdx], g) 1571 } 1572 } 1573 } 1574 1575 regionContainment := make([]uint64, len(b.groups)) 1576 for _, g := range b.groups { 1577 l := containment[g] 1578 1579 // Compute the transitive closure of containment. 1580 for i := 0; i < len(l); i++ { 1581 l = append(l, containment[l[i]]...) 1582 } 1583 1584 // Compute the bitmask. 1585 regionContainment[g] = 1 << g 1586 for _, v := range l { 1587 regionContainment[g] |= 1 << v 1588 } 1589 } 1590 b.writeSlice("regionContainment", regionContainment) 1591 1592 regionInclusion := make([]uint8, len(b.region.s)) 1593 bvs := make(map[uint64]index) 1594 // Make the first bitvector positions correspond with the groups. 1595 for r, i := range b.groups { 1596 bv := uint64(1 << i) 1597 for _, g := range mm[r] { 1598 bv |= 1 << g 1599 } 1600 bvs[bv] = i 1601 regionInclusion[r] = uint8(bvs[bv]) 1602 } 1603 for r := 1; r < len(b.region.s); r++ { 1604 if _, ok := b.groups[r]; !ok { 1605 bv := uint64(0) 1606 for _, g := range mm[r] { 1607 bv |= 1 << g 1608 } 1609 if bv == 0 { 1610 // Pick the world for unspecified regions. 1611 bv = 1 << b.groups[b.region.index("001")] 1612 } 1613 if _, ok := bvs[bv]; !ok { 1614 bvs[bv] = index(len(bvs)) 1615 } 1616 regionInclusion[r] = uint8(bvs[bv]) 1617 } 1618 } 1619 b.writeSlice("regionInclusion", regionInclusion) 1620 regionInclusionBits := make([]uint64, len(bvs)) 1621 for k, v := range bvs { 1622 regionInclusionBits[v] = uint64(k) 1623 } 1624 // Add bit vectors for increasingly large distances until a fixed point is reached. 1625 regionInclusionNext := []uint8{} 1626 for i := 0; i < len(regionInclusionBits); i++ { 1627 bits := regionInclusionBits[i] 1628 next := bits 1629 for i := uint(0); i < uint(len(b.groups)); i++ { 1630 if bits&(1<<i) != 0 { 1631 next |= regionInclusionBits[i] 1632 } 1633 } 1634 if _, ok := bvs[next]; !ok { 1635 bvs[next] = index(len(bvs)) 1636 regionInclusionBits = append(regionInclusionBits, next) 1637 } 1638 regionInclusionNext = append(regionInclusionNext, uint8(bvs[next])) 1639 } 1640 b.writeSlice("regionInclusionBits", regionInclusionBits) 1641 b.writeSlice("regionInclusionNext", regionInclusionNext) 1642} 1643 1644type parentRel struct { 1645 lang uint16 1646 script uint8 1647 maxScript uint8 1648 toRegion uint16 1649 fromRegion []uint16 1650} 1651 1652func (b *builder) writeParents() { 1653 b.writeType(parentRel{}) 1654 1655 parents := []parentRel{} 1656 1657 // Construct parent overrides. 1658 n := 0 1659 for _, p := range b.data.Supplemental().ParentLocales.ParentLocale { 1660 // Skipping non-standard scripts to root is implemented using addTags. 1661 if p.Parent == "root" { 1662 continue 1663 } 1664 1665 sub := strings.Split(p.Parent, "_") 1666 parent := parentRel{lang: b.langIndex(sub[0])} 1667 if len(sub) == 2 { 1668 // TODO: check that all undefined scripts are indeed Latn in these 1669 // cases. 1670 parent.maxScript = uint8(b.script.index("Latn")) 1671 parent.toRegion = uint16(b.region.index(sub[1])) 1672 } else { 1673 parent.script = uint8(b.script.index(sub[1])) 1674 parent.maxScript = parent.script 1675 parent.toRegion = uint16(b.region.index(sub[2])) 1676 } 1677 for _, c := range strings.Split(p.Locales, " ") { 1678 region := b.region.index(c[strings.LastIndex(c, "_")+1:]) 1679 parent.fromRegion = append(parent.fromRegion, uint16(region)) 1680 } 1681 parents = append(parents, parent) 1682 n += len(parent.fromRegion) 1683 } 1684 b.writeSliceAddSize("parents", n*2, parents) 1685} 1686 1687func main() { 1688 gen.Init() 1689 1690 gen.Repackage("gen_common.go", "common.go", "language") 1691 1692 w := gen.NewCodeWriter() 1693 defer w.WriteGoFile("tables.go", "language") 1694 1695 fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`) 1696 1697 b := newBuilder(w) 1698 gen.WriteCLDRVersion(w) 1699 1700 b.parseIndices() 1701 b.writeType(fromTo{}) 1702 b.writeLanguage() 1703 b.writeScript() 1704 b.writeRegion() 1705 b.writeVariant() 1706 // TODO: b.writeLocale() 1707 b.computeRegionGroups() 1708 b.writeLikelyData() 1709 b.writeMatchData() 1710 b.writeRegionInclusionData() 1711 b.writeParents() 1712} 1713