1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// Language tag table generator.
9// Data read from the web.
10
11package main
12
13import (
14	"bufio"
15	"flag"
16	"fmt"
17	"io"
18	"io/ioutil"
19	"log"
20	"math"
21	"reflect"
22	"regexp"
23	"sort"
24	"strconv"
25	"strings"
26
27	"golang.org/x/text/internal/gen"
28	"golang.org/x/text/internal/tag"
29	"golang.org/x/text/unicode/cldr"
30)
31
32var (
33	test = flag.Bool("test",
34		false,
35		"test existing tables; can be used to compare web data with package data.")
36	outputFile = flag.String("output",
37		"tables.go",
38		"output file for generated tables")
39)
40
41var comment = []string{
42	`
43lang holds an alphabetically sorted list of ISO-639 language identifiers.
44All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
45For 2-byte language identifiers, the two successive bytes have the following meaning:
46    - if the first letter of the 2- and 3-letter ISO codes are the same:
47      the second and third letter of the 3-letter ISO code.
48    - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
49For 3-byte language identifiers the 4th byte is 0.`,
50	`
51langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
52in lookup tables. The language ids for these language codes are derived directly
53from the letters and are not consecutive.`,
54	`
55altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
56to 2-letter language codes that cannot be derived using the method described above.
57Each 3-letter code is followed by its 1-byte langID.`,
58	`
59altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
60	`
61AliasMap maps langIDs to their suggested replacements.`,
62	`
63script is an alphabetically sorted list of ISO 15924 codes. The index
64of the script in the string, divided by 4, is the internal scriptID.`,
65	`
66isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
67for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
68the UN.M49 codes used for groups.)`,
69	`
70regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
71Each 2-letter codes is followed by two bytes with the following meaning:
72    - [A-Z}{2}: the first letter of the 2-letter code plus these two
73                letters form the 3-letter ISO code.
74    - 0, n:     index into altRegionISO3.`,
75	`
76regionTypes defines the status of a region for various standards.`,
77	`
78m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
79codes indicating collections of regions.`,
80	`
81m49Index gives indexes into fromM49 based on the three most significant bits
82of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
83   fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
84for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
85The region code is stored in the 9 lsb of the indexed value.`,
86	`
87fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
88	`
89altRegionISO3 holds a list of 3-letter region codes that cannot be
90mapped to 2-letter codes using the default algorithm. This is a short list.`,
91	`
92altRegionIDs holds a list of regionIDs the positions of which match those
93of the 3-letter ISO codes in altRegionISO3.`,
94	`
95variantNumSpecialized is the number of specialized variants in variants.`,
96	`
97suppressScript is an index from langID to the dominant script for that language,
98if it exists.  If a script is given, it should be suppressed from the language tag.`,
99	`
100likelyLang is a lookup table, indexed by langID, for the most likely
101scripts and regions given incomplete information. If more entries exist for a
102given language, region and script are the index and size respectively
103of the list in likelyLangList.`,
104	`
105likelyLangList holds lists info associated with likelyLang.`,
106	`
107likelyRegion is a lookup table, indexed by regionID, for the most likely
108languages and scripts given incomplete information. If more entries exist
109for a given regionID, lang and script are the index and size respectively
110of the list in likelyRegionList.
111TODO: exclude containers and user-definable regions from the list.`,
112	`
113likelyRegionList holds lists info associated with likelyRegion.`,
114	`
115likelyScript is a lookup table, indexed by scriptID, for the most likely
116languages and regions given a script.`,
117	`
118nRegionGroups is the number of region groups.`,
119	`
120regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
121where each set holds all groupings that are directly connected in a region
122containment graph.`,
123	`
124regionInclusionBits is an array of bit vectors where every vector represents
125a set of region groupings.  These sets are used to compute the distance
126between two regions for the purpose of language matching.`,
127	`
128regionInclusionNext marks, for each entry in regionInclusionBits, the set of
129all groups that are reachable from the groups set in the respective entry.`,
130}
131
132// TODO: consider changing some of these structures to tries. This can reduce
133// memory, but may increase the need for memory allocations. This could be
134// mitigated if we can piggyback on language tags for common cases.
135
136func failOnError(e error) {
137	if e != nil {
138		log.Panic(e)
139	}
140}
141
142type setType int
143
144const (
145	Indexed setType = 1 + iota // all elements must be of same size
146	Linear
147)
148
149type stringSet struct {
150	s              []string
151	sorted, frozen bool
152
153	// We often need to update values after the creation of an index is completed.
154	// We include a convenience map for keeping track of this.
155	update map[string]string
156	typ    setType // used for checking.
157}
158
159func (ss *stringSet) clone() stringSet {
160	c := *ss
161	c.s = append([]string(nil), c.s...)
162	return c
163}
164
165func (ss *stringSet) setType(t setType) {
166	if ss.typ != t && ss.typ != 0 {
167		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
168	}
169}
170
171// parse parses a whitespace-separated string and initializes ss with its
172// components.
173func (ss *stringSet) parse(s string) {
174	scan := bufio.NewScanner(strings.NewReader(s))
175	scan.Split(bufio.ScanWords)
176	for scan.Scan() {
177		ss.add(scan.Text())
178	}
179}
180
181func (ss *stringSet) assertChangeable() {
182	if ss.frozen {
183		log.Panic("attempt to modify a frozen stringSet")
184	}
185}
186
187func (ss *stringSet) add(s string) {
188	ss.assertChangeable()
189	ss.s = append(ss.s, s)
190	ss.sorted = ss.frozen
191}
192
193func (ss *stringSet) freeze() {
194	ss.compact()
195	ss.frozen = true
196}
197
198func (ss *stringSet) compact() {
199	if ss.sorted {
200		return
201	}
202	a := ss.s
203	sort.Strings(a)
204	k := 0
205	for i := 1; i < len(a); i++ {
206		if a[k] != a[i] {
207			a[k+1] = a[i]
208			k++
209		}
210	}
211	ss.s = a[:k+1]
212	ss.sorted = ss.frozen
213}
214
215type funcSorter struct {
216	fn func(a, b string) bool
217	sort.StringSlice
218}
219
220func (s funcSorter) Less(i, j int) bool {
221	return s.fn(s.StringSlice[i], s.StringSlice[j])
222}
223
224func (ss *stringSet) sortFunc(f func(a, b string) bool) {
225	ss.compact()
226	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
227}
228
229func (ss *stringSet) remove(s string) {
230	ss.assertChangeable()
231	if i, ok := ss.find(s); ok {
232		copy(ss.s[i:], ss.s[i+1:])
233		ss.s = ss.s[:len(ss.s)-1]
234	}
235}
236
237func (ss *stringSet) replace(ol, nu string) {
238	ss.s[ss.index(ol)] = nu
239	ss.sorted = ss.frozen
240}
241
242func (ss *stringSet) index(s string) int {
243	ss.setType(Indexed)
244	i, ok := ss.find(s)
245	if !ok {
246		if i < len(ss.s) {
247			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
248		}
249		log.Panicf("find: item %q is not in list", s)
250
251	}
252	return i
253}
254
255func (ss *stringSet) find(s string) (int, bool) {
256	ss.compact()
257	i := sort.SearchStrings(ss.s, s)
258	return i, i != len(ss.s) && ss.s[i] == s
259}
260
261func (ss *stringSet) slice() []string {
262	ss.compact()
263	return ss.s
264}
265
266func (ss *stringSet) updateLater(v, key string) {
267	if ss.update == nil {
268		ss.update = map[string]string{}
269	}
270	ss.update[v] = key
271}
272
273// join joins the string and ensures that all entries are of the same length.
274func (ss *stringSet) join() string {
275	ss.setType(Indexed)
276	n := len(ss.s[0])
277	for _, s := range ss.s {
278		if len(s) != n {
279			log.Panicf("join: not all entries are of the same length: %q", s)
280		}
281	}
282	ss.s = append(ss.s, strings.Repeat("\xff", n))
283	return strings.Join(ss.s, "")
284}
285
286// ianaEntry holds information for an entry in the IANA Language Subtag Repository.
287// All types use the same entry.
288// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
289// fields.
290type ianaEntry struct {
291	typ            string
292	description    []string
293	scope          string
294	added          string
295	preferred      string
296	deprecated     string
297	suppressScript string
298	macro          string
299	prefix         []string
300}
301
302type builder struct {
303	w    *gen.CodeWriter
304	hw   io.Writer // MultiWriter for w and w.Hash
305	data *cldr.CLDR
306	supp *cldr.SupplementalData
307
308	// indices
309	locale      stringSet // common locales
310	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
311	langNoIndex stringSet // 3-letter ISO codes with no associated data
312	script      stringSet // 4-letter ISO codes
313	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
314	variant     stringSet // 4-8-alphanumeric variant code.
315
316	// Region codes that are groups with their corresponding group IDs.
317	groups map[int]index
318
319	// langInfo
320	registry map[string]*ianaEntry
321}
322
323type index uint
324
325func newBuilder(w *gen.CodeWriter) *builder {
326	r := gen.OpenCLDRCoreZip()
327	defer r.Close()
328	d := &cldr.Decoder{}
329	data, err := d.DecodeZip(r)
330	failOnError(err)
331	b := builder{
332		w:    w,
333		hw:   io.MultiWriter(w, w.Hash),
334		data: data,
335		supp: data.Supplemental(),
336	}
337	b.parseRegistry()
338	return &b
339}
340
341func (b *builder) parseRegistry() {
342	r := gen.OpenIANAFile("assignments/language-subtag-registry")
343	defer r.Close()
344	b.registry = make(map[string]*ianaEntry)
345
346	scan := bufio.NewScanner(r)
347	scan.Split(bufio.ScanWords)
348	var record *ianaEntry
349	for more := scan.Scan(); more; {
350		key := scan.Text()
351		more = scan.Scan()
352		value := scan.Text()
353		switch key {
354		case "Type:":
355			record = &ianaEntry{typ: value}
356		case "Subtag:", "Tag:":
357			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
358				for a := s[0]; a <= s[1]; a = inc(a) {
359					b.addToRegistry(a, record)
360				}
361			} else {
362				b.addToRegistry(value, record)
363			}
364		case "Suppress-Script:":
365			record.suppressScript = value
366		case "Added:":
367			record.added = value
368		case "Deprecated:":
369			record.deprecated = value
370		case "Macrolanguage:":
371			record.macro = value
372		case "Preferred-Value:":
373			record.preferred = value
374		case "Prefix:":
375			record.prefix = append(record.prefix, value)
376		case "Scope:":
377			record.scope = value
378		case "Description:":
379			buf := []byte(value)
380			for more = scan.Scan(); more; more = scan.Scan() {
381				b := scan.Bytes()
382				if b[0] == '%' || b[len(b)-1] == ':' {
383					break
384				}
385				buf = append(buf, ' ')
386				buf = append(buf, b...)
387			}
388			record.description = append(record.description, string(buf))
389			continue
390		default:
391			continue
392		}
393		more = scan.Scan()
394	}
395	if scan.Err() != nil {
396		log.Panic(scan.Err())
397	}
398}
399
400func (b *builder) addToRegistry(key string, entry *ianaEntry) {
401	if info, ok := b.registry[key]; ok {
402		if info.typ != "language" || entry.typ != "extlang" {
403			log.Fatalf("parseRegistry: tag %q already exists", key)
404		}
405	} else {
406		b.registry[key] = entry
407	}
408}
409
410var commentIndex = make(map[string]string)
411
412func init() {
413	for _, s := range comment {
414		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
415		commentIndex[key] = s
416	}
417}
418
419func (b *builder) comment(name string) {
420	if s := commentIndex[name]; len(s) > 0 {
421		b.w.WriteComment(s)
422	} else {
423		fmt.Fprintln(b.w)
424	}
425}
426
427func (b *builder) pf(f string, x ...interface{}) {
428	fmt.Fprintf(b.hw, f, x...)
429	fmt.Fprint(b.hw, "\n")
430}
431
432func (b *builder) p(x ...interface{}) {
433	fmt.Fprintln(b.hw, x...)
434}
435
436func (b *builder) addSize(s int) {
437	b.w.Size += s
438	b.pf("// Size: %d bytes", s)
439}
440
441func (b *builder) writeConst(name string, x interface{}) {
442	b.comment(name)
443	b.w.WriteConst(name, x)
444}
445
446// writeConsts computes f(v) for all v in values and writes the results
447// as constants named _v to a single constant block.
448func (b *builder) writeConsts(f func(string) int, values ...string) {
449	b.pf("const (")
450	for _, v := range values {
451		b.pf("\t_%s = %v", v, f(v))
452	}
453	b.pf(")")
454}
455
456// writeType writes the type of the given value, which must be a struct.
457func (b *builder) writeType(value interface{}) {
458	b.comment(reflect.TypeOf(value).Name())
459	b.w.WriteType(value)
460}
461
462func (b *builder) writeSlice(name string, ss interface{}) {
463	b.writeSliceAddSize(name, 0, ss)
464}
465
466func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
467	b.comment(name)
468	b.w.Size += extraSize
469	v := reflect.ValueOf(ss)
470	t := v.Type().Elem()
471	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
472
473	fmt.Fprintf(b.w, "var %s = ", name)
474	b.w.WriteArray(ss)
475	b.p()
476}
477
478type FromTo struct {
479	From, To uint16
480}
481
482func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
483	ss.sortFunc(func(a, b string) bool {
484		return index(a) < index(b)
485	})
486	m := []FromTo{}
487	for _, s := range ss.s {
488		m = append(m, FromTo{index(s), index(ss.update[s])})
489	}
490	b.writeSlice(name, m)
491}
492
493const base = 'z' - 'a' + 1
494
495func strToInt(s string) uint {
496	v := uint(0)
497	for i := 0; i < len(s); i++ {
498		v *= base
499		v += uint(s[i] - 'a')
500	}
501	return v
502}
503
504// converts the given integer to the original ASCII string passed to strToInt.
505// len(s) must match the number of characters obtained.
506func intToStr(v uint, s []byte) {
507	for i := len(s) - 1; i >= 0; i-- {
508		s[i] = byte(v%base) + 'a'
509		v /= base
510	}
511}
512
513func (b *builder) writeBitVector(name string, ss []string) {
514	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
515	for _, s := range ss {
516		v := strToInt(s)
517		vec[v/8] |= 1 << (v % 8)
518	}
519	b.writeSlice(name, vec)
520}
521
522// TODO: convert this type into a list or two-stage trie.
523func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
524	b.comment(name)
525	v := reflect.ValueOf(m)
526	sz := v.Len() * (2 + int(v.Type().Key().Size()))
527	for _, k := range m {
528		sz += len(k)
529	}
530	b.addSize(sz)
531	keys := []string{}
532	b.pf(`var %s = map[string]uint16{`, name)
533	for k := range m {
534		keys = append(keys, k)
535	}
536	sort.Strings(keys)
537	for _, k := range keys {
538		b.pf("\t%q: %v,", k, f(m[k]))
539	}
540	b.p("}")
541}
542
543func (b *builder) writeMap(name string, m interface{}) {
544	b.comment(name)
545	v := reflect.ValueOf(m)
546	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
547	b.addSize(sz)
548	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
549		return strings.IndexRune("{}, ", r) != -1
550	})
551	sort.Strings(f[1:])
552	b.pf(`var %s = %s{`, name, f[0])
553	for _, kv := range f[1:] {
554		b.pf("\t%s,", kv)
555	}
556	b.p("}")
557}
558
559func (b *builder) langIndex(s string) uint16 {
560	if s == "und" {
561		return 0
562	}
563	if i, ok := b.lang.find(s); ok {
564		return uint16(i)
565	}
566	return uint16(strToInt(s)) + uint16(len(b.lang.s))
567}
568
569// inc advances the string to its lexicographical successor.
570func inc(s string) string {
571	const maxTagLength = 4
572	var buf [maxTagLength]byte
573	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
574	for i := 0; i < len(s); i++ {
575		if s[i] <= 'Z' {
576			buf[i] -= 'a' - 'A'
577		}
578	}
579	return string(buf[:len(s)])
580}
581
582func (b *builder) parseIndices() {
583	meta := b.supp.Metadata
584
585	for k, v := range b.registry {
586		var ss *stringSet
587		switch v.typ {
588		case "language":
589			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
590				b.lang.add(k)
591				continue
592			} else {
593				ss = &b.langNoIndex
594			}
595		case "region":
596			ss = &b.region
597		case "script":
598			ss = &b.script
599		case "variant":
600			ss = &b.variant
601		default:
602			continue
603		}
604		ss.add(k)
605	}
606	// Include any language for which there is data.
607	for _, lang := range b.data.Locales() {
608		if x := b.data.RawLDML(lang); false ||
609			x.LocaleDisplayNames != nil ||
610			x.Characters != nil ||
611			x.Delimiters != nil ||
612			x.Measurement != nil ||
613			x.Dates != nil ||
614			x.Numbers != nil ||
615			x.Units != nil ||
616			x.ListPatterns != nil ||
617			x.Collations != nil ||
618			x.Segmentations != nil ||
619			x.Rbnf != nil ||
620			x.Annotations != nil ||
621			x.Metadata != nil {
622
623			from := strings.Split(lang, "_")
624			if lang := from[0]; lang != "root" {
625				b.lang.add(lang)
626			}
627		}
628	}
629	// Include locales for plural rules, which uses a different structure.
630	for _, plurals := range b.data.Supplemental().Plurals {
631		for _, rules := range plurals.PluralRules {
632			for _, lang := range strings.Split(rules.Locales, " ") {
633				if lang = strings.Split(lang, "_")[0]; lang != "root" {
634					b.lang.add(lang)
635				}
636			}
637		}
638	}
639	// Include languages in likely subtags.
640	for _, m := range b.supp.LikelySubtags.LikelySubtag {
641		from := strings.Split(m.From, "_")
642		b.lang.add(from[0])
643	}
644	// Include ISO-639 alpha-3 bibliographic entries.
645	for _, a := range meta.Alias.LanguageAlias {
646		if a.Reason == "bibliographic" {
647			b.langNoIndex.add(a.Type)
648		}
649	}
650	// Include regions in territoryAlias (not all are in the IANA registry!)
651	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
652		if len(reg.Type) == 2 {
653			b.region.add(reg.Type)
654		}
655	}
656
657	for _, s := range b.lang.s {
658		if len(s) == 3 {
659			b.langNoIndex.remove(s)
660		}
661	}
662	b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
663	b.writeConst("NumScripts", len(b.script.slice()))
664	b.writeConst("NumRegions", len(b.region.slice()))
665
666	// Add dummy codes at the start of each list to represent "unspecified".
667	b.lang.add("---")
668	b.script.add("----")
669	b.region.add("---")
670
671	// common locales
672	b.locale.parse(meta.DefaultContent.Locales)
673}
674
675// TODO: region inclusion data will probably not be use used in future matchers.
676
677func (b *builder) computeRegionGroups() {
678	b.groups = make(map[int]index)
679
680	// Create group indices.
681	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
682		b.groups[i] = index(len(b.groups))
683	}
684	for _, g := range b.supp.TerritoryContainment.Group {
685		// Skip UN and EURO zone as they are flattening the containment
686		// relationship.
687		if g.Type == "EZ" || g.Type == "UN" {
688			continue
689		}
690		group := b.region.index(g.Type)
691		if _, ok := b.groups[group]; !ok {
692			b.groups[group] = index(len(b.groups))
693		}
694	}
695	if len(b.groups) > 64 {
696		log.Fatalf("only 64 groups supported, found %d", len(b.groups))
697	}
698	b.writeConst("nRegionGroups", len(b.groups))
699}
700
701var langConsts = []string{
702	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
703	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
704	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
705	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
706	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
707	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
708
709	// constants for grandfathered tags (if not already defined)
710	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
711	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
712}
713
714// writeLanguage generates all tables needed for language canonicalization.
715func (b *builder) writeLanguage() {
716	meta := b.supp.Metadata
717
718	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
719	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
720	b.writeConst("langPrivateStart", b.langIndex("qaa"))
721	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
722
723	// Get language codes that need to be mapped (overlong 3-letter codes,
724	// deprecated 2-letter codes, legacy and grandfathered tags.)
725	langAliasMap := stringSet{}
726	aliasTypeMap := map[string]AliasType{}
727
728	// altLangISO3 get the alternative ISO3 names that need to be mapped.
729	altLangISO3 := stringSet{}
730	// Add dummy start to avoid the use of index 0.
731	altLangISO3.add("---")
732	altLangISO3.updateLater("---", "aa")
733
734	lang := b.lang.clone()
735	for _, a := range meta.Alias.LanguageAlias {
736		if a.Replacement == "" {
737			a.Replacement = "und"
738		}
739		// TODO: support mapping to tags
740		repl := strings.SplitN(a.Replacement, "_", 2)[0]
741		if a.Reason == "overlong" {
742			if len(a.Replacement) == 2 && len(a.Type) == 3 {
743				lang.updateLater(a.Replacement, a.Type)
744			}
745		} else if len(a.Type) <= 3 {
746			switch a.Reason {
747			case "macrolanguage":
748				aliasTypeMap[a.Type] = Macro
749			case "deprecated":
750				// handled elsewhere
751				continue
752			case "bibliographic", "legacy":
753				if a.Type == "no" {
754					continue
755				}
756				aliasTypeMap[a.Type] = Legacy
757			default:
758				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
759			}
760			langAliasMap.add(a.Type)
761			langAliasMap.updateLater(a.Type, repl)
762		}
763	}
764	// Manually add the mapping of "nb" (Norwegian) to its macro language.
765	// This can be removed if CLDR adopts this change.
766	langAliasMap.add("nb")
767	langAliasMap.updateLater("nb", "no")
768	aliasTypeMap["nb"] = Macro
769
770	for k, v := range b.registry {
771		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
772		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
773			langAliasMap.add(k)
774			langAliasMap.updateLater(k, v.preferred)
775			aliasTypeMap[k] = Deprecated
776		}
777	}
778	// Fix CLDR mappings.
779	lang.updateLater("tl", "tgl")
780	lang.updateLater("sh", "hbs")
781	lang.updateLater("mo", "mol")
782	lang.updateLater("no", "nor")
783	lang.updateLater("tw", "twi")
784	lang.updateLater("nb", "nob")
785	lang.updateLater("ak", "aka")
786	lang.updateLater("bh", "bih")
787
788	// Ensure that each 2-letter code is matched with a 3-letter code.
789	for _, v := range lang.s[1:] {
790		s, ok := lang.update[v]
791		if !ok {
792			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
793				continue
794			}
795			lang.update[v] = s
796		}
797		if v[0] != s[0] {
798			altLangISO3.add(s)
799			altLangISO3.updateLater(s, v)
800		}
801	}
802
803	// Complete canonicalized language tags.
804	lang.freeze()
805	for i, v := range lang.s {
806		// We can avoid these manual entries by using the IANA registry directly.
807		// Seems easier to update the list manually, as changes are rare.
808		// The panic in this loop will trigger if we miss an entry.
809		add := ""
810		if s, ok := lang.update[v]; ok {
811			if s[0] == v[0] {
812				add = s[1:]
813			} else {
814				add = string([]byte{0, byte(altLangISO3.index(s))})
815			}
816		} else if len(v) == 3 {
817			add = "\x00"
818		} else {
819			log.Panicf("no data for long form of %q", v)
820		}
821		lang.s[i] += add
822	}
823	b.writeConst("lang", tag.Index(lang.join()))
824
825	b.writeConst("langNoIndexOffset", len(b.lang.s))
826
827	// space of all valid 3-letter language identifiers.
828	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
829
830	altLangIndex := []uint16{}
831	for i, s := range altLangISO3.slice() {
832		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
833		if i > 0 {
834			idx := b.lang.index(altLangISO3.update[s])
835			altLangIndex = append(altLangIndex, uint16(idx))
836		}
837	}
838	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
839	b.writeSlice("altLangIndex", altLangIndex)
840
841	b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex)
842	types := make([]AliasType, len(langAliasMap.s))
843	for i, s := range langAliasMap.s {
844		types[i] = aliasTypeMap[s]
845	}
846	b.writeSlice("AliasTypes", types)
847}
848
849var scriptConsts = []string{
850	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
851	"Zzzz",
852}
853
854func (b *builder) writeScript() {
855	b.writeConsts(b.script.index, scriptConsts...)
856	b.writeConst("script", tag.Index(b.script.join()))
857
858	supp := make([]uint8, len(b.lang.slice()))
859	for i, v := range b.lang.slice()[1:] {
860		if sc := b.registry[v].suppressScript; sc != "" {
861			supp[i+1] = uint8(b.script.index(sc))
862		}
863	}
864	b.writeSlice("suppressScript", supp)
865
866	// There is only one deprecated script in CLDR. This value is hard-coded.
867	// We check here if the code must be updated.
868	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
869		if a.Type != "Qaai" {
870			log.Panicf("unexpected deprecated stript %q", a.Type)
871		}
872	}
873}
874
875func parseM49(s string) int16 {
876	if len(s) == 0 {
877		return 0
878	}
879	v, err := strconv.ParseUint(s, 10, 10)
880	failOnError(err)
881	return int16(v)
882}
883
884var regionConsts = []string{
885	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
886	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
887}
888
889func (b *builder) writeRegion() {
890	b.writeConsts(b.region.index, regionConsts...)
891
892	isoOffset := b.region.index("AA")
893	m49map := make([]int16, len(b.region.slice()))
894	fromM49map := make(map[int16]int)
895	altRegionISO3 := ""
896	altRegionIDs := []uint16{}
897
898	b.writeConst("isoRegionOffset", isoOffset)
899
900	// 2-letter region lookup and mapping to numeric codes.
901	regionISO := b.region.clone()
902	regionISO.s = regionISO.s[isoOffset:]
903	regionISO.sorted = false
904
905	regionTypes := make([]byte, len(b.region.s))
906
907	// Is the region valid BCP 47?
908	for s, e := range b.registry {
909		if len(s) == 2 && s == strings.ToUpper(s) {
910			i := b.region.index(s)
911			for _, d := range e.description {
912				if strings.Contains(d, "Private use") {
913					regionTypes[i] = iso3166UserAssigned
914				}
915			}
916			regionTypes[i] |= bcp47Region
917		}
918	}
919
920	// Is the region a valid ccTLD?
921	r := gen.OpenIANAFile("domains/root/db")
922	defer r.Close()
923
924	buf, err := ioutil.ReadAll(r)
925	failOnError(err)
926	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
927	for _, m := range re.FindAllSubmatch(buf, -1) {
928		i := b.region.index(strings.ToUpper(string(m[1])))
929		regionTypes[i] |= ccTLD
930	}
931
932	b.writeSlice("regionTypes", regionTypes)
933
934	iso3Set := make(map[string]int)
935	update := func(iso2, iso3 string) {
936		i := regionISO.index(iso2)
937		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
938			regionISO.s[i] += iso3[1:]
939			iso3Set[iso3] = -1
940		} else {
941			if ok && j >= 0 {
942				regionISO.s[i] += string([]byte{0, byte(j)})
943			} else {
944				iso3Set[iso3] = len(altRegionISO3)
945				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
946				altRegionISO3 += iso3
947				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
948			}
949		}
950	}
951	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
952		i := regionISO.index(tc.Type) + isoOffset
953		if d := m49map[i]; d != 0 {
954			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
955		}
956		m49 := parseM49(tc.Numeric)
957		m49map[i] = m49
958		if r := fromM49map[m49]; r == 0 {
959			fromM49map[m49] = i
960		} else if r != i {
961			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
962			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
963				fromM49map[m49] = i
964			}
965		}
966	}
967	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
968		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
969			from := parseM49(ta.Type)
970			if r := fromM49map[from]; r == 0 {
971				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
972			}
973		}
974	}
975	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
976		if len(tc.Alpha3) == 3 {
977			update(tc.Type, tc.Alpha3)
978		}
979	}
980	// This entries are not included in territoryCodes. Mostly 3-letter variants
981	// of deleted codes and an entry for QU.
982	for _, m := range []struct{ iso2, iso3 string }{
983		{"CT", "CTE"},
984		{"DY", "DHY"},
985		{"HV", "HVO"},
986		{"JT", "JTN"},
987		{"MI", "MID"},
988		{"NH", "NHB"},
989		{"NQ", "ATN"},
990		{"PC", "PCI"},
991		{"PU", "PUS"},
992		{"PZ", "PCZ"},
993		{"RH", "RHO"},
994		{"VD", "VDR"},
995		{"WK", "WAK"},
996		// These three-letter codes are used for others as well.
997		{"FQ", "ATF"},
998	} {
999		update(m.iso2, m.iso3)
1000	}
1001	for i, s := range regionISO.s {
1002		if len(s) != 4 {
1003			regionISO.s[i] = s + "  "
1004		}
1005	}
1006	b.writeConst("regionISO", tag.Index(regionISO.join()))
1007	b.writeConst("altRegionISO3", altRegionISO3)
1008	b.writeSlice("altRegionIDs", altRegionIDs)
1009
1010	// Create list of deprecated regions.
1011	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
1012	// Transitionally-reserved mapping not included.
1013	regionOldMap := stringSet{}
1014	// Include regions in territoryAlias (not all are in the IANA registry!)
1015	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
1016		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
1017			regionOldMap.add(reg.Type)
1018			regionOldMap.updateLater(reg.Type, reg.Replacement)
1019			i, _ := regionISO.find(reg.Type)
1020			j, _ := regionISO.find(reg.Replacement)
1021			if k := m49map[i+isoOffset]; k == 0 {
1022				m49map[i+isoOffset] = m49map[j+isoOffset]
1023			}
1024		}
1025	}
1026	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
1027		return uint16(b.region.index(s))
1028	})
1029	// 3-digit region lookup, groupings.
1030	for i := 1; i < isoOffset; i++ {
1031		m := parseM49(b.region.s[i])
1032		m49map[i] = m
1033		fromM49map[m] = i
1034	}
1035	b.writeSlice("m49", m49map)
1036
1037	const (
1038		searchBits = 7
1039		regionBits = 9
1040	)
1041	if len(m49map) >= 1<<regionBits {
1042		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
1043	}
1044	m49Index := [9]int16{}
1045	fromM49 := []uint16{}
1046	m49 := []int{}
1047	for k, _ := range fromM49map {
1048		m49 = append(m49, int(k))
1049	}
1050	sort.Ints(m49)
1051	for _, k := range m49[1:] {
1052		val := (k & (1<<searchBits - 1)) << regionBits
1053		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
1054		m49Index[1:][k>>searchBits] = int16(len(fromM49))
1055	}
1056	b.writeSlice("m49Index", m49Index)
1057	b.writeSlice("fromM49", fromM49)
1058}
1059
1060const (
1061	// TODO: put these lists in regionTypes as user data? Could be used for
1062	// various optimizations and refinements and could be exposed in the API.
1063	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
1064	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
1065	// DY and RH are actually not deleted, but indeterminately reserved.
1066	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
1067)
1068
1069const (
1070	iso3166UserAssigned = 1 << iota
1071	ccTLD
1072	bcp47Region
1073)
1074
1075func find(list []string, s string) int {
1076	for i, t := range list {
1077		if t == s {
1078			return i
1079		}
1080	}
1081	return -1
1082}
1083
1084// writeVariants generates per-variant information and creates a map from variant
1085// name to index value. We assign index values such that sorting multiple
1086// variants by index value will result in the correct order.
1087// There are two types of variants: specialized and general. Specialized variants
1088// are only applicable to certain language or language-script pairs. Generalized
1089// variants apply to any language. Generalized variants always sort after
1090// specialized variants.  We will therefore always assign a higher index value
1091// to a generalized variant than any other variant. Generalized variants are
1092// sorted alphabetically among themselves.
1093// Specialized variants may also sort after other specialized variants. Such
1094// variants will be ordered after any of the variants they may follow.
1095// We assume that if a variant x is followed by a variant y, then for any prefix
1096// p of x, p-x is a prefix of y. This allows us to order tags based on the
1097// maximum of the length of any of its prefixes.
1098// TODO: it is possible to define a set of Prefix values on variants such that
1099// a total order cannot be defined to the point that this algorithm breaks.
1100// In other words, we cannot guarantee the same order of variants for the
1101// future using the same algorithm or for non-compliant combinations of
1102// variants. For this reason, consider using simple alphabetic sorting
1103// of variants and ignore Prefix restrictions altogether.
1104func (b *builder) writeVariant() {
1105	generalized := stringSet{}
1106	specialized := stringSet{}
1107	specializedExtend := stringSet{}
1108	// Collate the variants by type and check assumptions.
1109	for _, v := range b.variant.slice() {
1110		e := b.registry[v]
1111		if len(e.prefix) == 0 {
1112			generalized.add(v)
1113			continue
1114		}
1115		c := strings.Split(e.prefix[0], "-")
1116		hasScriptOrRegion := false
1117		if len(c) > 1 {
1118			_, hasScriptOrRegion = b.script.find(c[1])
1119			if !hasScriptOrRegion {
1120				_, hasScriptOrRegion = b.region.find(c[1])
1121
1122			}
1123		}
1124		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
1125			// Variant is preceded by a language.
1126			specialized.add(v)
1127			continue
1128		}
1129		// Variant is preceded by another variant.
1130		specializedExtend.add(v)
1131		prefix := c[0] + "-"
1132		if hasScriptOrRegion {
1133			prefix += c[1]
1134		}
1135		for _, p := range e.prefix {
1136			// Verify that the prefix minus the last element is a prefix of the
1137			// predecessor element.
1138			i := strings.LastIndex(p, "-")
1139			pred := b.registry[p[i+1:]]
1140			if find(pred.prefix, p[:i]) < 0 {
1141				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
1142			}
1143			// The sorting used below does not work in the general case. It works
1144			// if we assume that variants that may be followed by others only have
1145			// prefixes of the same length. Verify this.
1146			count := strings.Count(p[:i], "-")
1147			for _, q := range pred.prefix {
1148				if c := strings.Count(q, "-"); c != count {
1149					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
1150				}
1151			}
1152			if !strings.HasPrefix(p, prefix) {
1153				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
1154			}
1155		}
1156	}
1157
1158	// Sort extended variants.
1159	a := specializedExtend.s
1160	less := func(v, w string) bool {
1161		// Sort by the maximum number of elements.
1162		maxCount := func(s string) (max int) {
1163			for _, p := range b.registry[s].prefix {
1164				if c := strings.Count(p, "-"); c > max {
1165					max = c
1166				}
1167			}
1168			return
1169		}
1170		if cv, cw := maxCount(v), maxCount(w); cv != cw {
1171			return cv < cw
1172		}
1173		// Sort by name as tie breaker.
1174		return v < w
1175	}
1176	sort.Sort(funcSorter{less, sort.StringSlice(a)})
1177	specializedExtend.frozen = true
1178
1179	// Create index from variant name to index.
1180	variantIndex := make(map[string]uint8)
1181	add := func(s []string) {
1182		for _, v := range s {
1183			variantIndex[v] = uint8(len(variantIndex))
1184		}
1185	}
1186	add(specialized.slice())
1187	add(specializedExtend.s)
1188	numSpecialized := len(variantIndex)
1189	add(generalized.slice())
1190	if n := len(variantIndex); n > 255 {
1191		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
1192	}
1193	b.writeMap("variantIndex", variantIndex)
1194	b.writeConst("variantNumSpecialized", numSpecialized)
1195}
1196
1197func (b *builder) writeLanguageInfo() {
1198}
1199
1200// writeLikelyData writes tables that are used both for finding parent relations and for
1201// language matching.  Each entry contains additional bits to indicate the status of the
1202// data to know when it cannot be used for parent relations.
1203func (b *builder) writeLikelyData() {
1204	const (
1205		isList = 1 << iota
1206		scriptInFrom
1207		regionInFrom
1208	)
1209	type ( // generated types
1210		likelyScriptRegion struct {
1211			region uint16
1212			script uint8
1213			flags  uint8
1214		}
1215		likelyLangScript struct {
1216			lang   uint16
1217			script uint8
1218			flags  uint8
1219		}
1220		likelyLangRegion struct {
1221			lang   uint16
1222			region uint16
1223		}
1224		// likelyTag is used for getting likely tags for group regions, where
1225		// the likely region might be a region contained in the group.
1226		likelyTag struct {
1227			lang   uint16
1228			region uint16
1229			script uint8
1230		}
1231	)
1232	var ( // generated variables
1233		likelyRegionGroup = make([]likelyTag, len(b.groups))
1234		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
1235		likelyRegion      = make([]likelyLangScript, len(b.region.s))
1236		likelyScript      = make([]likelyLangRegion, len(b.script.s))
1237		likelyLangList    = []likelyScriptRegion{}
1238		likelyRegionList  = []likelyLangScript{}
1239	)
1240	type fromTo struct {
1241		from, to []string
1242	}
1243	langToOther := map[int][]fromTo{}
1244	regionToOther := map[int][]fromTo{}
1245	for _, m := range b.supp.LikelySubtags.LikelySubtag {
1246		from := strings.Split(m.From, "_")
1247		to := strings.Split(m.To, "_")
1248		if len(to) != 3 {
1249			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
1250		}
1251		if len(from) > 3 {
1252			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
1253		}
1254		if from[0] != to[0] && from[0] != "und" {
1255			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
1256		}
1257		if len(from) == 3 {
1258			if from[2] != to[2] {
1259				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
1260			}
1261			if from[0] != "und" {
1262				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
1263			}
1264		}
1265		if len(from) == 1 || from[0] != "und" {
1266			id := 0
1267			if from[0] != "und" {
1268				id = b.lang.index(from[0])
1269			}
1270			langToOther[id] = append(langToOther[id], fromTo{from, to})
1271		} else if len(from) == 2 && len(from[1]) == 4 {
1272			sid := b.script.index(from[1])
1273			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
1274			likelyScript[sid].region = uint16(b.region.index(to[2]))
1275		} else {
1276			r := b.region.index(from[len(from)-1])
1277			if id, ok := b.groups[r]; ok {
1278				if from[0] != "und" {
1279					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
1280				}
1281				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
1282				likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
1283				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
1284			} else {
1285				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
1286			}
1287		}
1288	}
1289	b.writeType(likelyLangRegion{})
1290	b.writeSlice("likelyScript", likelyScript)
1291
1292	for id := range b.lang.s {
1293		list := langToOther[id]
1294		if len(list) == 1 {
1295			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
1296			likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
1297		} else if len(list) > 1 {
1298			likelyLang[id].flags = isList
1299			likelyLang[id].region = uint16(len(likelyLangList))
1300			likelyLang[id].script = uint8(len(list))
1301			for _, x := range list {
1302				flags := uint8(0)
1303				if len(x.from) > 1 {
1304					if x.from[1] == x.to[2] {
1305						flags = regionInFrom
1306					} else {
1307						flags = scriptInFrom
1308					}
1309				}
1310				likelyLangList = append(likelyLangList, likelyScriptRegion{
1311					region: uint16(b.region.index(x.to[2])),
1312					script: uint8(b.script.index(x.to[1])),
1313					flags:  flags,
1314				})
1315			}
1316		}
1317	}
1318	// TODO: merge suppressScript data with this table.
1319	b.writeType(likelyScriptRegion{})
1320	b.writeSlice("likelyLang", likelyLang)
1321	b.writeSlice("likelyLangList", likelyLangList)
1322
1323	for id := range b.region.s {
1324		list := regionToOther[id]
1325		if len(list) == 1 {
1326			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
1327			likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
1328			if len(list[0].from) > 2 {
1329				likelyRegion[id].flags = scriptInFrom
1330			}
1331		} else if len(list) > 1 {
1332			likelyRegion[id].flags = isList
1333			likelyRegion[id].lang = uint16(len(likelyRegionList))
1334			likelyRegion[id].script = uint8(len(list))
1335			for i, x := range list {
1336				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
1337					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
1338				}
1339				x := likelyLangScript{
1340					lang:   uint16(b.langIndex(x.to[0])),
1341					script: uint8(b.script.index(x.to[1])),
1342				}
1343				if len(list[0].from) > 2 {
1344					x.flags = scriptInFrom
1345				}
1346				likelyRegionList = append(likelyRegionList, x)
1347			}
1348		}
1349	}
1350	b.writeType(likelyLangScript{})
1351	b.writeSlice("likelyRegion", likelyRegion)
1352	b.writeSlice("likelyRegionList", likelyRegionList)
1353
1354	b.writeType(likelyTag{})
1355	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
1356}
1357
1358func (b *builder) writeRegionInclusionData() {
1359	var (
1360		// mm holds for each group the set of groups with a distance of 1.
1361		mm = make(map[int][]index)
1362
1363		// containment holds for each group the transitive closure of
1364		// containment of other groups.
1365		containment = make(map[index][]index)
1366	)
1367	for _, g := range b.supp.TerritoryContainment.Group {
1368		// Skip UN and EURO zone as they are flattening the containment
1369		// relationship.
1370		if g.Type == "EZ" || g.Type == "UN" {
1371			continue
1372		}
1373		group := b.region.index(g.Type)
1374		groupIdx := b.groups[group]
1375		for _, mem := range strings.Split(g.Contains, " ") {
1376			r := b.region.index(mem)
1377			mm[r] = append(mm[r], groupIdx)
1378			if g, ok := b.groups[r]; ok {
1379				mm[group] = append(mm[group], g)
1380				containment[groupIdx] = append(containment[groupIdx], g)
1381			}
1382		}
1383	}
1384
1385	regionContainment := make([]uint64, len(b.groups))
1386	for _, g := range b.groups {
1387		l := containment[g]
1388
1389		// Compute the transitive closure of containment.
1390		for i := 0; i < len(l); i++ {
1391			l = append(l, containment[l[i]]...)
1392		}
1393
1394		// Compute the bitmask.
1395		regionContainment[g] = 1 << g
1396		for _, v := range l {
1397			regionContainment[g] |= 1 << v
1398		}
1399	}
1400	b.writeSlice("regionContainment", regionContainment)
1401
1402	regionInclusion := make([]uint8, len(b.region.s))
1403	bvs := make(map[uint64]index)
1404	// Make the first bitvector positions correspond with the groups.
1405	for r, i := range b.groups {
1406		bv := uint64(1 << i)
1407		for _, g := range mm[r] {
1408			bv |= 1 << g
1409		}
1410		bvs[bv] = i
1411		regionInclusion[r] = uint8(bvs[bv])
1412	}
1413	for r := 1; r < len(b.region.s); r++ {
1414		if _, ok := b.groups[r]; !ok {
1415			bv := uint64(0)
1416			for _, g := range mm[r] {
1417				bv |= 1 << g
1418			}
1419			if bv == 0 {
1420				// Pick the world for unspecified regions.
1421				bv = 1 << b.groups[b.region.index("001")]
1422			}
1423			if _, ok := bvs[bv]; !ok {
1424				bvs[bv] = index(len(bvs))
1425			}
1426			regionInclusion[r] = uint8(bvs[bv])
1427		}
1428	}
1429	b.writeSlice("regionInclusion", regionInclusion)
1430	regionInclusionBits := make([]uint64, len(bvs))
1431	for k, v := range bvs {
1432		regionInclusionBits[v] = uint64(k)
1433	}
1434	// Add bit vectors for increasingly large distances until a fixed point is reached.
1435	regionInclusionNext := []uint8{}
1436	for i := 0; i < len(regionInclusionBits); i++ {
1437		bits := regionInclusionBits[i]
1438		next := bits
1439		for i := uint(0); i < uint(len(b.groups)); i++ {
1440			if bits&(1<<i) != 0 {
1441				next |= regionInclusionBits[i]
1442			}
1443		}
1444		if _, ok := bvs[next]; !ok {
1445			bvs[next] = index(len(bvs))
1446			regionInclusionBits = append(regionInclusionBits, next)
1447		}
1448		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
1449	}
1450	b.writeSlice("regionInclusionBits", regionInclusionBits)
1451	b.writeSlice("regionInclusionNext", regionInclusionNext)
1452}
1453
1454type parentRel struct {
1455	lang       uint16
1456	script     uint8
1457	maxScript  uint8
1458	toRegion   uint16
1459	fromRegion []uint16
1460}
1461
1462func (b *builder) writeParents() {
1463	b.writeType(parentRel{})
1464
1465	parents := []parentRel{}
1466
1467	// Construct parent overrides.
1468	n := 0
1469	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
1470		// Skipping non-standard scripts to root is implemented using addTags.
1471		if p.Parent == "root" {
1472			continue
1473		}
1474
1475		sub := strings.Split(p.Parent, "_")
1476		parent := parentRel{lang: b.langIndex(sub[0])}
1477		if len(sub) == 2 {
1478			// TODO: check that all undefined scripts are indeed Latn in these
1479			// cases.
1480			parent.maxScript = uint8(b.script.index("Latn"))
1481			parent.toRegion = uint16(b.region.index(sub[1]))
1482		} else {
1483			parent.script = uint8(b.script.index(sub[1]))
1484			parent.maxScript = parent.script
1485			parent.toRegion = uint16(b.region.index(sub[2]))
1486		}
1487		for _, c := range strings.Split(p.Locales, " ") {
1488			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
1489			parent.fromRegion = append(parent.fromRegion, uint16(region))
1490		}
1491		parents = append(parents, parent)
1492		n += len(parent.fromRegion)
1493	}
1494	b.writeSliceAddSize("parents", n*2, parents)
1495}
1496
1497func main() {
1498	gen.Init()
1499
1500	gen.Repackage("gen_common.go", "common.go", "language")
1501
1502	w := gen.NewCodeWriter()
1503	defer w.WriteGoFile("tables.go", "language")
1504
1505	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
1506
1507	b := newBuilder(w)
1508	gen.WriteCLDRVersion(w)
1509
1510	b.parseIndices()
1511	b.writeType(FromTo{})
1512	b.writeLanguage()
1513	b.writeScript()
1514	b.writeRegion()
1515	b.writeVariant()
1516	// TODO: b.writeLocale()
1517	b.computeRegionGroups()
1518	b.writeLikelyData()
1519	b.writeRegionInclusionData()
1520	b.writeParents()
1521}
1522