1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Language tag table generator.
8// Data read from the web.
9
10package main
11
12import (
13	"bufio"
14	"flag"
15	"fmt"
16	"io"
17	"io/ioutil"
18	"log"
19	"math"
20	"reflect"
21	"regexp"
22	"sort"
23	"strconv"
24	"strings"
25
26	"golang.org/x/text/internal/gen"
27	"golang.org/x/text/internal/tag"
28	"golang.org/x/text/unicode/cldr"
29)
30
31var (
32	test = flag.Bool("test",
33		false,
34		"test existing tables; can be used to compare web data with package data.")
35	outputFile = flag.String("output",
36		"tables.go",
37		"output file for generated tables")
38)
39
40var comment = []string{
41	`
42lang holds an alphabetically sorted list of ISO-639 language identifiers.
43All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
44For 2-byte language identifiers, the two successive bytes have the following meaning:
45    - if the first letter of the 2- and 3-letter ISO codes are the same:
46      the second and third letter of the 3-letter ISO code.
47    - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
48For 3-byte language identifiers the 4th byte is 0.`,
49	`
50langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
51in lookup tables. The language ids for these language codes are derived directly
52from the letters and are not consecutive.`,
53	`
54altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
55to 2-letter language codes that cannot be derived using the method described above.
56Each 3-letter code is followed by its 1-byte langID.`,
57	`
58altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
59	`
60AliasMap maps langIDs to their suggested replacements.`,
61	`
62script is an alphabetically sorted list of ISO 15924 codes. The index
63of the script in the string, divided by 4, is the internal scriptID.`,
64	`
65isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
66for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
67the UN.M49 codes used for groups.)`,
68	`
69regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
70Each 2-letter codes is followed by two bytes with the following meaning:
71    - [A-Z}{2}: the first letter of the 2-letter code plus these two
72                letters form the 3-letter ISO code.
73    - 0, n:     index into altRegionISO3.`,
74	`
75regionTypes defines the status of a region for various standards.`,
76	`
77m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
78codes indicating collections of regions.`,
79	`
80m49Index gives indexes into fromM49 based on the three most significant bits
81of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
82   fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
83for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
84The region code is stored in the 9 lsb of the indexed value.`,
85	`
86fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
87	`
88altRegionISO3 holds a list of 3-letter region codes that cannot be
89mapped to 2-letter codes using the default algorithm. This is a short list.`,
90	`
91altRegionIDs holds a list of regionIDs the positions of which match those
92of the 3-letter ISO codes in altRegionISO3.`,
93	`
94variantNumSpecialized is the number of specialized variants in variants.`,
95	`
96suppressScript is an index from langID to the dominant script for that language,
97if it exists.  If a script is given, it should be suppressed from the language tag.`,
98	`
99likelyLang is a lookup table, indexed by langID, for the most likely
100scripts and regions given incomplete information. If more entries exist for a
101given language, region and script are the index and size respectively
102of the list in likelyLangList.`,
103	`
104likelyLangList holds lists info associated with likelyLang.`,
105	`
106likelyRegion is a lookup table, indexed by regionID, for the most likely
107languages and scripts given incomplete information. If more entries exist
108for a given regionID, lang and script are the index and size respectively
109of the list in likelyRegionList.
110TODO: exclude containers and user-definable regions from the list.`,
111	`
112likelyRegionList holds lists info associated with likelyRegion.`,
113	`
114likelyScript is a lookup table, indexed by scriptID, for the most likely
115languages and regions given a script.`,
116	`
117nRegionGroups is the number of region groups.`,
118	`
119regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
120where each set holds all groupings that are directly connected in a region
121containment graph.`,
122	`
123regionInclusionBits is an array of bit vectors where every vector represents
124a set of region groupings.  These sets are used to compute the distance
125between two regions for the purpose of language matching.`,
126	`
127regionInclusionNext marks, for each entry in regionInclusionBits, the set of
128all groups that are reachable from the groups set in the respective entry.`,
129}
130
131// TODO: consider changing some of these structures to tries. This can reduce
132// memory, but may increase the need for memory allocations. This could be
133// mitigated if we can piggyback on language tags for common cases.
134
135func failOnError(e error) {
136	if e != nil {
137		log.Panic(e)
138	}
139}
140
141type setType int
142
143const (
144	Indexed setType = 1 + iota // all elements must be of same size
145	Linear
146)
147
148type stringSet struct {
149	s              []string
150	sorted, frozen bool
151
152	// We often need to update values after the creation of an index is completed.
153	// We include a convenience map for keeping track of this.
154	update map[string]string
155	typ    setType // used for checking.
156}
157
158func (ss *stringSet) clone() stringSet {
159	c := *ss
160	c.s = append([]string(nil), c.s...)
161	return c
162}
163
164func (ss *stringSet) setType(t setType) {
165	if ss.typ != t && ss.typ != 0 {
166		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
167	}
168}
169
170// parse parses a whitespace-separated string and initializes ss with its
171// components.
172func (ss *stringSet) parse(s string) {
173	scan := bufio.NewScanner(strings.NewReader(s))
174	scan.Split(bufio.ScanWords)
175	for scan.Scan() {
176		ss.add(scan.Text())
177	}
178}
179
180func (ss *stringSet) assertChangeable() {
181	if ss.frozen {
182		log.Panic("attempt to modify a frozen stringSet")
183	}
184}
185
186func (ss *stringSet) add(s string) {
187	ss.assertChangeable()
188	ss.s = append(ss.s, s)
189	ss.sorted = ss.frozen
190}
191
192func (ss *stringSet) freeze() {
193	ss.compact()
194	ss.frozen = true
195}
196
197func (ss *stringSet) compact() {
198	if ss.sorted {
199		return
200	}
201	a := ss.s
202	sort.Strings(a)
203	k := 0
204	for i := 1; i < len(a); i++ {
205		if a[k] != a[i] {
206			a[k+1] = a[i]
207			k++
208		}
209	}
210	ss.s = a[:k+1]
211	ss.sorted = ss.frozen
212}
213
214type funcSorter struct {
215	fn func(a, b string) bool
216	sort.StringSlice
217}
218
219func (s funcSorter) Less(i, j int) bool {
220	return s.fn(s.StringSlice[i], s.StringSlice[j])
221}
222
223func (ss *stringSet) sortFunc(f func(a, b string) bool) {
224	ss.compact()
225	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
226}
227
228func (ss *stringSet) remove(s string) {
229	ss.assertChangeable()
230	if i, ok := ss.find(s); ok {
231		copy(ss.s[i:], ss.s[i+1:])
232		ss.s = ss.s[:len(ss.s)-1]
233	}
234}
235
236func (ss *stringSet) replace(ol, nu string) {
237	ss.s[ss.index(ol)] = nu
238	ss.sorted = ss.frozen
239}
240
241func (ss *stringSet) index(s string) int {
242	ss.setType(Indexed)
243	i, ok := ss.find(s)
244	if !ok {
245		if i < len(ss.s) {
246			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
247		}
248		log.Panicf("find: item %q is not in list", s)
249
250	}
251	return i
252}
253
254func (ss *stringSet) find(s string) (int, bool) {
255	ss.compact()
256	i := sort.SearchStrings(ss.s, s)
257	return i, i != len(ss.s) && ss.s[i] == s
258}
259
260func (ss *stringSet) slice() []string {
261	ss.compact()
262	return ss.s
263}
264
265func (ss *stringSet) updateLater(v, key string) {
266	if ss.update == nil {
267		ss.update = map[string]string{}
268	}
269	ss.update[v] = key
270}
271
272// join joins the string and ensures that all entries are of the same length.
273func (ss *stringSet) join() string {
274	ss.setType(Indexed)
275	n := len(ss.s[0])
276	for _, s := range ss.s {
277		if len(s) != n {
278			log.Panicf("join: not all entries are of the same length: %q", s)
279		}
280	}
281	ss.s = append(ss.s, strings.Repeat("\xff", n))
282	return strings.Join(ss.s, "")
283}
284
285// ianaEntry holds information for an entry in the IANA Language Subtag Repository.
286// All types use the same entry.
287// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
288// fields.
289type ianaEntry struct {
290	typ            string
291	description    []string
292	scope          string
293	added          string
294	preferred      string
295	deprecated     string
296	suppressScript string
297	macro          string
298	prefix         []string
299}
300
301type builder struct {
302	w    *gen.CodeWriter
303	hw   io.Writer // MultiWriter for w and w.Hash
304	data *cldr.CLDR
305	supp *cldr.SupplementalData
306
307	// indices
308	locale      stringSet // common locales
309	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
310	langNoIndex stringSet // 3-letter ISO codes with no associated data
311	script      stringSet // 4-letter ISO codes
312	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
313	variant     stringSet // 4-8-alphanumeric variant code.
314
315	// Region codes that are groups with their corresponding group IDs.
316	groups map[int]index
317
318	// langInfo
319	registry map[string]*ianaEntry
320}
321
322type index uint
323
324func newBuilder(w *gen.CodeWriter) *builder {
325	r := gen.OpenCLDRCoreZip()
326	defer r.Close()
327	d := &cldr.Decoder{}
328	data, err := d.DecodeZip(r)
329	failOnError(err)
330	b := builder{
331		w:    w,
332		hw:   io.MultiWriter(w, w.Hash),
333		data: data,
334		supp: data.Supplemental(),
335	}
336	b.parseRegistry()
337	return &b
338}
339
340func (b *builder) parseRegistry() {
341	r := gen.OpenIANAFile("assignments/language-subtag-registry")
342	defer r.Close()
343	b.registry = make(map[string]*ianaEntry)
344
345	scan := bufio.NewScanner(r)
346	scan.Split(bufio.ScanWords)
347	var record *ianaEntry
348	for more := scan.Scan(); more; {
349		key := scan.Text()
350		more = scan.Scan()
351		value := scan.Text()
352		switch key {
353		case "Type:":
354			record = &ianaEntry{typ: value}
355		case "Subtag:", "Tag:":
356			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
357				for a := s[0]; a <= s[1]; a = inc(a) {
358					b.addToRegistry(a, record)
359				}
360			} else {
361				b.addToRegistry(value, record)
362			}
363		case "Suppress-Script:":
364			record.suppressScript = value
365		case "Added:":
366			record.added = value
367		case "Deprecated:":
368			record.deprecated = value
369		case "Macrolanguage:":
370			record.macro = value
371		case "Preferred-Value:":
372			record.preferred = value
373		case "Prefix:":
374			record.prefix = append(record.prefix, value)
375		case "Scope:":
376			record.scope = value
377		case "Description:":
378			buf := []byte(value)
379			for more = scan.Scan(); more; more = scan.Scan() {
380				b := scan.Bytes()
381				if b[0] == '%' || b[len(b)-1] == ':' {
382					break
383				}
384				buf = append(buf, ' ')
385				buf = append(buf, b...)
386			}
387			record.description = append(record.description, string(buf))
388			continue
389		default:
390			continue
391		}
392		more = scan.Scan()
393	}
394	if scan.Err() != nil {
395		log.Panic(scan.Err())
396	}
397}
398
399func (b *builder) addToRegistry(key string, entry *ianaEntry) {
400	if info, ok := b.registry[key]; ok {
401		if info.typ != "language" || entry.typ != "extlang" {
402			log.Fatalf("parseRegistry: tag %q already exists", key)
403		}
404	} else {
405		b.registry[key] = entry
406	}
407}
408
409var commentIndex = make(map[string]string)
410
411func init() {
412	for _, s := range comment {
413		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
414		commentIndex[key] = s
415	}
416}
417
418func (b *builder) comment(name string) {
419	if s := commentIndex[name]; len(s) > 0 {
420		b.w.WriteComment(s)
421	} else {
422		fmt.Fprintln(b.w)
423	}
424}
425
426func (b *builder) pf(f string, x ...interface{}) {
427	fmt.Fprintf(b.hw, f, x...)
428	fmt.Fprint(b.hw, "\n")
429}
430
431func (b *builder) p(x ...interface{}) {
432	fmt.Fprintln(b.hw, x...)
433}
434
435func (b *builder) addSize(s int) {
436	b.w.Size += s
437	b.pf("// Size: %d bytes", s)
438}
439
440func (b *builder) writeConst(name string, x interface{}) {
441	b.comment(name)
442	b.w.WriteConst(name, x)
443}
444
445// writeConsts computes f(v) for all v in values and writes the results
446// as constants named _v to a single constant block.
447func (b *builder) writeConsts(f func(string) int, values ...string) {
448	b.pf("const (")
449	for _, v := range values {
450		b.pf("\t_%s = %v", v, f(v))
451	}
452	b.pf(")")
453}
454
455// writeType writes the type of the given value, which must be a struct.
456func (b *builder) writeType(value interface{}) {
457	b.comment(reflect.TypeOf(value).Name())
458	b.w.WriteType(value)
459}
460
461func (b *builder) writeSlice(name string, ss interface{}) {
462	b.writeSliceAddSize(name, 0, ss)
463}
464
465func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
466	b.comment(name)
467	b.w.Size += extraSize
468	v := reflect.ValueOf(ss)
469	t := v.Type().Elem()
470	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
471
472	fmt.Fprintf(b.w, "var %s = ", name)
473	b.w.WriteArray(ss)
474	b.p()
475}
476
477type FromTo struct {
478	From, To uint16
479}
480
481func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
482	ss.sortFunc(func(a, b string) bool {
483		return index(a) < index(b)
484	})
485	m := []FromTo{}
486	for _, s := range ss.s {
487		m = append(m, FromTo{index(s), index(ss.update[s])})
488	}
489	b.writeSlice(name, m)
490}
491
492const base = 'z' - 'a' + 1
493
494func strToInt(s string) uint {
495	v := uint(0)
496	for i := 0; i < len(s); i++ {
497		v *= base
498		v += uint(s[i] - 'a')
499	}
500	return v
501}
502
503// converts the given integer to the original ASCII string passed to strToInt.
504// len(s) must match the number of characters obtained.
505func intToStr(v uint, s []byte) {
506	for i := len(s) - 1; i >= 0; i-- {
507		s[i] = byte(v%base) + 'a'
508		v /= base
509	}
510}
511
512func (b *builder) writeBitVector(name string, ss []string) {
513	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
514	for _, s := range ss {
515		v := strToInt(s)
516		vec[v/8] |= 1 << (v % 8)
517	}
518	b.writeSlice(name, vec)
519}
520
521// TODO: convert this type into a list or two-stage trie.
522func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
523	b.comment(name)
524	v := reflect.ValueOf(m)
525	sz := v.Len() * (2 + int(v.Type().Key().Size()))
526	for _, k := range m {
527		sz += len(k)
528	}
529	b.addSize(sz)
530	keys := []string{}
531	b.pf(`var %s = map[string]uint16{`, name)
532	for k := range m {
533		keys = append(keys, k)
534	}
535	sort.Strings(keys)
536	for _, k := range keys {
537		b.pf("\t%q: %v,", k, f(m[k]))
538	}
539	b.p("}")
540}
541
542func (b *builder) writeMap(name string, m interface{}) {
543	b.comment(name)
544	v := reflect.ValueOf(m)
545	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
546	b.addSize(sz)
547	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
548		return strings.IndexRune("{}, ", r) != -1
549	})
550	sort.Strings(f[1:])
551	b.pf(`var %s = %s{`, name, f[0])
552	for _, kv := range f[1:] {
553		b.pf("\t%s,", kv)
554	}
555	b.p("}")
556}
557
558func (b *builder) langIndex(s string) uint16 {
559	if s == "und" {
560		return 0
561	}
562	if i, ok := b.lang.find(s); ok {
563		return uint16(i)
564	}
565	return uint16(strToInt(s)) + uint16(len(b.lang.s))
566}
567
568// inc advances the string to its lexicographical successor.
569func inc(s string) string {
570	const maxTagLength = 4
571	var buf [maxTagLength]byte
572	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
573	for i := 0; i < len(s); i++ {
574		if s[i] <= 'Z' {
575			buf[i] -= 'a' - 'A'
576		}
577	}
578	return string(buf[:len(s)])
579}
580
581func (b *builder) parseIndices() {
582	meta := b.supp.Metadata
583
584	for k, v := range b.registry {
585		var ss *stringSet
586		switch v.typ {
587		case "language":
588			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
589				b.lang.add(k)
590				continue
591			} else {
592				ss = &b.langNoIndex
593			}
594		case "region":
595			ss = &b.region
596		case "script":
597			ss = &b.script
598		case "variant":
599			ss = &b.variant
600		default:
601			continue
602		}
603		ss.add(k)
604	}
605	// Include any language for which there is data.
606	for _, lang := range b.data.Locales() {
607		if x := b.data.RawLDML(lang); false ||
608			x.LocaleDisplayNames != nil ||
609			x.Characters != nil ||
610			x.Delimiters != nil ||
611			x.Measurement != nil ||
612			x.Dates != nil ||
613			x.Numbers != nil ||
614			x.Units != nil ||
615			x.ListPatterns != nil ||
616			x.Collations != nil ||
617			x.Segmentations != nil ||
618			x.Rbnf != nil ||
619			x.Annotations != nil ||
620			x.Metadata != nil {
621
622			from := strings.Split(lang, "_")
623			if lang := from[0]; lang != "root" {
624				b.lang.add(lang)
625			}
626		}
627	}
628	// Include locales for plural rules, which uses a different structure.
629	for _, plurals := range b.data.Supplemental().Plurals {
630		for _, rules := range plurals.PluralRules {
631			for _, lang := range strings.Split(rules.Locales, " ") {
632				if lang = strings.Split(lang, "_")[0]; lang != "root" {
633					b.lang.add(lang)
634				}
635			}
636		}
637	}
638	// Include languages in likely subtags.
639	for _, m := range b.supp.LikelySubtags.LikelySubtag {
640		from := strings.Split(m.From, "_")
641		b.lang.add(from[0])
642	}
643	// Include ISO-639 alpha-3 bibliographic entries.
644	for _, a := range meta.Alias.LanguageAlias {
645		if a.Reason == "bibliographic" {
646			b.langNoIndex.add(a.Type)
647		}
648	}
649	// Include regions in territoryAlias (not all are in the IANA registry!)
650	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
651		if len(reg.Type) == 2 {
652			b.region.add(reg.Type)
653		}
654	}
655
656	for _, s := range b.lang.s {
657		if len(s) == 3 {
658			b.langNoIndex.remove(s)
659		}
660	}
661	b.writeConst("NumLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
662	b.writeConst("NumScripts", len(b.script.slice()))
663	b.writeConst("NumRegions", len(b.region.slice()))
664
665	// Add dummy codes at the start of each list to represent "unspecified".
666	b.lang.add("---")
667	b.script.add("----")
668	b.region.add("---")
669
670	// common locales
671	b.locale.parse(meta.DefaultContent.Locales)
672}
673
674// TODO: region inclusion data will probably not be use used in future matchers.
675
676func (b *builder) computeRegionGroups() {
677	b.groups = make(map[int]index)
678
679	// Create group indices.
680	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
681		b.groups[i] = index(len(b.groups))
682	}
683	for _, g := range b.supp.TerritoryContainment.Group {
684		// Skip UN and EURO zone as they are flattening the containment
685		// relationship.
686		if g.Type == "EZ" || g.Type == "UN" {
687			continue
688		}
689		group := b.region.index(g.Type)
690		if _, ok := b.groups[group]; !ok {
691			b.groups[group] = index(len(b.groups))
692		}
693	}
694	if len(b.groups) > 64 {
695		log.Fatalf("only 64 groups supported, found %d", len(b.groups))
696	}
697	b.writeConst("nRegionGroups", len(b.groups))
698}
699
700var langConsts = []string{
701	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
702	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
703	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
704	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
705	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
706	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
707
708	// constants for grandfathered tags (if not already defined)
709	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
710	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
711}
712
713// writeLanguage generates all tables needed for language canonicalization.
714func (b *builder) writeLanguage() {
715	meta := b.supp.Metadata
716
717	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
718	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
719	b.writeConst("langPrivateStart", b.langIndex("qaa"))
720	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
721
722	// Get language codes that need to be mapped (overlong 3-letter codes,
723	// deprecated 2-letter codes, legacy and grandfathered tags.)
724	langAliasMap := stringSet{}
725	aliasTypeMap := map[string]AliasType{}
726
727	// altLangISO3 get the alternative ISO3 names that need to be mapped.
728	altLangISO3 := stringSet{}
729	// Add dummy start to avoid the use of index 0.
730	altLangISO3.add("---")
731	altLangISO3.updateLater("---", "aa")
732
733	lang := b.lang.clone()
734	for _, a := range meta.Alias.LanguageAlias {
735		if a.Replacement == "" {
736			a.Replacement = "und"
737		}
738		// TODO: support mapping to tags
739		repl := strings.SplitN(a.Replacement, "_", 2)[0]
740		if a.Reason == "overlong" {
741			if len(a.Replacement) == 2 && len(a.Type) == 3 {
742				lang.updateLater(a.Replacement, a.Type)
743			}
744		} else if len(a.Type) <= 3 {
745			switch a.Reason {
746			case "macrolanguage":
747				aliasTypeMap[a.Type] = Macro
748			case "deprecated":
749				// handled elsewhere
750				continue
751			case "bibliographic", "legacy":
752				if a.Type == "no" {
753					continue
754				}
755				aliasTypeMap[a.Type] = Legacy
756			default:
757				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
758			}
759			langAliasMap.add(a.Type)
760			langAliasMap.updateLater(a.Type, repl)
761		}
762	}
763	// Manually add the mapping of "nb" (Norwegian) to its macro language.
764	// This can be removed if CLDR adopts this change.
765	langAliasMap.add("nb")
766	langAliasMap.updateLater("nb", "no")
767	aliasTypeMap["nb"] = Macro
768
769	for k, v := range b.registry {
770		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
771		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
772			langAliasMap.add(k)
773			langAliasMap.updateLater(k, v.preferred)
774			aliasTypeMap[k] = Deprecated
775		}
776	}
777	// Fix CLDR mappings.
778	lang.updateLater("tl", "tgl")
779	lang.updateLater("sh", "hbs")
780	lang.updateLater("mo", "mol")
781	lang.updateLater("no", "nor")
782	lang.updateLater("tw", "twi")
783	lang.updateLater("nb", "nob")
784	lang.updateLater("ak", "aka")
785	lang.updateLater("bh", "bih")
786
787	// Ensure that each 2-letter code is matched with a 3-letter code.
788	for _, v := range lang.s[1:] {
789		s, ok := lang.update[v]
790		if !ok {
791			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
792				continue
793			}
794			lang.update[v] = s
795		}
796		if v[0] != s[0] {
797			altLangISO3.add(s)
798			altLangISO3.updateLater(s, v)
799		}
800	}
801
802	// Complete canonicalized language tags.
803	lang.freeze()
804	for i, v := range lang.s {
805		// We can avoid these manual entries by using the IANA registry directly.
806		// Seems easier to update the list manually, as changes are rare.
807		// The panic in this loop will trigger if we miss an entry.
808		add := ""
809		if s, ok := lang.update[v]; ok {
810			if s[0] == v[0] {
811				add = s[1:]
812			} else {
813				add = string([]byte{0, byte(altLangISO3.index(s))})
814			}
815		} else if len(v) == 3 {
816			add = "\x00"
817		} else {
818			log.Panicf("no data for long form of %q", v)
819		}
820		lang.s[i] += add
821	}
822	b.writeConst("lang", tag.Index(lang.join()))
823
824	b.writeConst("langNoIndexOffset", len(b.lang.s))
825
826	// space of all valid 3-letter language identifiers.
827	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
828
829	altLangIndex := []uint16{}
830	for i, s := range altLangISO3.slice() {
831		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
832		if i > 0 {
833			idx := b.lang.index(altLangISO3.update[s])
834			altLangIndex = append(altLangIndex, uint16(idx))
835		}
836	}
837	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
838	b.writeSlice("altLangIndex", altLangIndex)
839
840	b.writeSortedMap("AliasMap", &langAliasMap, b.langIndex)
841	types := make([]AliasType, len(langAliasMap.s))
842	for i, s := range langAliasMap.s {
843		types[i] = aliasTypeMap[s]
844	}
845	b.writeSlice("AliasTypes", types)
846}
847
848var scriptConsts = []string{
849	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
850	"Zzzz",
851}
852
853func (b *builder) writeScript() {
854	b.writeConsts(b.script.index, scriptConsts...)
855	b.writeConst("script", tag.Index(b.script.join()))
856
857	supp := make([]uint8, len(b.lang.slice()))
858	for i, v := range b.lang.slice()[1:] {
859		if sc := b.registry[v].suppressScript; sc != "" {
860			supp[i+1] = uint8(b.script.index(sc))
861		}
862	}
863	b.writeSlice("suppressScript", supp)
864
865	// There is only one deprecated script in CLDR. This value is hard-coded.
866	// We check here if the code must be updated.
867	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
868		if a.Type != "Qaai" {
869			log.Panicf("unexpected deprecated stript %q", a.Type)
870		}
871	}
872}
873
874func parseM49(s string) int16 {
875	if len(s) == 0 {
876		return 0
877	}
878	v, err := strconv.ParseUint(s, 10, 10)
879	failOnError(err)
880	return int16(v)
881}
882
883var regionConsts = []string{
884	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
885	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
886}
887
888func (b *builder) writeRegion() {
889	b.writeConsts(b.region.index, regionConsts...)
890
891	isoOffset := b.region.index("AA")
892	m49map := make([]int16, len(b.region.slice()))
893	fromM49map := make(map[int16]int)
894	altRegionISO3 := ""
895	altRegionIDs := []uint16{}
896
897	b.writeConst("isoRegionOffset", isoOffset)
898
899	// 2-letter region lookup and mapping to numeric codes.
900	regionISO := b.region.clone()
901	regionISO.s = regionISO.s[isoOffset:]
902	regionISO.sorted = false
903
904	regionTypes := make([]byte, len(b.region.s))
905
906	// Is the region valid BCP 47?
907	for s, e := range b.registry {
908		if len(s) == 2 && s == strings.ToUpper(s) {
909			i := b.region.index(s)
910			for _, d := range e.description {
911				if strings.Contains(d, "Private use") {
912					regionTypes[i] = iso3166UserAssigned
913				}
914			}
915			regionTypes[i] |= bcp47Region
916		}
917	}
918
919	// Is the region a valid ccTLD?
920	r := gen.OpenIANAFile("domains/root/db")
921	defer r.Close()
922
923	buf, err := ioutil.ReadAll(r)
924	failOnError(err)
925	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
926	for _, m := range re.FindAllSubmatch(buf, -1) {
927		i := b.region.index(strings.ToUpper(string(m[1])))
928		regionTypes[i] |= ccTLD
929	}
930
931	b.writeSlice("regionTypes", regionTypes)
932
933	iso3Set := make(map[string]int)
934	update := func(iso2, iso3 string) {
935		i := regionISO.index(iso2)
936		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
937			regionISO.s[i] += iso3[1:]
938			iso3Set[iso3] = -1
939		} else {
940			if ok && j >= 0 {
941				regionISO.s[i] += string([]byte{0, byte(j)})
942			} else {
943				iso3Set[iso3] = len(altRegionISO3)
944				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
945				altRegionISO3 += iso3
946				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
947			}
948		}
949	}
950	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
951		i := regionISO.index(tc.Type) + isoOffset
952		if d := m49map[i]; d != 0 {
953			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
954		}
955		m49 := parseM49(tc.Numeric)
956		m49map[i] = m49
957		if r := fromM49map[m49]; r == 0 {
958			fromM49map[m49] = i
959		} else if r != i {
960			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
961			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
962				fromM49map[m49] = i
963			}
964		}
965	}
966	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
967		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
968			from := parseM49(ta.Type)
969			if r := fromM49map[from]; r == 0 {
970				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
971			}
972		}
973	}
974	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
975		if len(tc.Alpha3) == 3 {
976			update(tc.Type, tc.Alpha3)
977		}
978	}
979	// This entries are not included in territoryCodes. Mostly 3-letter variants
980	// of deleted codes and an entry for QU.
981	for _, m := range []struct{ iso2, iso3 string }{
982		{"CT", "CTE"},
983		{"DY", "DHY"},
984		{"HV", "HVO"},
985		{"JT", "JTN"},
986		{"MI", "MID"},
987		{"NH", "NHB"},
988		{"NQ", "ATN"},
989		{"PC", "PCI"},
990		{"PU", "PUS"},
991		{"PZ", "PCZ"},
992		{"RH", "RHO"},
993		{"VD", "VDR"},
994		{"WK", "WAK"},
995		// These three-letter codes are used for others as well.
996		{"FQ", "ATF"},
997	} {
998		update(m.iso2, m.iso3)
999	}
1000	for i, s := range regionISO.s {
1001		if len(s) != 4 {
1002			regionISO.s[i] = s + "  "
1003		}
1004	}
1005	b.writeConst("regionISO", tag.Index(regionISO.join()))
1006	b.writeConst("altRegionISO3", altRegionISO3)
1007	b.writeSlice("altRegionIDs", altRegionIDs)
1008
1009	// Create list of deprecated regions.
1010	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
1011	// Transitionally-reserved mapping not included.
1012	regionOldMap := stringSet{}
1013	// Include regions in territoryAlias (not all are in the IANA registry!)
1014	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
1015		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
1016			regionOldMap.add(reg.Type)
1017			regionOldMap.updateLater(reg.Type, reg.Replacement)
1018			i, _ := regionISO.find(reg.Type)
1019			j, _ := regionISO.find(reg.Replacement)
1020			if k := m49map[i+isoOffset]; k == 0 {
1021				m49map[i+isoOffset] = m49map[j+isoOffset]
1022			}
1023		}
1024	}
1025	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
1026		return uint16(b.region.index(s))
1027	})
1028	// 3-digit region lookup, groupings.
1029	for i := 1; i < isoOffset; i++ {
1030		m := parseM49(b.region.s[i])
1031		m49map[i] = m
1032		fromM49map[m] = i
1033	}
1034	b.writeSlice("m49", m49map)
1035
1036	const (
1037		searchBits = 7
1038		regionBits = 9
1039	)
1040	if len(m49map) >= 1<<regionBits {
1041		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
1042	}
1043	m49Index := [9]int16{}
1044	fromM49 := []uint16{}
1045	m49 := []int{}
1046	for k, _ := range fromM49map {
1047		m49 = append(m49, int(k))
1048	}
1049	sort.Ints(m49)
1050	for _, k := range m49[1:] {
1051		val := (k & (1<<searchBits - 1)) << regionBits
1052		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
1053		m49Index[1:][k>>searchBits] = int16(len(fromM49))
1054	}
1055	b.writeSlice("m49Index", m49Index)
1056	b.writeSlice("fromM49", fromM49)
1057}
1058
1059const (
1060	// TODO: put these lists in regionTypes as user data? Could be used for
1061	// various optimizations and refinements and could be exposed in the API.
1062	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
1063	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
1064	// DY and RH are actually not deleted, but indeterminately reserved.
1065	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
1066)
1067
1068const (
1069	iso3166UserAssigned = 1 << iota
1070	ccTLD
1071	bcp47Region
1072)
1073
1074func find(list []string, s string) int {
1075	for i, t := range list {
1076		if t == s {
1077			return i
1078		}
1079	}
1080	return -1
1081}
1082
1083// writeVariants generates per-variant information and creates a map from variant
1084// name to index value. We assign index values such that sorting multiple
1085// variants by index value will result in the correct order.
1086// There are two types of variants: specialized and general. Specialized variants
1087// are only applicable to certain language or language-script pairs. Generalized
1088// variants apply to any language. Generalized variants always sort after
1089// specialized variants.  We will therefore always assign a higher index value
1090// to a generalized variant than any other variant. Generalized variants are
1091// sorted alphabetically among themselves.
1092// Specialized variants may also sort after other specialized variants. Such
1093// variants will be ordered after any of the variants they may follow.
1094// We assume that if a variant x is followed by a variant y, then for any prefix
1095// p of x, p-x is a prefix of y. This allows us to order tags based on the
1096// maximum of the length of any of its prefixes.
1097// TODO: it is possible to define a set of Prefix values on variants such that
1098// a total order cannot be defined to the point that this algorithm breaks.
1099// In other words, we cannot guarantee the same order of variants for the
1100// future using the same algorithm or for non-compliant combinations of
1101// variants. For this reason, consider using simple alphabetic sorting
1102// of variants and ignore Prefix restrictions altogether.
1103func (b *builder) writeVariant() {
1104	generalized := stringSet{}
1105	specialized := stringSet{}
1106	specializedExtend := stringSet{}
1107	// Collate the variants by type and check assumptions.
1108	for _, v := range b.variant.slice() {
1109		e := b.registry[v]
1110		if len(e.prefix) == 0 {
1111			generalized.add(v)
1112			continue
1113		}
1114		c := strings.Split(e.prefix[0], "-")
1115		hasScriptOrRegion := false
1116		if len(c) > 1 {
1117			_, hasScriptOrRegion = b.script.find(c[1])
1118			if !hasScriptOrRegion {
1119				_, hasScriptOrRegion = b.region.find(c[1])
1120
1121			}
1122		}
1123		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
1124			// Variant is preceded by a language.
1125			specialized.add(v)
1126			continue
1127		}
1128		// Variant is preceded by another variant.
1129		specializedExtend.add(v)
1130		prefix := c[0] + "-"
1131		if hasScriptOrRegion {
1132			prefix += c[1]
1133		}
1134		for _, p := range e.prefix {
1135			// Verify that the prefix minus the last element is a prefix of the
1136			// predecessor element.
1137			i := strings.LastIndex(p, "-")
1138			pred := b.registry[p[i+1:]]
1139			if find(pred.prefix, p[:i]) < 0 {
1140				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
1141			}
1142			// The sorting used below does not work in the general case. It works
1143			// if we assume that variants that may be followed by others only have
1144			// prefixes of the same length. Verify this.
1145			count := strings.Count(p[:i], "-")
1146			for _, q := range pred.prefix {
1147				if c := strings.Count(q, "-"); c != count {
1148					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
1149				}
1150			}
1151			if !strings.HasPrefix(p, prefix) {
1152				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
1153			}
1154		}
1155	}
1156
1157	// Sort extended variants.
1158	a := specializedExtend.s
1159	less := func(v, w string) bool {
1160		// Sort by the maximum number of elements.
1161		maxCount := func(s string) (max int) {
1162			for _, p := range b.registry[s].prefix {
1163				if c := strings.Count(p, "-"); c > max {
1164					max = c
1165				}
1166			}
1167			return
1168		}
1169		if cv, cw := maxCount(v), maxCount(w); cv != cw {
1170			return cv < cw
1171		}
1172		// Sort by name as tie breaker.
1173		return v < w
1174	}
1175	sort.Sort(funcSorter{less, sort.StringSlice(a)})
1176	specializedExtend.frozen = true
1177
1178	// Create index from variant name to index.
1179	variantIndex := make(map[string]uint8)
1180	add := func(s []string) {
1181		for _, v := range s {
1182			variantIndex[v] = uint8(len(variantIndex))
1183		}
1184	}
1185	add(specialized.slice())
1186	add(specializedExtend.s)
1187	numSpecialized := len(variantIndex)
1188	add(generalized.slice())
1189	if n := len(variantIndex); n > 255 {
1190		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
1191	}
1192	b.writeMap("variantIndex", variantIndex)
1193	b.writeConst("variantNumSpecialized", numSpecialized)
1194}
1195
1196func (b *builder) writeLanguageInfo() {
1197}
1198
1199// writeLikelyData writes tables that are used both for finding parent relations and for
1200// language matching.  Each entry contains additional bits to indicate the status of the
1201// data to know when it cannot be used for parent relations.
1202func (b *builder) writeLikelyData() {
1203	const (
1204		isList = 1 << iota
1205		scriptInFrom
1206		regionInFrom
1207	)
1208	type ( // generated types
1209		likelyScriptRegion struct {
1210			region uint16
1211			script uint8
1212			flags  uint8
1213		}
1214		likelyLangScript struct {
1215			lang   uint16
1216			script uint8
1217			flags  uint8
1218		}
1219		likelyLangRegion struct {
1220			lang   uint16
1221			region uint16
1222		}
1223		// likelyTag is used for getting likely tags for group regions, where
1224		// the likely region might be a region contained in the group.
1225		likelyTag struct {
1226			lang   uint16
1227			region uint16
1228			script uint8
1229		}
1230	)
1231	var ( // generated variables
1232		likelyRegionGroup = make([]likelyTag, len(b.groups))
1233		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
1234		likelyRegion      = make([]likelyLangScript, len(b.region.s))
1235		likelyScript      = make([]likelyLangRegion, len(b.script.s))
1236		likelyLangList    = []likelyScriptRegion{}
1237		likelyRegionList  = []likelyLangScript{}
1238	)
1239	type fromTo struct {
1240		from, to []string
1241	}
1242	langToOther := map[int][]fromTo{}
1243	regionToOther := map[int][]fromTo{}
1244	for _, m := range b.supp.LikelySubtags.LikelySubtag {
1245		from := strings.Split(m.From, "_")
1246		to := strings.Split(m.To, "_")
1247		if len(to) != 3 {
1248			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
1249		}
1250		if len(from) > 3 {
1251			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
1252		}
1253		if from[0] != to[0] && from[0] != "und" {
1254			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
1255		}
1256		if len(from) == 3 {
1257			if from[2] != to[2] {
1258				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
1259			}
1260			if from[0] != "und" {
1261				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
1262			}
1263		}
1264		if len(from) == 1 || from[0] != "und" {
1265			id := 0
1266			if from[0] != "und" {
1267				id = b.lang.index(from[0])
1268			}
1269			langToOther[id] = append(langToOther[id], fromTo{from, to})
1270		} else if len(from) == 2 && len(from[1]) == 4 {
1271			sid := b.script.index(from[1])
1272			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
1273			likelyScript[sid].region = uint16(b.region.index(to[2]))
1274		} else {
1275			r := b.region.index(from[len(from)-1])
1276			if id, ok := b.groups[r]; ok {
1277				if from[0] != "und" {
1278					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
1279				}
1280				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
1281				likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
1282				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
1283			} else {
1284				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
1285			}
1286		}
1287	}
1288	b.writeType(likelyLangRegion{})
1289	b.writeSlice("likelyScript", likelyScript)
1290
1291	for id := range b.lang.s {
1292		list := langToOther[id]
1293		if len(list) == 1 {
1294			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
1295			likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
1296		} else if len(list) > 1 {
1297			likelyLang[id].flags = isList
1298			likelyLang[id].region = uint16(len(likelyLangList))
1299			likelyLang[id].script = uint8(len(list))
1300			for _, x := range list {
1301				flags := uint8(0)
1302				if len(x.from) > 1 {
1303					if x.from[1] == x.to[2] {
1304						flags = regionInFrom
1305					} else {
1306						flags = scriptInFrom
1307					}
1308				}
1309				likelyLangList = append(likelyLangList, likelyScriptRegion{
1310					region: uint16(b.region.index(x.to[2])),
1311					script: uint8(b.script.index(x.to[1])),
1312					flags:  flags,
1313				})
1314			}
1315		}
1316	}
1317	// TODO: merge suppressScript data with this table.
1318	b.writeType(likelyScriptRegion{})
1319	b.writeSlice("likelyLang", likelyLang)
1320	b.writeSlice("likelyLangList", likelyLangList)
1321
1322	for id := range b.region.s {
1323		list := regionToOther[id]
1324		if len(list) == 1 {
1325			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
1326			likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
1327			if len(list[0].from) > 2 {
1328				likelyRegion[id].flags = scriptInFrom
1329			}
1330		} else if len(list) > 1 {
1331			likelyRegion[id].flags = isList
1332			likelyRegion[id].lang = uint16(len(likelyRegionList))
1333			likelyRegion[id].script = uint8(len(list))
1334			for i, x := range list {
1335				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
1336					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
1337				}
1338				x := likelyLangScript{
1339					lang:   uint16(b.langIndex(x.to[0])),
1340					script: uint8(b.script.index(x.to[1])),
1341				}
1342				if len(list[0].from) > 2 {
1343					x.flags = scriptInFrom
1344				}
1345				likelyRegionList = append(likelyRegionList, x)
1346			}
1347		}
1348	}
1349	b.writeType(likelyLangScript{})
1350	b.writeSlice("likelyRegion", likelyRegion)
1351	b.writeSlice("likelyRegionList", likelyRegionList)
1352
1353	b.writeType(likelyTag{})
1354	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
1355}
1356
1357func (b *builder) writeRegionInclusionData() {
1358	var (
1359		// mm holds for each group the set of groups with a distance of 1.
1360		mm = make(map[int][]index)
1361
1362		// containment holds for each group the transitive closure of
1363		// containment of other groups.
1364		containment = make(map[index][]index)
1365	)
1366	for _, g := range b.supp.TerritoryContainment.Group {
1367		// Skip UN and EURO zone as they are flattening the containment
1368		// relationship.
1369		if g.Type == "EZ" || g.Type == "UN" {
1370			continue
1371		}
1372		group := b.region.index(g.Type)
1373		groupIdx := b.groups[group]
1374		for _, mem := range strings.Split(g.Contains, " ") {
1375			r := b.region.index(mem)
1376			mm[r] = append(mm[r], groupIdx)
1377			if g, ok := b.groups[r]; ok {
1378				mm[group] = append(mm[group], g)
1379				containment[groupIdx] = append(containment[groupIdx], g)
1380			}
1381		}
1382	}
1383
1384	regionContainment := make([]uint64, len(b.groups))
1385	for _, g := range b.groups {
1386		l := containment[g]
1387
1388		// Compute the transitive closure of containment.
1389		for i := 0; i < len(l); i++ {
1390			l = append(l, containment[l[i]]...)
1391		}
1392
1393		// Compute the bitmask.
1394		regionContainment[g] = 1 << g
1395		for _, v := range l {
1396			regionContainment[g] |= 1 << v
1397		}
1398	}
1399	b.writeSlice("regionContainment", regionContainment)
1400
1401	regionInclusion := make([]uint8, len(b.region.s))
1402	bvs := make(map[uint64]index)
1403	// Make the first bitvector positions correspond with the groups.
1404	for r, i := range b.groups {
1405		bv := uint64(1 << i)
1406		for _, g := range mm[r] {
1407			bv |= 1 << g
1408		}
1409		bvs[bv] = i
1410		regionInclusion[r] = uint8(bvs[bv])
1411	}
1412	for r := 1; r < len(b.region.s); r++ {
1413		if _, ok := b.groups[r]; !ok {
1414			bv := uint64(0)
1415			for _, g := range mm[r] {
1416				bv |= 1 << g
1417			}
1418			if bv == 0 {
1419				// Pick the world for unspecified regions.
1420				bv = 1 << b.groups[b.region.index("001")]
1421			}
1422			if _, ok := bvs[bv]; !ok {
1423				bvs[bv] = index(len(bvs))
1424			}
1425			regionInclusion[r] = uint8(bvs[bv])
1426		}
1427	}
1428	b.writeSlice("regionInclusion", regionInclusion)
1429	regionInclusionBits := make([]uint64, len(bvs))
1430	for k, v := range bvs {
1431		regionInclusionBits[v] = uint64(k)
1432	}
1433	// Add bit vectors for increasingly large distances until a fixed point is reached.
1434	regionInclusionNext := []uint8{}
1435	for i := 0; i < len(regionInclusionBits); i++ {
1436		bits := regionInclusionBits[i]
1437		next := bits
1438		for i := uint(0); i < uint(len(b.groups)); i++ {
1439			if bits&(1<<i) != 0 {
1440				next |= regionInclusionBits[i]
1441			}
1442		}
1443		if _, ok := bvs[next]; !ok {
1444			bvs[next] = index(len(bvs))
1445			regionInclusionBits = append(regionInclusionBits, next)
1446		}
1447		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
1448	}
1449	b.writeSlice("regionInclusionBits", regionInclusionBits)
1450	b.writeSlice("regionInclusionNext", regionInclusionNext)
1451}
1452
1453type parentRel struct {
1454	lang       uint16
1455	script     uint8
1456	maxScript  uint8
1457	toRegion   uint16
1458	fromRegion []uint16
1459}
1460
1461func (b *builder) writeParents() {
1462	b.writeType(parentRel{})
1463
1464	parents := []parentRel{}
1465
1466	// Construct parent overrides.
1467	n := 0
1468	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
1469		// Skipping non-standard scripts to root is implemented using addTags.
1470		if p.Parent == "root" {
1471			continue
1472		}
1473
1474		sub := strings.Split(p.Parent, "_")
1475		parent := parentRel{lang: b.langIndex(sub[0])}
1476		if len(sub) == 2 {
1477			// TODO: check that all undefined scripts are indeed Latn in these
1478			// cases.
1479			parent.maxScript = uint8(b.script.index("Latn"))
1480			parent.toRegion = uint16(b.region.index(sub[1]))
1481		} else {
1482			parent.script = uint8(b.script.index(sub[1]))
1483			parent.maxScript = parent.script
1484			parent.toRegion = uint16(b.region.index(sub[2]))
1485		}
1486		for _, c := range strings.Split(p.Locales, " ") {
1487			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
1488			parent.fromRegion = append(parent.fromRegion, uint16(region))
1489		}
1490		parents = append(parents, parent)
1491		n += len(parent.fromRegion)
1492	}
1493	b.writeSliceAddSize("parents", n*2, parents)
1494}
1495
1496func main() {
1497	gen.Init()
1498
1499	gen.Repackage("gen_common.go", "common.go", "language")
1500
1501	w := gen.NewCodeWriter()
1502	defer w.WriteGoFile("tables.go", "language")
1503
1504	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
1505
1506	b := newBuilder(w)
1507	gen.WriteCLDRVersion(w)
1508
1509	b.parseIndices()
1510	b.writeType(FromTo{})
1511	b.writeLanguage()
1512	b.writeScript()
1513	b.writeRegion()
1514	b.writeVariant()
1515	// TODO: b.writeLocale()
1516	b.computeRegionGroups()
1517	b.writeLikelyData()
1518	b.writeRegionInclusionData()
1519	b.writeParents()
1520}
1521