1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Language tag table generator.
8// Data read from the web.
9
10package main
11
12import (
13	"bufio"
14	"flag"
15	"fmt"
16	"io"
17	"io/ioutil"
18	"log"
19	"math"
20	"reflect"
21	"regexp"
22	"sort"
23	"strconv"
24	"strings"
25
26	"golang.org/x/text/internal/gen"
27	"golang.org/x/text/internal/tag"
28	"golang.org/x/text/unicode/cldr"
29)
30
31var (
32	test = flag.Bool("test",
33		false,
34		"test existing tables; can be used to compare web data with package data.")
35	outputFile = flag.String("output",
36		"tables.go",
37		"output file for generated tables")
38)
39
40var comment = []string{
41	`
42lang holds an alphabetically sorted list of ISO-639 language identifiers.
43All entries are 4 bytes. The index of the identifier (divided by 4) is the language tag.
44For 2-byte language identifiers, the two successive bytes have the following meaning:
45    - if the first letter of the 2- and 3-letter ISO codes are the same:
46      the second and third letter of the 3-letter ISO code.
47    - otherwise: a 0 and a by 2 bits right-shifted index into altLangISO3.
48For 3-byte language identifiers the 4th byte is 0.`,
49	`
50langNoIndex is a bit vector of all 3-letter language codes that are not used as an index
51in lookup tables. The language ids for these language codes are derived directly
52from the letters and are not consecutive.`,
53	`
54altLangISO3 holds an alphabetically sorted list of 3-letter language code alternatives
55to 2-letter language codes that cannot be derived using the method described above.
56Each 3-letter code is followed by its 1-byte langID.`,
57	`
58altLangIndex is used to convert indexes in altLangISO3 to langIDs.`,
59	`
60langAliasMap maps langIDs to their suggested replacements.`,
61	`
62script is an alphabetically sorted list of ISO 15924 codes. The index
63of the script in the string, divided by 4, is the internal scriptID.`,
64	`
65isoRegionOffset needs to be added to the index of regionISO to obtain the regionID
66for 2-letter ISO codes. (The first isoRegionOffset regionIDs are reserved for
67the UN.M49 codes used for groups.)`,
68	`
69regionISO holds a list of alphabetically sorted 2-letter ISO region codes.
70Each 2-letter codes is followed by two bytes with the following meaning:
71    - [A-Z}{2}: the first letter of the 2-letter code plus these two
72                letters form the 3-letter ISO code.
73    - 0, n:     index into altRegionISO3.`,
74	`
75regionTypes defines the status of a region for various standards.`,
76	`
77m49 maps regionIDs to UN.M49 codes. The first isoRegionOffset entries are
78codes indicating collections of regions.`,
79	`
80m49Index gives indexes into fromM49 based on the three most significant bits
81of a 10-bit UN.M49 code. To search an UN.M49 code in fromM49, search in
82   fromM49[m49Index[msb39(code)]:m49Index[msb3(code)+1]]
83for an entry where the first 7 bits match the 7 lsb of the UN.M49 code.
84The region code is stored in the 9 lsb of the indexed value.`,
85	`
86fromM49 contains entries to map UN.M49 codes to regions. See m49Index for details.`,
87	`
88altRegionISO3 holds a list of 3-letter region codes that cannot be
89mapped to 2-letter codes using the default algorithm. This is a short list.`,
90	`
91altRegionIDs holds a list of regionIDs the positions of which match those
92of the 3-letter ISO codes in altRegionISO3.`,
93	`
94variantNumSpecialized is the number of specialized variants in variants.`,
95	`
96suppressScript is an index from langID to the dominant script for that language,
97if it exists.  If a script is given, it should be suppressed from the language tag.`,
98	`
99likelyLang is a lookup table, indexed by langID, for the most likely
100scripts and regions given incomplete information. If more entries exist for a
101given language, region and script are the index and size respectively
102of the list in likelyLangList.`,
103	`
104likelyLangList holds lists info associated with likelyLang.`,
105	`
106likelyRegion is a lookup table, indexed by regionID, for the most likely
107languages and scripts given incomplete information. If more entries exist
108for a given regionID, lang and script are the index and size respectively
109of the list in likelyRegionList.
110TODO: exclude containers and user-definable regions from the list.`,
111	`
112likelyRegionList holds lists info associated with likelyRegion.`,
113	`
114likelyScript is a lookup table, indexed by scriptID, for the most likely
115languages and regions given a script.`,
116	`
117matchLang holds pairs of langIDs of base languages that are typically
118mutually intelligible. Each pair is associated with a confidence and
119whether the intelligibility goes one or both ways.`,
120	`
121matchScript holds pairs of scriptIDs where readers of one script
122can typically also read the other. Each is associated with a confidence.`,
123	`
124nRegionGroups is the number of region groups.`,
125	`
126regionInclusion maps region identifiers to sets of regions in regionInclusionBits,
127where each set holds all groupings that are directly connected in a region
128containment graph.`,
129	`
130regionInclusionBits is an array of bit vectors where every vector represents
131a set of region groupings.  These sets are used to compute the distance
132between two regions for the purpose of language matching.`,
133	`
134regionInclusionNext marks, for each entry in regionInclusionBits, the set of
135all groups that are reachable from the groups set in the respective entry.`,
136}
137
138// TODO: consider changing some of these structures to tries. This can reduce
139// memory, but may increase the need for memory allocations. This could be
140// mitigated if we can piggyback on language tags for common cases.
141
142func failOnError(e error) {
143	if e != nil {
144		log.Panic(e)
145	}
146}
147
148type setType int
149
150const (
151	Indexed setType = 1 + iota // all elements must be of same size
152	Linear
153)
154
155type stringSet struct {
156	s              []string
157	sorted, frozen bool
158
159	// We often need to update values after the creation of an index is completed.
160	// We include a convenience map for keeping track of this.
161	update map[string]string
162	typ    setType // used for checking.
163}
164
165func (ss *stringSet) clone() stringSet {
166	c := *ss
167	c.s = append([]string(nil), c.s...)
168	return c
169}
170
171func (ss *stringSet) setType(t setType) {
172	if ss.typ != t && ss.typ != 0 {
173		log.Panicf("type %d cannot be assigned as it was already %d", t, ss.typ)
174	}
175}
176
177// parse parses a whitespace-separated string and initializes ss with its
178// components.
179func (ss *stringSet) parse(s string) {
180	scan := bufio.NewScanner(strings.NewReader(s))
181	scan.Split(bufio.ScanWords)
182	for scan.Scan() {
183		ss.add(scan.Text())
184	}
185}
186
187func (ss *stringSet) assertChangeable() {
188	if ss.frozen {
189		log.Panic("attempt to modify a frozen stringSet")
190	}
191}
192
193func (ss *stringSet) add(s string) {
194	ss.assertChangeable()
195	ss.s = append(ss.s, s)
196	ss.sorted = ss.frozen
197}
198
199func (ss *stringSet) freeze() {
200	ss.compact()
201	ss.frozen = true
202}
203
204func (ss *stringSet) compact() {
205	if ss.sorted {
206		return
207	}
208	a := ss.s
209	sort.Strings(a)
210	k := 0
211	for i := 1; i < len(a); i++ {
212		if a[k] != a[i] {
213			a[k+1] = a[i]
214			k++
215		}
216	}
217	ss.s = a[:k+1]
218	ss.sorted = ss.frozen
219}
220
221type funcSorter struct {
222	fn func(a, b string) bool
223	sort.StringSlice
224}
225
226func (s funcSorter) Less(i, j int) bool {
227	return s.fn(s.StringSlice[i], s.StringSlice[j])
228}
229
230func (ss *stringSet) sortFunc(f func(a, b string) bool) {
231	ss.compact()
232	sort.Sort(funcSorter{f, sort.StringSlice(ss.s)})
233}
234
235func (ss *stringSet) remove(s string) {
236	ss.assertChangeable()
237	if i, ok := ss.find(s); ok {
238		copy(ss.s[i:], ss.s[i+1:])
239		ss.s = ss.s[:len(ss.s)-1]
240	}
241}
242
243func (ss *stringSet) replace(ol, nu string) {
244	ss.s[ss.index(ol)] = nu
245	ss.sorted = ss.frozen
246}
247
248func (ss *stringSet) index(s string) int {
249	ss.setType(Indexed)
250	i, ok := ss.find(s)
251	if !ok {
252		if i < len(ss.s) {
253			log.Panicf("find: item %q is not in list. Closest match is %q.", s, ss.s[i])
254		}
255		log.Panicf("find: item %q is not in list", s)
256
257	}
258	return i
259}
260
261func (ss *stringSet) find(s string) (int, bool) {
262	ss.compact()
263	i := sort.SearchStrings(ss.s, s)
264	return i, i != len(ss.s) && ss.s[i] == s
265}
266
267func (ss *stringSet) slice() []string {
268	ss.compact()
269	return ss.s
270}
271
272func (ss *stringSet) updateLater(v, key string) {
273	if ss.update == nil {
274		ss.update = map[string]string{}
275	}
276	ss.update[v] = key
277}
278
279// join joins the string and ensures that all entries are of the same length.
280func (ss *stringSet) join() string {
281	ss.setType(Indexed)
282	n := len(ss.s[0])
283	for _, s := range ss.s {
284		if len(s) != n {
285			log.Panicf("join: not all entries are of the same length: %q", s)
286		}
287	}
288	ss.s = append(ss.s, strings.Repeat("\xff", n))
289	return strings.Join(ss.s, "")
290}
291
292// ianaEntry holds information for an entry in the IANA Language Subtag Repository.
293// All types use the same entry.
294// See http://tools.ietf.org/html/bcp47#section-5.1 for a description of the various
295// fields.
296type ianaEntry struct {
297	typ            string
298	description    []string
299	scope          string
300	added          string
301	preferred      string
302	deprecated     string
303	suppressScript string
304	macro          string
305	prefix         []string
306}
307
308type builder struct {
309	w    *gen.CodeWriter
310	hw   io.Writer // MultiWriter for w and w.Hash
311	data *cldr.CLDR
312	supp *cldr.SupplementalData
313
314	// indices
315	locale      stringSet // common locales
316	lang        stringSet // canonical language ids (2 or 3 letter ISO codes) with data
317	langNoIndex stringSet // 3-letter ISO codes with no associated data
318	script      stringSet // 4-letter ISO codes
319	region      stringSet // 2-letter ISO or 3-digit UN M49 codes
320	variant     stringSet // 4-8-alphanumeric variant code.
321
322	// Region codes that are groups with their corresponding group IDs.
323	groups map[int]index
324
325	// langInfo
326	registry map[string]*ianaEntry
327}
328
329type index uint
330
331func newBuilder(w *gen.CodeWriter) *builder {
332	r := gen.OpenCLDRCoreZip()
333	defer r.Close()
334	d := &cldr.Decoder{}
335	data, err := d.DecodeZip(r)
336	failOnError(err)
337	b := builder{
338		w:    w,
339		hw:   io.MultiWriter(w, w.Hash),
340		data: data,
341		supp: data.Supplemental(),
342	}
343	b.parseRegistry()
344	return &b
345}
346
347func (b *builder) parseRegistry() {
348	r := gen.OpenIANAFile("assignments/language-subtag-registry")
349	defer r.Close()
350	b.registry = make(map[string]*ianaEntry)
351
352	scan := bufio.NewScanner(r)
353	scan.Split(bufio.ScanWords)
354	var record *ianaEntry
355	for more := scan.Scan(); more; {
356		key := scan.Text()
357		more = scan.Scan()
358		value := scan.Text()
359		switch key {
360		case "Type:":
361			record = &ianaEntry{typ: value}
362		case "Subtag:", "Tag:":
363			if s := strings.SplitN(value, "..", 2); len(s) > 1 {
364				for a := s[0]; a <= s[1]; a = inc(a) {
365					b.addToRegistry(a, record)
366				}
367			} else {
368				b.addToRegistry(value, record)
369			}
370		case "Suppress-Script:":
371			record.suppressScript = value
372		case "Added:":
373			record.added = value
374		case "Deprecated:":
375			record.deprecated = value
376		case "Macrolanguage:":
377			record.macro = value
378		case "Preferred-Value:":
379			record.preferred = value
380		case "Prefix:":
381			record.prefix = append(record.prefix, value)
382		case "Scope:":
383			record.scope = value
384		case "Description:":
385			buf := []byte(value)
386			for more = scan.Scan(); more; more = scan.Scan() {
387				b := scan.Bytes()
388				if b[0] == '%' || b[len(b)-1] == ':' {
389					break
390				}
391				buf = append(buf, ' ')
392				buf = append(buf, b...)
393			}
394			record.description = append(record.description, string(buf))
395			continue
396		default:
397			continue
398		}
399		more = scan.Scan()
400	}
401	if scan.Err() != nil {
402		log.Panic(scan.Err())
403	}
404}
405
406func (b *builder) addToRegistry(key string, entry *ianaEntry) {
407	if info, ok := b.registry[key]; ok {
408		if info.typ != "language" || entry.typ != "extlang" {
409			log.Fatalf("parseRegistry: tag %q already exists", key)
410		}
411	} else {
412		b.registry[key] = entry
413	}
414}
415
416var commentIndex = make(map[string]string)
417
418func init() {
419	for _, s := range comment {
420		key := strings.TrimSpace(strings.SplitN(s, " ", 2)[0])
421		commentIndex[key] = s
422	}
423}
424
425func (b *builder) comment(name string) {
426	if s := commentIndex[name]; len(s) > 0 {
427		b.w.WriteComment(s)
428	} else {
429		fmt.Fprintln(b.w)
430	}
431}
432
433func (b *builder) pf(f string, x ...interface{}) {
434	fmt.Fprintf(b.hw, f, x...)
435	fmt.Fprint(b.hw, "\n")
436}
437
438func (b *builder) p(x ...interface{}) {
439	fmt.Fprintln(b.hw, x...)
440}
441
442func (b *builder) addSize(s int) {
443	b.w.Size += s
444	b.pf("// Size: %d bytes", s)
445}
446
447func (b *builder) writeConst(name string, x interface{}) {
448	b.comment(name)
449	b.w.WriteConst(name, x)
450}
451
452// writeConsts computes f(v) for all v in values and writes the results
453// as constants named _v to a single constant block.
454func (b *builder) writeConsts(f func(string) int, values ...string) {
455	b.pf("const (")
456	for _, v := range values {
457		b.pf("\t_%s = %v", v, f(v))
458	}
459	b.pf(")")
460}
461
462// writeType writes the type of the given value, which must be a struct.
463func (b *builder) writeType(value interface{}) {
464	b.comment(reflect.TypeOf(value).Name())
465	b.w.WriteType(value)
466}
467
468func (b *builder) writeSlice(name string, ss interface{}) {
469	b.writeSliceAddSize(name, 0, ss)
470}
471
472func (b *builder) writeSliceAddSize(name string, extraSize int, ss interface{}) {
473	b.comment(name)
474	b.w.Size += extraSize
475	v := reflect.ValueOf(ss)
476	t := v.Type().Elem()
477	b.pf("// Size: %d bytes, %d elements", v.Len()*int(t.Size())+extraSize, v.Len())
478
479	fmt.Fprintf(b.w, "var %s = ", name)
480	b.w.WriteArray(ss)
481	b.p()
482}
483
484type fromTo struct {
485	from, to uint16
486}
487
488func (b *builder) writeSortedMap(name string, ss *stringSet, index func(s string) uint16) {
489	ss.sortFunc(func(a, b string) bool {
490		return index(a) < index(b)
491	})
492	m := []fromTo{}
493	for _, s := range ss.s {
494		m = append(m, fromTo{index(s), index(ss.update[s])})
495	}
496	b.writeSlice(name, m)
497}
498
499const base = 'z' - 'a' + 1
500
501func strToInt(s string) uint {
502	v := uint(0)
503	for i := 0; i < len(s); i++ {
504		v *= base
505		v += uint(s[i] - 'a')
506	}
507	return v
508}
509
510// converts the given integer to the original ASCII string passed to strToInt.
511// len(s) must match the number of characters obtained.
512func intToStr(v uint, s []byte) {
513	for i := len(s) - 1; i >= 0; i-- {
514		s[i] = byte(v%base) + 'a'
515		v /= base
516	}
517}
518
519func (b *builder) writeBitVector(name string, ss []string) {
520	vec := make([]uint8, int(math.Ceil(math.Pow(base, float64(len(ss[0])))/8)))
521	for _, s := range ss {
522		v := strToInt(s)
523		vec[v/8] |= 1 << (v % 8)
524	}
525	b.writeSlice(name, vec)
526}
527
528// TODO: convert this type into a list or two-stage trie.
529func (b *builder) writeMapFunc(name string, m map[string]string, f func(string) uint16) {
530	b.comment(name)
531	v := reflect.ValueOf(m)
532	sz := v.Len() * (2 + int(v.Type().Key().Size()))
533	for _, k := range m {
534		sz += len(k)
535	}
536	b.addSize(sz)
537	keys := []string{}
538	b.pf(`var %s = map[string]uint16{`, name)
539	for k := range m {
540		keys = append(keys, k)
541	}
542	sort.Strings(keys)
543	for _, k := range keys {
544		b.pf("\t%q: %v,", k, f(m[k]))
545	}
546	b.p("}")
547}
548
549func (b *builder) writeMap(name string, m interface{}) {
550	b.comment(name)
551	v := reflect.ValueOf(m)
552	sz := v.Len() * (2 + int(v.Type().Key().Size()) + int(v.Type().Elem().Size()))
553	b.addSize(sz)
554	f := strings.FieldsFunc(fmt.Sprintf("%#v", m), func(r rune) bool {
555		return strings.IndexRune("{}, ", r) != -1
556	})
557	sort.Strings(f[1:])
558	b.pf(`var %s = %s{`, name, f[0])
559	for _, kv := range f[1:] {
560		b.pf("\t%s,", kv)
561	}
562	b.p("}")
563}
564
565func (b *builder) langIndex(s string) uint16 {
566	if s == "und" {
567		return 0
568	}
569	if i, ok := b.lang.find(s); ok {
570		return uint16(i)
571	}
572	return uint16(strToInt(s)) + uint16(len(b.lang.s))
573}
574
575// inc advances the string to its lexicographical successor.
576func inc(s string) string {
577	const maxTagLength = 4
578	var buf [maxTagLength]byte
579	intToStr(strToInt(strings.ToLower(s))+1, buf[:len(s)])
580	for i := 0; i < len(s); i++ {
581		if s[i] <= 'Z' {
582			buf[i] -= 'a' - 'A'
583		}
584	}
585	return string(buf[:len(s)])
586}
587
588func (b *builder) parseIndices() {
589	meta := b.supp.Metadata
590
591	for k, v := range b.registry {
592		var ss *stringSet
593		switch v.typ {
594		case "language":
595			if len(k) == 2 || v.suppressScript != "" || v.scope == "special" {
596				b.lang.add(k)
597				continue
598			} else {
599				ss = &b.langNoIndex
600			}
601		case "region":
602			ss = &b.region
603		case "script":
604			ss = &b.script
605		case "variant":
606			ss = &b.variant
607		default:
608			continue
609		}
610		ss.add(k)
611	}
612	// Include any language for which there is data.
613	for _, lang := range b.data.Locales() {
614		if x := b.data.RawLDML(lang); false ||
615			x.LocaleDisplayNames != nil ||
616			x.Characters != nil ||
617			x.Delimiters != nil ||
618			x.Measurement != nil ||
619			x.Dates != nil ||
620			x.Numbers != nil ||
621			x.Units != nil ||
622			x.ListPatterns != nil ||
623			x.Collations != nil ||
624			x.Segmentations != nil ||
625			x.Rbnf != nil ||
626			x.Annotations != nil ||
627			x.Metadata != nil {
628
629			from := strings.Split(lang, "_")
630			if lang := from[0]; lang != "root" {
631				b.lang.add(lang)
632			}
633		}
634	}
635	// Include locales for plural rules, which uses a different structure.
636	for _, plurals := range b.data.Supplemental().Plurals {
637		for _, rules := range plurals.PluralRules {
638			for _, lang := range strings.Split(rules.Locales, " ") {
639				if lang = strings.Split(lang, "_")[0]; lang != "root" {
640					b.lang.add(lang)
641				}
642			}
643		}
644	}
645	// Include languages in likely subtags.
646	for _, m := range b.supp.LikelySubtags.LikelySubtag {
647		from := strings.Split(m.From, "_")
648		b.lang.add(from[0])
649	}
650	// Include ISO-639 alpha-3 bibliographic entries.
651	for _, a := range meta.Alias.LanguageAlias {
652		if a.Reason == "bibliographic" {
653			b.langNoIndex.add(a.Type)
654		}
655	}
656	// Include regions in territoryAlias (not all are in the IANA registry!)
657	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
658		if len(reg.Type) == 2 {
659			b.region.add(reg.Type)
660		}
661	}
662
663	for _, s := range b.lang.s {
664		if len(s) == 3 {
665			b.langNoIndex.remove(s)
666		}
667	}
668	b.writeConst("numLanguages", len(b.lang.slice())+len(b.langNoIndex.slice()))
669	b.writeConst("numScripts", len(b.script.slice()))
670	b.writeConst("numRegions", len(b.region.slice()))
671
672	// Add dummy codes at the start of each list to represent "unspecified".
673	b.lang.add("---")
674	b.script.add("----")
675	b.region.add("---")
676
677	// common locales
678	b.locale.parse(meta.DefaultContent.Locales)
679}
680
681// TODO: region inclusion data will probably not be use used in future matchers.
682
683func (b *builder) computeRegionGroups() {
684	b.groups = make(map[int]index)
685
686	// Create group indices.
687	for i := 1; b.region.s[i][0] < 'A'; i++ { // Base M49 indices on regionID.
688		b.groups[i] = index(len(b.groups))
689	}
690	for _, g := range b.supp.TerritoryContainment.Group {
691		// Skip UN and EURO zone as they are flattening the containment
692		// relationship.
693		if g.Type == "EZ" || g.Type == "UN" {
694			continue
695		}
696		group := b.region.index(g.Type)
697		if _, ok := b.groups[group]; !ok {
698			b.groups[group] = index(len(b.groups))
699		}
700	}
701	if len(b.groups) > 64 {
702		log.Fatalf("only 64 groups supported, found %d", len(b.groups))
703	}
704	b.writeConst("nRegionGroups", len(b.groups))
705}
706
707var langConsts = []string{
708	"af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
709	"et", "fa", "fi", "fil", "fr", "gu", "he", "hi", "hr", "hu", "hy", "id", "is",
710	"it", "ja", "ka", "kk", "km", "kn", "ko", "ky", "lo", "lt", "lv", "mk", "ml",
711	"mn", "mo", "mr", "ms", "mul", "my", "nb", "ne", "nl", "no", "pa", "pl", "pt",
712	"ro", "ru", "sh", "si", "sk", "sl", "sq", "sr", "sv", "sw", "ta", "te", "th",
713	"tl", "tn", "tr", "uk", "ur", "uz", "vi", "zh", "zu",
714
715	// constants for grandfathered tags (if not already defined)
716	"jbo", "ami", "bnn", "hak", "tlh", "lb", "nv", "pwn", "tao", "tay", "tsu",
717	"nn", "sfb", "vgt", "sgg", "cmn", "nan", "hsn",
718}
719
720// writeLanguage generates all tables needed for language canonicalization.
721func (b *builder) writeLanguage() {
722	meta := b.supp.Metadata
723
724	b.writeConst("nonCanonicalUnd", b.lang.index("und"))
725	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
726	b.writeConst("langPrivateStart", b.langIndex("qaa"))
727	b.writeConst("langPrivateEnd", b.langIndex("qtz"))
728
729	// Get language codes that need to be mapped (overlong 3-letter codes,
730	// deprecated 2-letter codes, legacy and grandfathered tags.)
731	langAliasMap := stringSet{}
732	aliasTypeMap := map[string]langAliasType{}
733
734	// altLangISO3 get the alternative ISO3 names that need to be mapped.
735	altLangISO3 := stringSet{}
736	// Add dummy start to avoid the use of index 0.
737	altLangISO3.add("---")
738	altLangISO3.updateLater("---", "aa")
739
740	lang := b.lang.clone()
741	for _, a := range meta.Alias.LanguageAlias {
742		if a.Replacement == "" {
743			a.Replacement = "und"
744		}
745		// TODO: support mapping to tags
746		repl := strings.SplitN(a.Replacement, "_", 2)[0]
747		if a.Reason == "overlong" {
748			if len(a.Replacement) == 2 && len(a.Type) == 3 {
749				lang.updateLater(a.Replacement, a.Type)
750			}
751		} else if len(a.Type) <= 3 {
752			switch a.Reason {
753			case "macrolanguage":
754				aliasTypeMap[a.Type] = langMacro
755			case "deprecated":
756				// handled elsewhere
757				continue
758			case "bibliographic", "legacy":
759				if a.Type == "no" {
760					continue
761				}
762				aliasTypeMap[a.Type] = langLegacy
763			default:
764				log.Fatalf("new %s alias: %s", a.Reason, a.Type)
765			}
766			langAliasMap.add(a.Type)
767			langAliasMap.updateLater(a.Type, repl)
768		}
769	}
770	// Manually add the mapping of "nb" (Norwegian) to its macro language.
771	// This can be removed if CLDR adopts this change.
772	langAliasMap.add("nb")
773	langAliasMap.updateLater("nb", "no")
774	aliasTypeMap["nb"] = langMacro
775
776	for k, v := range b.registry {
777		// Also add deprecated values for 3-letter ISO codes, which CLDR omits.
778		if v.typ == "language" && v.deprecated != "" && v.preferred != "" {
779			langAliasMap.add(k)
780			langAliasMap.updateLater(k, v.preferred)
781			aliasTypeMap[k] = langDeprecated
782		}
783	}
784	// Fix CLDR mappings.
785	lang.updateLater("tl", "tgl")
786	lang.updateLater("sh", "hbs")
787	lang.updateLater("mo", "mol")
788	lang.updateLater("no", "nor")
789	lang.updateLater("tw", "twi")
790	lang.updateLater("nb", "nob")
791	lang.updateLater("ak", "aka")
792	lang.updateLater("bh", "bih")
793
794	// Ensure that each 2-letter code is matched with a 3-letter code.
795	for _, v := range lang.s[1:] {
796		s, ok := lang.update[v]
797		if !ok {
798			if s, ok = lang.update[langAliasMap.update[v]]; !ok {
799				continue
800			}
801			lang.update[v] = s
802		}
803		if v[0] != s[0] {
804			altLangISO3.add(s)
805			altLangISO3.updateLater(s, v)
806		}
807	}
808
809	// Complete canonicalized language tags.
810	lang.freeze()
811	for i, v := range lang.s {
812		// We can avoid these manual entries by using the IANA registry directly.
813		// Seems easier to update the list manually, as changes are rare.
814		// The panic in this loop will trigger if we miss an entry.
815		add := ""
816		if s, ok := lang.update[v]; ok {
817			if s[0] == v[0] {
818				add = s[1:]
819			} else {
820				add = string([]byte{0, byte(altLangISO3.index(s))})
821			}
822		} else if len(v) == 3 {
823			add = "\x00"
824		} else {
825			log.Panicf("no data for long form of %q", v)
826		}
827		lang.s[i] += add
828	}
829	b.writeConst("lang", tag.Index(lang.join()))
830
831	b.writeConst("langNoIndexOffset", len(b.lang.s))
832
833	// space of all valid 3-letter language identifiers.
834	b.writeBitVector("langNoIndex", b.langNoIndex.slice())
835
836	altLangIndex := []uint16{}
837	for i, s := range altLangISO3.slice() {
838		altLangISO3.s[i] += string([]byte{byte(len(altLangIndex))})
839		if i > 0 {
840			idx := b.lang.index(altLangISO3.update[s])
841			altLangIndex = append(altLangIndex, uint16(idx))
842		}
843	}
844	b.writeConst("altLangISO3", tag.Index(altLangISO3.join()))
845	b.writeSlice("altLangIndex", altLangIndex)
846
847	b.writeSortedMap("langAliasMap", &langAliasMap, b.langIndex)
848	types := make([]langAliasType, len(langAliasMap.s))
849	for i, s := range langAliasMap.s {
850		types[i] = aliasTypeMap[s]
851	}
852	b.writeSlice("langAliasTypes", types)
853}
854
855var scriptConsts = []string{
856	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
857	"Zzzz",
858}
859
860func (b *builder) writeScript() {
861	b.writeConsts(b.script.index, scriptConsts...)
862	b.writeConst("script", tag.Index(b.script.join()))
863
864	supp := make([]uint8, len(b.lang.slice()))
865	for i, v := range b.lang.slice()[1:] {
866		if sc := b.registry[v].suppressScript; sc != "" {
867			supp[i+1] = uint8(b.script.index(sc))
868		}
869	}
870	b.writeSlice("suppressScript", supp)
871
872	// There is only one deprecated script in CLDR. This value is hard-coded.
873	// We check here if the code must be updated.
874	for _, a := range b.supp.Metadata.Alias.ScriptAlias {
875		if a.Type != "Qaai" {
876			log.Panicf("unexpected deprecated stript %q", a.Type)
877		}
878	}
879}
880
881func parseM49(s string) int16 {
882	if len(s) == 0 {
883		return 0
884	}
885	v, err := strconv.ParseUint(s, 10, 10)
886	failOnError(err)
887	return int16(v)
888}
889
890var regionConsts = []string{
891	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
892	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
893}
894
895func (b *builder) writeRegion() {
896	b.writeConsts(b.region.index, regionConsts...)
897
898	isoOffset := b.region.index("AA")
899	m49map := make([]int16, len(b.region.slice()))
900	fromM49map := make(map[int16]int)
901	altRegionISO3 := ""
902	altRegionIDs := []uint16{}
903
904	b.writeConst("isoRegionOffset", isoOffset)
905
906	// 2-letter region lookup and mapping to numeric codes.
907	regionISO := b.region.clone()
908	regionISO.s = regionISO.s[isoOffset:]
909	regionISO.sorted = false
910
911	regionTypes := make([]byte, len(b.region.s))
912
913	// Is the region valid BCP 47?
914	for s, e := range b.registry {
915		if len(s) == 2 && s == strings.ToUpper(s) {
916			i := b.region.index(s)
917			for _, d := range e.description {
918				if strings.Contains(d, "Private use") {
919					regionTypes[i] = iso3166UserAssigned
920				}
921			}
922			regionTypes[i] |= bcp47Region
923		}
924	}
925
926	// Is the region a valid ccTLD?
927	r := gen.OpenIANAFile("domains/root/db")
928	defer r.Close()
929
930	buf, err := ioutil.ReadAll(r)
931	failOnError(err)
932	re := regexp.MustCompile(`"/domains/root/db/([a-z]{2}).html"`)
933	for _, m := range re.FindAllSubmatch(buf, -1) {
934		i := b.region.index(strings.ToUpper(string(m[1])))
935		regionTypes[i] |= ccTLD
936	}
937
938	b.writeSlice("regionTypes", regionTypes)
939
940	iso3Set := make(map[string]int)
941	update := func(iso2, iso3 string) {
942		i := regionISO.index(iso2)
943		if j, ok := iso3Set[iso3]; !ok && iso3[0] == iso2[0] {
944			regionISO.s[i] += iso3[1:]
945			iso3Set[iso3] = -1
946		} else {
947			if ok && j >= 0 {
948				regionISO.s[i] += string([]byte{0, byte(j)})
949			} else {
950				iso3Set[iso3] = len(altRegionISO3)
951				regionISO.s[i] += string([]byte{0, byte(len(altRegionISO3))})
952				altRegionISO3 += iso3
953				altRegionIDs = append(altRegionIDs, uint16(isoOffset+i))
954			}
955		}
956	}
957	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
958		i := regionISO.index(tc.Type) + isoOffset
959		if d := m49map[i]; d != 0 {
960			log.Panicf("%s found as a duplicate UN.M49 code of %03d", tc.Numeric, d)
961		}
962		m49 := parseM49(tc.Numeric)
963		m49map[i] = m49
964		if r := fromM49map[m49]; r == 0 {
965			fromM49map[m49] = i
966		} else if r != i {
967			dep := b.registry[regionISO.s[r-isoOffset]].deprecated
968			if t := b.registry[tc.Type]; t != nil && dep != "" && (t.deprecated == "" || t.deprecated > dep) {
969				fromM49map[m49] = i
970			}
971		}
972	}
973	for _, ta := range b.supp.Metadata.Alias.TerritoryAlias {
974		if len(ta.Type) == 3 && ta.Type[0] <= '9' && len(ta.Replacement) == 2 {
975			from := parseM49(ta.Type)
976			if r := fromM49map[from]; r == 0 {
977				fromM49map[from] = regionISO.index(ta.Replacement) + isoOffset
978			}
979		}
980	}
981	for _, tc := range b.supp.CodeMappings.TerritoryCodes {
982		if len(tc.Alpha3) == 3 {
983			update(tc.Type, tc.Alpha3)
984		}
985	}
986	// This entries are not included in territoryCodes. Mostly 3-letter variants
987	// of deleted codes and an entry for QU.
988	for _, m := range []struct{ iso2, iso3 string }{
989		{"CT", "CTE"},
990		{"DY", "DHY"},
991		{"HV", "HVO"},
992		{"JT", "JTN"},
993		{"MI", "MID"},
994		{"NH", "NHB"},
995		{"NQ", "ATN"},
996		{"PC", "PCI"},
997		{"PU", "PUS"},
998		{"PZ", "PCZ"},
999		{"RH", "RHO"},
1000		{"VD", "VDR"},
1001		{"WK", "WAK"},
1002		// These three-letter codes are used for others as well.
1003		{"FQ", "ATF"},
1004	} {
1005		update(m.iso2, m.iso3)
1006	}
1007	for i, s := range regionISO.s {
1008		if len(s) != 4 {
1009			regionISO.s[i] = s + "  "
1010		}
1011	}
1012	b.writeConst("regionISO", tag.Index(regionISO.join()))
1013	b.writeConst("altRegionISO3", altRegionISO3)
1014	b.writeSlice("altRegionIDs", altRegionIDs)
1015
1016	// Create list of deprecated regions.
1017	// TODO: consider inserting SF -> FI. Not included by CLDR, but is the only
1018	// Transitionally-reserved mapping not included.
1019	regionOldMap := stringSet{}
1020	// Include regions in territoryAlias (not all are in the IANA registry!)
1021	for _, reg := range b.supp.Metadata.Alias.TerritoryAlias {
1022		if len(reg.Type) == 2 && reg.Reason == "deprecated" && len(reg.Replacement) == 2 {
1023			regionOldMap.add(reg.Type)
1024			regionOldMap.updateLater(reg.Type, reg.Replacement)
1025			i, _ := regionISO.find(reg.Type)
1026			j, _ := regionISO.find(reg.Replacement)
1027			if k := m49map[i+isoOffset]; k == 0 {
1028				m49map[i+isoOffset] = m49map[j+isoOffset]
1029			}
1030		}
1031	}
1032	b.writeSortedMap("regionOldMap", &regionOldMap, func(s string) uint16 {
1033		return uint16(b.region.index(s))
1034	})
1035	// 3-digit region lookup, groupings.
1036	for i := 1; i < isoOffset; i++ {
1037		m := parseM49(b.region.s[i])
1038		m49map[i] = m
1039		fromM49map[m] = i
1040	}
1041	b.writeSlice("m49", m49map)
1042
1043	const (
1044		searchBits = 7
1045		regionBits = 9
1046	)
1047	if len(m49map) >= 1<<regionBits {
1048		log.Fatalf("Maximum number of regions exceeded: %d > %d", len(m49map), 1<<regionBits)
1049	}
1050	m49Index := [9]int16{}
1051	fromM49 := []uint16{}
1052	m49 := []int{}
1053	for k, _ := range fromM49map {
1054		m49 = append(m49, int(k))
1055	}
1056	sort.Ints(m49)
1057	for _, k := range m49[1:] {
1058		val := (k & (1<<searchBits - 1)) << regionBits
1059		fromM49 = append(fromM49, uint16(val|fromM49map[int16(k)]))
1060		m49Index[1:][k>>searchBits] = int16(len(fromM49))
1061	}
1062	b.writeSlice("m49Index", m49Index)
1063	b.writeSlice("fromM49", fromM49)
1064}
1065
1066const (
1067	// TODO: put these lists in regionTypes as user data? Could be used for
1068	// various optimizations and refinements and could be exposed in the API.
1069	iso3166Except = "AC CP DG EA EU FX IC SU TA UK"
1070	iso3166Trans  = "AN BU CS NT TP YU ZR" // SF is not in our set of Regions.
1071	// DY and RH are actually not deleted, but indeterminately reserved.
1072	iso3166DelCLDR = "CT DD DY FQ HV JT MI NH NQ PC PU PZ RH VD WK YD"
1073)
1074
1075const (
1076	iso3166UserAssigned = 1 << iota
1077	ccTLD
1078	bcp47Region
1079)
1080
1081func find(list []string, s string) int {
1082	for i, t := range list {
1083		if t == s {
1084			return i
1085		}
1086	}
1087	return -1
1088}
1089
1090// writeVariants generates per-variant information and creates a map from variant
1091// name to index value. We assign index values such that sorting multiple
1092// variants by index value will result in the correct order.
1093// There are two types of variants: specialized and general. Specialized variants
1094// are only applicable to certain language or language-script pairs. Generalized
1095// variants apply to any language. Generalized variants always sort after
1096// specialized variants.  We will therefore always assign a higher index value
1097// to a generalized variant than any other variant. Generalized variants are
1098// sorted alphabetically among themselves.
1099// Specialized variants may also sort after other specialized variants. Such
1100// variants will be ordered after any of the variants they may follow.
1101// We assume that if a variant x is followed by a variant y, then for any prefix
1102// p of x, p-x is a prefix of y. This allows us to order tags based on the
1103// maximum of the length of any of its prefixes.
1104// TODO: it is possible to define a set of Prefix values on variants such that
1105// a total order cannot be defined to the point that this algorithm breaks.
1106// In other words, we cannot guarantee the same order of variants for the
1107// future using the same algorithm or for non-compliant combinations of
1108// variants. For this reason, consider using simple alphabetic sorting
1109// of variants and ignore Prefix restrictions altogether.
1110func (b *builder) writeVariant() {
1111	generalized := stringSet{}
1112	specialized := stringSet{}
1113	specializedExtend := stringSet{}
1114	// Collate the variants by type and check assumptions.
1115	for _, v := range b.variant.slice() {
1116		e := b.registry[v]
1117		if len(e.prefix) == 0 {
1118			generalized.add(v)
1119			continue
1120		}
1121		c := strings.Split(e.prefix[0], "-")
1122		hasScriptOrRegion := false
1123		if len(c) > 1 {
1124			_, hasScriptOrRegion = b.script.find(c[1])
1125			if !hasScriptOrRegion {
1126				_, hasScriptOrRegion = b.region.find(c[1])
1127
1128			}
1129		}
1130		if len(c) == 1 || len(c) == 2 && hasScriptOrRegion {
1131			// Variant is preceded by a language.
1132			specialized.add(v)
1133			continue
1134		}
1135		// Variant is preceded by another variant.
1136		specializedExtend.add(v)
1137		prefix := c[0] + "-"
1138		if hasScriptOrRegion {
1139			prefix += c[1]
1140		}
1141		for _, p := range e.prefix {
1142			// Verify that the prefix minus the last element is a prefix of the
1143			// predecessor element.
1144			i := strings.LastIndex(p, "-")
1145			pred := b.registry[p[i+1:]]
1146			if find(pred.prefix, p[:i]) < 0 {
1147				log.Fatalf("prefix %q for variant %q not consistent with predecessor spec", p, v)
1148			}
1149			// The sorting used below does not work in the general case. It works
1150			// if we assume that variants that may be followed by others only have
1151			// prefixes of the same length. Verify this.
1152			count := strings.Count(p[:i], "-")
1153			for _, q := range pred.prefix {
1154				if c := strings.Count(q, "-"); c != count {
1155					log.Fatalf("variant %q preceding %q has a prefix %q of size %d; want %d", p[i+1:], v, q, c, count)
1156				}
1157			}
1158			if !strings.HasPrefix(p, prefix) {
1159				log.Fatalf("prefix %q of variant %q should start with %q", p, v, prefix)
1160			}
1161		}
1162	}
1163
1164	// Sort extended variants.
1165	a := specializedExtend.s
1166	less := func(v, w string) bool {
1167		// Sort by the maximum number of elements.
1168		maxCount := func(s string) (max int) {
1169			for _, p := range b.registry[s].prefix {
1170				if c := strings.Count(p, "-"); c > max {
1171					max = c
1172				}
1173			}
1174			return
1175		}
1176		if cv, cw := maxCount(v), maxCount(w); cv != cw {
1177			return cv < cw
1178		}
1179		// Sort by name as tie breaker.
1180		return v < w
1181	}
1182	sort.Sort(funcSorter{less, sort.StringSlice(a)})
1183	specializedExtend.frozen = true
1184
1185	// Create index from variant name to index.
1186	variantIndex := make(map[string]uint8)
1187	add := func(s []string) {
1188		for _, v := range s {
1189			variantIndex[v] = uint8(len(variantIndex))
1190		}
1191	}
1192	add(specialized.slice())
1193	add(specializedExtend.s)
1194	numSpecialized := len(variantIndex)
1195	add(generalized.slice())
1196	if n := len(variantIndex); n > 255 {
1197		log.Fatalf("maximum number of variants exceeded: was %d; want <= 255", n)
1198	}
1199	b.writeMap("variantIndex", variantIndex)
1200	b.writeConst("variantNumSpecialized", numSpecialized)
1201}
1202
1203func (b *builder) writeLanguageInfo() {
1204}
1205
1206// writeLikelyData writes tables that are used both for finding parent relations and for
1207// language matching.  Each entry contains additional bits to indicate the status of the
1208// data to know when it cannot be used for parent relations.
1209func (b *builder) writeLikelyData() {
1210	const (
1211		isList = 1 << iota
1212		scriptInFrom
1213		regionInFrom
1214	)
1215	type ( // generated types
1216		likelyScriptRegion struct {
1217			region uint16
1218			script uint8
1219			flags  uint8
1220		}
1221		likelyLangScript struct {
1222			lang   uint16
1223			script uint8
1224			flags  uint8
1225		}
1226		likelyLangRegion struct {
1227			lang   uint16
1228			region uint16
1229		}
1230		// likelyTag is used for getting likely tags for group regions, where
1231		// the likely region might be a region contained in the group.
1232		likelyTag struct {
1233			lang   uint16
1234			region uint16
1235			script uint8
1236		}
1237	)
1238	var ( // generated variables
1239		likelyRegionGroup = make([]likelyTag, len(b.groups))
1240		likelyLang        = make([]likelyScriptRegion, len(b.lang.s))
1241		likelyRegion      = make([]likelyLangScript, len(b.region.s))
1242		likelyScript      = make([]likelyLangRegion, len(b.script.s))
1243		likelyLangList    = []likelyScriptRegion{}
1244		likelyRegionList  = []likelyLangScript{}
1245	)
1246	type fromTo struct {
1247		from, to []string
1248	}
1249	langToOther := map[int][]fromTo{}
1250	regionToOther := map[int][]fromTo{}
1251	for _, m := range b.supp.LikelySubtags.LikelySubtag {
1252		from := strings.Split(m.From, "_")
1253		to := strings.Split(m.To, "_")
1254		if len(to) != 3 {
1255			log.Fatalf("invalid number of subtags in %q: found %d, want 3", m.To, len(to))
1256		}
1257		if len(from) > 3 {
1258			log.Fatalf("invalid number of subtags: found %d, want 1-3", len(from))
1259		}
1260		if from[0] != to[0] && from[0] != "und" {
1261			log.Fatalf("unexpected language change in expansion: %s -> %s", from, to)
1262		}
1263		if len(from) == 3 {
1264			if from[2] != to[2] {
1265				log.Fatalf("unexpected region change in expansion: %s -> %s", from, to)
1266			}
1267			if from[0] != "und" {
1268				log.Fatalf("unexpected fully specified from tag: %s -> %s", from, to)
1269			}
1270		}
1271		if len(from) == 1 || from[0] != "und" {
1272			id := 0
1273			if from[0] != "und" {
1274				id = b.lang.index(from[0])
1275			}
1276			langToOther[id] = append(langToOther[id], fromTo{from, to})
1277		} else if len(from) == 2 && len(from[1]) == 4 {
1278			sid := b.script.index(from[1])
1279			likelyScript[sid].lang = uint16(b.langIndex(to[0]))
1280			likelyScript[sid].region = uint16(b.region.index(to[2]))
1281		} else {
1282			r := b.region.index(from[len(from)-1])
1283			if id, ok := b.groups[r]; ok {
1284				if from[0] != "und" {
1285					log.Fatalf("region changed unexpectedly: %s -> %s", from, to)
1286				}
1287				likelyRegionGroup[id].lang = uint16(b.langIndex(to[0]))
1288				likelyRegionGroup[id].script = uint8(b.script.index(to[1]))
1289				likelyRegionGroup[id].region = uint16(b.region.index(to[2]))
1290			} else {
1291				regionToOther[r] = append(regionToOther[r], fromTo{from, to})
1292			}
1293		}
1294	}
1295	b.writeType(likelyLangRegion{})
1296	b.writeSlice("likelyScript", likelyScript)
1297
1298	for id := range b.lang.s {
1299		list := langToOther[id]
1300		if len(list) == 1 {
1301			likelyLang[id].region = uint16(b.region.index(list[0].to[2]))
1302			likelyLang[id].script = uint8(b.script.index(list[0].to[1]))
1303		} else if len(list) > 1 {
1304			likelyLang[id].flags = isList
1305			likelyLang[id].region = uint16(len(likelyLangList))
1306			likelyLang[id].script = uint8(len(list))
1307			for _, x := range list {
1308				flags := uint8(0)
1309				if len(x.from) > 1 {
1310					if x.from[1] == x.to[2] {
1311						flags = regionInFrom
1312					} else {
1313						flags = scriptInFrom
1314					}
1315				}
1316				likelyLangList = append(likelyLangList, likelyScriptRegion{
1317					region: uint16(b.region.index(x.to[2])),
1318					script: uint8(b.script.index(x.to[1])),
1319					flags:  flags,
1320				})
1321			}
1322		}
1323	}
1324	// TODO: merge suppressScript data with this table.
1325	b.writeType(likelyScriptRegion{})
1326	b.writeSlice("likelyLang", likelyLang)
1327	b.writeSlice("likelyLangList", likelyLangList)
1328
1329	for id := range b.region.s {
1330		list := regionToOther[id]
1331		if len(list) == 1 {
1332			likelyRegion[id].lang = uint16(b.langIndex(list[0].to[0]))
1333			likelyRegion[id].script = uint8(b.script.index(list[0].to[1]))
1334			if len(list[0].from) > 2 {
1335				likelyRegion[id].flags = scriptInFrom
1336			}
1337		} else if len(list) > 1 {
1338			likelyRegion[id].flags = isList
1339			likelyRegion[id].lang = uint16(len(likelyRegionList))
1340			likelyRegion[id].script = uint8(len(list))
1341			for i, x := range list {
1342				if len(x.from) == 2 && i != 0 || i > 0 && len(x.from) != 3 {
1343					log.Fatalf("unspecified script must be first in list: %v at %d", x.from, i)
1344				}
1345				x := likelyLangScript{
1346					lang:   uint16(b.langIndex(x.to[0])),
1347					script: uint8(b.script.index(x.to[1])),
1348				}
1349				if len(list[0].from) > 2 {
1350					x.flags = scriptInFrom
1351				}
1352				likelyRegionList = append(likelyRegionList, x)
1353			}
1354		}
1355	}
1356	b.writeType(likelyLangScript{})
1357	b.writeSlice("likelyRegion", likelyRegion)
1358	b.writeSlice("likelyRegionList", likelyRegionList)
1359
1360	b.writeType(likelyTag{})
1361	b.writeSlice("likelyRegionGroup", likelyRegionGroup)
1362}
1363
1364type mutualIntelligibility struct {
1365	want, have uint16
1366	distance   uint8
1367	oneway     bool
1368}
1369
1370type scriptIntelligibility struct {
1371	wantLang, haveLang     uint16
1372	wantScript, haveScript uint8
1373	distance               uint8
1374	// Always oneway
1375}
1376
1377type regionIntelligibility struct {
1378	lang     uint16 // compact language id
1379	script   uint8  // 0 means any
1380	group    uint8  // 0 means any; if bit 7 is set it means inverse
1381	distance uint8
1382	// Always twoway.
1383}
1384
1385// writeMatchData writes tables with languages and scripts for which there is
1386// mutual intelligibility. The data is based on CLDR's languageMatching data.
1387// Note that we use a different algorithm than the one defined by CLDR and that
1388// we slightly modify the data. For example, we convert scores to confidence levels.
1389// We also drop all region-related data as we use a different algorithm to
1390// determine region equivalence.
1391func (b *builder) writeMatchData() {
1392	lm := b.supp.LanguageMatching.LanguageMatches
1393	cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
1394
1395	regionHierarchy := map[string][]string{}
1396	for _, g := range b.supp.TerritoryContainment.Group {
1397		regions := strings.Split(g.Contains, " ")
1398		regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
1399	}
1400	regionToGroups := make([]uint8, len(b.region.s))
1401
1402	idToIndex := map[string]uint8{}
1403	for i, mv := range lm[0].MatchVariable {
1404		if i > 6 {
1405			log.Fatalf("Too many groups: %d", i)
1406		}
1407		idToIndex[mv.Id] = uint8(i + 1)
1408		// TODO: also handle '-'
1409		for _, r := range strings.Split(mv.Value, "+") {
1410			todo := []string{r}
1411			for k := 0; k < len(todo); k++ {
1412				r := todo[k]
1413				regionToGroups[b.region.index(r)] |= 1 << uint8(i)
1414				todo = append(todo, regionHierarchy[r]...)
1415			}
1416		}
1417	}
1418	b.writeSlice("regionToGroups", regionToGroups)
1419
1420	// maps language id to in- and out-of-group region.
1421	paradigmLocales := [][3]uint16{}
1422	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
1423	for i := 0; i < len(locales); i += 2 {
1424		x := [3]uint16{}
1425		for j := 0; j < 2; j++ {
1426			pc := strings.SplitN(locales[i+j], "-", 2)
1427			x[0] = b.langIndex(pc[0])
1428			if len(pc) == 2 {
1429				x[1+j] = uint16(b.region.index(pc[1]))
1430			}
1431		}
1432		paradigmLocales = append(paradigmLocales, x)
1433	}
1434	b.writeSlice("paradigmLocales", paradigmLocales)
1435
1436	b.writeType(mutualIntelligibility{})
1437	b.writeType(scriptIntelligibility{})
1438	b.writeType(regionIntelligibility{})
1439
1440	matchLang := []mutualIntelligibility{}
1441	matchScript := []scriptIntelligibility{}
1442	matchRegion := []regionIntelligibility{}
1443	// Convert the languageMatch entries in lists keyed by desired language.
1444	for _, m := range lm[0].LanguageMatch {
1445		// Different versions of CLDR use different separators.
1446		desired := strings.Replace(m.Desired, "-", "_", -1)
1447		supported := strings.Replace(m.Supported, "-", "_", -1)
1448		d := strings.Split(desired, "_")
1449		s := strings.Split(supported, "_")
1450		if len(d) != len(s) {
1451			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1452			continue
1453		}
1454		distance, _ := strconv.ParseInt(m.Distance, 10, 8)
1455		switch len(d) {
1456		case 2:
1457			if desired == supported && desired == "*_*" {
1458				continue
1459			}
1460			// language-script pair.
1461			matchScript = append(matchScript, scriptIntelligibility{
1462				wantLang:   uint16(b.langIndex(d[0])),
1463				haveLang:   uint16(b.langIndex(s[0])),
1464				wantScript: uint8(b.script.index(d[1])),
1465				haveScript: uint8(b.script.index(s[1])),
1466				distance:   uint8(distance),
1467			})
1468			if m.Oneway != "true" {
1469				matchScript = append(matchScript, scriptIntelligibility{
1470					wantLang:   uint16(b.langIndex(s[0])),
1471					haveLang:   uint16(b.langIndex(d[0])),
1472					wantScript: uint8(b.script.index(s[1])),
1473					haveScript: uint8(b.script.index(d[1])),
1474					distance:   uint8(distance),
1475				})
1476			}
1477		case 1:
1478			if desired == supported && desired == "*" {
1479				continue
1480			}
1481			if distance == 1 {
1482				// nb == no is already handled by macro mapping. Check there
1483				// really is only this case.
1484				if d[0] != "no" || s[0] != "nb" {
1485					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
1486				}
1487				continue
1488			}
1489			// TODO: consider dropping oneway field and just doubling the entry.
1490			matchLang = append(matchLang, mutualIntelligibility{
1491				want:     uint16(b.langIndex(d[0])),
1492				have:     uint16(b.langIndex(s[0])),
1493				distance: uint8(distance),
1494				oneway:   m.Oneway == "true",
1495			})
1496		case 3:
1497			if desired == supported && desired == "*_*_*" {
1498				continue
1499			}
1500			if desired != supported {
1501				// This is now supported by CLDR, but only one case, which
1502				// should already be covered by paradigm locales. For instance,
1503				// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
1504				// testdata/CLDRLocaleMatcherTest.txt tests this.
1505				if supported != "en_*_GB" {
1506					log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1507				}
1508				continue
1509			}
1510			ri := regionIntelligibility{
1511				lang:     b.langIndex(d[0]),
1512				distance: uint8(distance),
1513			}
1514			if d[1] != "*" {
1515				ri.script = uint8(b.script.index(d[1]))
1516			}
1517			switch {
1518			case d[2] == "*":
1519				ri.group = 0x80 // not contained in anything
1520			case strings.HasPrefix(d[2], "$!"):
1521				ri.group = 0x80
1522				d[2] = "$" + d[2][len("$!"):]
1523				fallthrough
1524			case strings.HasPrefix(d[2], "$"):
1525				ri.group |= idToIndex[d[2]]
1526			}
1527			matchRegion = append(matchRegion, ri)
1528		default:
1529			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
1530		}
1531	}
1532	sort.SliceStable(matchLang, func(i, j int) bool {
1533		return matchLang[i].distance < matchLang[j].distance
1534	})
1535	b.writeSlice("matchLang", matchLang)
1536
1537	sort.SliceStable(matchScript, func(i, j int) bool {
1538		return matchScript[i].distance < matchScript[j].distance
1539	})
1540	b.writeSlice("matchScript", matchScript)
1541
1542	sort.SliceStable(matchRegion, func(i, j int) bool {
1543		return matchRegion[i].distance < matchRegion[j].distance
1544	})
1545	b.writeSlice("matchRegion", matchRegion)
1546}
1547
1548func (b *builder) writeRegionInclusionData() {
1549	var (
1550		// mm holds for each group the set of groups with a distance of 1.
1551		mm = make(map[int][]index)
1552
1553		// containment holds for each group the transitive closure of
1554		// containment of other groups.
1555		containment = make(map[index][]index)
1556	)
1557	for _, g := range b.supp.TerritoryContainment.Group {
1558		// Skip UN and EURO zone as they are flattening the containment
1559		// relationship.
1560		if g.Type == "EZ" || g.Type == "UN" {
1561			continue
1562		}
1563		group := b.region.index(g.Type)
1564		groupIdx := b.groups[group]
1565		for _, mem := range strings.Split(g.Contains, " ") {
1566			r := b.region.index(mem)
1567			mm[r] = append(mm[r], groupIdx)
1568			if g, ok := b.groups[r]; ok {
1569				mm[group] = append(mm[group], g)
1570				containment[groupIdx] = append(containment[groupIdx], g)
1571			}
1572		}
1573	}
1574
1575	regionContainment := make([]uint64, len(b.groups))
1576	for _, g := range b.groups {
1577		l := containment[g]
1578
1579		// Compute the transitive closure of containment.
1580		for i := 0; i < len(l); i++ {
1581			l = append(l, containment[l[i]]...)
1582		}
1583
1584		// Compute the bitmask.
1585		regionContainment[g] = 1 << g
1586		for _, v := range l {
1587			regionContainment[g] |= 1 << v
1588		}
1589	}
1590	b.writeSlice("regionContainment", regionContainment)
1591
1592	regionInclusion := make([]uint8, len(b.region.s))
1593	bvs := make(map[uint64]index)
1594	// Make the first bitvector positions correspond with the groups.
1595	for r, i := range b.groups {
1596		bv := uint64(1 << i)
1597		for _, g := range mm[r] {
1598			bv |= 1 << g
1599		}
1600		bvs[bv] = i
1601		regionInclusion[r] = uint8(bvs[bv])
1602	}
1603	for r := 1; r < len(b.region.s); r++ {
1604		if _, ok := b.groups[r]; !ok {
1605			bv := uint64(0)
1606			for _, g := range mm[r] {
1607				bv |= 1 << g
1608			}
1609			if bv == 0 {
1610				// Pick the world for unspecified regions.
1611				bv = 1 << b.groups[b.region.index("001")]
1612			}
1613			if _, ok := bvs[bv]; !ok {
1614				bvs[bv] = index(len(bvs))
1615			}
1616			regionInclusion[r] = uint8(bvs[bv])
1617		}
1618	}
1619	b.writeSlice("regionInclusion", regionInclusion)
1620	regionInclusionBits := make([]uint64, len(bvs))
1621	for k, v := range bvs {
1622		regionInclusionBits[v] = uint64(k)
1623	}
1624	// Add bit vectors for increasingly large distances until a fixed point is reached.
1625	regionInclusionNext := []uint8{}
1626	for i := 0; i < len(regionInclusionBits); i++ {
1627		bits := regionInclusionBits[i]
1628		next := bits
1629		for i := uint(0); i < uint(len(b.groups)); i++ {
1630			if bits&(1<<i) != 0 {
1631				next |= regionInclusionBits[i]
1632			}
1633		}
1634		if _, ok := bvs[next]; !ok {
1635			bvs[next] = index(len(bvs))
1636			regionInclusionBits = append(regionInclusionBits, next)
1637		}
1638		regionInclusionNext = append(regionInclusionNext, uint8(bvs[next]))
1639	}
1640	b.writeSlice("regionInclusionBits", regionInclusionBits)
1641	b.writeSlice("regionInclusionNext", regionInclusionNext)
1642}
1643
1644type parentRel struct {
1645	lang       uint16
1646	script     uint8
1647	maxScript  uint8
1648	toRegion   uint16
1649	fromRegion []uint16
1650}
1651
1652func (b *builder) writeParents() {
1653	b.writeType(parentRel{})
1654
1655	parents := []parentRel{}
1656
1657	// Construct parent overrides.
1658	n := 0
1659	for _, p := range b.data.Supplemental().ParentLocales.ParentLocale {
1660		// Skipping non-standard scripts to root is implemented using addTags.
1661		if p.Parent == "root" {
1662			continue
1663		}
1664
1665		sub := strings.Split(p.Parent, "_")
1666		parent := parentRel{lang: b.langIndex(sub[0])}
1667		if len(sub) == 2 {
1668			// TODO: check that all undefined scripts are indeed Latn in these
1669			// cases.
1670			parent.maxScript = uint8(b.script.index("Latn"))
1671			parent.toRegion = uint16(b.region.index(sub[1]))
1672		} else {
1673			parent.script = uint8(b.script.index(sub[1]))
1674			parent.maxScript = parent.script
1675			parent.toRegion = uint16(b.region.index(sub[2]))
1676		}
1677		for _, c := range strings.Split(p.Locales, " ") {
1678			region := b.region.index(c[strings.LastIndex(c, "_")+1:])
1679			parent.fromRegion = append(parent.fromRegion, uint16(region))
1680		}
1681		parents = append(parents, parent)
1682		n += len(parent.fromRegion)
1683	}
1684	b.writeSliceAddSize("parents", n*2, parents)
1685}
1686
1687func main() {
1688	gen.Init()
1689
1690	gen.Repackage("gen_common.go", "common.go", "language")
1691
1692	w := gen.NewCodeWriter()
1693	defer w.WriteGoFile("tables.go", "language")
1694
1695	fmt.Fprintln(w, `import "golang.org/x/text/internal/tag"`)
1696
1697	b := newBuilder(w)
1698	gen.WriteCLDRVersion(w)
1699
1700	b.parseIndices()
1701	b.writeType(fromTo{})
1702	b.writeLanguage()
1703	b.writeScript()
1704	b.writeRegion()
1705	b.writeVariant()
1706	// TODO: b.writeLocale()
1707	b.computeRegionGroups()
1708	b.writeLikelyData()
1709	b.writeMatchData()
1710	b.writeRegionInclusionData()
1711	b.writeParents()
1712}
1713