1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Language tag table generator.
8// Data read from the web.
9
10package main
11
12import (
13	"flag"
14	"fmt"
15	"io"
16	"log"
17	"sort"
18	"strconv"
19	"strings"
20
21	"golang.org/x/text/internal/gen"
22	"golang.org/x/text/internal/language"
23	"golang.org/x/text/unicode/cldr"
24)
25
26var (
27	test = flag.Bool("test",
28		false,
29		"test existing tables; can be used to compare web data with package data.")
30	outputFile = flag.String("output",
31		"tables.go",
32		"output file for generated tables")
33)
34
35func main() {
36	gen.Init()
37
38	w := gen.NewCodeWriter()
39	defer w.WriteGoFile("tables.go", "language")
40
41	b := newBuilder(w)
42	gen.WriteCLDRVersion(w)
43
44	b.writeConstants()
45	b.writeMatchData()
46}
47
48type builder struct {
49	w    *gen.CodeWriter
50	hw   io.Writer // MultiWriter for w and w.Hash
51	data *cldr.CLDR
52	supp *cldr.SupplementalData
53}
54
55func (b *builder) langIndex(s string) uint16 {
56	return uint16(language.MustParseBase(s))
57}
58
59func (b *builder) regionIndex(s string) int {
60	return int(language.MustParseRegion(s))
61}
62
63func (b *builder) scriptIndex(s string) int {
64	return int(language.MustParseScript(s))
65}
66
67func newBuilder(w *gen.CodeWriter) *builder {
68	r := gen.OpenCLDRCoreZip()
69	defer r.Close()
70	d := &cldr.Decoder{}
71	data, err := d.DecodeZip(r)
72	if err != nil {
73		log.Fatal(err)
74	}
75	b := builder{
76		w:    w,
77		hw:   io.MultiWriter(w, w.Hash),
78		data: data,
79		supp: data.Supplemental(),
80	}
81	return &b
82}
83
84// writeConsts computes f(v) for all v in values and writes the results
85// as constants named _v to a single constant block.
86func (b *builder) writeConsts(f func(string) int, values ...string) {
87	fmt.Fprintln(b.w, "const (")
88	for _, v := range values {
89		fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
90	}
91	fmt.Fprintln(b.w, ")")
92}
93
94// TODO: region inclusion data will probably not be use used in future matchers.
95
96var langConsts = []string{
97	"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
98}
99
100var scriptConsts = []string{
101	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
102	"Zzzz",
103}
104
105var regionConsts = []string{
106	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
107	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
108}
109
110func (b *builder) writeConstants() {
111	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
112	b.writeConsts(b.regionIndex, regionConsts...)
113	b.writeConsts(b.scriptIndex, scriptConsts...)
114}
115
116type mutualIntelligibility struct {
117	want, have uint16
118	distance   uint8
119	oneway     bool
120}
121
122type scriptIntelligibility struct {
123	wantLang, haveLang     uint16
124	wantScript, haveScript uint8
125	distance               uint8
126	// Always oneway
127}
128
129type regionIntelligibility struct {
130	lang     uint16 // compact language id
131	script   uint8  // 0 means any
132	group    uint8  // 0 means any; if bit 7 is set it means inverse
133	distance uint8
134	// Always twoway.
135}
136
137// writeMatchData writes tables with languages and scripts for which there is
138// mutual intelligibility. The data is based on CLDR's languageMatching data.
139// Note that we use a different algorithm than the one defined by CLDR and that
140// we slightly modify the data. For example, we convert scores to confidence levels.
141// We also drop all region-related data as we use a different algorithm to
142// determine region equivalence.
143func (b *builder) writeMatchData() {
144	lm := b.supp.LanguageMatching.LanguageMatches
145	cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
146
147	regionHierarchy := map[string][]string{}
148	for _, g := range b.supp.TerritoryContainment.Group {
149		regions := strings.Split(g.Contains, " ")
150		regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
151	}
152	regionToGroups := make([]uint8, language.NumRegions)
153
154	idToIndex := map[string]uint8{}
155	for i, mv := range lm[0].MatchVariable {
156		if i > 6 {
157			log.Fatalf("Too many groups: %d", i)
158		}
159		idToIndex[mv.Id] = uint8(i + 1)
160		// TODO: also handle '-'
161		for _, r := range strings.Split(mv.Value, "+") {
162			todo := []string{r}
163			for k := 0; k < len(todo); k++ {
164				r := todo[k]
165				regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
166				todo = append(todo, regionHierarchy[r]...)
167			}
168		}
169	}
170	b.w.WriteVar("regionToGroups", regionToGroups)
171
172	// maps language id to in- and out-of-group region.
173	paradigmLocales := [][3]uint16{}
174	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
175	for i := 0; i < len(locales); i += 2 {
176		x := [3]uint16{}
177		for j := 0; j < 2; j++ {
178			pc := strings.SplitN(locales[i+j], "-", 2)
179			x[0] = b.langIndex(pc[0])
180			if len(pc) == 2 {
181				x[1+j] = uint16(b.regionIndex(pc[1]))
182			}
183		}
184		paradigmLocales = append(paradigmLocales, x)
185	}
186	b.w.WriteVar("paradigmLocales", paradigmLocales)
187
188	b.w.WriteType(mutualIntelligibility{})
189	b.w.WriteType(scriptIntelligibility{})
190	b.w.WriteType(regionIntelligibility{})
191
192	matchLang := []mutualIntelligibility{}
193	matchScript := []scriptIntelligibility{}
194	matchRegion := []regionIntelligibility{}
195	// Convert the languageMatch entries in lists keyed by desired language.
196	for _, m := range lm[0].LanguageMatch {
197		// Different versions of CLDR use different separators.
198		desired := strings.Replace(m.Desired, "-", "_", -1)
199		supported := strings.Replace(m.Supported, "-", "_", -1)
200		d := strings.Split(desired, "_")
201		s := strings.Split(supported, "_")
202		if len(d) != len(s) {
203			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
204			continue
205		}
206		distance, _ := strconv.ParseInt(m.Distance, 10, 8)
207		switch len(d) {
208		case 2:
209			if desired == supported && desired == "*_*" {
210				continue
211			}
212			// language-script pair.
213			matchScript = append(matchScript, scriptIntelligibility{
214				wantLang:   uint16(b.langIndex(d[0])),
215				haveLang:   uint16(b.langIndex(s[0])),
216				wantScript: uint8(b.scriptIndex(d[1])),
217				haveScript: uint8(b.scriptIndex(s[1])),
218				distance:   uint8(distance),
219			})
220			if m.Oneway != "true" {
221				matchScript = append(matchScript, scriptIntelligibility{
222					wantLang:   uint16(b.langIndex(s[0])),
223					haveLang:   uint16(b.langIndex(d[0])),
224					wantScript: uint8(b.scriptIndex(s[1])),
225					haveScript: uint8(b.scriptIndex(d[1])),
226					distance:   uint8(distance),
227				})
228			}
229		case 1:
230			if desired == supported && desired == "*" {
231				continue
232			}
233			if distance == 1 {
234				// nb == no is already handled by macro mapping. Check there
235				// really is only this case.
236				if d[0] != "no" || s[0] != "nb" {
237					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
238				}
239				continue
240			}
241			// TODO: consider dropping oneway field and just doubling the entry.
242			matchLang = append(matchLang, mutualIntelligibility{
243				want:     uint16(b.langIndex(d[0])),
244				have:     uint16(b.langIndex(s[0])),
245				distance: uint8(distance),
246				oneway:   m.Oneway == "true",
247			})
248		case 3:
249			if desired == supported && desired == "*_*_*" {
250				continue
251			}
252			if desired != supported {
253				// This is now supported by CLDR, but only one case, which
254				// should already be covered by paradigm locales. For instance,
255				// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
256				// testdata/CLDRLocaleMatcherTest.txt tests this.
257				if supported != "en_*_GB" {
258					log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
259				}
260				continue
261			}
262			ri := regionIntelligibility{
263				lang:     b.langIndex(d[0]),
264				distance: uint8(distance),
265			}
266			if d[1] != "*" {
267				ri.script = uint8(b.scriptIndex(d[1]))
268			}
269			switch {
270			case d[2] == "*":
271				ri.group = 0x80 // not contained in anything
272			case strings.HasPrefix(d[2], "$!"):
273				ri.group = 0x80
274				d[2] = "$" + d[2][len("$!"):]
275				fallthrough
276			case strings.HasPrefix(d[2], "$"):
277				ri.group |= idToIndex[d[2]]
278			}
279			matchRegion = append(matchRegion, ri)
280		default:
281			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
282		}
283	}
284	sort.SliceStable(matchLang, func(i, j int) bool {
285		return matchLang[i].distance < matchLang[j].distance
286	})
287	b.w.WriteComment(`
288		matchLang holds pairs of langIDs of base languages that are typically
289		mutually intelligible. Each pair is associated with a confidence and
290		whether the intelligibility goes one or both ways.`)
291	b.w.WriteVar("matchLang", matchLang)
292
293	b.w.WriteComment(`
294		matchScript holds pairs of scriptIDs where readers of one script
295		can typically also read the other. Each is associated with a confidence.`)
296	sort.SliceStable(matchScript, func(i, j int) bool {
297		return matchScript[i].distance < matchScript[j].distance
298	})
299	b.w.WriteVar("matchScript", matchScript)
300
301	sort.SliceStable(matchRegion, func(i, j int) bool {
302		return matchRegion[i].distance < matchRegion[j].distance
303	})
304	b.w.WriteVar("matchRegion", matchRegion)
305}
306