1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// Language tag table generator.
9// Data read from the web.
10
11package main
12
13import (
14	"flag"
15	"fmt"
16	"io"
17	"log"
18	"sort"
19	"strconv"
20	"strings"
21
22	"golang.org/x/text/internal/gen"
23	"golang.org/x/text/internal/language"
24	"golang.org/x/text/unicode/cldr"
25)
26
27var (
28	test = flag.Bool("test",
29		false,
30		"test existing tables; can be used to compare web data with package data.")
31	outputFile = flag.String("output",
32		"tables.go",
33		"output file for generated tables")
34)
35
36func main() {
37	gen.Init()
38
39	w := gen.NewCodeWriter()
40	defer w.WriteGoFile("tables.go", "language")
41
42	b := newBuilder(w)
43	gen.WriteCLDRVersion(w)
44
45	b.writeConstants()
46	b.writeMatchData()
47}
48
49type builder struct {
50	w    *gen.CodeWriter
51	hw   io.Writer // MultiWriter for w and w.Hash
52	data *cldr.CLDR
53	supp *cldr.SupplementalData
54}
55
56func (b *builder) langIndex(s string) uint16 {
57	return uint16(language.MustParseBase(s))
58}
59
60func (b *builder) regionIndex(s string) int {
61	return int(language.MustParseRegion(s))
62}
63
64func (b *builder) scriptIndex(s string) int {
65	return int(language.MustParseScript(s))
66}
67
68func newBuilder(w *gen.CodeWriter) *builder {
69	r := gen.OpenCLDRCoreZip()
70	defer r.Close()
71	d := &cldr.Decoder{}
72	data, err := d.DecodeZip(r)
73	if err != nil {
74		log.Fatal(err)
75	}
76	b := builder{
77		w:    w,
78		hw:   io.MultiWriter(w, w.Hash),
79		data: data,
80		supp: data.Supplemental(),
81	}
82	return &b
83}
84
85// writeConsts computes f(v) for all v in values and writes the results
86// as constants named _v to a single constant block.
87func (b *builder) writeConsts(f func(string) int, values ...string) {
88	fmt.Fprintln(b.w, "const (")
89	for _, v := range values {
90		fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
91	}
92	fmt.Fprintln(b.w, ")")
93}
94
95// TODO: region inclusion data will probably not be use used in future matchers.
96
97var langConsts = []string{
98	"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
99}
100
101var scriptConsts = []string{
102	"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
103	"Zzzz",
104}
105
106var regionConsts = []string{
107	"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
108	"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
109}
110
111func (b *builder) writeConstants() {
112	b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
113	b.writeConsts(b.regionIndex, regionConsts...)
114	b.writeConsts(b.scriptIndex, scriptConsts...)
115}
116
117type mutualIntelligibility struct {
118	want, have uint16
119	distance   uint8
120	oneway     bool
121}
122
123type scriptIntelligibility struct {
124	wantLang, haveLang     uint16
125	wantScript, haveScript uint8
126	distance               uint8
127	// Always oneway
128}
129
130type regionIntelligibility struct {
131	lang     uint16 // compact language id
132	script   uint8  // 0 means any
133	group    uint8  // 0 means any; if bit 7 is set it means inverse
134	distance uint8
135	// Always twoway.
136}
137
138// writeMatchData writes tables with languages and scripts for which there is
139// mutual intelligibility. The data is based on CLDR's languageMatching data.
140// Note that we use a different algorithm than the one defined by CLDR and that
141// we slightly modify the data. For example, we convert scores to confidence levels.
142// We also drop all region-related data as we use a different algorithm to
143// determine region equivalence.
144func (b *builder) writeMatchData() {
145	lm := b.supp.LanguageMatching.LanguageMatches
146	cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
147
148	regionHierarchy := map[string][]string{}
149	for _, g := range b.supp.TerritoryContainment.Group {
150		regions := strings.Split(g.Contains, " ")
151		regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
152	}
153	// Regions start at 1, so the slice must be one larger than the number of
154	// regions.
155	regionToGroups := make([]uint8, language.NumRegions+1)
156
157	idToIndex := map[string]uint8{}
158	for i, mv := range lm[0].MatchVariable {
159		if i > 6 {
160			log.Fatalf("Too many groups: %d", i)
161		}
162		idToIndex[mv.Id] = uint8(i + 1)
163		// TODO: also handle '-'
164		for _, r := range strings.Split(mv.Value, "+") {
165			todo := []string{r}
166			for k := 0; k < len(todo); k++ {
167				r := todo[k]
168				regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
169				todo = append(todo, regionHierarchy[r]...)
170			}
171		}
172	}
173	b.w.WriteVar("regionToGroups", regionToGroups)
174
175	// maps language id to in- and out-of-group region.
176	paradigmLocales := [][3]uint16{}
177	locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
178	for i := 0; i < len(locales); i += 2 {
179		x := [3]uint16{}
180		for j := 0; j < 2; j++ {
181			pc := strings.SplitN(locales[i+j], "-", 2)
182			x[0] = b.langIndex(pc[0])
183			if len(pc) == 2 {
184				x[1+j] = uint16(b.regionIndex(pc[1]))
185			}
186		}
187		paradigmLocales = append(paradigmLocales, x)
188	}
189	b.w.WriteVar("paradigmLocales", paradigmLocales)
190
191	b.w.WriteType(mutualIntelligibility{})
192	b.w.WriteType(scriptIntelligibility{})
193	b.w.WriteType(regionIntelligibility{})
194
195	matchLang := []mutualIntelligibility{}
196	matchScript := []scriptIntelligibility{}
197	matchRegion := []regionIntelligibility{}
198	// Convert the languageMatch entries in lists keyed by desired language.
199	for _, m := range lm[0].LanguageMatch {
200		// Different versions of CLDR use different separators.
201		desired := strings.Replace(m.Desired, "-", "_", -1)
202		supported := strings.Replace(m.Supported, "-", "_", -1)
203		d := strings.Split(desired, "_")
204		s := strings.Split(supported, "_")
205		if len(d) != len(s) {
206			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
207			continue
208		}
209		distance, _ := strconv.ParseInt(m.Distance, 10, 8)
210		switch len(d) {
211		case 2:
212			if desired == supported && desired == "*_*" {
213				continue
214			}
215			// language-script pair.
216			matchScript = append(matchScript, scriptIntelligibility{
217				wantLang:   uint16(b.langIndex(d[0])),
218				haveLang:   uint16(b.langIndex(s[0])),
219				wantScript: uint8(b.scriptIndex(d[1])),
220				haveScript: uint8(b.scriptIndex(s[1])),
221				distance:   uint8(distance),
222			})
223			if m.Oneway != "true" {
224				matchScript = append(matchScript, scriptIntelligibility{
225					wantLang:   uint16(b.langIndex(s[0])),
226					haveLang:   uint16(b.langIndex(d[0])),
227					wantScript: uint8(b.scriptIndex(s[1])),
228					haveScript: uint8(b.scriptIndex(d[1])),
229					distance:   uint8(distance),
230				})
231			}
232		case 1:
233			if desired == supported && desired == "*" {
234				continue
235			}
236			if distance == 1 {
237				// nb == no is already handled by macro mapping. Check there
238				// really is only this case.
239				if d[0] != "no" || s[0] != "nb" {
240					log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
241				}
242				continue
243			}
244			// TODO: consider dropping oneway field and just doubling the entry.
245			matchLang = append(matchLang, mutualIntelligibility{
246				want:     uint16(b.langIndex(d[0])),
247				have:     uint16(b.langIndex(s[0])),
248				distance: uint8(distance),
249				oneway:   m.Oneway == "true",
250			})
251		case 3:
252			if desired == supported && desired == "*_*_*" {
253				continue
254			}
255			if desired != supported {
256				// This is now supported by CLDR, but only one case, which
257				// should already be covered by paradigm locales. For instance,
258				// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
259				// testdata/CLDRLocaleMatcherTest.txt tests this.
260				if supported != "en_*_GB" {
261					log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
262				}
263				continue
264			}
265			ri := regionIntelligibility{
266				lang:     b.langIndex(d[0]),
267				distance: uint8(distance),
268			}
269			if d[1] != "*" {
270				ri.script = uint8(b.scriptIndex(d[1]))
271			}
272			switch {
273			case d[2] == "*":
274				ri.group = 0x80 // not contained in anything
275			case strings.HasPrefix(d[2], "$!"):
276				ri.group = 0x80
277				d[2] = "$" + d[2][len("$!"):]
278				fallthrough
279			case strings.HasPrefix(d[2], "$"):
280				ri.group |= idToIndex[d[2]]
281			}
282			matchRegion = append(matchRegion, ri)
283		default:
284			log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
285		}
286	}
287	sort.SliceStable(matchLang, func(i, j int) bool {
288		return matchLang[i].distance < matchLang[j].distance
289	})
290	b.w.WriteComment(`
291		matchLang holds pairs of langIDs of base languages that are typically
292		mutually intelligible. Each pair is associated with a confidence and
293		whether the intelligibility goes one or both ways.`)
294	b.w.WriteVar("matchLang", matchLang)
295
296	b.w.WriteComment(`
297		matchScript holds pairs of scriptIDs where readers of one script
298		can typically also read the other. Each is associated with a confidence.`)
299	sort.SliceStable(matchScript, func(i, j int) bool {
300		return matchScript[i].distance < matchScript[j].distance
301	})
302	b.w.WriteVar("matchScript", matchScript)
303
304	sort.SliceStable(matchRegion, func(i, j int) bool {
305		return matchRegion[i].distance < matchRegion[j].distance
306	})
307	b.w.WriteVar("matchRegion", matchRegion)
308}
309