1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// Generator for display name tables.
9
10package main
11
12import (
13	"bytes"
14	"flag"
15	"fmt"
16	"log"
17	"reflect"
18	"sort"
19	"strings"
20
21	"golang.org/x/text/internal/gen"
22	"golang.org/x/text/language"
23	"golang.org/x/text/unicode/cldr"
24)
25
26var (
27	test = flag.Bool("test", false,
28		"test existing tables; can be used to compare web data with package data.")
29	outputFile = flag.String("output", "tables.go", "output file")
30
31	stats = flag.Bool("stats", false, "prints statistics to stderr")
32
33	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
34	draft = flag.String("draft",
35		"contributed",
36		`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
37	pkg = flag.String("package",
38		"display",
39		"the name of the package in which the generated file is to be included")
40
41	tags = newTagSet("tags",
42		[]language.Tag{},
43		"space-separated list of tags to include or empty for all")
44	dict = newTagSet("dict",
45		dictTags(),
46		"space-separated list or tags for which to include a Dictionary. "+
47			`"" means the common list from go.text/language.`)
48)
49
50func dictTags() (tag []language.Tag) {
51	// TODO: replace with language.Common.Tags() once supported.
52	const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
53		"es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
54		"ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
55		"pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
56		"zh zh-Hans zh-Hant zu"
57
58	for _, s := range strings.Split(str, " ") {
59		tag = append(tag, language.MustParse(s))
60	}
61	return tag
62}
63
64func main() {
65	gen.Init()
66
67	// Read the CLDR zip file.
68	r := gen.OpenCLDRCoreZip()
69	defer r.Close()
70
71	d := &cldr.Decoder{}
72	d.SetDirFilter("main", "supplemental")
73	d.SetSectionFilter("localeDisplayNames")
74	data, err := d.DecodeZip(r)
75	if err != nil {
76		log.Fatalf("DecodeZip: %v", err)
77	}
78
79	w := gen.NewCodeWriter()
80	defer w.WriteGoFile(*outputFile, "display")
81
82	gen.WriteCLDRVersion(w)
83
84	b := builder{
85		w:     w,
86		data:  data,
87		group: make(map[string]*group),
88	}
89	b.generate()
90}
91
92const tagForm = language.All
93
94// tagSet is used to parse command line flags of tags. It implements the
95// flag.Value interface.
96type tagSet map[language.Tag]bool
97
98func newTagSet(name string, tags []language.Tag, usage string) tagSet {
99	f := tagSet(make(map[language.Tag]bool))
100	for _, t := range tags {
101		f[t] = true
102	}
103	flag.Var(f, name, usage)
104	return f
105}
106
107// String implements the String method of the flag.Value interface.
108func (f tagSet) String() string {
109	tags := []string{}
110	for t := range f {
111		tags = append(tags, t.String())
112	}
113	sort.Strings(tags)
114	return strings.Join(tags, " ")
115}
116
117// Set implements Set from the flag.Value interface.
118func (f tagSet) Set(s string) error {
119	if s != "" {
120		for _, s := range strings.Split(s, " ") {
121			if s != "" {
122				tag, err := tagForm.Parse(s)
123				if err != nil {
124					return err
125				}
126				f[tag] = true
127			}
128		}
129	}
130	return nil
131}
132
133func (f tagSet) contains(t language.Tag) bool {
134	if len(f) == 0 {
135		return true
136	}
137	return f[t]
138}
139
140// builder is used to create all tables with display name information.
141type builder struct {
142	w *gen.CodeWriter
143
144	data *cldr.CLDR
145
146	fromLocs []string
147
148	// destination tags for the current locale.
149	toTags     []string
150	toTagIndex map[string]int
151
152	// list of supported tags
153	supported []language.Tag
154
155	// key-value pairs per group
156	group map[string]*group
157
158	// statistics
159	sizeIndex int // total size of all indexes of headers
160	sizeData  int // total size of all data of headers
161	totalSize int
162}
163
164type group struct {
165	// Maps from a given language to the Namer data for this language.
166	lang    map[language.Tag]keyValues
167	headers []header
168
169	toTags        []string
170	threeStart    int
171	fourPlusStart int
172}
173
174// set sets the typ to the name for locale loc.
175func (g *group) set(t language.Tag, typ, name string) {
176	kv := g.lang[t]
177	if kv == nil {
178		kv = make(keyValues)
179		g.lang[t] = kv
180	}
181	if kv[typ] == "" {
182		kv[typ] = name
183	}
184}
185
186type keyValues map[string]string
187
188type header struct {
189	tag   language.Tag
190	data  string
191	index []uint16
192}
193
194var versionInfo = `// Version is deprecated. Use CLDRVersion.
195const Version = %#v
196
197`
198
199var self = language.MustParse("mul")
200
201// generate builds and writes all tables.
202func (b *builder) generate() {
203	fmt.Fprintf(b.w, versionInfo, cldr.Version)
204
205	b.filter()
206	b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
207		if ldn.Languages != nil {
208			for _, v := range ldn.Languages.Language {
209				lang := v.Type
210				if lang == "root" {
211					// We prefer the data from "und"
212					// TODO: allow both the data for root and und somehow.
213					continue
214				}
215				tag := tagForm.MustParse(lang)
216				if tags.contains(tag) {
217					g.set(loc, tag.String(), v.Data())
218				}
219			}
220		}
221	})
222	b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
223		if ldn.Scripts != nil {
224			for _, v := range ldn.Scripts.Script {
225				code := language.MustParseScript(v.Type)
226				if code.IsPrivateUse() { // Qaaa..Qabx
227					// TODO: data currently appears to be very meager.
228					// Reconsider if we have data for English.
229					if loc == language.English {
230						log.Fatal("Consider including data for private use scripts.")
231					}
232					continue
233				}
234				g.set(loc, code.String(), v.Data())
235			}
236		}
237	})
238	b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
239		if ldn.Territories != nil {
240			for _, v := range ldn.Territories.Territory {
241				g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
242			}
243		}
244	})
245
246	b.makeSupported()
247
248	b.writeParents()
249
250	b.writeGroup("lang")
251	b.writeGroup("script")
252	b.writeGroup("region")
253
254	b.w.WriteConst("numSupported", len(b.supported))
255	buf := bytes.Buffer{}
256	for _, tag := range b.supported {
257		fmt.Fprint(&buf, tag.String(), "|")
258	}
259	b.w.WriteConst("supported", buf.String())
260
261	b.writeDictionaries()
262
263	b.supported = []language.Tag{self}
264
265	// Compute the names of locales in their own language. Some of these names
266	// may be specified in their parent locales. We iterate the maximum depth
267	// of the parent three times to match successive parents of tags until a
268	// possible match is found.
269	for i := 0; i < 4; i++ {
270		b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
271			parent := tag
272			if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
273				parent, _ = language.Raw.Compose(b)
274			}
275			if ldn.Languages != nil {
276				for _, v := range ldn.Languages.Language {
277					key := tagForm.MustParse(v.Type)
278					saved := key
279					if key == parent {
280						g.set(self, tag.String(), v.Data())
281					}
282					for k := 0; k < i; k++ {
283						key = key.Parent()
284					}
285					if key == tag {
286						g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
287					}
288				}
289			}
290		})
291	}
292
293	b.writeGroup("self")
294}
295
296func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
297	b.sizeIndex = 0
298	b.sizeData = 0
299	b.toTags = nil
300	b.fromLocs = nil
301	b.toTagIndex = make(map[string]int)
302
303	g := b.group[name]
304	if g == nil {
305		g = &group{lang: make(map[language.Tag]keyValues)}
306		b.group[name] = g
307	}
308	for _, loc := range b.data.Locales() {
309		// We use RawLDML instead of LDML as we are managing our own inheritance
310		// in this implementation.
311		ldml := b.data.RawLDML(loc)
312
313		// We do not support the POSIX variant (it is not a supported BCP 47
314		// variant). This locale also doesn't happen to contain any data, so
315		// we'll skip it by checking for this.
316		tag, err := tagForm.Parse(loc)
317		if err != nil {
318			if ldml.LocaleDisplayNames != nil {
319				log.Fatalf("setData: %v", err)
320			}
321			continue
322		}
323		if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
324			f(g, tag, ldml.LocaleDisplayNames)
325		}
326	}
327}
328
329func (b *builder) filter() {
330	filter := func(s *cldr.Slice) {
331		if *short {
332			s.SelectOnePerGroup("alt", []string{"short", ""})
333		} else {
334			s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
335		}
336		d, err := cldr.ParseDraft(*draft)
337		if err != nil {
338			log.Fatalf("filter: %v", err)
339		}
340		s.SelectDraft(d)
341	}
342	for _, loc := range b.data.Locales() {
343		if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
344			if ldn.Languages != nil {
345				s := cldr.MakeSlice(&ldn.Languages.Language)
346				if filter(&s); len(ldn.Languages.Language) == 0 {
347					ldn.Languages = nil
348				}
349			}
350			if ldn.Scripts != nil {
351				s := cldr.MakeSlice(&ldn.Scripts.Script)
352				if filter(&s); len(ldn.Scripts.Script) == 0 {
353					ldn.Scripts = nil
354				}
355			}
356			if ldn.Territories != nil {
357				s := cldr.MakeSlice(&ldn.Territories.Territory)
358				if filter(&s); len(ldn.Territories.Territory) == 0 {
359					ldn.Territories = nil
360				}
361			}
362		}
363	}
364}
365
366// makeSupported creates a list of all supported locales.
367func (b *builder) makeSupported() {
368	// tags across groups
369	for _, g := range b.group {
370		for t, _ := range g.lang {
371			b.supported = append(b.supported, t)
372		}
373	}
374	b.supported = b.supported[:unique(tagsSorter(b.supported))]
375
376}
377
378type tagsSorter []language.Tag
379
380func (a tagsSorter) Len() int           { return len(a) }
381func (a tagsSorter) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
382func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
383
384func (b *builder) writeGroup(name string) {
385	g := b.group[name]
386
387	for _, kv := range g.lang {
388		for t, _ := range kv {
389			g.toTags = append(g.toTags, t)
390		}
391	}
392	g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
393
394	// Allocate header per supported value.
395	g.headers = make([]header, len(b.supported))
396	for i, sup := range b.supported {
397		kv, ok := g.lang[sup]
398		if !ok {
399			g.headers[i].tag = sup
400			continue
401		}
402		data := []byte{}
403		index := make([]uint16, len(g.toTags), len(g.toTags)+1)
404		for j, t := range g.toTags {
405			index[j] = uint16(len(data))
406			data = append(data, kv[t]...)
407		}
408		index = append(index, uint16(len(data)))
409
410		// Trim the tail of the index.
411		// TODO: indexes can be reduced in size quite a bit more.
412		n := len(index)
413		for ; n >= 2 && index[n-2] == index[n-1]; n-- {
414		}
415		index = index[:n]
416
417		// Workaround for a bug in CLDR 26.
418		// See https://unicode.org/cldr/trac/ticket/8042.
419		if cldr.Version == "26" && sup.String() == "hsb" {
420			data = bytes.Replace(data, []byte{'"'}, nil, 1)
421		}
422		g.headers[i] = header{sup, string(data), index}
423	}
424	g.writeTable(b.w, name)
425}
426
427type tagsBySize []string
428
429func (l tagsBySize) Len() int      { return len(l) }
430func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
431func (l tagsBySize) Less(i, j int) bool {
432	a, b := l[i], l[j]
433	// Sort single-tag entries based on size first. Otherwise alphabetic.
434	if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
435		return len(a) < len(b)
436	}
437	return a < b
438}
439
440// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
441// of tags[i].
442func parentIndices(tags []language.Tag) []int16 {
443	index := make(map[language.Tag]int16)
444	for i, t := range tags {
445		index[t] = int16(i)
446	}
447
448	// Construct default parents.
449	parents := make([]int16, len(tags))
450	for i, t := range tags {
451		parents[i] = -1
452		for t = t.Parent(); t != language.Und; t = t.Parent() {
453			if j, ok := index[t]; ok {
454				parents[i] = j
455				break
456			}
457		}
458	}
459	return parents
460}
461
462func (b *builder) writeParents() {
463	parents := parentIndices(b.supported)
464	fmt.Fprintf(b.w, "var parents = ")
465	b.w.WriteArray(parents)
466}
467
468// writeKeys writes keys to a special index used by the display package.
469// tags are assumed to be sorted by length.
470func writeKeys(w *gen.CodeWriter, name string, keys []string) {
471	w.Size += int(3 * reflect.TypeOf("").Size())
472	w.WriteComment("Number of keys: %d", len(keys))
473	fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
474	for i := 2; i <= 4; i++ {
475		sub := []string{}
476		for _, t := range keys {
477			if len(t) != i {
478				break
479			}
480			sub = append(sub, t)
481		}
482		s := strings.Join(sub, "")
483		w.WriteString(s)
484		fmt.Fprintf(w, ",\n")
485		keys = keys[len(sub):]
486	}
487	fmt.Fprintln(w, "\t}")
488	if len(keys) > 0 {
489		w.Size += int(reflect.TypeOf([]string{}).Size())
490		fmt.Fprintf(w, "\t%sTagsLong = ", name)
491		w.WriteSlice(keys)
492	}
493	fmt.Fprintln(w, ")\n")
494}
495
496// identifier creates an identifier from the given tag.
497func identifier(t language.Tag) string {
498	return strings.Replace(t.String(), "-", "", -1)
499}
500
501func (h *header) writeEntry(w *gen.CodeWriter, name string) {
502	if len(dict) > 0 && dict.contains(h.tag) {
503		fmt.Fprintf(w, "\t{ // %s\n", h.tag)
504		fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
505		fmt.Fprintln(w, "\t},")
506	} else if len(h.data) == 0 {
507		fmt.Fprintln(w, "\t\t{}, //", h.tag)
508	} else {
509		fmt.Fprintf(w, "\t{ // %s\n", h.tag)
510		w.WriteString(h.data)
511		fmt.Fprintln(w, ",")
512		w.WriteSlice(h.index)
513		fmt.Fprintln(w, ",\n\t},")
514	}
515}
516
517// write the data for the given header as single entries. The size for this data
518// was already accounted for in writeEntry.
519func (h *header) writeSingle(w *gen.CodeWriter, name string) {
520	if len(dict) > 0 && dict.contains(h.tag) {
521		tag := identifier(h.tag)
522		w.WriteConst(tag+name+"Str", h.data)
523
524		// Note that we create a slice instead of an array. If we use an array
525		// we need to refer to it as a[:] in other tables, which will cause the
526		// array to always be included by the linker. See Issue 7651.
527		w.WriteVar(tag+name+"Idx", h.index)
528	}
529}
530
531// WriteTable writes an entry for a single Namer.
532func (g *group) writeTable(w *gen.CodeWriter, name string) {
533	start := w.Size
534	writeKeys(w, name, g.toTags)
535	w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
536
537	fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
538
539	title := strings.Title(name)
540	for _, h := range g.headers {
541		h.writeEntry(w, title)
542	}
543	fmt.Fprintln(w, "}\n")
544
545	for _, h := range g.headers {
546		h.writeSingle(w, title)
547	}
548	n := w.Size - start
549	fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
550}
551
552func (b *builder) writeDictionaries() {
553	fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
554	fmt.Fprintln(b.w, "var (")
555	parents := parentIndices(b.supported)
556
557	for i, t := range b.supported {
558		if dict.contains(t) {
559			ident := identifier(t)
560			fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
561			if p := parents[i]; p == -1 {
562				fmt.Fprintln(b.w, "\t\tnil,")
563			} else {
564				fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
565			}
566			fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
567			fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
568			fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
569			fmt.Fprintln(b.w, "\t}")
570		}
571	}
572	fmt.Fprintln(b.w, ")")
573
574	var s string
575	var a []uint16
576	sz := reflect.TypeOf(s).Size()
577	sz += reflect.TypeOf(a).Size()
578	sz *= 3
579	sz += reflect.TypeOf(&a).Size()
580	n := int(sz) * len(dict)
581	fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
582
583	b.w.Size += n
584}
585
586// unique sorts the given lists and removes duplicate entries by swapping them
587// past position k, where k is the number of unique values. It returns k.
588func unique(a sort.Interface) int {
589	if a.Len() == 0 {
590		return 0
591	}
592	sort.Sort(a)
593	k := 1
594	for i := 1; i < a.Len(); i++ {
595		if a.Less(k-1, i) {
596			if k != i {
597				a.Swap(k, i)
598			}
599			k++
600		}
601	}
602	return k
603}
604