1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Generator for display name tables.
8
9package main
10
11import (
12	"bytes"
13	"flag"
14	"fmt"
15	"log"
16	"reflect"
17	"sort"
18	"strings"
19
20	"golang.org/x/text/internal/gen"
21	"golang.org/x/text/language"
22	"golang.org/x/text/unicode/cldr"
23)
24
25var (
26	test = flag.Bool("test", false,
27		"test existing tables; can be used to compare web data with package data.")
28	outputFile = flag.String("output", "tables.go", "output file")
29
30	stats = flag.Bool("stats", false, "prints statistics to stderr")
31
32	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
33	draft = flag.String("draft",
34		"contributed",
35		`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
36	pkg = flag.String("package",
37		"display",
38		"the name of the package in which the generated file is to be included")
39
40	tags = newTagSet("tags",
41		[]language.Tag{},
42		"space-separated list of tags to include or empty for all")
43	dict = newTagSet("dict",
44		dictTags(),
45		"space-separated list or tags for which to include a Dictionary. "+
46			`"" means the common list from go.text/language.`)
47)
48
49func dictTags() (tag []language.Tag) {
50	// TODO: replace with language.Common.Tags() once supported.
51	const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
52		"es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
53		"ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
54		"pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
55		"zh zh-Hans zh-Hant zu"
56
57	for _, s := range strings.Split(str, " ") {
58		tag = append(tag, language.MustParse(s))
59	}
60	return tag
61}
62
63func main() {
64	gen.Init()
65
66	// Read the CLDR zip file.
67	r := gen.OpenCLDRCoreZip()
68	defer r.Close()
69
70	d := &cldr.Decoder{}
71	d.SetDirFilter("main", "supplemental")
72	d.SetSectionFilter("localeDisplayNames")
73	data, err := d.DecodeZip(r)
74	if err != nil {
75		log.Fatalf("DecodeZip: %v", err)
76	}
77
78	w := gen.NewCodeWriter()
79	defer w.WriteGoFile(*outputFile, "display")
80
81	gen.WriteCLDRVersion(w)
82
83	b := builder{
84		w:     w,
85		data:  data,
86		group: make(map[string]*group),
87	}
88	b.generate()
89}
90
91const tagForm = language.All
92
93// tagSet is used to parse command line flags of tags. It implements the
94// flag.Value interface.
95type tagSet map[language.Tag]bool
96
97func newTagSet(name string, tags []language.Tag, usage string) tagSet {
98	f := tagSet(make(map[language.Tag]bool))
99	for _, t := range tags {
100		f[t] = true
101	}
102	flag.Var(f, name, usage)
103	return f
104}
105
106// String implements the String method of the flag.Value interface.
107func (f tagSet) String() string {
108	tags := []string{}
109	for t := range f {
110		tags = append(tags, t.String())
111	}
112	sort.Strings(tags)
113	return strings.Join(tags, " ")
114}
115
116// Set implements Set from the flag.Value interface.
117func (f tagSet) Set(s string) error {
118	if s != "" {
119		for _, s := range strings.Split(s, " ") {
120			if s != "" {
121				tag, err := tagForm.Parse(s)
122				if err != nil {
123					return err
124				}
125				f[tag] = true
126			}
127		}
128	}
129	return nil
130}
131
132func (f tagSet) contains(t language.Tag) bool {
133	if len(f) == 0 {
134		return true
135	}
136	return f[t]
137}
138
139// builder is used to create all tables with display name information.
140type builder struct {
141	w *gen.CodeWriter
142
143	data *cldr.CLDR
144
145	fromLocs []string
146
147	// destination tags for the current locale.
148	toTags     []string
149	toTagIndex map[string]int
150
151	// list of supported tags
152	supported []language.Tag
153
154	// key-value pairs per group
155	group map[string]*group
156
157	// statistics
158	sizeIndex int // total size of all indexes of headers
159	sizeData  int // total size of all data of headers
160	totalSize int
161}
162
163type group struct {
164	// Maps from a given language to the Namer data for this language.
165	lang    map[language.Tag]keyValues
166	headers []header
167
168	toTags        []string
169	threeStart    int
170	fourPlusStart int
171}
172
173// set sets the typ to the name for locale loc.
174func (g *group) set(t language.Tag, typ, name string) {
175	kv := g.lang[t]
176	if kv == nil {
177		kv = make(keyValues)
178		g.lang[t] = kv
179	}
180	if kv[typ] == "" {
181		kv[typ] = name
182	}
183}
184
185type keyValues map[string]string
186
187type header struct {
188	tag   language.Tag
189	data  string
190	index []uint16
191}
192
193var versionInfo = `// Version is deprecated. Use CLDRVersion.
194const Version = %#v
195
196`
197
198var self = language.MustParse("mul")
199
200// generate builds and writes all tables.
201func (b *builder) generate() {
202	fmt.Fprintf(b.w, versionInfo, cldr.Version)
203
204	b.filter()
205	b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
206		if ldn.Languages != nil {
207			for _, v := range ldn.Languages.Language {
208				lang := v.Type
209				if lang == "root" {
210					// We prefer the data from "und"
211					// TODO: allow both the data for root and und somehow.
212					continue
213				}
214				tag := tagForm.MustParse(lang)
215				if tags.contains(tag) {
216					g.set(loc, tag.String(), v.Data())
217				}
218			}
219		}
220	})
221	b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
222		if ldn.Scripts != nil {
223			for _, v := range ldn.Scripts.Script {
224				code := language.MustParseScript(v.Type)
225				if code.IsPrivateUse() { // Qaaa..Qabx
226					// TODO: data currently appears to be very meager.
227					// Reconsider if we have data for English.
228					if loc == language.English {
229						log.Fatal("Consider including data for private use scripts.")
230					}
231					continue
232				}
233				g.set(loc, code.String(), v.Data())
234			}
235		}
236	})
237	b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
238		if ldn.Territories != nil {
239			for _, v := range ldn.Territories.Territory {
240				g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
241			}
242		}
243	})
244
245	b.makeSupported()
246
247	b.writeParents()
248
249	b.writeGroup("lang")
250	b.writeGroup("script")
251	b.writeGroup("region")
252
253	b.w.WriteConst("numSupported", len(b.supported))
254	buf := bytes.Buffer{}
255	for _, tag := range b.supported {
256		fmt.Fprint(&buf, tag.String(), "|")
257	}
258	b.w.WriteConst("supported", buf.String())
259
260	b.writeDictionaries()
261
262	b.supported = []language.Tag{self}
263
264	// Compute the names of locales in their own language. Some of these names
265	// may be specified in their parent locales. We iterate the maximum depth
266	// of the parent three times to match successive parents of tags until a
267	// possible match is found.
268	for i := 0; i < 4; i++ {
269		b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
270			parent := tag
271			if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
272				parent, _ = language.Raw.Compose(b)
273			}
274			if ldn.Languages != nil {
275				for _, v := range ldn.Languages.Language {
276					key := tagForm.MustParse(v.Type)
277					saved := key
278					if key == parent {
279						g.set(self, tag.String(), v.Data())
280					}
281					for k := 0; k < i; k++ {
282						key = key.Parent()
283					}
284					if key == tag {
285						g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
286					}
287				}
288			}
289		})
290	}
291
292	b.writeGroup("self")
293}
294
295func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
296	b.sizeIndex = 0
297	b.sizeData = 0
298	b.toTags = nil
299	b.fromLocs = nil
300	b.toTagIndex = make(map[string]int)
301
302	g := b.group[name]
303	if g == nil {
304		g = &group{lang: make(map[language.Tag]keyValues)}
305		b.group[name] = g
306	}
307	for _, loc := range b.data.Locales() {
308		// We use RawLDML instead of LDML as we are managing our own inheritance
309		// in this implementation.
310		ldml := b.data.RawLDML(loc)
311
312		// We do not support the POSIX variant (it is not a supported BCP 47
313		// variant). This locale also doesn't happen to contain any data, so
314		// we'll skip it by checking for this.
315		tag, err := tagForm.Parse(loc)
316		if err != nil {
317			if ldml.LocaleDisplayNames != nil {
318				log.Fatalf("setData: %v", err)
319			}
320			continue
321		}
322		if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
323			f(g, tag, ldml.LocaleDisplayNames)
324		}
325	}
326}
327
328func (b *builder) filter() {
329	filter := func(s *cldr.Slice) {
330		if *short {
331			s.SelectOnePerGroup("alt", []string{"short", ""})
332		} else {
333			s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
334		}
335		d, err := cldr.ParseDraft(*draft)
336		if err != nil {
337			log.Fatalf("filter: %v", err)
338		}
339		s.SelectDraft(d)
340	}
341	for _, loc := range b.data.Locales() {
342		if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
343			if ldn.Languages != nil {
344				s := cldr.MakeSlice(&ldn.Languages.Language)
345				if filter(&s); len(ldn.Languages.Language) == 0 {
346					ldn.Languages = nil
347				}
348			}
349			if ldn.Scripts != nil {
350				s := cldr.MakeSlice(&ldn.Scripts.Script)
351				if filter(&s); len(ldn.Scripts.Script) == 0 {
352					ldn.Scripts = nil
353				}
354			}
355			if ldn.Territories != nil {
356				s := cldr.MakeSlice(&ldn.Territories.Territory)
357				if filter(&s); len(ldn.Territories.Territory) == 0 {
358					ldn.Territories = nil
359				}
360			}
361		}
362	}
363}
364
365// makeSupported creates a list of all supported locales.
366func (b *builder) makeSupported() {
367	// tags across groups
368	for _, g := range b.group {
369		for t, _ := range g.lang {
370			b.supported = append(b.supported, t)
371		}
372	}
373	b.supported = b.supported[:unique(tagsSorter(b.supported))]
374
375}
376
377type tagsSorter []language.Tag
378
379func (a tagsSorter) Len() int           { return len(a) }
380func (a tagsSorter) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
381func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
382
383func (b *builder) writeGroup(name string) {
384	g := b.group[name]
385
386	for _, kv := range g.lang {
387		for t, _ := range kv {
388			g.toTags = append(g.toTags, t)
389		}
390	}
391	g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
392
393	// Allocate header per supported value.
394	g.headers = make([]header, len(b.supported))
395	for i, sup := range b.supported {
396		kv, ok := g.lang[sup]
397		if !ok {
398			g.headers[i].tag = sup
399			continue
400		}
401		data := []byte{}
402		index := make([]uint16, len(g.toTags), len(g.toTags)+1)
403		for j, t := range g.toTags {
404			index[j] = uint16(len(data))
405			data = append(data, kv[t]...)
406		}
407		index = append(index, uint16(len(data)))
408
409		// Trim the tail of the index.
410		// TODO: indexes can be reduced in size quite a bit more.
411		n := len(index)
412		for ; n >= 2 && index[n-2] == index[n-1]; n-- {
413		}
414		index = index[:n]
415
416		// Workaround for a bug in CLDR 26.
417		// See http://unicode.org/cldr/trac/ticket/8042.
418		if cldr.Version == "26" && sup.String() == "hsb" {
419			data = bytes.Replace(data, []byte{'"'}, nil, 1)
420		}
421		g.headers[i] = header{sup, string(data), index}
422	}
423	g.writeTable(b.w, name)
424}
425
426type tagsBySize []string
427
428func (l tagsBySize) Len() int      { return len(l) }
429func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
430func (l tagsBySize) Less(i, j int) bool {
431	a, b := l[i], l[j]
432	// Sort single-tag entries based on size first. Otherwise alphabetic.
433	if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
434		return len(a) < len(b)
435	}
436	return a < b
437}
438
439// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
440// of tags[i].
441func parentIndices(tags []language.Tag) []int16 {
442	index := make(map[language.Tag]int16)
443	for i, t := range tags {
444		index[t] = int16(i)
445	}
446
447	// Construct default parents.
448	parents := make([]int16, len(tags))
449	for i, t := range tags {
450		parents[i] = -1
451		for t = t.Parent(); t != language.Und; t = t.Parent() {
452			if j, ok := index[t]; ok {
453				parents[i] = j
454				break
455			}
456		}
457	}
458	return parents
459}
460
461func (b *builder) writeParents() {
462	parents := parentIndices(b.supported)
463	fmt.Fprintf(b.w, "var parents = ")
464	b.w.WriteArray(parents)
465}
466
467// writeKeys writes keys to a special index used by the display package.
468// tags are assumed to be sorted by length.
469func writeKeys(w *gen.CodeWriter, name string, keys []string) {
470	w.Size += int(3 * reflect.TypeOf("").Size())
471	w.WriteComment("Number of keys: %d", len(keys))
472	fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
473	for i := 2; i <= 4; i++ {
474		sub := []string{}
475		for _, t := range keys {
476			if len(t) != i {
477				break
478			}
479			sub = append(sub, t)
480		}
481		s := strings.Join(sub, "")
482		w.WriteString(s)
483		fmt.Fprintf(w, ",\n")
484		keys = keys[len(sub):]
485	}
486	fmt.Fprintln(w, "\t}")
487	if len(keys) > 0 {
488		w.Size += int(reflect.TypeOf([]string{}).Size())
489		fmt.Fprintf(w, "\t%sTagsLong = ", name)
490		w.WriteSlice(keys)
491	}
492	fmt.Fprintln(w, ")\n")
493}
494
495// identifier creates an identifier from the given tag.
496func identifier(t language.Tag) string {
497	return strings.Replace(t.String(), "-", "", -1)
498}
499
500func (h *header) writeEntry(w *gen.CodeWriter, name string) {
501	if len(dict) > 0 && dict.contains(h.tag) {
502		fmt.Fprintf(w, "\t{ // %s\n", h.tag)
503		fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
504		fmt.Fprintln(w, "\t},")
505	} else if len(h.data) == 0 {
506		fmt.Fprintln(w, "\t\t{}, //", h.tag)
507	} else {
508		fmt.Fprintf(w, "\t{ // %s\n", h.tag)
509		w.WriteString(h.data)
510		fmt.Fprintln(w, ",")
511		w.WriteSlice(h.index)
512		fmt.Fprintln(w, ",\n\t},")
513	}
514}
515
516// write the data for the given header as single entries. The size for this data
517// was already accounted for in writeEntry.
518func (h *header) writeSingle(w *gen.CodeWriter, name string) {
519	if len(dict) > 0 && dict.contains(h.tag) {
520		tag := identifier(h.tag)
521		w.WriteConst(tag+name+"Str", h.data)
522
523		// Note that we create a slice instead of an array. If we use an array
524		// we need to refer to it as a[:] in other tables, which will cause the
525		// array to always be included by the linker. See Issue 7651.
526		w.WriteVar(tag+name+"Idx", h.index)
527	}
528}
529
530// WriteTable writes an entry for a single Namer.
531func (g *group) writeTable(w *gen.CodeWriter, name string) {
532	start := w.Size
533	writeKeys(w, name, g.toTags)
534	w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
535
536	fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
537
538	title := strings.Title(name)
539	for _, h := range g.headers {
540		h.writeEntry(w, title)
541	}
542	fmt.Fprintln(w, "}\n")
543
544	for _, h := range g.headers {
545		h.writeSingle(w, title)
546	}
547	n := w.Size - start
548	fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
549}
550
551func (b *builder) writeDictionaries() {
552	fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
553	fmt.Fprintln(b.w, "var (")
554	parents := parentIndices(b.supported)
555
556	for i, t := range b.supported {
557		if dict.contains(t) {
558			ident := identifier(t)
559			fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
560			if p := parents[i]; p == -1 {
561				fmt.Fprintln(b.w, "\t\tnil,")
562			} else {
563				fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
564			}
565			fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
566			fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
567			fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
568			fmt.Fprintln(b.w, "\t}")
569		}
570	}
571	fmt.Fprintln(b.w, ")")
572
573	var s string
574	var a []uint16
575	sz := reflect.TypeOf(s).Size()
576	sz += reflect.TypeOf(a).Size()
577	sz *= 3
578	sz += reflect.TypeOf(&a).Size()
579	n := int(sz) * len(dict)
580	fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
581
582	b.w.Size += n
583}
584
585// unique sorts the given lists and removes duplicate entries by swapping them
586// past position k, where k is the number of unique values. It returns k.
587func unique(a sort.Interface) int {
588	if a.Len() == 0 {
589		return 0
590	}
591	sort.Sort(a)
592	k := 1
593	for i := 1; i < a.Len(); i++ {
594		if a.Less(k-1, i) {
595			if k != i {
596				a.Swap(k, i)
597			}
598			k++
599		}
600	}
601	return k
602}
603