1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// Collation table generator.
9// Data read from the web.
10
11package main
12
13import (
14	"archive/zip"
15	"bufio"
16	"bytes"
17	"flag"
18	"fmt"
19	"io"
20	"io/ioutil"
21	"log"
22	"os"
23	"regexp"
24	"sort"
25	"strconv"
26	"strings"
27	"unicode/utf8"
28
29	"golang.org/x/text/collate"
30	"golang.org/x/text/collate/build"
31	"golang.org/x/text/internal/colltab"
32	"golang.org/x/text/internal/gen"
33	"golang.org/x/text/language"
34	"golang.org/x/text/unicode/cldr"
35)
36
37var (
38	test = flag.Bool("test", false,
39		"test existing tables; can be used to compare web data with package data.")
40	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
41	draft = flag.Bool("draft", false, `Use draft versions, when available.`)
42	tags  = flag.String("tags", "", "build tags to be included after +build directive")
43	pkg   = flag.String("package", "collate",
44		"the name of the package in which the generated file is to be included")
45
46	tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
47		"comma-spearated list of tables to generate.")
48	exclude = flagStringSet("exclude", "zh2", "",
49		"comma-separated list of languages to exclude.")
50	include = flagStringSet("include", "", "",
51		"comma-separated list of languages to include. Include trumps exclude.")
52	// TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
53	// TODO: Not included: traditional (buggy for Bengali)
54	types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
55		"comma-separated list of types that should be included.")
56)
57
58// stringSet implements an ordered set based on a list.  It implements flag.Value
59// to allow a set to be specified as a comma-separated list.
60type stringSet struct {
61	s        []string
62	allowed  *stringSet
63	dirty    bool // needs compaction if true
64	all      bool
65	allowAll bool
66}
67
68func flagStringSet(name, def, allowed, usage string) *stringSet {
69	ss := &stringSet{}
70	if allowed != "" {
71		usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
72		ss.allowed = &stringSet{}
73		failOnError(ss.allowed.Set(allowed))
74	}
75	ss.Set(def)
76	flag.Var(ss, name, usage)
77	return ss
78}
79
80func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
81	ss := &stringSet{allowAll: true}
82	if allowed == "" {
83		flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
84	} else {
85		ss.allowed = &stringSet{}
86		failOnError(ss.allowed.Set(allowed))
87		flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
88	}
89	ss.Set(def)
90	return ss
91}
92
93func (ss stringSet) Len() int {
94	return len(ss.s)
95}
96
97func (ss stringSet) String() string {
98	return strings.Join(ss.s, ",")
99}
100
101func (ss *stringSet) Set(s string) error {
102	if ss.allowAll && s == "all" {
103		ss.s = nil
104		ss.all = true
105		return nil
106	}
107	ss.s = ss.s[:0]
108	for _, s := range strings.Split(s, ",") {
109		if s := strings.TrimSpace(s); s != "" {
110			if ss.allowed != nil && !ss.allowed.contains(s) {
111				return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
112			}
113			ss.add(s)
114		}
115	}
116	ss.compact()
117	return nil
118}
119
120func (ss *stringSet) add(s string) {
121	ss.s = append(ss.s, s)
122	ss.dirty = true
123}
124
125func (ss *stringSet) values() []string {
126	ss.compact()
127	return ss.s
128}
129
130func (ss *stringSet) contains(s string) bool {
131	if ss.all {
132		return true
133	}
134	for _, v := range ss.s {
135		if v == s {
136			return true
137		}
138	}
139	return false
140}
141
142func (ss *stringSet) compact() {
143	if !ss.dirty {
144		return
145	}
146	a := ss.s
147	sort.Strings(a)
148	k := 0
149	for i := 1; i < len(a); i++ {
150		if a[k] != a[i] {
151			a[k+1] = a[i]
152			k++
153		}
154	}
155	ss.s = a[:k+1]
156	ss.dirty = false
157}
158
159func skipLang(l string) bool {
160	if include.Len() > 0 {
161		return !include.contains(l)
162	}
163	return exclude.contains(l)
164}
165
166// altInclude returns a list of alternatives (for the LDML alt attribute)
167// in order of preference.  An empty string in this list indicates the
168// default entry.
169func altInclude() []string {
170	l := []string{}
171	if *short {
172		l = append(l, "short")
173	}
174	l = append(l, "")
175	// TODO: handle draft using cldr.SetDraftLevel
176	if *draft {
177		l = append(l, "proposed")
178	}
179	return l
180}
181
182func failOnError(e error) {
183	if e != nil {
184		log.Panic(e)
185	}
186}
187
188func openArchive() *zip.Reader {
189	f := gen.OpenCLDRCoreZip()
190	buffer, err := ioutil.ReadAll(f)
191	f.Close()
192	failOnError(err)
193	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
194	failOnError(err)
195	return archive
196}
197
198// parseUCA parses a Default Unicode Collation Element Table of the format
199// specified in https://www.unicode.org/reports/tr10/#File_Format.
200// It returns the variable top.
201func parseUCA(builder *build.Builder) {
202	var r io.ReadCloser
203	var err error
204	for _, f := range openArchive().File {
205		if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
206			r, err = f.Open()
207		}
208	}
209	if r == nil {
210		log.Fatal("File allkeys_CLDR.txt not found in archive.")
211	}
212	failOnError(err)
213	defer r.Close()
214	scanner := bufio.NewScanner(r)
215	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
216	for i := 1; scanner.Scan(); i++ {
217		line := scanner.Text()
218		if len(line) == 0 || line[0] == '#' {
219			continue
220		}
221		if line[0] == '@' {
222			// parse properties
223			switch {
224			case strings.HasPrefix(line[1:], "version "):
225				a := strings.Split(line[1:], " ")
226				if a[1] != gen.UnicodeVersion() {
227					log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
228				}
229			case strings.HasPrefix(line[1:], "backwards "):
230				log.Fatalf("%d: unsupported option backwards", i)
231			default:
232				log.Printf("%d: unknown option %s", i, line[1:])
233			}
234		} else {
235			// parse entries
236			part := strings.Split(line, " ; ")
237			if len(part) != 2 {
238				log.Fatalf("%d: production rule without ';': %v", i, line)
239			}
240			lhs := []rune{}
241			for _, v := range strings.Split(part[0], " ") {
242				if v == "" {
243					continue
244				}
245				lhs = append(lhs, rune(convHex(i, v)))
246			}
247			var n int
248			var vars []int
249			rhs := [][]int{}
250			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
251				n += len(m[0])
252				elem := []int{}
253				for _, h := range strings.Split(m[2], ".") {
254					elem = append(elem, convHex(i, h))
255				}
256				if m[1] == "*" {
257					vars = append(vars, i)
258				}
259				rhs = append(rhs, elem)
260			}
261			if len(part[1]) < n+3 || part[1][n+1] != '#' {
262				log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
263			}
264			if *test {
265				testInput.add(string(lhs))
266			}
267			failOnError(builder.Add(lhs, rhs, vars))
268		}
269	}
270	if scanner.Err() != nil {
271		log.Fatal(scanner.Err())
272	}
273}
274
275func convHex(line int, s string) int {
276	r, e := strconv.ParseInt(s, 16, 32)
277	if e != nil {
278		log.Fatalf("%d: %v", line, e)
279	}
280	return int(r)
281}
282
283var testInput = stringSet{}
284
285var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
286var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
287
288var mainLocales = []string{}
289
290// charsets holds a list of exemplar characters per category.
291type charSets map[string][]string
292
293func (p charSets) fprint(w io.Writer) {
294	fmt.Fprintln(w, "[exN]string{")
295	for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
296		if set := p[k]; len(set) != 0 {
297			fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
298		}
299	}
300	fmt.Fprintln(w, "\t},")
301}
302
303var localeChars = make(map[string]charSets)
304
305const exemplarHeader = `
306type exemplarType int
307const (
308	exCharacters exemplarType = iota
309	exContractions
310	exPunctuation
311	exAuxiliary
312	exCurrency
313	exIndex
314	exN
315)
316`
317
318func printExemplarCharacters(w io.Writer) {
319	fmt.Fprintln(w, exemplarHeader)
320	fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
321	for _, loc := range mainLocales {
322		fmt.Fprintf(w, "\t%q: ", loc)
323		localeChars[loc].fprint(w)
324	}
325	fmt.Fprintln(w, "}")
326}
327
328func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
329	r := gen.OpenCLDRCoreZip()
330	data, err := d.DecodeZip(r)
331	failOnError(err)
332	return data
333}
334
335// parseMain parses XML files in the main directory of the CLDR core.zip file.
336func parseMain() {
337	d := &cldr.Decoder{}
338	d.SetDirFilter("main")
339	d.SetSectionFilter("characters")
340	data := decodeCLDR(d)
341	for _, loc := range data.Locales() {
342		x := data.RawLDML(loc)
343		if skipLang(x.Identity.Language.Type) {
344			continue
345		}
346		if x.Characters != nil {
347			x, _ = data.LDML(loc)
348			loc = language.Make(loc).String()
349			for _, ec := range x.Characters.ExemplarCharacters {
350				if ec.Draft != "" {
351					continue
352				}
353				if _, ok := localeChars[loc]; !ok {
354					mainLocales = append(mainLocales, loc)
355					localeChars[loc] = make(charSets)
356				}
357				localeChars[loc][ec.Type] = parseCharacters(ec.Data())
358			}
359		}
360	}
361}
362
363func parseCharacters(chars string) []string {
364	parseSingle := func(s string) (r rune, tail string, escaped bool) {
365		if s[0] == '\\' {
366			return rune(s[1]), s[2:], true
367		}
368		r, sz := utf8.DecodeRuneInString(s)
369		return r, s[sz:], false
370	}
371	chars = strings.TrimSpace(chars)
372	if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
373		chars = chars[1:n]
374	}
375	list := []string{}
376	var r, last, end rune
377	for len(chars) > 0 {
378		if chars[0] == '{' { // character sequence
379			buf := []rune{}
380			for chars = chars[1:]; len(chars) > 0; {
381				r, chars, _ = parseSingle(chars)
382				if r == '}' {
383					break
384				}
385				if r == ' ' {
386					log.Fatalf("space not supported in sequence %q", chars)
387				}
388				buf = append(buf, r)
389			}
390			list = append(list, string(buf))
391			last = 0
392		} else { // single character
393			escaped := false
394			r, chars, escaped = parseSingle(chars)
395			if r != ' ' {
396				if r == '-' && !escaped {
397					if last == 0 {
398						log.Fatal("'-' should be preceded by a character")
399					}
400					end, chars, _ = parseSingle(chars)
401					for ; last <= end; last++ {
402						list = append(list, string(last))
403					}
404					last = 0
405				} else {
406					list = append(list, string(r))
407					last = r
408				}
409			}
410		}
411	}
412	return list
413}
414
415var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
416
417// typeMap translates legacy type keys to their BCP47 equivalent.
418var typeMap = map[string]string{
419	"phonebook":   "phonebk",
420	"traditional": "trad",
421}
422
423// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
424func parseCollation(b *build.Builder) {
425	d := &cldr.Decoder{}
426	d.SetDirFilter("collation")
427	data := decodeCLDR(d)
428	for _, loc := range data.Locales() {
429		x, err := data.LDML(loc)
430		failOnError(err)
431		if skipLang(x.Identity.Language.Type) {
432			continue
433		}
434		cs := x.Collations.Collation
435		sl := cldr.MakeSlice(&cs)
436		if len(types.s) == 0 {
437			sl.SelectAnyOf("type", x.Collations.Default())
438		} else if !types.all {
439			sl.SelectAnyOf("type", types.s...)
440		}
441		sl.SelectOnePerGroup("alt", altInclude())
442
443		for _, c := range cs {
444			id, err := language.Parse(loc)
445			if err != nil {
446				fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
447				continue
448			}
449			// Support both old- and new-style defaults.
450			d := c.Type
451			if x.Collations.DefaultCollation == nil {
452				d = x.Collations.Default()
453			} else {
454				d = x.Collations.DefaultCollation.Data()
455			}
456			// We assume tables are being built either for search or collation,
457			// but not both. For search the default is always "search".
458			if d != c.Type && c.Type != "search" {
459				typ := c.Type
460				if len(c.Type) > 8 {
461					typ = typeMap[c.Type]
462				}
463				id, err = id.SetTypeForKey("co", typ)
464				failOnError(err)
465			}
466			t := b.Tailoring(id)
467			c.Process(processor{t})
468		}
469	}
470}
471
472type processor struct {
473	t *build.Tailoring
474}
475
476func (p processor) Reset(anchor string, before int) (err error) {
477	if before != 0 {
478		err = p.t.SetAnchorBefore(anchor)
479	} else {
480		err = p.t.SetAnchor(anchor)
481	}
482	failOnError(err)
483	return nil
484}
485
486func (p processor) Insert(level int, str, context, extend string) error {
487	str = context + str
488	if *test {
489		testInput.add(str)
490	}
491	// TODO: mimic bug in old maketables: remove.
492	err := p.t.Insert(colltab.Level(level-1), str, context+extend)
493	failOnError(err)
494	return nil
495}
496
497func (p processor) Index(id string) {
498}
499
500func testCollator(c *collate.Collator) {
501	c0 := collate.New(language.Und)
502
503	// iterator over all characters for all locales and check
504	// whether Key is equal.
505	buf := collate.Buffer{}
506
507	// Add all common and not too uncommon runes to the test set.
508	for i := rune(0); i < 0x30000; i++ {
509		testInput.add(string(i))
510	}
511	for i := rune(0xE0000); i < 0xF0000; i++ {
512		testInput.add(string(i))
513	}
514	for _, str := range testInput.values() {
515		k0 := c0.KeyFromString(&buf, str)
516		k := c.KeyFromString(&buf, str)
517		if !bytes.Equal(k0, k) {
518			failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
519		}
520		buf.Reset()
521	}
522	fmt.Println("PASS")
523}
524
525func main() {
526	gen.Init()
527	b := build.NewBuilder()
528	parseUCA(b)
529	if tables.contains("chars") {
530		parseMain()
531	}
532	parseCollation(b)
533
534	c, err := b.Build()
535	failOnError(err)
536
537	if *test {
538		testCollator(collate.NewFromTable(c))
539	} else {
540		w := &bytes.Buffer{}
541
542		gen.WriteUnicodeVersion(w)
543		gen.WriteCLDRVersion(w)
544
545		if tables.contains("collate") {
546			_, err = b.Print(w)
547			failOnError(err)
548		}
549		if tables.contains("chars") {
550			printExemplarCharacters(w)
551		}
552		gen.WriteGoFile("tables.go", *pkg, w.Bytes())
553	}
554}
555