1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Collation table generator.
8// Data read from the web.
9
10package main
11
12import (
13	"archive/zip"
14	"bufio"
15	"bytes"
16	"flag"
17	"fmt"
18	"io"
19	"io/ioutil"
20	"log"
21	"os"
22	"regexp"
23	"sort"
24	"strconv"
25	"strings"
26	"unicode/utf8"
27
28	"golang.org/x/text/collate"
29	"golang.org/x/text/collate/build"
30	"golang.org/x/text/internal/colltab"
31	"golang.org/x/text/internal/gen"
32	"golang.org/x/text/language"
33	"golang.org/x/text/unicode/cldr"
34)
35
36var (
37	test = flag.Bool("test", false,
38		"test existing tables; can be used to compare web data with package data.")
39	short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
40	draft = flag.Bool("draft", false, `Use draft versions, when available.`)
41	tags  = flag.String("tags", "", "build tags to be included after +build directive")
42	pkg   = flag.String("package", "collate",
43		"the name of the package in which the generated file is to be included")
44
45	tables = flagStringSetAllowAll("tables", "collate", "collate,chars",
46		"comma-spearated list of tables to generate.")
47	exclude = flagStringSet("exclude", "zh2", "",
48		"comma-separated list of languages to exclude.")
49	include = flagStringSet("include", "", "",
50		"comma-separated list of languages to include. Include trumps exclude.")
51	// TODO: Not included: unihan gb2312han zhuyin big5han (for size reasons)
52	// TODO: Not included: traditional (buggy for Bengali)
53	types = flagStringSetAllowAll("types", "standard,phonebook,phonetic,reformed,pinyin,stroke", "",
54		"comma-separated list of types that should be included.")
55)
56
57// stringSet implements an ordered set based on a list.  It implements flag.Value
58// to allow a set to be specified as a comma-separated list.
59type stringSet struct {
60	s        []string
61	allowed  *stringSet
62	dirty    bool // needs compaction if true
63	all      bool
64	allowAll bool
65}
66
67func flagStringSet(name, def, allowed, usage string) *stringSet {
68	ss := &stringSet{}
69	if allowed != "" {
70		usage += fmt.Sprintf(" (allowed values: any of %s)", allowed)
71		ss.allowed = &stringSet{}
72		failOnError(ss.allowed.Set(allowed))
73	}
74	ss.Set(def)
75	flag.Var(ss, name, usage)
76	return ss
77}
78
79func flagStringSetAllowAll(name, def, allowed, usage string) *stringSet {
80	ss := &stringSet{allowAll: true}
81	if allowed == "" {
82		flag.Var(ss, name, usage+fmt.Sprintf(` Use "all" to select all.`))
83	} else {
84		ss.allowed = &stringSet{}
85		failOnError(ss.allowed.Set(allowed))
86		flag.Var(ss, name, usage+fmt.Sprintf(` (allowed values: "all" or any of %s)`, allowed))
87	}
88	ss.Set(def)
89	return ss
90}
91
92func (ss stringSet) Len() int {
93	return len(ss.s)
94}
95
96func (ss stringSet) String() string {
97	return strings.Join(ss.s, ",")
98}
99
100func (ss *stringSet) Set(s string) error {
101	if ss.allowAll && s == "all" {
102		ss.s = nil
103		ss.all = true
104		return nil
105	}
106	ss.s = ss.s[:0]
107	for _, s := range strings.Split(s, ",") {
108		if s := strings.TrimSpace(s); s != "" {
109			if ss.allowed != nil && !ss.allowed.contains(s) {
110				return fmt.Errorf("unsupported value %q; must be one of %s", s, ss.allowed)
111			}
112			ss.add(s)
113		}
114	}
115	ss.compact()
116	return nil
117}
118
119func (ss *stringSet) add(s string) {
120	ss.s = append(ss.s, s)
121	ss.dirty = true
122}
123
124func (ss *stringSet) values() []string {
125	ss.compact()
126	return ss.s
127}
128
129func (ss *stringSet) contains(s string) bool {
130	if ss.all {
131		return true
132	}
133	for _, v := range ss.s {
134		if v == s {
135			return true
136		}
137	}
138	return false
139}
140
141func (ss *stringSet) compact() {
142	if !ss.dirty {
143		return
144	}
145	a := ss.s
146	sort.Strings(a)
147	k := 0
148	for i := 1; i < len(a); i++ {
149		if a[k] != a[i] {
150			a[k+1] = a[i]
151			k++
152		}
153	}
154	ss.s = a[:k+1]
155	ss.dirty = false
156}
157
158func skipLang(l string) bool {
159	if include.Len() > 0 {
160		return !include.contains(l)
161	}
162	return exclude.contains(l)
163}
164
165// altInclude returns a list of alternatives (for the LDML alt attribute)
166// in order of preference.  An empty string in this list indicates the
167// default entry.
168func altInclude() []string {
169	l := []string{}
170	if *short {
171		l = append(l, "short")
172	}
173	l = append(l, "")
174	// TODO: handle draft using cldr.SetDraftLevel
175	if *draft {
176		l = append(l, "proposed")
177	}
178	return l
179}
180
181func failOnError(e error) {
182	if e != nil {
183		log.Panic(e)
184	}
185}
186
187func openArchive() *zip.Reader {
188	f := gen.OpenCLDRCoreZip()
189	buffer, err := ioutil.ReadAll(f)
190	f.Close()
191	failOnError(err)
192	archive, err := zip.NewReader(bytes.NewReader(buffer), int64(len(buffer)))
193	failOnError(err)
194	return archive
195}
196
197// parseUCA parses a Default Unicode Collation Element Table of the format
198// specified in http://www.unicode.org/reports/tr10/#File_Format.
199// It returns the variable top.
200func parseUCA(builder *build.Builder) {
201	var r io.ReadCloser
202	var err error
203	for _, f := range openArchive().File {
204		if strings.HasSuffix(f.Name, "allkeys_CLDR.txt") {
205			r, err = f.Open()
206		}
207	}
208	if r == nil {
209		log.Fatal("File allkeys_CLDR.txt not found in archive.")
210	}
211	failOnError(err)
212	defer r.Close()
213	scanner := bufio.NewScanner(r)
214	colelem := regexp.MustCompile(`\[([.*])([0-9A-F.]+)\]`)
215	for i := 1; scanner.Scan(); i++ {
216		line := scanner.Text()
217		if len(line) == 0 || line[0] == '#' {
218			continue
219		}
220		if line[0] == '@' {
221			// parse properties
222			switch {
223			case strings.HasPrefix(line[1:], "version "):
224				a := strings.Split(line[1:], " ")
225				if a[1] != gen.UnicodeVersion() {
226					log.Fatalf("incompatible version %s; want %s", a[1], gen.UnicodeVersion())
227				}
228			case strings.HasPrefix(line[1:], "backwards "):
229				log.Fatalf("%d: unsupported option backwards", i)
230			default:
231				log.Printf("%d: unknown option %s", i, line[1:])
232			}
233		} else {
234			// parse entries
235			part := strings.Split(line, " ; ")
236			if len(part) != 2 {
237				log.Fatalf("%d: production rule without ';': %v", i, line)
238			}
239			lhs := []rune{}
240			for _, v := range strings.Split(part[0], " ") {
241				if v == "" {
242					continue
243				}
244				lhs = append(lhs, rune(convHex(i, v)))
245			}
246			var n int
247			var vars []int
248			rhs := [][]int{}
249			for i, m := range colelem.FindAllStringSubmatch(part[1], -1) {
250				n += len(m[0])
251				elem := []int{}
252				for _, h := range strings.Split(m[2], ".") {
253					elem = append(elem, convHex(i, h))
254				}
255				if m[1] == "*" {
256					vars = append(vars, i)
257				}
258				rhs = append(rhs, elem)
259			}
260			if len(part[1]) < n+3 || part[1][n+1] != '#' {
261				log.Fatalf("%d: expected comment; found %s", i, part[1][n:])
262			}
263			if *test {
264				testInput.add(string(lhs))
265			}
266			failOnError(builder.Add(lhs, rhs, vars))
267		}
268	}
269	if scanner.Err() != nil {
270		log.Fatal(scanner.Err())
271	}
272}
273
274func convHex(line int, s string) int {
275	r, e := strconv.ParseInt(s, 16, 32)
276	if e != nil {
277		log.Fatalf("%d: %v", line, e)
278	}
279	return int(r)
280}
281
282var testInput = stringSet{}
283
284var charRe = regexp.MustCompile(`&#x([0-9A-F]*);`)
285var tagRe = regexp.MustCompile(`<([a-z_]*)  */>`)
286
287var mainLocales = []string{}
288
289// charsets holds a list of exemplar characters per category.
290type charSets map[string][]string
291
292func (p charSets) fprint(w io.Writer) {
293	fmt.Fprintln(w, "[exN]string{")
294	for i, k := range []string{"", "contractions", "punctuation", "auxiliary", "currencySymbol", "index"} {
295		if set := p[k]; len(set) != 0 {
296			fmt.Fprintf(w, "\t\t%d: %q,\n", i, strings.Join(set, " "))
297		}
298	}
299	fmt.Fprintln(w, "\t},")
300}
301
302var localeChars = make(map[string]charSets)
303
304const exemplarHeader = `
305type exemplarType int
306const (
307	exCharacters exemplarType = iota
308	exContractions
309	exPunctuation
310	exAuxiliary
311	exCurrency
312	exIndex
313	exN
314)
315`
316
317func printExemplarCharacters(w io.Writer) {
318	fmt.Fprintln(w, exemplarHeader)
319	fmt.Fprintln(w, "var exemplarCharacters = map[string][exN]string{")
320	for _, loc := range mainLocales {
321		fmt.Fprintf(w, "\t%q: ", loc)
322		localeChars[loc].fprint(w)
323	}
324	fmt.Fprintln(w, "}")
325}
326
327func decodeCLDR(d *cldr.Decoder) *cldr.CLDR {
328	r := gen.OpenCLDRCoreZip()
329	data, err := d.DecodeZip(r)
330	failOnError(err)
331	return data
332}
333
334// parseMain parses XML files in the main directory of the CLDR core.zip file.
335func parseMain() {
336	d := &cldr.Decoder{}
337	d.SetDirFilter("main")
338	d.SetSectionFilter("characters")
339	data := decodeCLDR(d)
340	for _, loc := range data.Locales() {
341		x := data.RawLDML(loc)
342		if skipLang(x.Identity.Language.Type) {
343			continue
344		}
345		if x.Characters != nil {
346			x, _ = data.LDML(loc)
347			loc = language.Make(loc).String()
348			for _, ec := range x.Characters.ExemplarCharacters {
349				if ec.Draft != "" {
350					continue
351				}
352				if _, ok := localeChars[loc]; !ok {
353					mainLocales = append(mainLocales, loc)
354					localeChars[loc] = make(charSets)
355				}
356				localeChars[loc][ec.Type] = parseCharacters(ec.Data())
357			}
358		}
359	}
360}
361
362func parseCharacters(chars string) []string {
363	parseSingle := func(s string) (r rune, tail string, escaped bool) {
364		if s[0] == '\\' {
365			return rune(s[1]), s[2:], true
366		}
367		r, sz := utf8.DecodeRuneInString(s)
368		return r, s[sz:], false
369	}
370	chars = strings.TrimSpace(chars)
371	if n := len(chars) - 1; chars[n] == ']' && chars[0] == '[' {
372		chars = chars[1:n]
373	}
374	list := []string{}
375	var r, last, end rune
376	for len(chars) > 0 {
377		if chars[0] == '{' { // character sequence
378			buf := []rune{}
379			for chars = chars[1:]; len(chars) > 0; {
380				r, chars, _ = parseSingle(chars)
381				if r == '}' {
382					break
383				}
384				if r == ' ' {
385					log.Fatalf("space not supported in sequence %q", chars)
386				}
387				buf = append(buf, r)
388			}
389			list = append(list, string(buf))
390			last = 0
391		} else { // single character
392			escaped := false
393			r, chars, escaped = parseSingle(chars)
394			if r != ' ' {
395				if r == '-' && !escaped {
396					if last == 0 {
397						log.Fatal("'-' should be preceded by a character")
398					}
399					end, chars, _ = parseSingle(chars)
400					for ; last <= end; last++ {
401						list = append(list, string(last))
402					}
403					last = 0
404				} else {
405					list = append(list, string(r))
406					last = r
407				}
408			}
409		}
410	}
411	return list
412}
413
414var fileRe = regexp.MustCompile(`.*/collation/(.*)\.xml`)
415
416// typeMap translates legacy type keys to their BCP47 equivalent.
417var typeMap = map[string]string{
418	"phonebook":   "phonebk",
419	"traditional": "trad",
420}
421
422// parseCollation parses XML files in the collation directory of the CLDR core.zip file.
423func parseCollation(b *build.Builder) {
424	d := &cldr.Decoder{}
425	d.SetDirFilter("collation")
426	data := decodeCLDR(d)
427	for _, loc := range data.Locales() {
428		x, err := data.LDML(loc)
429		failOnError(err)
430		if skipLang(x.Identity.Language.Type) {
431			continue
432		}
433		cs := x.Collations.Collation
434		sl := cldr.MakeSlice(&cs)
435		if len(types.s) == 0 {
436			sl.SelectAnyOf("type", x.Collations.Default())
437		} else if !types.all {
438			sl.SelectAnyOf("type", types.s...)
439		}
440		sl.SelectOnePerGroup("alt", altInclude())
441
442		for _, c := range cs {
443			id, err := language.Parse(loc)
444			if err != nil {
445				fmt.Fprintf(os.Stderr, "invalid locale: %q", err)
446				continue
447			}
448			// Support both old- and new-style defaults.
449			d := c.Type
450			if x.Collations.DefaultCollation == nil {
451				d = x.Collations.Default()
452			} else {
453				d = x.Collations.DefaultCollation.Data()
454			}
455			// We assume tables are being built either for search or collation,
456			// but not both. For search the default is always "search".
457			if d != c.Type && c.Type != "search" {
458				typ := c.Type
459				if len(c.Type) > 8 {
460					typ = typeMap[c.Type]
461				}
462				id, err = id.SetTypeForKey("co", typ)
463				failOnError(err)
464			}
465			t := b.Tailoring(id)
466			c.Process(processor{t})
467		}
468	}
469}
470
471type processor struct {
472	t *build.Tailoring
473}
474
475func (p processor) Reset(anchor string, before int) (err error) {
476	if before != 0 {
477		err = p.t.SetAnchorBefore(anchor)
478	} else {
479		err = p.t.SetAnchor(anchor)
480	}
481	failOnError(err)
482	return nil
483}
484
485func (p processor) Insert(level int, str, context, extend string) error {
486	str = context + str
487	if *test {
488		testInput.add(str)
489	}
490	// TODO: mimic bug in old maketables: remove.
491	err := p.t.Insert(colltab.Level(level-1), str, context+extend)
492	failOnError(err)
493	return nil
494}
495
496func (p processor) Index(id string) {
497}
498
499func testCollator(c *collate.Collator) {
500	c0 := collate.New(language.Und)
501
502	// iterator over all characters for all locales and check
503	// whether Key is equal.
504	buf := collate.Buffer{}
505
506	// Add all common and not too uncommon runes to the test set.
507	for i := rune(0); i < 0x30000; i++ {
508		testInput.add(string(i))
509	}
510	for i := rune(0xE0000); i < 0xF0000; i++ {
511		testInput.add(string(i))
512	}
513	for _, str := range testInput.values() {
514		k0 := c0.KeyFromString(&buf, str)
515		k := c.KeyFromString(&buf, str)
516		if !bytes.Equal(k0, k) {
517			failOnError(fmt.Errorf("test:%U: keys differ (%x vs %x)", []rune(str), k0, k))
518		}
519		buf.Reset()
520	}
521	fmt.Println("PASS")
522}
523
524func main() {
525	gen.Init()
526	b := build.NewBuilder()
527	parseUCA(b)
528	if tables.contains("chars") {
529		parseMain()
530	}
531	parseCollation(b)
532
533	c, err := b.Build()
534	failOnError(err)
535
536	if *test {
537		testCollator(collate.NewFromTable(c))
538	} else {
539		w := &bytes.Buffer{}
540
541		gen.WriteUnicodeVersion(w)
542		gen.WriteCLDRVersion(w)
543
544		if tables.contains("collate") {
545			_, err = b.Print(w)
546			failOnError(err)
547		}
548		if tables.contains("chars") {
549			printExemplarCharacters(w)
550		}
551		gen.WriteGoFile("tables.go", *pkg, w.Bytes())
552	}
553}
554