1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Unicode table generator.
8// Data read from the web.
9
10package main
11
12import (
13	"flag"
14	"fmt"
15	"log"
16	"os"
17	"regexp"
18	"sort"
19	"strings"
20	"unicode"
21
22	"golang.org/x/text/internal/gen"
23	"golang.org/x/text/internal/ucd"
24	"golang.org/x/text/unicode/rangetable"
25)
26
27func main() {
28	flag.Parse()
29	setupOutput()
30	loadChars() // always needed
31	loadCasefold()
32	printCategories()
33	printScriptOrProperty(false)
34	printScriptOrProperty(true)
35	printCases()
36	printLatinProperties()
37	printCasefold()
38	printSizes()
39	flushOutput()
40}
41
42func defaultVersion() string {
43	if v := os.Getenv("UNICODE_VERSION"); v != "" {
44		return v
45	}
46	return unicode.Version
47}
48
49var tablelist = flag.String("tables",
50	"all",
51	"comma-separated list of which tables to generate; can be letter")
52var scriptlist = flag.String("scripts",
53	"all",
54	"comma-separated list of which script tables to generate")
55var proplist = flag.String("props",
56	"all",
57	"comma-separated list of which property tables to generate")
58var cases = flag.Bool("cases",
59	true,
60	"generate case tables")
61var test = flag.Bool("test",
62	false,
63	"test existing tables; can be used to compare web data with package data")
64
65var scriptRe = regexp.MustCompile(`^([0-9A-F]+)(\.\.[0-9A-F]+)? *; ([A-Za-z_]+)$`)
66var logger = log.New(os.Stderr, "", log.Lshortfile)
67
68var output *gen.CodeWriter
69
70func setupOutput() {
71	output = gen.NewCodeWriter()
72}
73
74func flushOutput() {
75	output.WriteGoFile("tables.go", "unicode")
76}
77
78func printf(format string, args ...interface{}) {
79	fmt.Fprintf(output, format, args...)
80}
81
82func print(args ...interface{}) {
83	fmt.Fprint(output, args...)
84}
85
86func println(args ...interface{}) {
87	fmt.Fprintln(output, args...)
88}
89
90var category = map[string]bool{
91	// Nd Lu etc.
92	// We use one-character names to identify merged categories
93	"L": true, // Lu Ll Lt Lm Lo
94	"P": true, // Pc Pd Ps Pe Pu Pf Po
95	"M": true, // Mn Mc Me
96	"N": true, // Nd Nl No
97	"S": true, // Sm Sc Sk So
98	"Z": true, // Zs Zl Zp
99	"C": true, // Cc Cf Cs Co Cn
100}
101
102// This contains only the properties we're interested in.
103type Char struct {
104	codePoint rune // if zero, this index is not a valid code point.
105	category  string
106	upperCase rune
107	lowerCase rune
108	titleCase rune
109	foldCase  rune // simple case folding
110	caseOrbit rune // next in simple case folding orbit
111}
112
113const MaxChar = 0x10FFFF
114
115var chars = make([]Char, MaxChar+1)
116var scripts = make(map[string][]rune)
117var props = make(map[string][]rune) // a property looks like a script; can share the format
118
119func allCategories() []string {
120	a := make([]string, 0, len(category))
121	for k := range category {
122		a = append(a, k)
123	}
124	sort.Strings(a)
125	return a
126}
127
128func all(scripts map[string][]rune) []string {
129	a := make([]string, 0, len(scripts))
130	for k := range scripts {
131		a = append(a, k)
132	}
133	sort.Strings(a)
134	return a
135}
136
137func allCatFold(m map[string]map[rune]bool) []string {
138	a := make([]string, 0, len(m))
139	for k := range m {
140		a = append(a, k)
141	}
142	sort.Strings(a)
143	return a
144}
145
146func categoryOp(code rune, class uint8) bool {
147	category := chars[code].category
148	return len(category) > 0 && category[0] == class
149}
150
151func loadChars() {
152	ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
153		c := Char{codePoint: p.Rune(0)}
154
155		getRune := func(field int) rune {
156			if p.String(field) == "" {
157				return 0
158			}
159			return p.Rune(field)
160		}
161
162		c.category = p.String(ucd.GeneralCategory)
163		category[c.category] = true
164		switch c.category {
165		case "Nd":
166			// Decimal digit
167			p.Int(ucd.NumericValue)
168		case "Lu":
169			c.upperCase = getRune(ucd.CodePoint)
170			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
171			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
172		case "Ll":
173			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
174			c.lowerCase = getRune(ucd.CodePoint)
175			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
176		case "Lt":
177			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
178			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
179			c.titleCase = getRune(ucd.CodePoint)
180		default:
181			c.upperCase = getRune(ucd.SimpleUppercaseMapping)
182			c.lowerCase = getRune(ucd.SimpleLowercaseMapping)
183			c.titleCase = getRune(ucd.SimpleTitlecaseMapping)
184		}
185
186		chars[c.codePoint] = c
187	})
188}
189
190func loadCasefold() {
191	ucd.Parse(gen.OpenUCDFile("CaseFolding.txt"), func(p *ucd.Parser) {
192		kind := p.String(1)
193		if kind != "C" && kind != "S" {
194			// Only care about 'common' and 'simple' foldings.
195			return
196		}
197		p1 := p.Rune(0)
198		p2 := p.Rune(2)
199		chars[p1].foldCase = rune(p2)
200	})
201}
202
203var categoryMapping = map[string]string{
204	"Lu": "Letter, uppercase",
205	"Ll": "Letter, lowercase",
206	"Lt": "Letter, titlecase",
207	"Lm": "Letter, modifier",
208	"Lo": "Letter, other",
209	"Mn": "Mark, nonspacing",
210	"Mc": "Mark, spacing combining",
211	"Me": "Mark, enclosing",
212	"Nd": "Number, decimal digit",
213	"Nl": "Number, letter",
214	"No": "Number, other",
215	"Pc": "Punctuation, connector",
216	"Pd": "Punctuation, dash",
217	"Ps": "Punctuation, open",
218	"Pe": "Punctuation, close",
219	"Pi": "Punctuation, initial quote",
220	"Pf": "Punctuation, final quote",
221	"Po": "Punctuation, other",
222	"Sm": "Symbol, math",
223	"Sc": "Symbol, currency",
224	"Sk": "Symbol, modifier",
225	"So": "Symbol, other",
226	"Zs": "Separator, space",
227	"Zl": "Separator, line",
228	"Zp": "Separator, paragraph",
229	"Cc": "Other, control",
230	"Cf": "Other, format",
231	"Cs": "Other, surrogate",
232	"Co": "Other, private use",
233	"Cn": "Other, not assigned",
234}
235
236func printCategories() {
237	if *tablelist == "" {
238		return
239	}
240	// Find out which categories to dump
241	list := strings.Split(*tablelist, ",")
242	if *tablelist == "all" {
243		list = allCategories()
244	}
245	if *test {
246		fullCategoryTest(list)
247		return
248	}
249
250	println("// Version is the Unicode edition from which the tables are derived.")
251	printf("const Version = %q\n\n", gen.UnicodeVersion())
252
253	if *tablelist == "all" {
254		println("// Categories is the set of Unicode category tables.")
255		println("var Categories = map[string] *RangeTable {")
256		for _, k := range allCategories() {
257			printf("\t%q: %s,\n", k, k)
258		}
259		print("}\n\n")
260	}
261
262	decl := make(sort.StringSlice, len(list))
263	ndecl := 0
264	for _, name := range list {
265		if _, ok := category[name]; !ok {
266			logger.Fatal("unknown category", name)
267		}
268		// We generate an UpperCase name to serve as concise documentation and an _UnderScored
269		// name to store the data. This stops godoc dumping all the tables but keeps them
270		// available to clients.
271		// Cases deserving special comments
272		varDecl := ""
273		switch name {
274		case "C":
275			varDecl = "\tOther = _C;	// Other/C is the set of Unicode control and special characters, category C.\n"
276			varDecl += "\tC = _C\n"
277		case "L":
278			varDecl = "\tLetter = _L;	// Letter/L is the set of Unicode letters, category L.\n"
279			varDecl += "\tL = _L\n"
280		case "M":
281			varDecl = "\tMark = _M;	// Mark/M is the set of Unicode mark characters, category M.\n"
282			varDecl += "\tM = _M\n"
283		case "N":
284			varDecl = "\tNumber = _N;	// Number/N is the set of Unicode number characters, category N.\n"
285			varDecl += "\tN = _N\n"
286		case "P":
287			varDecl = "\tPunct = _P;	// Punct/P is the set of Unicode punctuation characters, category P.\n"
288			varDecl += "\tP = _P\n"
289		case "S":
290			varDecl = "\tSymbol = _S;	// Symbol/S is the set of Unicode symbol characters, category S.\n"
291			varDecl += "\tS = _S\n"
292		case "Z":
293			varDecl = "\tSpace = _Z;	// Space/Z is the set of Unicode space characters, category Z.\n"
294			varDecl += "\tZ = _Z\n"
295		case "Nd":
296			varDecl = "\tDigit = _Nd;	// Digit is the set of Unicode characters with the \"decimal digit\" property.\n"
297		case "Lu":
298			varDecl = "\tUpper = _Lu;	// Upper is the set of Unicode upper case letters.\n"
299		case "Ll":
300			varDecl = "\tLower = _Ll;	// Lower is the set of Unicode lower case letters.\n"
301		case "Lt":
302			varDecl = "\tTitle = _Lt;	// Title is the set of Unicode title case letters.\n"
303		}
304		if len(name) > 1 {
305			desc, ok := categoryMapping[name]
306			if ok {
307				varDecl += fmt.Sprintf(
308					"\t%s = _%s;	// %s is the set of Unicode characters in category %s (%s).\n",
309					name, name, name, name, desc)
310			} else {
311				varDecl += fmt.Sprintf(
312					"\t%s = _%s;	// %s is the set of Unicode characters in category %s.\n",
313					name, name, name, name)
314			}
315		}
316		decl[ndecl] = varDecl
317		ndecl++
318		if len(name) == 1 { // unified categories
319			dumpRange(
320				"_"+name,
321				func(code rune) bool { return categoryOp(code, name[0]) })
322			continue
323		}
324		dumpRange("_"+name,
325			func(code rune) bool { return chars[code].category == name })
326	}
327	decl.Sort()
328	println("// These variables have type *RangeTable.")
329	println("var (")
330	for _, d := range decl {
331		print(d)
332	}
333	print(")\n\n")
334}
335
336type Op func(code rune) bool
337
338func dumpRange(name string, inCategory Op) {
339	runes := []rune{}
340	for i := range chars {
341		r := rune(i)
342		if inCategory(r) {
343			runes = append(runes, r)
344		}
345	}
346	printRangeTable(name, runes)
347}
348
349func printRangeTable(name string, runes []rune) {
350	rt := rangetable.New(runes...)
351	printf("var %s = &RangeTable{\n", name)
352	println("\tR16: []Range16{")
353	for _, r := range rt.R16 {
354		printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
355		range16Count++
356	}
357	println("\t},")
358	if len(rt.R32) > 0 {
359		println("\tR32: []Range32{")
360		for _, r := range rt.R32 {
361			printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
362			range32Count++
363		}
364		println("\t},")
365	}
366	if rt.LatinOffset > 0 {
367		printf("\tLatinOffset: %d,\n", rt.LatinOffset)
368	}
369	printf("}\n\n")
370}
371
372func fullCategoryTest(list []string) {
373	for _, name := range list {
374		if _, ok := category[name]; !ok {
375			logger.Fatal("unknown category", name)
376		}
377		r, ok := unicode.Categories[name]
378		if !ok && len(name) > 1 {
379			logger.Fatalf("unknown table %q", name)
380		}
381		if len(name) == 1 {
382			verifyRange(name, func(code rune) bool { return categoryOp(code, name[0]) }, r)
383		} else {
384			verifyRange(
385				name,
386				func(code rune) bool { return chars[code].category == name },
387				r)
388		}
389	}
390}
391
392func verifyRange(name string, inCategory Op, table *unicode.RangeTable) {
393	count := 0
394	for j := range chars {
395		i := rune(j)
396		web := inCategory(i)
397		pkg := unicode.Is(table, i)
398		if web != pkg {
399			fmt.Fprintf(os.Stderr, "%s: %U: web=%t pkg=%t\n", name, i, web, pkg)
400			count++
401			if count > 10 {
402				break
403			}
404		}
405	}
406}
407
408func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]rune) {
409	for _, name := range list {
410		if _, ok := scripts[name]; !ok {
411			logger.Fatal("unknown script", name)
412		}
413		_, ok := installed[name]
414		if !ok {
415			logger.Fatal("unknown table", name)
416		}
417		for _, r := range scripts[name] {
418			if !unicode.Is(installed[name], rune(r)) {
419				fmt.Fprintf(os.Stderr, "%U: not in script %s\n", r, name)
420			}
421		}
422	}
423}
424
425var deprecatedAliases = map[string]string{
426	"Sentence_Terminal": "STerm",
427}
428
429// PropList.txt has the same format as Scripts.txt so we can share its parser.
430func printScriptOrProperty(doProps bool) {
431	flaglist := *scriptlist
432	file := "Scripts.txt"
433	table := scripts
434	installed := unicode.Scripts
435	if doProps {
436		flaglist = *proplist
437		file = "PropList.txt"
438		table = props
439		installed = unicode.Properties
440	}
441	if flaglist == "" {
442		return
443	}
444	ucd.Parse(gen.OpenUCDFile(file), func(p *ucd.Parser) {
445		name := p.String(1)
446		table[name] = append(table[name], p.Rune(0))
447	})
448	// Find out which scripts to dump
449	list := strings.Split(flaglist, ",")
450	if flaglist == "all" {
451		list = all(table)
452	}
453	if *test {
454		fullScriptTest(list, installed, table)
455		return
456	}
457
458	if flaglist == "all" {
459		if doProps {
460			println("// Properties is the set of Unicode property tables.")
461			println("var Properties = map[string] *RangeTable{")
462		} else {
463			println("// Scripts is the set of Unicode script tables.")
464			println("var Scripts = map[string] *RangeTable{")
465		}
466		for _, k := range all(table) {
467			printf("\t%q: %s,\n", k, k)
468			if alias, ok := deprecatedAliases[k]; ok {
469				printf("\t%q: %s,\n", alias, k)
470			}
471		}
472		print("}\n\n")
473	}
474
475	decl := make(sort.StringSlice, len(list)+len(deprecatedAliases))
476	ndecl := 0
477	for _, name := range list {
478		if doProps {
479			decl[ndecl] = fmt.Sprintf(
480				"\t%s = _%s;\t// %s is the set of Unicode characters with property %s.\n",
481				name, name, name, name)
482		} else {
483			decl[ndecl] = fmt.Sprintf(
484				"\t%s = _%s;\t// %s is the set of Unicode characters in script %s.\n",
485				name, name, name, name)
486		}
487		ndecl++
488		if alias, ok := deprecatedAliases[name]; ok {
489			decl[ndecl] = fmt.Sprintf(
490				"\t%[1]s = _%[2]s;\t// %[1]s is an alias for %[2]s.\n",
491				alias, name)
492			ndecl++
493		}
494		printRangeTable("_"+name, table[name])
495	}
496	decl.Sort()
497	println("// These variables have type *RangeTable.")
498	println("var (")
499	for _, d := range decl {
500		print(d)
501	}
502	print(")\n\n")
503}
504
505const (
506	CaseUpper = 1 << iota
507	CaseLower
508	CaseTitle
509	CaseNone    = 0  // must be zero
510	CaseMissing = -1 // character not present; not a valid case state
511)
512
513type caseState struct {
514	point        rune
515	_case        int
516	deltaToUpper rune
517	deltaToLower rune
518	deltaToTitle rune
519}
520
521// Is d a continuation of the state of c?
522func (c *caseState) adjacent(d *caseState) bool {
523	if d.point < c.point {
524		c, d = d, c
525	}
526	switch {
527	case d.point != c.point+1: // code points not adjacent (shouldn't happen)
528		return false
529	case d._case != c._case: // different cases
530		return c.upperLowerAdjacent(d)
531	case c._case == CaseNone:
532		return false
533	case c._case == CaseMissing:
534		return false
535	case d.deltaToUpper != c.deltaToUpper:
536		return false
537	case d.deltaToLower != c.deltaToLower:
538		return false
539	case d.deltaToTitle != c.deltaToTitle:
540		return false
541	}
542	return true
543}
544
545// Is d the same as c, but opposite in upper/lower case? this would make it
546// an element of an UpperLower sequence.
547func (c *caseState) upperLowerAdjacent(d *caseState) bool {
548	// check they're a matched case pair.  we know they have adjacent values
549	switch {
550	case c._case == CaseUpper && d._case != CaseLower:
551		return false
552	case c._case == CaseLower && d._case != CaseUpper:
553		return false
554	}
555	// matched pair (at least in upper/lower).  make the order Upper Lower
556	if c._case == CaseLower {
557		c, d = d, c
558	}
559	// for an Upper Lower sequence the deltas have to be in order
560	//	c: 0 1 0
561	//	d: -1 0 -1
562	switch {
563	case c.deltaToUpper != 0:
564		return false
565	case c.deltaToLower != 1:
566		return false
567	case c.deltaToTitle != 0:
568		return false
569	case d.deltaToUpper != -1:
570		return false
571	case d.deltaToLower != 0:
572		return false
573	case d.deltaToTitle != -1:
574		return false
575	}
576	return true
577}
578
579// Does this character start an UpperLower sequence?
580func (c *caseState) isUpperLower() bool {
581	// for an Upper Lower sequence the deltas have to be in order
582	//	c: 0 1 0
583	switch {
584	case c.deltaToUpper != 0:
585		return false
586	case c.deltaToLower != 1:
587		return false
588	case c.deltaToTitle != 0:
589		return false
590	}
591	return true
592}
593
594// Does this character start a LowerUpper sequence?
595func (c *caseState) isLowerUpper() bool {
596	// for an Upper Lower sequence the deltas have to be in order
597	//	c: -1 0 -1
598	switch {
599	case c.deltaToUpper != -1:
600		return false
601	case c.deltaToLower != 0:
602		return false
603	case c.deltaToTitle != -1:
604		return false
605	}
606	return true
607}
608
609func getCaseState(i rune) (c *caseState) {
610	c = &caseState{point: i, _case: CaseNone}
611	ch := &chars[i]
612	switch ch.codePoint {
613	case 0:
614		c._case = CaseMissing // Will get NUL wrong but that doesn't matter
615		return
616	case ch.upperCase:
617		c._case = CaseUpper
618	case ch.lowerCase:
619		c._case = CaseLower
620	case ch.titleCase:
621		c._case = CaseTitle
622	}
623	// Some things such as roman numeral U+2161 don't describe themselves
624	// as upper case, but have a lower case. Second-guess them.
625	if c._case == CaseNone && ch.lowerCase != 0 {
626		c._case = CaseUpper
627	}
628	// Same in the other direction.
629	if c._case == CaseNone && ch.upperCase != 0 {
630		c._case = CaseLower
631	}
632
633	if ch.upperCase != 0 {
634		c.deltaToUpper = ch.upperCase - i
635	}
636	if ch.lowerCase != 0 {
637		c.deltaToLower = ch.lowerCase - i
638	}
639	if ch.titleCase != 0 {
640		c.deltaToTitle = ch.titleCase - i
641	}
642	return
643}
644
645func printCases() {
646	if *test {
647		fullCaseTest()
648		return
649	}
650	printf(
651		"// CaseRanges is the table describing case mappings for all letters with\n" +
652			"// non-self mappings.\n" +
653			"var CaseRanges = _CaseRanges\n" +
654			"var _CaseRanges = []CaseRange {\n")
655
656	var startState *caseState    // the start of a run; nil for not active
657	var prevState = &caseState{} // the state of the previous character
658	for i := range chars {
659		state := getCaseState(rune(i))
660		if state.adjacent(prevState) {
661			prevState = state
662			continue
663		}
664		// end of run (possibly)
665		printCaseRange(startState, prevState)
666		startState = nil
667		if state._case != CaseMissing && state._case != CaseNone {
668			startState = state
669		}
670		prevState = state
671	}
672	print("}\n")
673}
674
675func printCaseRange(lo, hi *caseState) {
676	if lo == nil {
677		return
678	}
679	if lo.deltaToUpper == 0 && lo.deltaToLower == 0 && lo.deltaToTitle == 0 {
680		// character represents itself in all cases - no need to mention it
681		return
682	}
683	switch {
684	case hi.point > lo.point && lo.isUpperLower():
685		printf("\t{0x%04X, 0x%04X, d{UpperLower, UpperLower, UpperLower}},\n",
686			lo.point, hi.point)
687	case hi.point > lo.point && lo.isLowerUpper():
688		logger.Fatalf("LowerUpper sequence: should not happen: %U.  If it's real, need to fix To()", lo.point)
689		printf("\t{0x%04X, 0x%04X, d{LowerUpper, LowerUpper, LowerUpper}},\n",
690			lo.point, hi.point)
691	default:
692		printf("\t{0x%04X, 0x%04X, d{%d, %d, %d}},\n",
693			lo.point, hi.point,
694			lo.deltaToUpper, lo.deltaToLower, lo.deltaToTitle)
695	}
696}
697
698// If the cased value in the Char is 0, it means use the rune itself.
699func caseIt(r, cased rune) rune {
700	if cased == 0 {
701		return r
702	}
703	return cased
704}
705
706func fullCaseTest() {
707	for j, c := range chars {
708		i := rune(j)
709		lower := unicode.ToLower(i)
710		want := caseIt(i, c.lowerCase)
711		if lower != want {
712			fmt.Fprintf(os.Stderr, "lower %U should be %U is %U\n", i, want, lower)
713		}
714		upper := unicode.ToUpper(i)
715		want = caseIt(i, c.upperCase)
716		if upper != want {
717			fmt.Fprintf(os.Stderr, "upper %U should be %U is %U\n", i, want, upper)
718		}
719		title := unicode.ToTitle(i)
720		want = caseIt(i, c.titleCase)
721		if title != want {
722			fmt.Fprintf(os.Stderr, "title %U should be %U is %U\n", i, want, title)
723		}
724	}
725}
726
727func printLatinProperties() {
728	if *test {
729		return
730	}
731	println("var properties = [MaxLatin1+1]uint8{")
732	for code := 0; code <= unicode.MaxLatin1; code++ {
733		var property string
734		switch chars[code].category {
735		case "Cc", "": // NUL has no category.
736			property = "pC"
737		case "Cf": // soft hyphen, unique category, not printable.
738			property = "0"
739		case "Ll":
740			property = "pLl | pp"
741		case "Lo":
742			property = "pLo | pp"
743		case "Lu":
744			property = "pLu | pp"
745		case "Nd", "No":
746			property = "pN | pp"
747		case "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps":
748			property = "pP | pp"
749		case "Sc", "Sk", "Sm", "So":
750			property = "pS | pp"
751		case "Zs":
752			property = "pZ"
753		default:
754			logger.Fatalf("%U has unknown category %q", code, chars[code].category)
755		}
756		// Special case
757		if code == ' ' {
758			property = "pZ | pp"
759		}
760		printf("\t0x%02X: %s, // %q\n", code, property, code)
761	}
762	printf("}\n\n")
763}
764
765func printCasefold() {
766	// Build list of case-folding groups attached to each canonical folded char (typically lower case).
767	var caseOrbit = make([][]rune, MaxChar+1)
768	for j := range chars {
769		i := rune(j)
770		c := &chars[i]
771		if c.foldCase == 0 {
772			continue
773		}
774		orb := caseOrbit[c.foldCase]
775		if orb == nil {
776			orb = append(orb, c.foldCase)
777		}
778		caseOrbit[c.foldCase] = append(orb, i)
779	}
780
781	// Insert explicit 1-element groups when assuming [lower, upper] would be wrong.
782	for j := range chars {
783		i := rune(j)
784		c := &chars[i]
785		f := c.foldCase
786		if f == 0 {
787			f = i
788		}
789		orb := caseOrbit[f]
790		if orb == nil && (c.upperCase != 0 && c.upperCase != i || c.lowerCase != 0 && c.lowerCase != i) {
791			// Default assumption of [upper, lower] is wrong.
792			caseOrbit[i] = []rune{i}
793		}
794	}
795
796	// Delete the groups for which assuming [lower, upper] or [upper, lower] is right.
797	for i, orb := range caseOrbit {
798		if len(orb) == 2 && chars[orb[0]].upperCase == orb[1] && chars[orb[1]].lowerCase == orb[0] {
799			caseOrbit[i] = nil
800		}
801		if len(orb) == 2 && chars[orb[1]].upperCase == orb[0] && chars[orb[0]].lowerCase == orb[1] {
802			caseOrbit[i] = nil
803		}
804	}
805
806	// Record orbit information in chars.
807	for _, orb := range caseOrbit {
808		if orb == nil {
809			continue
810		}
811		sort.Slice(orb, func(i, j int) bool {
812			return orb[i] < orb[j]
813		})
814		c := orb[len(orb)-1]
815		for _, d := range orb {
816			chars[c].caseOrbit = d
817			c = d
818		}
819	}
820
821	printAsciiFold()
822	printCaseOrbit()
823
824	// Tables of category and script folding exceptions: code points
825	// that must be added when interpreting a particular category/script
826	// in a case-folding context.
827	cat := make(map[string]map[rune]bool)
828	for name := range category {
829		if x := foldExceptions(inCategory(name)); len(x) > 0 {
830			cat[name] = x
831		}
832	}
833
834	scr := make(map[string]map[rune]bool)
835	for name := range scripts {
836		if x := foldExceptions(scripts[name]); len(x) > 0 {
837			scr[name] = x
838		}
839	}
840
841	printCatFold("FoldCategory", cat)
842	printCatFold("FoldScript", scr)
843}
844
845// inCategory returns a list of all the runes in the category.
846func inCategory(name string) []rune {
847	var x []rune
848	for j := range chars {
849		i := rune(j)
850		c := &chars[i]
851		if c.category == name || len(name) == 1 && len(c.category) > 1 && c.category[0] == name[0] {
852			x = append(x, i)
853		}
854	}
855	return x
856}
857
858// foldExceptions returns a list of all the runes fold-equivalent
859// to runes in class but not in class themselves.
860func foldExceptions(class []rune) map[rune]bool {
861	// Create map containing class and all fold-equivalent chars.
862	m := make(map[rune]bool)
863	for _, r := range class {
864		c := &chars[r]
865		if c.caseOrbit == 0 {
866			// Just upper and lower.
867			if u := c.upperCase; u != 0 {
868				m[u] = true
869			}
870			if l := c.lowerCase; l != 0 {
871				m[l] = true
872			}
873			m[r] = true
874			continue
875		}
876		// Otherwise walk orbit.
877		r0 := r
878		for {
879			m[r] = true
880			r = chars[r].caseOrbit
881			if r == r0 {
882				break
883			}
884		}
885	}
886
887	// Remove class itself.
888	for _, r := range class {
889		delete(m, r)
890	}
891
892	// What's left is the exceptions.
893	return m
894}
895
896var comment = map[string]string{
897	"FoldCategory": "// FoldCategory maps a category name to a table of\n" +
898		"// code points outside the category that are equivalent under\n" +
899		"// simple case folding to code points inside the category.\n" +
900		"// If there is no entry for a category name, there are no such points.\n",
901
902	"FoldScript": "// FoldScript maps a script name to a table of\n" +
903		"// code points outside the script that are equivalent under\n" +
904		"// simple case folding to code points inside the script.\n" +
905		"// If there is no entry for a script name, there are no such points.\n",
906}
907
908func printAsciiFold() {
909	printf("var asciiFold = [MaxASCII + 1]uint16{\n")
910	for i := rune(0); i <= unicode.MaxASCII; i++ {
911		c := chars[i]
912		f := c.caseOrbit
913		if f == 0 {
914			if c.lowerCase != i && c.lowerCase != 0 {
915				f = c.lowerCase
916			} else if c.upperCase != i && c.upperCase != 0 {
917				f = c.upperCase
918			} else {
919				f = i
920			}
921		}
922		printf("\t0x%04X,\n", f)
923	}
924	printf("}\n\n")
925}
926
927func printCaseOrbit() {
928	if *test {
929		for j := range chars {
930			i := rune(j)
931			c := &chars[i]
932			f := c.caseOrbit
933			if f == 0 {
934				if c.lowerCase != i && c.lowerCase != 0 {
935					f = c.lowerCase
936				} else if c.upperCase != i && c.upperCase != 0 {
937					f = c.upperCase
938				} else {
939					f = i
940				}
941			}
942			if g := unicode.SimpleFold(i); g != f {
943				fmt.Fprintf(os.Stderr, "unicode.SimpleFold(%#U) = %#U, want %#U\n", i, g, f)
944			}
945		}
946		return
947	}
948
949	printf("var caseOrbit = []foldPair{\n")
950	for i := range chars {
951		c := &chars[i]
952		if c.caseOrbit != 0 {
953			printf("\t{0x%04X, 0x%04X},\n", i, c.caseOrbit)
954			foldPairCount++
955		}
956	}
957	printf("}\n\n")
958}
959
960func printCatFold(name string, m map[string]map[rune]bool) {
961	if *test {
962		var pkgMap map[string]*unicode.RangeTable
963		if name == "FoldCategory" {
964			pkgMap = unicode.FoldCategory
965		} else {
966			pkgMap = unicode.FoldScript
967		}
968		if len(pkgMap) != len(m) {
969			fmt.Fprintf(os.Stderr, "unicode.%s has %d elements, want %d\n", name, len(pkgMap), len(m))
970			return
971		}
972		for k, v := range m {
973			t, ok := pkgMap[k]
974			if !ok {
975				fmt.Fprintf(os.Stderr, "unicode.%s[%q] missing\n", name, k)
976				continue
977			}
978			n := 0
979			for _, r := range t.R16 {
980				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
981					if !v[c] {
982						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
983					}
984					n++
985				}
986			}
987			for _, r := range t.R32 {
988				for c := rune(r.Lo); c <= rune(r.Hi); c += rune(r.Stride) {
989					if !v[c] {
990						fmt.Fprintf(os.Stderr, "unicode.%s[%q] contains %#U, should not\n", name, k, c)
991					}
992					n++
993				}
994			}
995			if n != len(v) {
996				fmt.Fprintf(os.Stderr, "unicode.%s[%q] has %d code points, want %d\n", name, k, n, len(v))
997			}
998		}
999		return
1000	}
1001
1002	print(comment[name])
1003	printf("var %s = map[string]*RangeTable{\n", name)
1004	for _, name := range allCatFold(m) {
1005		printf("\t%q: fold%s,\n", name, name)
1006	}
1007	printf("}\n\n")
1008	for _, name := range allCatFold(m) {
1009		class := m[name]
1010		dumpRange("fold"+name, func(code rune) bool { return class[code] })
1011	}
1012}
1013
1014var range16Count = 0  // Number of entries in the 16-bit range tables.
1015var range32Count = 0  // Number of entries in the 32-bit range tables.
1016var foldPairCount = 0 // Number of fold pairs in the exception tables.
1017
1018func printSizes() {
1019	if *test {
1020		return
1021	}
1022	println()
1023	printf("// Range entries: %d 16-bit, %d 32-bit, %d total.\n", range16Count, range32Count, range16Count+range32Count)
1024	range16Bytes := range16Count * 3 * 2
1025	range32Bytes := range32Count * 3 * 4
1026	printf("// Range bytes: %d 16-bit, %d 32-bit, %d total.\n", range16Bytes, range32Bytes, range16Bytes+range32Bytes)
1027	println()
1028	printf("// Fold orbit bytes: %d pairs, %d bytes\n", foldPairCount, foldPairCount*2*2)
1029}
1030