1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Normalization table generator.
8// Data read from the web.
9// See forminfo.go for a description of the trie values associated with each rune.
10
11package main
12
13import (
14	"bytes"
15	"encoding/binary"
16	"flag"
17	"fmt"
18	"io"
19	"log"
20	"sort"
21	"strconv"
22	"strings"
23
24	"golang.org/x/text/internal/gen"
25	"golang.org/x/text/internal/triegen"
26	"golang.org/x/text/internal/ucd"
27)
28
29func main() {
30	gen.Init()
31	loadUnicodeData()
32	compactCCC()
33	loadCompositionExclusions()
34	completeCharFields(FCanonical)
35	completeCharFields(FCompatibility)
36	computeNonStarterCounts()
37	verifyComputed()
38	printChars()
39	testDerived()
40	printTestdata()
41	makeTables()
42}
43
44var (
45	tablelist = flag.String("tables",
46		"all",
47		"comma-separated list of which tables to generate; "+
48			"can be 'decomp', 'recomp', 'info' and 'all'")
49	test = flag.Bool("test",
50		false,
51		"test existing tables against DerivedNormalizationProps and generate test data for regression testing")
52	verbose = flag.Bool("verbose",
53		false,
54		"write data to stdout as it is parsed")
55)
56
57const MaxChar = 0x10FFFF // anything above this shouldn't exist
58
59// Quick Check properties of runes allow us to quickly
60// determine whether a rune may occur in a normal form.
61// For a given normal form, a rune may be guaranteed to occur
62// verbatim (QC=Yes), may or may not combine with another
63// rune (QC=Maybe), or may not occur (QC=No).
64type QCResult int
65
66const (
67	QCUnknown QCResult = iota
68	QCYes
69	QCNo
70	QCMaybe
71)
72
73func (r QCResult) String() string {
74	switch r {
75	case QCYes:
76		return "Yes"
77	case QCNo:
78		return "No"
79	case QCMaybe:
80		return "Maybe"
81	}
82	return "***UNKNOWN***"
83}
84
85const (
86	FCanonical     = iota // NFC or NFD
87	FCompatibility        // NFKC or NFKD
88	FNumberOfFormTypes
89)
90
91const (
92	MComposed   = iota // NFC or NFKC
93	MDecomposed        // NFD or NFKD
94	MNumberOfModes
95)
96
97// This contains only the properties we're interested in.
98type Char struct {
99	name          string
100	codePoint     rune  // if zero, this index is not a valid code point.
101	ccc           uint8 // canonical combining class
102	origCCC       uint8
103	excludeInComp bool // from CompositionExclusions.txt
104	compatDecomp  bool // it has a compatibility expansion
105
106	nTrailingNonStarters uint8
107	nLeadingNonStarters  uint8 // must be equal to trailing if non-zero
108
109	forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
110
111	state State
112}
113
114var chars = make([]Char, MaxChar+1)
115var cccMap = make(map[uint8]uint8)
116
117func (c Char) String() string {
118	buf := new(bytes.Buffer)
119
120	fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
121	fmt.Fprintf(buf, "  ccc: %v\n", c.ccc)
122	fmt.Fprintf(buf, "  excludeInComp: %v\n", c.excludeInComp)
123	fmt.Fprintf(buf, "  compatDecomp: %v\n", c.compatDecomp)
124	fmt.Fprintf(buf, "  state: %v\n", c.state)
125	fmt.Fprintf(buf, "  NFC:\n")
126	fmt.Fprint(buf, c.forms[FCanonical])
127	fmt.Fprintf(buf, "  NFKC:\n")
128	fmt.Fprint(buf, c.forms[FCompatibility])
129
130	return buf.String()
131}
132
133// In UnicodeData.txt, some ranges are marked like this:
134//	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
135//	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
136// parseCharacter keeps a state variable indicating the weirdness.
137type State int
138
139const (
140	SNormal State = iota // known to be zero for the type
141	SFirst
142	SLast
143	SMissing
144)
145
146var lastChar = rune('\u0000')
147
148func (c Char) isValid() bool {
149	return c.codePoint != 0 && c.state != SMissing
150}
151
152type FormInfo struct {
153	quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
154	verified   [MNumberOfModes]bool     // index: MComposed or MDecomposed
155
156	combinesForward  bool // May combine with rune on the right
157	combinesBackward bool // May combine with rune on the left
158	isOneWay         bool // Never appears in result
159	inDecomp         bool // Some decompositions result in this char.
160	decomp           Decomposition
161	expandedDecomp   Decomposition
162}
163
164func (f FormInfo) String() string {
165	buf := bytes.NewBuffer(make([]byte, 0))
166
167	fmt.Fprintf(buf, "    quickCheck[C]: %v\n", f.quickCheck[MComposed])
168	fmt.Fprintf(buf, "    quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
169	fmt.Fprintf(buf, "    cmbForward: %v\n", f.combinesForward)
170	fmt.Fprintf(buf, "    cmbBackward: %v\n", f.combinesBackward)
171	fmt.Fprintf(buf, "    isOneWay: %v\n", f.isOneWay)
172	fmt.Fprintf(buf, "    inDecomp: %v\n", f.inDecomp)
173	fmt.Fprintf(buf, "    decomposition: %X\n", f.decomp)
174	fmt.Fprintf(buf, "    expandedDecomp: %X\n", f.expandedDecomp)
175
176	return buf.String()
177}
178
179type Decomposition []rune
180
181func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
182	decomp := strings.Split(s, " ")
183	if len(decomp) > 0 && skipfirst {
184		decomp = decomp[1:]
185	}
186	for _, d := range decomp {
187		point, err := strconv.ParseUint(d, 16, 64)
188		if err != nil {
189			return a, err
190		}
191		a = append(a, rune(point))
192	}
193	return a, nil
194}
195
196func loadUnicodeData() {
197	f := gen.OpenUCDFile("UnicodeData.txt")
198	defer f.Close()
199	p := ucd.New(f)
200	for p.Next() {
201		r := p.Rune(ucd.CodePoint)
202		char := &chars[r]
203
204		char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
205		decmap := p.String(ucd.DecompMapping)
206
207		exp, err := parseDecomposition(decmap, false)
208		isCompat := false
209		if err != nil {
210			if len(decmap) > 0 {
211				exp, err = parseDecomposition(decmap, true)
212				if err != nil {
213					log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
214				}
215				isCompat = true
216			}
217		}
218
219		char.name = p.String(ucd.Name)
220		char.codePoint = r
221		char.forms[FCompatibility].decomp = exp
222		if !isCompat {
223			char.forms[FCanonical].decomp = exp
224		} else {
225			char.compatDecomp = true
226		}
227		if len(decmap) > 0 {
228			char.forms[FCompatibility].decomp = exp
229		}
230	}
231	if err := p.Err(); err != nil {
232		log.Fatal(err)
233	}
234}
235
236// compactCCC converts the sparse set of CCC values to a continguous one,
237// reducing the number of bits needed from 8 to 6.
238func compactCCC() {
239	m := make(map[uint8]uint8)
240	for i := range chars {
241		c := &chars[i]
242		m[c.ccc] = 0
243	}
244	cccs := []int{}
245	for v, _ := range m {
246		cccs = append(cccs, int(v))
247	}
248	sort.Ints(cccs)
249	for i, c := range cccs {
250		cccMap[uint8(i)] = uint8(c)
251		m[uint8(c)] = uint8(i)
252	}
253	for i := range chars {
254		c := &chars[i]
255		c.origCCC = c.ccc
256		c.ccc = m[c.ccc]
257	}
258	if len(m) >= 1<<6 {
259		log.Fatalf("too many difference CCC values: %d >= 64", len(m))
260	}
261}
262
263// CompositionExclusions.txt has form:
264// 0958    # ...
265// See https://unicode.org/reports/tr44/ for full explanation
266func loadCompositionExclusions() {
267	f := gen.OpenUCDFile("CompositionExclusions.txt")
268	defer f.Close()
269	p := ucd.New(f)
270	for p.Next() {
271		c := &chars[p.Rune(0)]
272		if c.excludeInComp {
273			log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
274		}
275		c.excludeInComp = true
276	}
277	if e := p.Err(); e != nil {
278		log.Fatal(e)
279	}
280}
281
282// hasCompatDecomp returns true if any of the recursive
283// decompositions contains a compatibility expansion.
284// In this case, the character may not occur in NFK*.
285func hasCompatDecomp(r rune) bool {
286	c := &chars[r]
287	if c.compatDecomp {
288		return true
289	}
290	for _, d := range c.forms[FCompatibility].decomp {
291		if hasCompatDecomp(d) {
292			return true
293		}
294	}
295	return false
296}
297
298// Hangul related constants.
299const (
300	HangulBase = 0xAC00
301	HangulEnd  = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
302
303	JamoLBase = 0x1100
304	JamoLEnd  = 0x1113
305	JamoVBase = 0x1161
306	JamoVEnd  = 0x1176
307	JamoTBase = 0x11A8
308	JamoTEnd  = 0x11C3
309
310	JamoLVTCount = 19 * 21 * 28
311	JamoTCount   = 28
312)
313
314func isHangul(r rune) bool {
315	return HangulBase <= r && r < HangulEnd
316}
317
318func isHangulWithoutJamoT(r rune) bool {
319	if !isHangul(r) {
320		return false
321	}
322	r -= HangulBase
323	return r < JamoLVTCount && r%JamoTCount == 0
324}
325
326func ccc(r rune) uint8 {
327	return chars[r].ccc
328}
329
330// Insert a rune in a buffer, ordered by Canonical Combining Class.
331func insertOrdered(b Decomposition, r rune) Decomposition {
332	n := len(b)
333	b = append(b, 0)
334	cc := ccc(r)
335	if cc > 0 {
336		// Use bubble sort.
337		for ; n > 0; n-- {
338			if ccc(b[n-1]) <= cc {
339				break
340			}
341			b[n] = b[n-1]
342		}
343	}
344	b[n] = r
345	return b
346}
347
348// Recursively decompose.
349func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
350	dcomp := chars[r].forms[form].decomp
351	if len(dcomp) == 0 {
352		return insertOrdered(d, r)
353	}
354	for _, c := range dcomp {
355		d = decomposeRecursive(form, c, d)
356	}
357	return d
358}
359
360func completeCharFields(form int) {
361	// Phase 0: pre-expand decomposition.
362	for i := range chars {
363		f := &chars[i].forms[form]
364		if len(f.decomp) == 0 {
365			continue
366		}
367		exp := make(Decomposition, 0)
368		for _, c := range f.decomp {
369			exp = decomposeRecursive(form, c, exp)
370		}
371		f.expandedDecomp = exp
372	}
373
374	// Phase 1: composition exclusion, mark decomposition.
375	for i := range chars {
376		c := &chars[i]
377		f := &c.forms[form]
378
379		// Marks script-specific exclusions and version restricted.
380		f.isOneWay = c.excludeInComp
381
382		// Singletons
383		f.isOneWay = f.isOneWay || len(f.decomp) == 1
384
385		// Non-starter decompositions
386		if len(f.decomp) > 1 {
387			chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
388			f.isOneWay = f.isOneWay || chk
389		}
390
391		// Runes that decompose into more than two runes.
392		f.isOneWay = f.isOneWay || len(f.decomp) > 2
393
394		if form == FCompatibility {
395			f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
396		}
397
398		for _, r := range f.decomp {
399			chars[r].forms[form].inDecomp = true
400		}
401	}
402
403	// Phase 2: forward and backward combining.
404	for i := range chars {
405		c := &chars[i]
406		f := &c.forms[form]
407
408		if !f.isOneWay && len(f.decomp) == 2 {
409			f0 := &chars[f.decomp[0]].forms[form]
410			f1 := &chars[f.decomp[1]].forms[form]
411			if !f0.isOneWay {
412				f0.combinesForward = true
413			}
414			if !f1.isOneWay {
415				f1.combinesBackward = true
416			}
417		}
418		if isHangulWithoutJamoT(rune(i)) {
419			f.combinesForward = true
420		}
421	}
422
423	// Phase 3: quick check values.
424	for i := range chars {
425		c := &chars[i]
426		f := &c.forms[form]
427
428		switch {
429		case len(f.decomp) > 0:
430			f.quickCheck[MDecomposed] = QCNo
431		case isHangul(rune(i)):
432			f.quickCheck[MDecomposed] = QCNo
433		default:
434			f.quickCheck[MDecomposed] = QCYes
435		}
436		switch {
437		case f.isOneWay:
438			f.quickCheck[MComposed] = QCNo
439		case (i & 0xffff00) == JamoLBase:
440			f.quickCheck[MComposed] = QCYes
441			if JamoLBase <= i && i < JamoLEnd {
442				f.combinesForward = true
443			}
444			if JamoVBase <= i && i < JamoVEnd {
445				f.quickCheck[MComposed] = QCMaybe
446				f.combinesBackward = true
447				f.combinesForward = true
448			}
449			if JamoTBase <= i && i < JamoTEnd {
450				f.quickCheck[MComposed] = QCMaybe
451				f.combinesBackward = true
452			}
453		case !f.combinesBackward:
454			f.quickCheck[MComposed] = QCYes
455		default:
456			f.quickCheck[MComposed] = QCMaybe
457		}
458	}
459}
460
461func computeNonStarterCounts() {
462	// Phase 4: leading and trailing non-starter count
463	for i := range chars {
464		c := &chars[i]
465
466		runes := []rune{rune(i)}
467		// We always use FCompatibility so that the CGJ insertion points do not
468		// change for repeated normalizations with different forms.
469		if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
470			runes = exp
471		}
472		// We consider runes that combine backwards to be non-starters for the
473		// purpose of Stream-Safe Text Processing.
474		for _, r := range runes {
475			if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
476				break
477			}
478			c.nLeadingNonStarters++
479		}
480		for i := len(runes) - 1; i >= 0; i-- {
481			if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
482				break
483			}
484			c.nTrailingNonStarters++
485		}
486		if c.nTrailingNonStarters > 3 {
487			log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
488		}
489
490		if isHangul(rune(i)) {
491			c.nTrailingNonStarters = 2
492			if isHangulWithoutJamoT(rune(i)) {
493				c.nTrailingNonStarters = 1
494			}
495		}
496
497		if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
498			log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
499		}
500		if t := c.nTrailingNonStarters; t > 3 {
501			log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
502		}
503	}
504}
505
506func printBytes(w io.Writer, b []byte, name string) {
507	fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
508	fmt.Fprintf(w, "var %s = [...]byte {", name)
509	for i, c := range b {
510		switch {
511		case i%64 == 0:
512			fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
513		case i%8 == 0:
514			fmt.Fprintf(w, "\n")
515		}
516		fmt.Fprintf(w, "0x%.2X, ", c)
517	}
518	fmt.Fprint(w, "\n}\n\n")
519}
520
521// See forminfo.go for format.
522func makeEntry(f *FormInfo, c *Char) uint16 {
523	e := uint16(0)
524	if r := c.codePoint; HangulBase <= r && r < HangulEnd {
525		e |= 0x40
526	}
527	if f.combinesForward {
528		e |= 0x20
529	}
530	if f.quickCheck[MDecomposed] == QCNo {
531		e |= 0x4
532	}
533	switch f.quickCheck[MComposed] {
534	case QCYes:
535	case QCNo:
536		e |= 0x10
537	case QCMaybe:
538		e |= 0x18
539	default:
540		log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
541	}
542	e |= uint16(c.nTrailingNonStarters)
543	return e
544}
545
546// decompSet keeps track of unique decompositions, grouped by whether
547// the decomposition is followed by a trailing and/or leading CCC.
548type decompSet [7]map[string]bool
549
550const (
551	normalDecomp = iota
552	firstMulti
553	firstCCC
554	endMulti
555	firstLeadingCCC
556	firstCCCZeroExcept
557	firstStarterWithNLead
558	lastDecomp
559)
560
561var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
562
563func makeDecompSet() decompSet {
564	m := decompSet{}
565	for i := range m {
566		m[i] = make(map[string]bool)
567	}
568	return m
569}
570func (m *decompSet) insert(key int, s string) {
571	m[key][s] = true
572}
573
574func printCharInfoTables(w io.Writer) int {
575	mkstr := func(r rune, f *FormInfo) (int, string) {
576		d := f.expandedDecomp
577		s := string([]rune(d))
578		if max := 1 << 6; len(s) >= max {
579			const msg = "%U: too many bytes in decomposition: %d >= %d"
580			log.Fatalf(msg, r, len(s), max)
581		}
582		head := uint8(len(s))
583		if f.quickCheck[MComposed] != QCYes {
584			head |= 0x40
585		}
586		if f.combinesForward {
587			head |= 0x80
588		}
589		s = string([]byte{head}) + s
590
591		lccc := ccc(d[0])
592		tccc := ccc(d[len(d)-1])
593		cc := ccc(r)
594		if cc != 0 && lccc == 0 && tccc == 0 {
595			log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
596		}
597		if tccc < lccc && lccc != 0 {
598			const msg = "%U: lccc (%d) must be <= tcc (%d)"
599			log.Fatalf(msg, r, lccc, tccc)
600		}
601		index := normalDecomp
602		nTrail := chars[r].nTrailingNonStarters
603		nLead := chars[r].nLeadingNonStarters
604		if tccc > 0 || lccc > 0 || nTrail > 0 {
605			tccc <<= 2
606			tccc |= nTrail
607			s += string([]byte{tccc})
608			index = endMulti
609			for _, r := range d[1:] {
610				if ccc(r) == 0 {
611					index = firstCCC
612				}
613			}
614			if lccc > 0 || nLead > 0 {
615				s += string([]byte{lccc})
616				if index == firstCCC {
617					log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
618				}
619				index = firstLeadingCCC
620			}
621			if cc != lccc {
622				if cc != 0 {
623					log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
624				}
625				index = firstCCCZeroExcept
626			}
627		} else if len(d) > 1 {
628			index = firstMulti
629		}
630		return index, s
631	}
632
633	decompSet := makeDecompSet()
634	const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
635	decompSet.insert(firstStarterWithNLead, nLeadStr)
636
637	// Store the uniqued decompositions in a byte buffer,
638	// preceded by their byte length.
639	for _, c := range chars {
640		for _, f := range c.forms {
641			if len(f.expandedDecomp) == 0 {
642				continue
643			}
644			if f.combinesBackward {
645				log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
646			}
647			index, s := mkstr(c.codePoint, &f)
648			decompSet.insert(index, s)
649		}
650	}
651
652	decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
653	size := 0
654	positionMap := make(map[string]uint16)
655	decompositions.WriteString("\000")
656	fmt.Fprintln(w, "const (")
657	for i, m := range decompSet {
658		sa := []string{}
659		for s := range m {
660			sa = append(sa, s)
661		}
662		sort.Strings(sa)
663		for _, s := range sa {
664			p := decompositions.Len()
665			decompositions.WriteString(s)
666			positionMap[s] = uint16(p)
667		}
668		if cname[i] != "" {
669			fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
670		}
671	}
672	fmt.Fprintln(w, "maxDecomp = 0x8000")
673	fmt.Fprintln(w, ")")
674	b := decompositions.Bytes()
675	printBytes(w, b, "decomps")
676	size += len(b)
677
678	varnames := []string{"nfc", "nfkc"}
679	for i := 0; i < FNumberOfFormTypes; i++ {
680		trie := triegen.NewTrie(varnames[i])
681
682		for r, c := range chars {
683			f := c.forms[i]
684			d := f.expandedDecomp
685			if len(d) != 0 {
686				_, key := mkstr(c.codePoint, &f)
687				trie.Insert(rune(r), uint64(positionMap[key]))
688				if c.ccc != ccc(d[0]) {
689					// We assume the lead ccc of a decomposition !=0 in this case.
690					if ccc(d[0]) == 0 {
691						log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
692					}
693				}
694			} else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
695				// Handle cases where it can't be detected that the nLead should be equal
696				// to nTrail.
697				trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
698			} else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
699				trie.Insert(c.codePoint, uint64(0x8000|v))
700			}
701		}
702		sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
703		if err != nil {
704			log.Fatal(err)
705		}
706		size += sz
707	}
708	return size
709}
710
711func contains(sa []string, s string) bool {
712	for _, a := range sa {
713		if a == s {
714			return true
715		}
716	}
717	return false
718}
719
720func makeTables() {
721	w := &bytes.Buffer{}
722
723	size := 0
724	if *tablelist == "" {
725		return
726	}
727	list := strings.Split(*tablelist, ",")
728	if *tablelist == "all" {
729		list = []string{"recomp", "info"}
730	}
731
732	// Compute maximum decomposition size.
733	max := 0
734	for _, c := range chars {
735		if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
736			max = n
737		}
738	}
739	fmt.Fprintln(w, `import "sync"`)
740	fmt.Fprintln(w)
741
742	fmt.Fprintln(w, "const (")
743	fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
744	fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
745	fmt.Fprintln(w)
746	fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
747	fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
748	fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
749	fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
750	fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
751	fmt.Fprintln(w, ")\n")
752
753	// Print the CCC remap table.
754	size += len(cccMap)
755	fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
756	for i := 0; i < len(cccMap); i++ {
757		if i%8 == 0 {
758			fmt.Fprintln(w)
759		}
760		fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
761	}
762	fmt.Fprintln(w, "\n}\n")
763
764	if contains(list, "info") {
765		size += printCharInfoTables(w)
766	}
767
768	if contains(list, "recomp") {
769		// Note that we use 32 bit keys, instead of 64 bit.
770		// This clips the bits of three entries, but we know
771		// this won't cause a collision. The compiler will catch
772		// any changes made to UnicodeData.txt that introduces
773		// a collision.
774		// Note that the recomposition map for NFC and NFKC
775		// are identical.
776
777		// Recomposition map
778		nrentries := 0
779		for _, c := range chars {
780			f := c.forms[FCanonical]
781			if !f.isOneWay && len(f.decomp) > 0 {
782				nrentries++
783			}
784		}
785		sz := nrentries * 8
786		size += sz
787		fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
788		fmt.Fprintln(w, "var recompMap map[uint32]rune")
789		fmt.Fprintln(w, "var recompMapOnce sync.Once\n")
790		fmt.Fprintln(w, `const recompMapPacked = "" +`)
791		var buf [8]byte
792		for i, c := range chars {
793			f := c.forms[FCanonical]
794			d := f.decomp
795			if !f.isOneWay && len(d) > 0 {
796				key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
797				binary.BigEndian.PutUint32(buf[:4], key)
798				binary.BigEndian.PutUint32(buf[4:], uint32(i))
799				fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i))
800			}
801		}
802		// hack so we don't have to special case the trailing plus sign
803		fmt.Fprintf(w, `	""`)
804		fmt.Fprintln(w)
805	}
806
807	fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
808	gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes())
809}
810
811func printChars() {
812	if *verbose {
813		for _, c := range chars {
814			if !c.isValid() || c.state == SMissing {
815				continue
816			}
817			fmt.Println(c)
818		}
819	}
820}
821
822// verifyComputed does various consistency tests.
823func verifyComputed() {
824	for i, c := range chars {
825		for _, f := range c.forms {
826			isNo := (f.quickCheck[MDecomposed] == QCNo)
827			if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
828				log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
829			}
830
831			isMaybe := f.quickCheck[MComposed] == QCMaybe
832			if f.combinesBackward != isMaybe {
833				log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
834			}
835			if len(f.decomp) > 0 && f.combinesForward && isMaybe {
836				log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
837			}
838
839			if len(f.expandedDecomp) != 0 {
840				continue
841			}
842			if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
843				// We accept these runes to be treated differently (it only affects
844				// segment breaking in iteration, most likely on improper use), but
845				// reconsider if more characters are added.
846				// U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
847				// U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
848				// U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
849				// U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
850				// U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
851				// U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
852				if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
853					log.Fatalf("%U: nLead was %v; want %v", i, a, b)
854				}
855			}
856		}
857		nfc := c.forms[FCanonical]
858		nfkc := c.forms[FCompatibility]
859		if nfc.combinesBackward != nfkc.combinesBackward {
860			log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
861		}
862	}
863}
864
865// Use values in DerivedNormalizationProps.txt to compare against the
866// values we computed.
867// DerivedNormalizationProps.txt has form:
868// 00C0..00C5    ; NFD_QC; N # ...
869// 0374          ; NFD_QC; N # ...
870// See https://unicode.org/reports/tr44/ for full explanation
871func testDerived() {
872	f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
873	defer f.Close()
874	p := ucd.New(f)
875	for p.Next() {
876		r := p.Rune(0)
877		c := &chars[r]
878
879		var ftype, mode int
880		qt := p.String(1)
881		switch qt {
882		case "NFC_QC":
883			ftype, mode = FCanonical, MComposed
884		case "NFD_QC":
885			ftype, mode = FCanonical, MDecomposed
886		case "NFKC_QC":
887			ftype, mode = FCompatibility, MComposed
888		case "NFKD_QC":
889			ftype, mode = FCompatibility, MDecomposed
890		default:
891			continue
892		}
893		var qr QCResult
894		switch p.String(2) {
895		case "Y":
896			qr = QCYes
897		case "N":
898			qr = QCNo
899		case "M":
900			qr = QCMaybe
901		default:
902			log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
903		}
904		if got := c.forms[ftype].quickCheck[mode]; got != qr {
905			log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
906		}
907		c.forms[ftype].verified[mode] = true
908	}
909	if err := p.Err(); err != nil {
910		log.Fatal(err)
911	}
912	// Any unspecified value must be QCYes. Verify this.
913	for i, c := range chars {
914		for j, fd := range c.forms {
915			for k, qr := range fd.quickCheck {
916				if !fd.verified[k] && qr != QCYes {
917					m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
918					log.Printf(m, i, j, k, qr, c.name)
919				}
920			}
921		}
922	}
923}
924
925var testHeader = `const (
926	Yes = iota
927	No
928	Maybe
929)
930
931type formData struct {
932	qc              uint8
933	combinesForward bool
934	decomposition   string
935}
936
937type runeData struct {
938	r      rune
939	ccc    uint8
940	nLead  uint8
941	nTrail uint8
942	f      [2]formData // 0: canonical; 1: compatibility
943}
944
945func f(qc uint8, cf bool, dec string) [2]formData {
946	return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
947}
948
949func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
950	return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
951}
952
953var testData = []runeData{
954`
955
956func printTestdata() {
957	type lastInfo struct {
958		ccc    uint8
959		nLead  uint8
960		nTrail uint8
961		f      string
962	}
963
964	last := lastInfo{}
965	w := &bytes.Buffer{}
966	fmt.Fprintf(w, testHeader)
967	for r, c := range chars {
968		f := c.forms[FCanonical]
969		qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
970		f = c.forms[FCompatibility]
971		qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
972		s := ""
973		if d == dk && qc == qck && cf == cfk {
974			s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
975		} else {
976			s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
977		}
978		current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
979		if last != current {
980			fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
981			last = current
982		}
983	}
984	fmt.Fprintln(w, "}")
985	gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes())
986}
987