1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// This program generates the trie for casing operations. The Unicode casing
9// algorithm requires the lookup of various properties and mappings for each
10// rune. The table generated by this generator combines several of the most
11// frequently used of these into a single trie so that they can be accessed
12// with a single lookup.
13package main
14
15import (
16	"bytes"
17	"fmt"
18	"io"
19	"io/ioutil"
20	"log"
21	"reflect"
22	"strconv"
23	"strings"
24	"unicode"
25
26	"golang.org/x/text/internal/gen"
27	"golang.org/x/text/internal/triegen"
28	"golang.org/x/text/internal/ucd"
29	"golang.org/x/text/unicode/norm"
30)
31
32func main() {
33	gen.Init()
34	genTables()
35	genTablesTest()
36	gen.Repackage("gen_trieval.go", "trieval.go", "cases")
37}
38
39// runeInfo contains all information for a rune that we care about for casing
40// operations.
41type runeInfo struct {
42	Rune rune
43
44	entry info // trie value for this rune.
45
46	CaseMode info
47
48	// Simple case mappings.
49	Simple [1 + maxCaseMode][]rune
50
51	// Special casing
52	HasSpecial  bool
53	Conditional bool
54	Special     [1 + maxCaseMode][]rune
55
56	// Folding
57	FoldSimple  rune
58	FoldSpecial rune
59	FoldFull    []rune
60
61	// TODO: FC_NFKC, or equivalent data.
62
63	// Properties
64	SoftDotted     bool
65	CaseIgnorable  bool
66	Cased          bool
67	DecomposeGreek bool
68	BreakType      string
69	BreakCat       breakCategory
70
71	// We care mostly about 0, Above, and IotaSubscript.
72	CCC byte
73}
74
75type breakCategory int
76
77const (
78	breakBreak breakCategory = iota
79	breakLetter
80	breakMid
81)
82
83// mapping returns the case mapping for the given case type.
84func (r *runeInfo) mapping(c info) string {
85	if r.HasSpecial {
86		return string(r.Special[c])
87	}
88	if len(r.Simple[c]) != 0 {
89		return string(r.Simple[c])
90	}
91	return string(r.Rune)
92}
93
94func parse(file string, f func(p *ucd.Parser)) {
95	ucd.Parse(gen.OpenUCDFile(file), f)
96}
97
98func parseUCD() []runeInfo {
99	chars := make([]runeInfo, unicode.MaxRune)
100
101	get := func(r rune) *runeInfo {
102		c := &chars[r]
103		c.Rune = r
104		return c
105	}
106
107	parse("UnicodeData.txt", func(p *ucd.Parser) {
108		ri := get(p.Rune(0))
109		ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
110		ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
111		ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
112		ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
113		if p.String(ucd.GeneralCategory) == "Lt" {
114			ri.CaseMode = cTitle
115		}
116	})
117
118	// <code>; <property>
119	parse("PropList.txt", func(p *ucd.Parser) {
120		if p.String(1) == "Soft_Dotted" {
121			chars[p.Rune(0)].SoftDotted = true
122		}
123	})
124
125	// <code>; <word break type>
126	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
127		ri := get(p.Rune(0))
128		switch p.String(1) {
129		case "Case_Ignorable":
130			ri.CaseIgnorable = true
131		case "Cased":
132			ri.Cased = true
133		case "Lowercase":
134			ri.CaseMode = cLower
135		case "Uppercase":
136			ri.CaseMode = cUpper
137		}
138	})
139
140	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
141	parse("SpecialCasing.txt", func(p *ucd.Parser) {
142		// We drop all conditional special casing and deal with them manually in
143		// the language-specific case mappers. Rune 0x03A3 is the only one with
144		// a conditional formatting that is not language-specific. However,
145		// dealing with this letter is tricky, especially in a streaming
146		// context, so we deal with it in the Caser for Greek specifically.
147		ri := get(p.Rune(0))
148		if p.String(4) == "" {
149			ri.HasSpecial = true
150			ri.Special[cLower] = p.Runes(1)
151			ri.Special[cTitle] = p.Runes(2)
152			ri.Special[cUpper] = p.Runes(3)
153		} else {
154			ri.Conditional = true
155		}
156	})
157
158	// TODO: Use text breaking according to UAX #29.
159	// <code>; <word break type>
160	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
161		ri := get(p.Rune(0))
162		ri.BreakType = p.String(1)
163
164		// We collapse the word breaking properties onto the categories we need.
165		switch p.String(1) { // TODO: officially we need to canonicalize.
166		case "MidLetter", "MidNumLet", "Single_Quote":
167			ri.BreakCat = breakMid
168			if !ri.CaseIgnorable {
169				// finalSigma relies on the fact that all breakMid runes are
170				// also a Case_Ignorable. Revisit this code when this changes.
171				log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
172			}
173		case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
174			ri.BreakCat = breakLetter
175		}
176	})
177
178	// <code>; <type>; <mapping>
179	parse("CaseFolding.txt", func(p *ucd.Parser) {
180		ri := get(p.Rune(0))
181		switch p.String(1) {
182		case "C":
183			ri.FoldSimple = p.Rune(2)
184			ri.FoldFull = p.Runes(2)
185		case "S":
186			ri.FoldSimple = p.Rune(2)
187		case "T":
188			ri.FoldSpecial = p.Rune(2)
189		case "F":
190			ri.FoldFull = p.Runes(2)
191		default:
192			log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
193		}
194	})
195
196	return chars
197}
198
199func genTables() {
200	chars := parseUCD()
201	verifyProperties(chars)
202
203	t := triegen.NewTrie("case")
204	for i := range chars {
205		c := &chars[i]
206		makeEntry(c)
207		t.Insert(rune(i), uint64(c.entry))
208	}
209
210	w := gen.NewCodeWriter()
211	defer w.WriteVersionedGoFile("tables.go", "cases")
212
213	gen.WriteUnicodeVersion(w)
214
215	// TODO: write CLDR version after adding a mechanism to detect that the
216	// tables on which the manually created locale-sensitive casing code is
217	// based hasn't changed.
218
219	w.WriteVar("xorData", string(xorData))
220	w.WriteVar("exceptions", string(exceptionData))
221
222	sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
223	if err != nil {
224		log.Fatal(err)
225	}
226	w.Size += sz
227}
228
229func makeEntry(ri *runeInfo) {
230	if ri.CaseIgnorable {
231		if ri.Cased {
232			ri.entry = cIgnorableCased
233		} else {
234			ri.entry = cIgnorableUncased
235		}
236	} else {
237		ri.entry = ri.CaseMode
238	}
239
240	// TODO: handle soft-dotted.
241
242	ccc := cccOther
243	switch ri.CCC {
244	case 0: // Not_Reordered
245		ccc = cccZero
246	case above: // Above
247		ccc = cccAbove
248	}
249	switch ri.BreakCat {
250	case breakBreak:
251		ccc = cccBreak
252	case breakMid:
253		ri.entry |= isMidBit
254	}
255
256	ri.entry |= ccc
257
258	if ri.CaseMode == cUncased {
259		return
260	}
261
262	// Need to do something special.
263	if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
264		makeException(ri)
265		return
266	}
267	if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
268		makeException(ri)
269		return
270	}
271
272	// Rune is either lowercase or uppercase.
273
274	orig := string(ri.Rune)
275	mapped := ""
276	if ri.CaseMode == cUpper {
277		mapped = ri.mapping(cLower)
278	} else {
279		mapped = ri.mapping(cUpper)
280	}
281
282	if len(orig) != len(mapped) {
283		makeException(ri)
284		return
285	}
286
287	if string(ri.FoldFull) == ri.mapping(cUpper) {
288		ri.entry |= inverseFoldBit
289	}
290
291	n := len(orig)
292
293	// Create per-byte XOR mask.
294	var b []byte
295	for i := 0; i < n; i++ {
296		b = append(b, orig[i]^mapped[i])
297	}
298
299	// Remove leading 0 bytes, but keep at least one byte.
300	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
301	}
302
303	if len(b) == 1 && b[0]&0xc0 == 0 {
304		ri.entry |= info(b[0]) << xorShift
305		return
306	}
307
308	key := string(b)
309	x, ok := xorCache[key]
310	if !ok {
311		xorData = append(xorData, 0) // for detecting start of sequence
312		xorData = append(xorData, b...)
313
314		x = len(xorData) - 1
315		xorCache[key] = x
316	}
317	ri.entry |= info(x<<xorShift) | xorIndexBit
318}
319
320var xorCache = map[string]int{}
321
322// xorData contains byte-wise XOR data for the least significant bytes of a
323// UTF-8 encoded rune. An index points to the last byte. The sequence starts
324// with a zero terminator.
325var xorData = []byte{}
326
327// See the comments in gen_trieval.go re "the exceptions slice".
328var exceptionData = []byte{0}
329
330// makeException encodes case mappings that cannot be expressed in a simple
331// XOR diff.
332func makeException(ri *runeInfo) {
333	ccc := ri.entry & cccMask
334	// Set exception bit and retain case type.
335	ri.entry &= 0x0007
336	ri.entry |= exceptionBit
337
338	if len(exceptionData) >= 1<<numExceptionBits {
339		log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
340	}
341
342	// Set the offset in the exceptionData array.
343	ri.entry |= info(len(exceptionData) << exceptionShift)
344
345	orig := string(ri.Rune)
346	tc := ri.mapping(cTitle)
347	uc := ri.mapping(cUpper)
348	lc := ri.mapping(cLower)
349	ff := string(ri.FoldFull)
350
351	// addString sets the length of a string and adds it to the expansions array.
352	addString := func(s string, b *byte) {
353		if len(s) == 0 {
354			// Zero-length mappings exist, but only for conditional casing,
355			// which we are representing outside of this table.
356			log.Fatalf("%U: has zero-length mapping.", ri.Rune)
357		}
358		*b <<= 3
359		if s != orig || ri.CaseMode == cLower {
360			n := len(s)
361			if n > 7 {
362				log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
363			}
364			*b |= byte(n)
365			exceptionData = append(exceptionData, s...)
366		}
367	}
368
369	// byte 0:
370	exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
371
372	// byte 1:
373	p := len(exceptionData)
374	exceptionData = append(exceptionData, 0)
375
376	if len(ff) > 7 { // May be zero-length.
377		log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
378	}
379	exceptionData = append(exceptionData, ff...)
380	ct := ri.CaseMode
381	if ct != cLower {
382		addString(lc, &exceptionData[p])
383	}
384	if ct != cUpper {
385		addString(uc, &exceptionData[p])
386	}
387	if ct != cTitle {
388		addString(tc, &exceptionData[p])
389	}
390}
391
392// sparseCompacter is a trie value block Compacter. There are many cases where
393// successive runes alternate between lower- and upper-case. This Compacter
394// exploits this by adding a special case type where the case value is obtained
395// from or-ing it with the least-significant bit of the rune, creating large
396// ranges of equal case values that compress well.
397type sparseCompacter struct {
398	sparseBlocks  [][]uint16
399	sparseOffsets []uint16
400	sparseCount   int
401}
402
403// makeSparse returns the number of elements that compact block would contain
404// as well as the modified values.
405func makeSparse(vals []uint64) ([]uint16, int) {
406	// Copy the values.
407	values := make([]uint16, len(vals))
408	for i, v := range vals {
409		values[i] = uint16(v)
410	}
411
412	alt := func(i int, v uint16) uint16 {
413		if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
414			// Convert cLower or cUpper to cXORCase value, which has the form 11x.
415			xor := v
416			xor &^= 1
417			xor |= uint16(i&1) ^ (v & 1)
418			xor |= 0x4
419			return xor
420		}
421		return v
422	}
423
424	var count int
425	var previous uint16
426	for i, v := range values {
427		if v != 0 {
428			// Try if the unmodified value is equal to the previous.
429			if v == previous {
430				continue
431			}
432
433			// Try if the xor-ed value is equal to the previous value.
434			a := alt(i, v)
435			if a == previous {
436				values[i] = a
437				continue
438			}
439
440			// This is a new value.
441			count++
442
443			// Use the xor-ed value if it will be identical to the next value.
444			if p := i + 1; p < len(values) && alt(p, values[p]) == a {
445				values[i] = a
446				v = a
447			}
448		}
449		previous = v
450	}
451	return values, count
452}
453
454func (s *sparseCompacter) Size(v []uint64) (int, bool) {
455	_, n := makeSparse(v)
456
457	// We limit using this method to having 16 entries.
458	if n > 16 {
459		return 0, false
460	}
461
462	return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
463}
464
465func (s *sparseCompacter) Store(v []uint64) uint32 {
466	h := uint32(len(s.sparseOffsets))
467	values, sz := makeSparse(v)
468	s.sparseBlocks = append(s.sparseBlocks, values)
469	s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
470	s.sparseCount += sz
471	return h
472}
473
474func (s *sparseCompacter) Handler() string {
475	// The sparse global variable and its lookup method is defined in gen_trieval.go.
476	return "sparse.lookup"
477}
478
479func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
480	p := func(format string, args ...interface{}) {
481		_, err := fmt.Fprintf(w, format, args...)
482		if retErr == nil && err != nil {
483			retErr = err
484		}
485	}
486
487	ls := len(s.sparseBlocks)
488	if ls == len(s.sparseOffsets) {
489		s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
490	}
491	p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
492	p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
493
494	ns := s.sparseCount
495	p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
496	p("var sparseValues = [%d]valueRange {", ns)
497	for i, values := range s.sparseBlocks {
498		p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
499		var v uint16
500		for i, nv := range values {
501			if nv != v {
502				if v != 0 {
503					p(",hi:%#02x},", 0x80+i-1)
504				}
505				if nv != 0 {
506					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
507				}
508			}
509			v = nv
510		}
511		if v != 0 {
512			p(",hi:%#02x},", 0x80+len(values)-1)
513		}
514	}
515	p("\n}\n\n")
516	return
517}
518
519// verifyProperties that properties of the runes that are relied upon in the
520// implementation. Each property is marked with an identifier that is referred
521// to in the places where it is used.
522func verifyProperties(chars []runeInfo) {
523	for i, c := range chars {
524		r := rune(i)
525
526		// Rune properties.
527
528		// A.1: modifier never changes on lowercase. [ltLower]
529		if c.CCC > 0 && unicode.ToLower(r) != r {
530			log.Fatalf("%U: non-starter changes when lowercased", r)
531		}
532
533		// A.2: properties of decompositions starting with I or J. [ltLower]
534		d := norm.NFD.PropertiesString(string(r)).Decomposition()
535		if len(d) > 0 {
536			if d[0] == 'I' || d[0] == 'J' {
537				// A.2.1: we expect at least an ASCII character and a modifier.
538				if len(d) < 3 {
539					log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
540				}
541
542				// All subsequent runes are modifiers and all have the same CCC.
543				runes := []rune(string(d[1:]))
544				ccc := chars[runes[0]].CCC
545
546				for _, mr := range runes[1:] {
547					mc := chars[mr]
548
549					// A.2.2: all modifiers have a CCC of Above or less.
550					if ccc == 0 || ccc > above {
551						log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
552					}
553
554					// A.2.3: a sequence of modifiers all have the same CCC.
555					if mc.CCC != ccc {
556						log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
557					}
558
559					// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
560					if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
561						log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
562					}
563
564					if i += len(string(mr)); i >= len(d) {
565						break
566					}
567				}
568			}
569		}
570
571		// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
572		if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
573			log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
574		}
575
576		// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
577		if c.CCC == iotaSubscript && r != 0x0345 {
578			log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
579		}
580
581		// A.5: soft-dotted runes do not have exceptions.
582		if c.SoftDotted && c.entry&exceptionBit != 0 {
583			log.Fatalf("%U: soft-dotted has exception", r)
584		}
585
586		// A.6: Greek decomposition. [elUpper]
587		if unicode.Is(unicode.Greek, r) {
588			if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
589				runes := []rune(string(b))
590				// A.6.1: If a Greek rune decomposes and the first rune of the
591				// decomposition is greater than U+00FF, the rune is always
592				// great and not a modifier.
593				if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
594					log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
595				}
596				// A.6.2: Any follow-up rune in a Greek decomposition is a
597				// modifier of which the first should be gobbled in
598				// decomposition.
599				for _, m := range runes[1:] {
600					switch m {
601					case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
602					default:
603						log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
604					}
605				}
606			}
607		}
608
609		// Breaking properties.
610
611		// B.1: all runes with CCC > 0 are of break type Extend.
612		if c.CCC > 0 && c.BreakType != "Extend" {
613			log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
614		}
615
616		// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
617		if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
618			log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
619		}
620
621		// B.3: letter category.
622		if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
623			if c.BreakCat != breakLetter {
624				log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
625			}
626		}
627	}
628}
629
630func genTablesTest() {
631	w := &bytes.Buffer{}
632
633	fmt.Fprintln(w, "var (")
634	printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
635
636	// We discard the output as we know we have perfect functions. We run them
637	// just to verify the properties are correct.
638	n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
639	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
640	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
641	if n > 0 {
642		log.Fatalf("One of the discarded properties does not have a perfect filter.")
643	}
644
645	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
646	fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
647	parse("SpecialCasing.txt", func(p *ucd.Parser) {
648		// Skip conditional entries.
649		if p.String(4) != "" {
650			return
651		}
652		r := p.Rune(0)
653		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
654			r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
655	})
656	fmt.Fprint(w, "\t}\n\n")
657
658	// <code>; <type>; <runes>
659	table := map[rune]struct{ simple, full, special string }{}
660	parse("CaseFolding.txt", func(p *ucd.Parser) {
661		r := p.Rune(0)
662		t := p.String(1)
663		v := string(p.Runes(2))
664		if t != "T" && v == string(unicode.ToLower(r)) {
665			return
666		}
667		x := table[r]
668		switch t {
669		case "C":
670			x.full = v
671			x.simple = v
672		case "S":
673			x.simple = v
674		case "F":
675			x.full = v
676		case "T":
677			x.special = v
678		}
679		table[r] = x
680	})
681	fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
682	for r := rune(0); r < 0x10FFFF; r++ {
683		x, ok := table[r]
684		if !ok {
685			continue
686		}
687		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
688	}
689	fmt.Fprint(w, "\t}\n\n")
690
691	// Break property
692	notBreak := map[rune]bool{}
693	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
694		switch p.String(1) {
695		case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
696			"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
697			notBreak[p.Rune(0)] = true
698		}
699	})
700
701	fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
702	inBreak := false
703	for r := rune(0); r <= lastRuneForTesting; r++ {
704		if isBreak := !notBreak[r]; isBreak != inBreak {
705			if isBreak {
706				fmt.Fprintf(w, "\t\t{0x%x, ", r)
707			} else {
708				fmt.Fprintf(w, "0x%x},\n", r-1)
709			}
710			inBreak = isBreak
711		}
712	}
713	if inBreak {
714		fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
715	}
716	fmt.Fprint(w, "\t}\n\n")
717
718	// Word break test
719	// Filter out all samples that do not contain cased characters.
720	cased := map[rune]bool{}
721	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
722		if p.String(1) == "Cased" {
723			cased[p.Rune(0)] = true
724		}
725	})
726
727	fmt.Fprintln(w, "\tbreakTest = []string{")
728	parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
729		c := strings.Split(p.String(0), " ")
730
731		const sep = '|'
732		numCased := 0
733		test := ""
734		for ; len(c) >= 2; c = c[2:] {
735			if c[0] == "÷" && test != "" {
736				test += string(sep)
737			}
738			i, err := strconv.ParseUint(c[1], 16, 32)
739			r := rune(i)
740			if err != nil {
741				log.Fatalf("Invalid rune %q.", c[1])
742			}
743			if r == sep {
744				log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
745			}
746			if cased[r] {
747				numCased++
748			}
749			test += string(r)
750		}
751		if numCased > 1 {
752			fmt.Fprintf(w, "\t\t%q,\n", test)
753		}
754	})
755	fmt.Fprintln(w, "\t}")
756
757	fmt.Fprintln(w, ")")
758
759	gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
760}
761
762// These functions are just used for verification that their definition have not
763// changed in the Unicode Standard.
764
765func verifyCased(r rune) bool {
766	return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
767}
768
769func verifyLower(r rune) bool {
770	return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
771}
772
773func verifyUpper(r rune) bool {
774	return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
775}
776
777// verifyIgnore is an approximation of the Case_Ignorable property using the
778// core unicode package. It is used to reduce the size of the test data.
779func verifyIgnore(r rune) bool {
780	props := []*unicode.RangeTable{
781		unicode.Mn,
782		unicode.Me,
783		unicode.Cf,
784		unicode.Lm,
785		unicode.Sk,
786	}
787	for _, p := range props {
788		if unicode.Is(p, r) {
789			return true
790		}
791	}
792	return false
793}
794
795// printProperties prints tables of rune properties from the given UCD file.
796// A filter func f can be given to exclude certain values. A rune r will have
797// the indicated property if it is in the generated table or if f(r).
798func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
799	verify := map[rune]bool{}
800	n := 0
801	varNameParts := strings.Split(property, "_")
802	varNameParts[0] = strings.ToLower(varNameParts[0])
803	fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
804	parse(file, func(p *ucd.Parser) {
805		if p.String(1) == property {
806			r := p.Rune(0)
807			verify[r] = true
808			if !f(r) {
809				n++
810				fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
811			}
812		}
813	})
814	fmt.Fprint(w, "\t}\n\n")
815
816	// Verify that f is correct, that is, it represents a subset of the property.
817	for r := rune(0); r <= lastRuneForTesting; r++ {
818		if !verify[r] && f(r) {
819			log.Fatalf("Incorrect filter func for property %q.", property)
820		}
821	}
822	return n
823}
824
825// The newCaseTrie, sparseValues and sparseOffsets definitions below are
826// placeholders referred to by gen_trieval.go. The real definitions are
827// generated by this program and written to tables.go.
828
829func newCaseTrie(int) int { return 0 }
830
831var (
832	sparseValues  [0]valueRange
833	sparseOffsets [0]uint16
834)
835