1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// This program generates the trie for casing operations. The Unicode casing
8// algorithm requires the lookup of various properties and mappings for each
9// rune. The table generated by this generator combines several of the most
10// frequently used of these into a single trie so that they can be accessed
11// with a single lookup.
12package main
13
14import (
15	"bytes"
16	"fmt"
17	"io"
18	"io/ioutil"
19	"log"
20	"reflect"
21	"strconv"
22	"strings"
23	"unicode"
24
25	"golang.org/x/text/internal/gen"
26	"golang.org/x/text/internal/triegen"
27	"golang.org/x/text/internal/ucd"
28	"golang.org/x/text/unicode/norm"
29)
30
31func main() {
32	gen.Init()
33	genTables()
34	genTablesTest()
35	gen.Repackage("gen_trieval.go", "trieval.go", "cases")
36}
37
38// runeInfo contains all information for a rune that we care about for casing
39// operations.
40type runeInfo struct {
41	Rune rune
42
43	entry info // trie value for this rune.
44
45	CaseMode info
46
47	// Simple case mappings.
48	Simple [1 + maxCaseMode][]rune
49
50	// Special casing
51	HasSpecial  bool
52	Conditional bool
53	Special     [1 + maxCaseMode][]rune
54
55	// Folding
56	FoldSimple  rune
57	FoldSpecial rune
58	FoldFull    []rune
59
60	// TODO: FC_NFKC, or equivalent data.
61
62	// Properties
63	SoftDotted     bool
64	CaseIgnorable  bool
65	Cased          bool
66	DecomposeGreek bool
67	BreakType      string
68	BreakCat       breakCategory
69
70	// We care mostly about 0, Above, and IotaSubscript.
71	CCC byte
72}
73
74type breakCategory int
75
76const (
77	breakBreak breakCategory = iota
78	breakLetter
79	breakMid
80)
81
82// mapping returns the case mapping for the given case type.
83func (r *runeInfo) mapping(c info) string {
84	if r.HasSpecial {
85		return string(r.Special[c])
86	}
87	if len(r.Simple[c]) != 0 {
88		return string(r.Simple[c])
89	}
90	return string(r.Rune)
91}
92
93func parse(file string, f func(p *ucd.Parser)) {
94	ucd.Parse(gen.OpenUCDFile(file), f)
95}
96
97func parseUCD() []runeInfo {
98	chars := make([]runeInfo, unicode.MaxRune)
99
100	get := func(r rune) *runeInfo {
101		c := &chars[r]
102		c.Rune = r
103		return c
104	}
105
106	parse("UnicodeData.txt", func(p *ucd.Parser) {
107		ri := get(p.Rune(0))
108		ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
109		ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
110		ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
111		ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
112		if p.String(ucd.GeneralCategory) == "Lt" {
113			ri.CaseMode = cTitle
114		}
115	})
116
117	// <code>; <property>
118	parse("PropList.txt", func(p *ucd.Parser) {
119		if p.String(1) == "Soft_Dotted" {
120			chars[p.Rune(0)].SoftDotted = true
121		}
122	})
123
124	// <code>; <word break type>
125	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
126		ri := get(p.Rune(0))
127		switch p.String(1) {
128		case "Case_Ignorable":
129			ri.CaseIgnorable = true
130		case "Cased":
131			ri.Cased = true
132		case "Lowercase":
133			ri.CaseMode = cLower
134		case "Uppercase":
135			ri.CaseMode = cUpper
136		}
137	})
138
139	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
140	parse("SpecialCasing.txt", func(p *ucd.Parser) {
141		// We drop all conditional special casing and deal with them manually in
142		// the language-specific case mappers. Rune 0x03A3 is the only one with
143		// a conditional formatting that is not language-specific. However,
144		// dealing with this letter is tricky, especially in a streaming
145		// context, so we deal with it in the Caser for Greek specifically.
146		ri := get(p.Rune(0))
147		if p.String(4) == "" {
148			ri.HasSpecial = true
149			ri.Special[cLower] = p.Runes(1)
150			ri.Special[cTitle] = p.Runes(2)
151			ri.Special[cUpper] = p.Runes(3)
152		} else {
153			ri.Conditional = true
154		}
155	})
156
157	// TODO: Use text breaking according to UAX #29.
158	// <code>; <word break type>
159	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
160		ri := get(p.Rune(0))
161		ri.BreakType = p.String(1)
162
163		// We collapse the word breaking properties onto the categories we need.
164		switch p.String(1) { // TODO: officially we need to canonicalize.
165		case "MidLetter", "MidNumLet", "Single_Quote":
166			ri.BreakCat = breakMid
167			if !ri.CaseIgnorable {
168				// finalSigma relies on the fact that all breakMid runes are
169				// also a Case_Ignorable. Revisit this code when this changes.
170				log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
171			}
172		case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
173			ri.BreakCat = breakLetter
174		}
175	})
176
177	// <code>; <type>; <mapping>
178	parse("CaseFolding.txt", func(p *ucd.Parser) {
179		ri := get(p.Rune(0))
180		switch p.String(1) {
181		case "C":
182			ri.FoldSimple = p.Rune(2)
183			ri.FoldFull = p.Runes(2)
184		case "S":
185			ri.FoldSimple = p.Rune(2)
186		case "T":
187			ri.FoldSpecial = p.Rune(2)
188		case "F":
189			ri.FoldFull = p.Runes(2)
190		default:
191			log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
192		}
193	})
194
195	return chars
196}
197
198func genTables() {
199	chars := parseUCD()
200	verifyProperties(chars)
201
202	t := triegen.NewTrie("case")
203	for i := range chars {
204		c := &chars[i]
205		makeEntry(c)
206		t.Insert(rune(i), uint64(c.entry))
207	}
208
209	w := gen.NewCodeWriter()
210	defer w.WriteVersionedGoFile("tables.go", "cases")
211
212	gen.WriteUnicodeVersion(w)
213
214	// TODO: write CLDR version after adding a mechanism to detect that the
215	// tables on which the manually created locale-sensitive casing code is
216	// based hasn't changed.
217
218	w.WriteVar("xorData", string(xorData))
219	w.WriteVar("exceptions", string(exceptionData))
220
221	sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
222	if err != nil {
223		log.Fatal(err)
224	}
225	w.Size += sz
226}
227
228func makeEntry(ri *runeInfo) {
229	if ri.CaseIgnorable {
230		if ri.Cased {
231			ri.entry = cIgnorableCased
232		} else {
233			ri.entry = cIgnorableUncased
234		}
235	} else {
236		ri.entry = ri.CaseMode
237	}
238
239	// TODO: handle soft-dotted.
240
241	ccc := cccOther
242	switch ri.CCC {
243	case 0: // Not_Reordered
244		ccc = cccZero
245	case above: // Above
246		ccc = cccAbove
247	}
248	switch ri.BreakCat {
249	case breakBreak:
250		ccc = cccBreak
251	case breakMid:
252		ri.entry |= isMidBit
253	}
254
255	ri.entry |= ccc
256
257	if ri.CaseMode == cUncased {
258		return
259	}
260
261	// Need to do something special.
262	if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
263		makeException(ri)
264		return
265	}
266	if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
267		makeException(ri)
268		return
269	}
270
271	// Rune is either lowercase or uppercase.
272
273	orig := string(ri.Rune)
274	mapped := ""
275	if ri.CaseMode == cUpper {
276		mapped = ri.mapping(cLower)
277	} else {
278		mapped = ri.mapping(cUpper)
279	}
280
281	if len(orig) != len(mapped) {
282		makeException(ri)
283		return
284	}
285
286	if string(ri.FoldFull) == ri.mapping(cUpper) {
287		ri.entry |= inverseFoldBit
288	}
289
290	n := len(orig)
291
292	// Create per-byte XOR mask.
293	var b []byte
294	for i := 0; i < n; i++ {
295		b = append(b, orig[i]^mapped[i])
296	}
297
298	// Remove leading 0 bytes, but keep at least one byte.
299	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
300	}
301
302	if len(b) == 1 && b[0]&0xc0 == 0 {
303		ri.entry |= info(b[0]) << xorShift
304		return
305	}
306
307	key := string(b)
308	x, ok := xorCache[key]
309	if !ok {
310		xorData = append(xorData, 0) // for detecting start of sequence
311		xorData = append(xorData, b...)
312
313		x = len(xorData) - 1
314		xorCache[key] = x
315	}
316	ri.entry |= info(x<<xorShift) | xorIndexBit
317}
318
319var xorCache = map[string]int{}
320
321// xorData contains byte-wise XOR data for the least significant bytes of a
322// UTF-8 encoded rune. An index points to the last byte. The sequence starts
323// with a zero terminator.
324var xorData = []byte{}
325
326// See the comments in gen_trieval.go re "the exceptions slice".
327var exceptionData = []byte{0}
328
329// makeException encodes case mappings that cannot be expressed in a simple
330// XOR diff.
331func makeException(ri *runeInfo) {
332	ccc := ri.entry & cccMask
333	// Set exception bit and retain case type.
334	ri.entry &= 0x0007
335	ri.entry |= exceptionBit
336
337	if len(exceptionData) >= 1<<numExceptionBits {
338		log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
339	}
340
341	// Set the offset in the exceptionData array.
342	ri.entry |= info(len(exceptionData) << exceptionShift)
343
344	orig := string(ri.Rune)
345	tc := ri.mapping(cTitle)
346	uc := ri.mapping(cUpper)
347	lc := ri.mapping(cLower)
348	ff := string(ri.FoldFull)
349
350	// addString sets the length of a string and adds it to the expansions array.
351	addString := func(s string, b *byte) {
352		if len(s) == 0 {
353			// Zero-length mappings exist, but only for conditional casing,
354			// which we are representing outside of this table.
355			log.Fatalf("%U: has zero-length mapping.", ri.Rune)
356		}
357		*b <<= 3
358		if s != orig || ri.CaseMode == cLower {
359			n := len(s)
360			if n > 7 {
361				log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
362			}
363			*b |= byte(n)
364			exceptionData = append(exceptionData, s...)
365		}
366	}
367
368	// byte 0:
369	exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
370
371	// byte 1:
372	p := len(exceptionData)
373	exceptionData = append(exceptionData, 0)
374
375	if len(ff) > 7 { // May be zero-length.
376		log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
377	}
378	exceptionData = append(exceptionData, ff...)
379	ct := ri.CaseMode
380	if ct != cLower {
381		addString(lc, &exceptionData[p])
382	}
383	if ct != cUpper {
384		addString(uc, &exceptionData[p])
385	}
386	if ct != cTitle {
387		addString(tc, &exceptionData[p])
388	}
389}
390
391// sparseCompacter is a trie value block Compacter. There are many cases where
392// successive runes alternate between lower- and upper-case. This Compacter
393// exploits this by adding a special case type where the case value is obtained
394// from or-ing it with the least-significant bit of the rune, creating large
395// ranges of equal case values that compress well.
396type sparseCompacter struct {
397	sparseBlocks  [][]uint16
398	sparseOffsets []uint16
399	sparseCount   int
400}
401
402// makeSparse returns the number of elements that compact block would contain
403// as well as the modified values.
404func makeSparse(vals []uint64) ([]uint16, int) {
405	// Copy the values.
406	values := make([]uint16, len(vals))
407	for i, v := range vals {
408		values[i] = uint16(v)
409	}
410
411	alt := func(i int, v uint16) uint16 {
412		if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
413			// Convert cLower or cUpper to cXORCase value, which has the form 11x.
414			xor := v
415			xor &^= 1
416			xor |= uint16(i&1) ^ (v & 1)
417			xor |= 0x4
418			return xor
419		}
420		return v
421	}
422
423	var count int
424	var previous uint16
425	for i, v := range values {
426		if v != 0 {
427			// Try if the unmodified value is equal to the previous.
428			if v == previous {
429				continue
430			}
431
432			// Try if the xor-ed value is equal to the previous value.
433			a := alt(i, v)
434			if a == previous {
435				values[i] = a
436				continue
437			}
438
439			// This is a new value.
440			count++
441
442			// Use the xor-ed value if it will be identical to the next value.
443			if p := i + 1; p < len(values) && alt(p, values[p]) == a {
444				values[i] = a
445				v = a
446			}
447		}
448		previous = v
449	}
450	return values, count
451}
452
453func (s *sparseCompacter) Size(v []uint64) (int, bool) {
454	_, n := makeSparse(v)
455
456	// We limit using this method to having 16 entries.
457	if n > 16 {
458		return 0, false
459	}
460
461	return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
462}
463
464func (s *sparseCompacter) Store(v []uint64) uint32 {
465	h := uint32(len(s.sparseOffsets))
466	values, sz := makeSparse(v)
467	s.sparseBlocks = append(s.sparseBlocks, values)
468	s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
469	s.sparseCount += sz
470	return h
471}
472
473func (s *sparseCompacter) Handler() string {
474	// The sparse global variable and its lookup method is defined in gen_trieval.go.
475	return "sparse.lookup"
476}
477
478func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
479	p := func(format string, args ...interface{}) {
480		_, err := fmt.Fprintf(w, format, args...)
481		if retErr == nil && err != nil {
482			retErr = err
483		}
484	}
485
486	ls := len(s.sparseBlocks)
487	if ls == len(s.sparseOffsets) {
488		s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
489	}
490	p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
491	p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
492
493	ns := s.sparseCount
494	p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
495	p("var sparseValues = [%d]valueRange {", ns)
496	for i, values := range s.sparseBlocks {
497		p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
498		var v uint16
499		for i, nv := range values {
500			if nv != v {
501				if v != 0 {
502					p(",hi:%#02x},", 0x80+i-1)
503				}
504				if nv != 0 {
505					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
506				}
507			}
508			v = nv
509		}
510		if v != 0 {
511			p(",hi:%#02x},", 0x80+len(values)-1)
512		}
513	}
514	p("\n}\n\n")
515	return
516}
517
518// verifyProperties that properties of the runes that are relied upon in the
519// implementation. Each property is marked with an identifier that is referred
520// to in the places where it is used.
521func verifyProperties(chars []runeInfo) {
522	for i, c := range chars {
523		r := rune(i)
524
525		// Rune properties.
526
527		// A.1: modifier never changes on lowercase. [ltLower]
528		if c.CCC > 0 && unicode.ToLower(r) != r {
529			log.Fatalf("%U: non-starter changes when lowercased", r)
530		}
531
532		// A.2: properties of decompositions starting with I or J. [ltLower]
533		d := norm.NFD.PropertiesString(string(r)).Decomposition()
534		if len(d) > 0 {
535			if d[0] == 'I' || d[0] == 'J' {
536				// A.2.1: we expect at least an ASCII character and a modifier.
537				if len(d) < 3 {
538					log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
539				}
540
541				// All subsequent runes are modifiers and all have the same CCC.
542				runes := []rune(string(d[1:]))
543				ccc := chars[runes[0]].CCC
544
545				for _, mr := range runes[1:] {
546					mc := chars[mr]
547
548					// A.2.2: all modifiers have a CCC of Above or less.
549					if ccc == 0 || ccc > above {
550						log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
551					}
552
553					// A.2.3: a sequence of modifiers all have the same CCC.
554					if mc.CCC != ccc {
555						log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
556					}
557
558					// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
559					if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
560						log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
561					}
562
563					if i += len(string(mr)); i >= len(d) {
564						break
565					}
566				}
567			}
568		}
569
570		// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
571		if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
572			log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
573		}
574
575		// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
576		if c.CCC == iotaSubscript && r != 0x0345 {
577			log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
578		}
579
580		// A.5: soft-dotted runes do not have exceptions.
581		if c.SoftDotted && c.entry&exceptionBit != 0 {
582			log.Fatalf("%U: soft-dotted has exception", r)
583		}
584
585		// A.6: Greek decomposition. [elUpper]
586		if unicode.Is(unicode.Greek, r) {
587			if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
588				runes := []rune(string(b))
589				// A.6.1: If a Greek rune decomposes and the first rune of the
590				// decomposition is greater than U+00FF, the rune is always
591				// great and not a modifier.
592				if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
593					log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
594				}
595				// A.6.2: Any follow-up rune in a Greek decomposition is a
596				// modifier of which the first should be gobbled in
597				// decomposition.
598				for _, m := range runes[1:] {
599					switch m {
600					case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
601					default:
602						log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
603					}
604				}
605			}
606		}
607
608		// Breaking properties.
609
610		// B.1: all runes with CCC > 0 are of break type Extend.
611		if c.CCC > 0 && c.BreakType != "Extend" {
612			log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
613		}
614
615		// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
616		if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
617			log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
618		}
619
620		// B.3: letter category.
621		if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
622			if c.BreakCat != breakLetter {
623				log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
624			}
625		}
626	}
627}
628
629func genTablesTest() {
630	w := &bytes.Buffer{}
631
632	fmt.Fprintln(w, "var (")
633	printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
634
635	// We discard the output as we know we have perfect functions. We run them
636	// just to verify the properties are correct.
637	n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
638	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
639	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
640	if n > 0 {
641		log.Fatalf("One of the discarded properties does not have a perfect filter.")
642	}
643
644	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
645	fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
646	parse("SpecialCasing.txt", func(p *ucd.Parser) {
647		// Skip conditional entries.
648		if p.String(4) != "" {
649			return
650		}
651		r := p.Rune(0)
652		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
653			r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
654	})
655	fmt.Fprint(w, "\t}\n\n")
656
657	// <code>; <type>; <runes>
658	table := map[rune]struct{ simple, full, special string }{}
659	parse("CaseFolding.txt", func(p *ucd.Parser) {
660		r := p.Rune(0)
661		t := p.String(1)
662		v := string(p.Runes(2))
663		if t != "T" && v == string(unicode.ToLower(r)) {
664			return
665		}
666		x := table[r]
667		switch t {
668		case "C":
669			x.full = v
670			x.simple = v
671		case "S":
672			x.simple = v
673		case "F":
674			x.full = v
675		case "T":
676			x.special = v
677		}
678		table[r] = x
679	})
680	fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
681	for r := rune(0); r < 0x10FFFF; r++ {
682		x, ok := table[r]
683		if !ok {
684			continue
685		}
686		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
687	}
688	fmt.Fprint(w, "\t}\n\n")
689
690	// Break property
691	notBreak := map[rune]bool{}
692	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
693		switch p.String(1) {
694		case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
695			"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
696			notBreak[p.Rune(0)] = true
697		}
698	})
699
700	fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
701	inBreak := false
702	for r := rune(0); r <= lastRuneForTesting; r++ {
703		if isBreak := !notBreak[r]; isBreak != inBreak {
704			if isBreak {
705				fmt.Fprintf(w, "\t\t{0x%x, ", r)
706			} else {
707				fmt.Fprintf(w, "0x%x},\n", r-1)
708			}
709			inBreak = isBreak
710		}
711	}
712	if inBreak {
713		fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
714	}
715	fmt.Fprint(w, "\t}\n\n")
716
717	// Word break test
718	// Filter out all samples that do not contain cased characters.
719	cased := map[rune]bool{}
720	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
721		if p.String(1) == "Cased" {
722			cased[p.Rune(0)] = true
723		}
724	})
725
726	fmt.Fprintln(w, "\tbreakTest = []string{")
727	parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
728		c := strings.Split(p.String(0), " ")
729
730		const sep = '|'
731		numCased := 0
732		test := ""
733		for ; len(c) >= 2; c = c[2:] {
734			if c[0] == "÷" && test != "" {
735				test += string(sep)
736			}
737			i, err := strconv.ParseUint(c[1], 16, 32)
738			r := rune(i)
739			if err != nil {
740				log.Fatalf("Invalid rune %q.", c[1])
741			}
742			if r == sep {
743				log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
744			}
745			if cased[r] {
746				numCased++
747			}
748			test += string(r)
749		}
750		if numCased > 1 {
751			fmt.Fprintf(w, "\t\t%q,\n", test)
752		}
753	})
754	fmt.Fprintln(w, "\t}")
755
756	fmt.Fprintln(w, ")")
757
758	gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
759}
760
761// These functions are just used for verification that their definition have not
762// changed in the Unicode Standard.
763
764func verifyCased(r rune) bool {
765	return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
766}
767
768func verifyLower(r rune) bool {
769	return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
770}
771
772func verifyUpper(r rune) bool {
773	return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
774}
775
776// verifyIgnore is an approximation of the Case_Ignorable property using the
777// core unicode package. It is used to reduce the size of the test data.
778func verifyIgnore(r rune) bool {
779	props := []*unicode.RangeTable{
780		unicode.Mn,
781		unicode.Me,
782		unicode.Cf,
783		unicode.Lm,
784		unicode.Sk,
785	}
786	for _, p := range props {
787		if unicode.Is(p, r) {
788			return true
789		}
790	}
791	return false
792}
793
794// printProperties prints tables of rune properties from the given UCD file.
795// A filter func f can be given to exclude certain values. A rune r will have
796// the indicated property if it is in the generated table or if f(r).
797func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
798	verify := map[rune]bool{}
799	n := 0
800	varNameParts := strings.Split(property, "_")
801	varNameParts[0] = strings.ToLower(varNameParts[0])
802	fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
803	parse(file, func(p *ucd.Parser) {
804		if p.String(1) == property {
805			r := p.Rune(0)
806			verify[r] = true
807			if !f(r) {
808				n++
809				fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
810			}
811		}
812	})
813	fmt.Fprint(w, "\t}\n\n")
814
815	// Verify that f is correct, that is, it represents a subset of the property.
816	for r := rune(0); r <= lastRuneForTesting; r++ {
817		if !verify[r] && f(r) {
818			log.Fatalf("Incorrect filter func for property %q.", property)
819		}
820	}
821	return n
822}
823
824// The newCaseTrie, sparseValues and sparseOffsets definitions below are
825// placeholders referred to by gen_trieval.go. The real definitions are
826// generated by this program and written to tables.go.
827
828func newCaseTrie(int) int { return 0 }
829
830var (
831	sparseValues  [0]valueRange
832	sparseOffsets [0]uint16
833)
834