1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package norm
6
7import (
8	"bytes"
9	"flag"
10	"fmt"
11	"io"
12	"io/ioutil"
13	"log"
14	"os"
15	"os/exec"
16	"path/filepath"
17	"runtime"
18	"strings"
19	"testing"
20	"unicode/utf8"
21
22	"golang.org/x/text/internal/testtext"
23	"golang.org/x/text/transform"
24)
25
26var (
27	testn = flag.Int("testn", -1, "specific test number to run or -1 for all")
28)
29
30// pc replaces any rune r that is repeated n times, for n > 1, with r{n}.
31func pc(s string) []byte {
32	b := bytes.NewBuffer(make([]byte, 0, len(s)))
33	for i := 0; i < len(s); {
34		r, sz := utf8.DecodeRuneInString(s[i:])
35		n := 0
36		if sz == 1 {
37			// Special-case one-byte case to handle repetition for invalid UTF-8.
38			for c := s[i]; i+n < len(s) && s[i+n] == c; n++ {
39			}
40		} else {
41			for _, r2 := range s[i:] {
42				if r2 != r {
43					break
44				}
45				n++
46			}
47		}
48		b.WriteString(s[i : i+sz])
49		if n > 1 {
50			fmt.Fprintf(b, "{%d}", n)
51		}
52		i += sz * n
53	}
54	return b.Bytes()
55}
56
57// pidx finds the index from which two strings start to differ, plus context.
58// It returns the index and ellipsis if the index is greater than 0.
59func pidx(a, b string) (i int, prefix string) {
60	for ; i < len(a) && i < len(b) && a[i] == b[i]; i++ {
61	}
62	if i < 8 {
63		return 0, ""
64	}
65	i -= 3 // ensure taking at least one full rune before the difference.
66	for k := i - 7; i > k && !utf8.RuneStart(a[i]); i-- {
67	}
68	return i, "..."
69}
70
71type PositionTest struct {
72	input  string
73	pos    int
74	buffer string // expected contents of reorderBuffer, if applicable
75}
76
77type positionFunc func(rb *reorderBuffer, s string) (int, []byte)
78
79func runPosTests(t *testing.T, name string, f Form, fn positionFunc, tests []PositionTest) {
80	rb := reorderBuffer{}
81	rb.init(f, nil)
82	for i, test := range tests {
83		rb.reset()
84		rb.src = inputString(test.input)
85		rb.nsrc = len(test.input)
86		pos, out := fn(&rb, test.input)
87		if pos != test.pos {
88			t.Errorf("%s:%d: position is %d; want %d", name, i, pos, test.pos)
89		}
90		if outs := string(out); outs != test.buffer {
91			k, pfx := pidx(outs, test.buffer)
92			t.Errorf("%s:%d: buffer \nwas  %s%+q; \nwant %s%+q", name, i, pfx, pc(outs[k:]), pfx, pc(test.buffer[k:]))
93		}
94	}
95}
96
97func grave(n int) string {
98	return rep(0x0300, n)
99}
100
101func rep(r rune, n int) string {
102	return strings.Repeat(string(r), n)
103}
104
105const segSize = maxByteBufferSize
106
107var cgj = GraphemeJoiner
108
109var decomposeSegmentTests = []PositionTest{
110	// illegal runes
111	{"\xC2", 0, ""},
112	{"\xC0", 1, "\xC0"},
113	{"\u00E0\x80", 2, "\u0061\u0300"},
114	// starter
115	{"a", 1, "a"},
116	{"ab", 1, "a"},
117	// starter + composing
118	{"a\u0300", 3, "a\u0300"},
119	{"a\u0300b", 3, "a\u0300"},
120	// with decomposition
121	{"\u00C0", 2, "A\u0300"},
122	{"\u00C0b", 2, "A\u0300"},
123	// long
124	{grave(31), 60, grave(30) + cgj},
125	{"a" + grave(31), 61, "a" + grave(30) + cgj},
126
127	// Stability tests: see https://www.unicode.org/review/pr-29.html.
128	// U+0300 COMBINING GRAVE ACCENT;Mn;230;NSM;;;;;N;NON-SPACING GRAVE;;;;
129	// U+0B47 ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
130	// U+0B3E ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
131	// U+1100 HANGUL CHOSEONG KIYEOK;Lo;0;L;;;;;N;;;;;
132	// U+1161 HANGUL JUNGSEONG A;Lo;0;L;;;;;N;;;;;
133	{"\u0B47\u0300\u0B3E", 8, "\u0B47\u0300\u0B3E"},
134	{"\u1100\u0300\u1161", 8, "\u1100\u0300\u1161"},
135	{"\u0B47\u0B3E", 6, "\u0B47\u0B3E"},
136	{"\u1100\u1161", 6, "\u1100\u1161"},
137
138	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
139	// Sequence of decomposing characters that are starters and modifiers.
140	{"\u0d4a" + strings.Repeat("\u0d3e", 31), 90, "\u0d46" + strings.Repeat("\u0d3e", 30) + cgj},
141
142	{grave(30), 60, grave(30)},
143	// U+FF9E is a starter, but decomposes to U+3099, which is not.
144	{grave(30) + "\uff9e", 60, grave(30) + cgj},
145	// ends with incomplete UTF-8 encoding
146	{"\xCC", 0, ""},
147	{"\u0300\xCC", 2, "\u0300"},
148}
149
150func decomposeSegmentF(rb *reorderBuffer, s string) (int, []byte) {
151	rb.initString(NFD, s)
152	rb.setFlusher(nil, appendFlush)
153	p := decomposeSegment(rb, 0, true)
154	return p, rb.out
155}
156
157func TestDecomposeSegment(t *testing.T) {
158	runPosTests(t, "TestDecomposeSegment", NFC, decomposeSegmentF, decomposeSegmentTests)
159}
160
161var firstBoundaryTests = []PositionTest{
162	// no boundary
163	{"", -1, ""},
164	{"\u0300", -1, ""},
165	{"\x80\x80", -1, ""},
166	// illegal runes
167	{"\xff", 0, ""},
168	{"\u0300\xff", 2, ""},
169	{"\u0300\xc0\x80\x80", 2, ""},
170	// boundaries
171	{"a", 0, ""},
172	{"\u0300a", 2, ""},
173	// Hangul
174	{"\u1103\u1161", 0, ""},
175	{"\u110B\u1173\u11B7", 0, ""},
176	{"\u1161\u110B\u1173\u11B7", 3, ""},
177	{"\u1173\u11B7\u1103\u1161", 6, ""},
178	// too many combining characters.
179	{grave(maxNonStarters - 1), -1, ""},
180	{grave(maxNonStarters), 60, ""},
181	{grave(maxNonStarters + 1), 60, ""},
182}
183
184func firstBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
185	return rb.f.form.FirstBoundary([]byte(s)), nil
186}
187
188func firstBoundaryStringF(rb *reorderBuffer, s string) (int, []byte) {
189	return rb.f.form.FirstBoundaryInString(s), nil
190}
191
192func TestFirstBoundary(t *testing.T) {
193	runPosTests(t, "TestFirstBoundary", NFC, firstBoundaryF, firstBoundaryTests)
194	runPosTests(t, "TestFirstBoundaryInString", NFC, firstBoundaryStringF, firstBoundaryTests)
195}
196
197func TestNextBoundary(t *testing.T) {
198	testCases := []struct {
199		input string
200		atEOF bool
201		want  int
202	}{
203		// no boundary
204		{"", true, 0},
205		{"", false, -1},
206		{"\u0300", true, 2},
207		{"\u0300", false, -1},
208		{"\x80\x80", true, 1},
209		{"\x80\x80", false, 1},
210		// illegal runes
211		{"\xff", false, 1},
212		{"\u0300\xff", false, 2},
213		{"\u0300\xc0\x80\x80", false, 2},
214		{"\xc2\x80\x80", false, 2},
215		{"\xc2", false, -1},
216		{"\xc2", true, 1},
217		{"a\u0300\xc2", false, -1},
218		{"a\u0300\xc2", true, 3},
219		// boundaries
220		{"a", true, 1},
221		{"a", false, -1},
222		{"aa", false, 1},
223		{"\u0300", true, 2},
224		{"\u0300", false, -1},
225		{"\u0300a", false, 2},
226		// Hangul
227		{"\u1103\u1161", true, 6},
228		{"\u1103\u1161", false, -1},
229		{"\u110B\u1173\u11B7", false, -1},
230		{"\u110B\u1173\u11B7\u110B\u1173\u11B7", false, 9},
231		{"\u1161\u110B\u1173\u11B7", false, 3},
232		{"\u1173\u11B7\u1103\u1161", false, 6},
233		// too many combining characters.
234		{grave(maxNonStarters - 1), false, -1},
235		{grave(maxNonStarters), false, 60},
236		{grave(maxNonStarters + 1), false, 60},
237	}
238
239	for _, tc := range testCases {
240		if got := NFC.NextBoundary([]byte(tc.input), tc.atEOF); got != tc.want {
241			t.Errorf("NextBoundary(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
242		}
243		if got := NFC.NextBoundaryInString(tc.input, tc.atEOF); got != tc.want {
244			t.Errorf("NextBoundaryInString(%+q, %v) = %d; want %d", tc.input, tc.atEOF, got, tc.want)
245		}
246	}
247}
248
249var decomposeToLastTests = []PositionTest{
250	// ends with inert character
251	{"Hello!", 6, ""},
252	{"\u0632", 2, ""},
253	{"a\u0301\u0635", 5, ""},
254	// ends with non-inert starter
255	{"a", 0, "a"},
256	{"a\u0301a", 3, "a"},
257	{"a\u0301\u03B9", 3, "\u03B9"},
258	{"a\u0327", 0, "a\u0327"},
259	// illegal runes
260	{"\xFF", 1, ""},
261	{"aa\xFF", 3, ""},
262	{"\xC0\x80\x80", 3, ""},
263	{"\xCC\x80\x80", 3, ""},
264	// ends with incomplete UTF-8 encoding
265	{"a\xCC", 2, ""},
266	// ends with combining characters
267	{"\u0300\u0301", 0, "\u0300\u0301"},
268	{"a\u0300\u0301", 0, "a\u0300\u0301"},
269	{"a\u0301\u0308", 0, "a\u0301\u0308"},
270	{"a\u0308\u0301", 0, "a\u0308\u0301"},
271	{"aaaa\u0300\u0301", 3, "a\u0300\u0301"},
272	{"\u0300a\u0300\u0301", 2, "a\u0300\u0301"},
273	{"\u00C0", 0, "A\u0300"},
274	{"a\u00C0", 1, "A\u0300"},
275	// decomposing
276	{"a\u0300\u00E0", 3, "a\u0300"},
277	// multisegment decompositions (flushes leading segments)
278	{"a\u0300\uFDC0", 7, "\u064A"},
279	{"\uFDC0" + grave(29), 4, "\u064A" + grave(29)},
280	{"\uFDC0" + grave(30), 4, "\u064A" + grave(30)},
281	{"\uFDC0" + grave(31), 5, grave(30)},
282	{"\uFDFA" + grave(14), 31, "\u0645" + grave(14)},
283	// Overflow
284	{"\u00E0" + grave(29), 0, "a" + grave(30)},
285	{"\u00E0" + grave(30), 2, grave(30)},
286	// Hangul
287	{"a\u1103", 1, "\u1103"},
288	{"a\u110B", 1, "\u110B"},
289	{"a\u110B\u1173", 1, "\u110B\u1173"},
290	// See comment in composition.go:compBoundaryAfter.
291	{"a\u110B\u1173\u11B7", 1, "\u110B\u1173\u11B7"},
292	{"a\uC73C", 1, "\u110B\u1173"},
293	{"다음", 3, "\u110B\u1173\u11B7"},
294	{"다", 0, "\u1103\u1161"},
295	{"\u1103\u1161\u110B\u1173\u11B7", 6, "\u110B\u1173\u11B7"},
296	{"\u110B\u1173\u11B7\u1103\u1161", 9, "\u1103\u1161"},
297	{"다음음", 6, "\u110B\u1173\u11B7"},
298	{"음다다", 6, "\u1103\u1161"},
299	// maximized buffer
300	{"a" + grave(30), 0, "a" + grave(30)},
301	// Buffer overflow
302	{"a" + grave(31), 3, grave(30)},
303	// weird UTF-8
304	{"a\u0300\u11B7", 0, "a\u0300\u11B7"},
305}
306
307func decomposeToLast(rb *reorderBuffer, s string) (int, []byte) {
308	rb.setFlusher([]byte(s), appendFlush)
309	decomposeToLastBoundary(rb)
310	buf := rb.flush(nil)
311	return len(rb.out), buf
312}
313
314func TestDecomposeToLastBoundary(t *testing.T) {
315	runPosTests(t, "TestDecomposeToLastBoundary", NFKC, decomposeToLast, decomposeToLastTests)
316}
317
318var lastBoundaryTests = []PositionTest{
319	// ends with inert character
320	{"Hello!", 6, ""},
321	{"\u0632", 2, ""},
322	// ends with non-inert starter
323	{"a", 0, ""},
324	// illegal runes
325	{"\xff", 1, ""},
326	{"aa\xff", 3, ""},
327	{"a\xff\u0300", 1, ""}, // TODO: should probably be 2.
328	{"\xc0\x80\x80", 3, ""},
329	{"\xc0\x80\x80\u0300", 3, ""},
330	// ends with incomplete UTF-8 encoding
331	{"\xCC", -1, ""},
332	{"\xE0\x80", -1, ""},
333	{"\xF0\x80\x80", -1, ""},
334	{"a\xCC", 0, ""},
335	{"\x80\xCC", 1, ""},
336	{"\xCC\xCC", 1, ""},
337	// ends with combining characters
338	{"a\u0300\u0301", 0, ""},
339	{"aaaa\u0300\u0301", 3, ""},
340	{"\u0300a\u0300\u0301", 2, ""},
341	{"\u00C2", 0, ""},
342	{"a\u00C2", 1, ""},
343	// decomposition may recombine
344	{"\u0226", 0, ""},
345	// no boundary
346	{"", -1, ""},
347	{"\u0300\u0301", -1, ""},
348	{"\u0300", -1, ""},
349	{"\x80\x80", -1, ""},
350	{"\x80\x80\u0301", -1, ""},
351	// Hangul
352	{"다음", 3, ""},
353	{"다", 0, ""},
354	{"\u1103\u1161\u110B\u1173\u11B7", 6, ""},
355	{"\u110B\u1173\u11B7\u1103\u1161", 9, ""},
356	// too many combining characters.
357	{grave(maxNonStarters - 1), -1, ""},
358	// May still be preceded with a non-starter.
359	{grave(maxNonStarters), -1, ""},
360	// May still need to insert a cgj after the last combiner.
361	{grave(maxNonStarters + 1), 2, ""},
362	{grave(maxNonStarters + 2), 4, ""},
363
364	{"a" + grave(maxNonStarters-1), 0, ""},
365	{"a" + grave(maxNonStarters), 0, ""},
366	// May still need to insert a cgj after the last combiner.
367	{"a" + grave(maxNonStarters+1), 3, ""},
368	{"a" + grave(maxNonStarters+2), 5, ""},
369}
370
371func lastBoundaryF(rb *reorderBuffer, s string) (int, []byte) {
372	return rb.f.form.LastBoundary([]byte(s)), nil
373}
374
375func TestLastBoundary(t *testing.T) {
376	runPosTests(t, "TestLastBoundary", NFC, lastBoundaryF, lastBoundaryTests)
377}
378
379type spanTest struct {
380	input string
381	atEOF bool
382	n     int
383	err   error
384}
385
386var quickSpanTests = []spanTest{
387	{"", true, 0, nil},
388	// starters
389	{"a", true, 1, nil},
390	{"abc", true, 3, nil},
391	{"\u043Eb", true, 3, nil},
392	// incomplete last rune.
393	{"\xCC", true, 1, nil},
394	{"\xCC", false, 0, transform.ErrShortSrc},
395	{"a\xCC", true, 2, nil},
396	{"a\xCC", false, 0, transform.ErrShortSrc}, // TODO: could be 1 for NFD
397	// incorrectly ordered combining characters
398	{"\u0300\u0316", true, 0, transform.ErrEndOfSpan},
399	{"\u0300\u0316", false, 0, transform.ErrEndOfSpan},
400	{"\u0300\u0316cd", true, 0, transform.ErrEndOfSpan},
401	{"\u0300\u0316cd", false, 0, transform.ErrEndOfSpan},
402	// have a maximum number of combining characters.
403	{rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
404	{"a" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
405	{"Ɵ" + rep(0x035D, 30) + "\u035B", true, 0, transform.ErrEndOfSpan},
406	{"aa" + rep(0x035D, 30) + "\u035B", true, 1, transform.ErrEndOfSpan},
407	{rep(0x035D, 30) + cgj + "\u035B", true, 64, nil},
408	{"a" + rep(0x035D, 30) + cgj + "\u035B", true, 65, nil},
409	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
410	{"aa" + rep(0x035D, 30) + cgj + "\u035B", true, 66, nil},
411
412	{"a" + rep(0x035D, 30) + cgj + "\u035B", false, 61, transform.ErrShortSrc},
413	{"Ɵ" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
414	{"aa" + rep(0x035D, 30) + cgj + "\u035B", false, 62, transform.ErrShortSrc},
415}
416
417var quickSpanNFDTests = []spanTest{
418	// needs decomposing
419	{"\u00C0", true, 0, transform.ErrEndOfSpan},
420	{"abc\u00C0", true, 3, transform.ErrEndOfSpan},
421	// correctly ordered combining characters
422	{"\u0300", true, 2, nil},
423	{"ab\u0300", true, 4, nil},
424	{"ab\u0300cd", true, 6, nil},
425	{"\u0300cd", true, 4, nil},
426	{"\u0316\u0300", true, 4, nil},
427	{"ab\u0316\u0300", true, 6, nil},
428	{"ab\u0316\u0300cd", true, 8, nil},
429	{"ab\u0316\u0300\u00C0", true, 6, transform.ErrEndOfSpan},
430	{"\u0316\u0300cd", true, 6, nil},
431	{"\u043E\u0308b", true, 5, nil},
432	// incorrectly ordered combining characters
433	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan}, // TODO: we could skip 'b' as well.
434	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
435	// Hangul
436	{"같은", true, 0, transform.ErrEndOfSpan},
437}
438
439var quickSpanNFCTests = []spanTest{
440	// okay composed
441	{"\u00C0", true, 2, nil},
442	{"abc\u00C0", true, 5, nil},
443	// correctly ordered combining characters
444	// TODO: b may combine with modifiers, which is why this fails. We could
445	// make a more precise test that actually checks whether last
446	// characters combines. Probably not worth it.
447	{"ab\u0300", true, 1, transform.ErrEndOfSpan},
448	{"ab\u0300cd", true, 1, transform.ErrEndOfSpan},
449	{"ab\u0316\u0300", true, 1, transform.ErrEndOfSpan},
450	{"ab\u0316\u0300cd", true, 1, transform.ErrEndOfSpan},
451	{"\u00C0\u035D", true, 4, nil},
452	// we do not special case leading combining characters
453	{"\u0300cd", true, 0, transform.ErrEndOfSpan},
454	{"\u0300", true, 0, transform.ErrEndOfSpan},
455	{"\u0316\u0300", true, 0, transform.ErrEndOfSpan},
456	{"\u0316\u0300cd", true, 0, transform.ErrEndOfSpan},
457	// incorrectly ordered combining characters
458	{"ab\u0300\u0316", true, 1, transform.ErrEndOfSpan},
459	{"ab\u0300\u0316cd", true, 1, transform.ErrEndOfSpan},
460	// Hangul
461	{"같은", true, 6, nil},
462	{"같은", false, 3, transform.ErrShortSrc},
463	// We return the start of the violating segment in case of overflow.
464	{grave(30) + "\uff9e", true, 0, transform.ErrEndOfSpan},
465	{grave(30), true, 0, transform.ErrEndOfSpan},
466}
467
468func runSpanTests(t *testing.T, name string, f Form, testCases []spanTest) {
469	for i, tc := range testCases {
470		s := fmt.Sprintf("Bytes/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
471		ok := testtext.Run(t, s, func(t *testing.T) {
472			n, err := f.Span([]byte(tc.input), tc.atEOF)
473			if n != tc.n || err != tc.err {
474				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
475			}
476		})
477		if !ok {
478			continue // Don't do the String variant if the Bytes variant failed.
479		}
480		s = fmt.Sprintf("String/%s/%d=%+q/atEOF=%v", name, i, pc(tc.input), tc.atEOF)
481		testtext.Run(t, s, func(t *testing.T) {
482			n, err := f.SpanString(tc.input, tc.atEOF)
483			if n != tc.n || err != tc.err {
484				t.Errorf("\n got %d, %v;\nwant %d, %v", n, err, tc.n, tc.err)
485			}
486		})
487	}
488}
489
490func TestSpan(t *testing.T) {
491	runSpanTests(t, "NFD", NFD, quickSpanTests)
492	runSpanTests(t, "NFD", NFD, quickSpanNFDTests)
493	runSpanTests(t, "NFC", NFC, quickSpanTests)
494	runSpanTests(t, "NFC", NFC, quickSpanNFCTests)
495}
496
497var isNormalTests = []PositionTest{
498	{"", 1, ""},
499	// illegal runes
500	{"\xff", 1, ""},
501	// starters
502	{"a", 1, ""},
503	{"abc", 1, ""},
504	{"\u043Eb", 1, ""},
505	// incorrectly ordered combining characters
506	{"\u0300\u0316", 0, ""},
507	{"ab\u0300\u0316", 0, ""},
508	{"ab\u0300\u0316cd", 0, ""},
509	{"\u0300\u0316cd", 0, ""},
510}
511var isNormalNFDTests = []PositionTest{
512	// needs decomposing
513	{"\u00C0", 0, ""},
514	{"abc\u00C0", 0, ""},
515	// correctly ordered combining characters
516	{"\u0300", 1, ""},
517	{"ab\u0300", 1, ""},
518	{"ab\u0300cd", 1, ""},
519	{"\u0300cd", 1, ""},
520	{"\u0316\u0300", 1, ""},
521	{"ab\u0316\u0300", 1, ""},
522	{"ab\u0316\u0300cd", 1, ""},
523	{"\u0316\u0300cd", 1, ""},
524	{"\u043E\u0308b", 1, ""},
525	// Hangul
526	{"같은", 0, ""},
527}
528var isNormalNFCTests = []PositionTest{
529	// okay composed
530	{"\u00C0", 1, ""},
531	{"abc\u00C0", 1, ""},
532	// need reordering
533	{"a\u0300", 0, ""},
534	{"a\u0300cd", 0, ""},
535	{"a\u0316\u0300", 0, ""},
536	{"a\u0316\u0300cd", 0, ""},
537	// correctly ordered combining characters
538	{"ab\u0300", 1, ""},
539	{"ab\u0300cd", 1, ""},
540	{"ab\u0316\u0300", 1, ""},
541	{"ab\u0316\u0300cd", 1, ""},
542	{"\u00C0\u035D", 1, ""},
543	{"\u0300", 1, ""},
544	{"\u0316\u0300cd", 1, ""},
545	// Hangul
546	{"같은", 1, ""},
547}
548
549var isNormalNFKXTests = []PositionTest{
550	// Special case.
551	{"\u00BC", 0, ""},
552}
553
554func isNormalF(rb *reorderBuffer, s string) (int, []byte) {
555	if rb.f.form.IsNormal([]byte(s)) {
556		return 1, nil
557	}
558	return 0, nil
559}
560
561func isNormalStringF(rb *reorderBuffer, s string) (int, []byte) {
562	if rb.f.form.IsNormalString(s) {
563		return 1, nil
564	}
565	return 0, nil
566}
567
568func TestIsNormal(t *testing.T) {
569	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalF, isNormalTests)
570	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalF, isNormalNFDTests)
571	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalF, isNormalTests)
572	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalF, isNormalNFCTests)
573	runPosTests(t, "TestIsNormalNFKD1", NFKD, isNormalF, isNormalTests)
574	runPosTests(t, "TestIsNormalNFKD2", NFKD, isNormalF, isNormalNFDTests)
575	runPosTests(t, "TestIsNormalNFKD3", NFKD, isNormalF, isNormalNFKXTests)
576	runPosTests(t, "TestIsNormalNFKC1", NFKC, isNormalF, isNormalTests)
577	runPosTests(t, "TestIsNormalNFKC2", NFKC, isNormalF, isNormalNFCTests)
578	runPosTests(t, "TestIsNormalNFKC3", NFKC, isNormalF, isNormalNFKXTests)
579}
580
581func TestIsNormalString(t *testing.T) {
582	runPosTests(t, "TestIsNormalNFD1", NFD, isNormalStringF, isNormalTests)
583	runPosTests(t, "TestIsNormalNFD2", NFD, isNormalStringF, isNormalNFDTests)
584	runPosTests(t, "TestIsNormalNFC1", NFC, isNormalStringF, isNormalTests)
585	runPosTests(t, "TestIsNormalNFC2", NFC, isNormalStringF, isNormalNFCTests)
586}
587
588type AppendTest struct {
589	left  string
590	right string
591	out   string
592}
593
594type appendFunc func(f Form, out []byte, s string) []byte
595
596var fstr = []string{"NFC", "NFD", "NFKC", "NFKD"}
597
598func runNormTests(t *testing.T, name string, fn appendFunc) {
599	for f := NFC; f <= NFKD; f++ {
600		runAppendTests(t, name, f, fn, normTests[f])
601	}
602}
603
604func runAppendTests(t *testing.T, name string, f Form, fn appendFunc, tests []AppendTest) {
605	for i, test := range tests {
606		t.Run(fmt.Sprintf("%s/%d", fstr[f], i), func(t *testing.T) {
607			id := pc(test.left + test.right)
608			if *testn >= 0 && i != *testn {
609				return
610			}
611			t.Run("fn", func(t *testing.T) {
612				out := []byte(test.left)
613				have := string(fn(f, out, test.right))
614				if len(have) != len(test.out) {
615					t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(test.out), pc(have), pc(test.out))
616				}
617				if have != test.out {
618					k, pf := pidx(have, test.out)
619					t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(test.out[k:]))
620				}
621			})
622
623			// Bootstrap by normalizing input. Ensures that the various variants
624			// behave the same.
625			for g := NFC; g <= NFKD; g++ {
626				if f == g {
627					continue
628				}
629				t.Run(fstr[g], func(t *testing.T) {
630					want := g.String(test.left + test.right)
631					have := string(fn(g, g.AppendString(nil, test.left), test.right))
632					if len(have) != len(want) {
633						t.Errorf("%+q: length is %d; want %d (%+q vs %+q)", id, len(have), len(want), pc(have), pc(want))
634					}
635					if have != want {
636						k, pf := pidx(have, want)
637						t.Errorf("%+q:\nwas  %s%+q; \nwant %s%+q", id, pf, pc(have[k:]), pf, pc(want[k:]))
638					}
639				})
640			}
641		})
642	}
643}
644
645var normTests = [][]AppendTest{
646	appendTestsNFC,
647	appendTestsNFD,
648	appendTestsNFKC,
649	appendTestsNFKD,
650}
651
652var appendTestsNFC = []AppendTest{
653	{"", ascii, ascii},
654	{"", txt_all, txt_all},
655	{"\uff9e", grave(30), "\uff9e" + grave(29) + cgj + grave(1)},
656	{grave(30), "\uff9e", grave(30) + cgj + "\uff9e"},
657
658	// Tests designed for Iter.
659	{ // ordering of non-composing combining characters
660		"",
661		"\u0305\u0316",
662		"\u0316\u0305",
663	},
664	{ // segment overflow
665		"",
666		"a" + rep(0x0305, maxNonStarters+4) + "\u0316",
667		"a" + rep(0x0305, maxNonStarters) + cgj + "\u0316" + rep(0x305, 4),
668	},
669
670	{ // Combine across non-blocking non-starters.
671		// U+0327 COMBINING CEDILLA;Mn;202;NSM;;;;;N;NON-SPACING CEDILLA;;;;
672		// U+0325 COMBINING RING BELOW;Mn;220;NSM;;;;;N;NON-SPACING RING BELOW;;;;
673		"", "a\u0327\u0325", "\u1e01\u0327",
674	},
675
676	{ // Jamo V+T does not combine.
677		"",
678		"\u1161\u11a8",
679		"\u1161\u11a8",
680	},
681
682	// Stability tests: see https://www.unicode.org/review/pr-29.html.
683	{"", "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"},
684	{"", "\u1100\u0300\u1161", "\u1100\u0300\u1161"},
685	{"", "\u0b47\u0b3e", "\u0b4b"},
686	{"", "\u1100\u1161", "\uac00"},
687
688	// U+04DA MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;
689	{ // 0d4a starts a new segment.
690		"",
691		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
692		"\u0d4a" + strings.Repeat("\u0d3e", 15) + "\u0d4a" + strings.Repeat("\u0d3e", 15),
693	},
694
695	{ // Split combining characters.
696		// TODO: don't insert CGJ before starters.
697		"",
698		"\u0d46" + strings.Repeat("\u0d3e", 31),
699		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
700	},
701
702	{ // Split combining characters.
703		"",
704		"\u0d4a" + strings.Repeat("\u0d3e", 30),
705		"\u0d4a" + strings.Repeat("\u0d3e", 29) + cgj + "\u0d3e",
706	},
707
708	{ //  https://golang.org/issues/20079
709		"",
710		"\xeb\u0344",
711		"\xeb\u0308\u0301",
712	},
713
714	{ //  https://golang.org/issues/20079
715		"",
716		"\uac00" + strings.Repeat("\u0300", 30),
717		"\uac00" + strings.Repeat("\u0300", 29) + "\u034f\u0300",
718	},
719
720	{ //  https://golang.org/issues/20079
721		"",
722		"\xeb" + strings.Repeat("\u0300", 31),
723		"\xeb" + strings.Repeat("\u0300", 30) + "\u034f\u0300",
724	},
725}
726
727var appendTestsNFD = []AppendTest{
728	// TODO: Move some of the tests here.
729}
730
731var appendTestsNFKC = []AppendTest{
732	// empty buffers
733	{"", "", ""},
734	{"a", "", "a"},
735	{"", "a", "a"},
736	{"", "\u0041\u0307\u0304", "\u01E0"},
737	// segment split across buffers
738	{"", "a\u0300b", "\u00E0b"},
739	{"a", "\u0300b", "\u00E0b"},
740	{"a", "\u0300\u0316", "\u00E0\u0316"},
741	{"a", "\u0316\u0300", "\u00E0\u0316"},
742	{"a", "\u0300a\u0300", "\u00E0\u00E0"},
743	{"a", "\u0300a\u0300a\u0300", "\u00E0\u00E0\u00E0"},
744	{"a", "\u0300aaa\u0300aaa\u0300", "\u00E0aa\u00E0aa\u00E0"},
745	{"a\u0300", "\u0327", "\u00E0\u0327"},
746	{"a\u0327", "\u0300", "\u00E0\u0327"},
747	{"a\u0316", "\u0300", "\u00E0\u0316"},
748	{"\u0041\u0307", "\u0304", "\u01E0"},
749	// Hangul
750	{"", "\u110B\u1173", "\uC73C"},
751	{"", "\u1103\u1161", "\uB2E4"},
752	{"", "\u110B\u1173\u11B7", "\uC74C"},
753	{"", "\u320E", "\x28\uAC00\x29"},
754	{"", "\x28\u1100\u1161\x29", "\x28\uAC00\x29"},
755	{"\u1103", "\u1161", "\uB2E4"},
756	{"\u110B", "\u1173\u11B7", "\uC74C"},
757	{"\u110B\u1173", "\u11B7", "\uC74C"},
758	{"\uC73C", "\u11B7", "\uC74C"},
759	// UTF-8 encoding split across buffers
760	{"a\xCC", "\x80", "\u00E0"},
761	{"a\xCC", "\x80b", "\u00E0b"},
762	{"a\xCC", "\x80a\u0300", "\u00E0\u00E0"},
763	{"a\xCC", "\x80\x80", "\u00E0\x80"},
764	{"a\xCC", "\x80\xCC", "\u00E0\xCC"},
765	{"a\u0316\xCC", "\x80a\u0316\u0300", "\u00E0\u0316\u00E0\u0316"},
766	// ending in incomplete UTF-8 encoding
767	{"", "\xCC", "\xCC"},
768	{"a", "\xCC", "a\xCC"},
769	{"a", "b\xCC", "ab\xCC"},
770	{"\u0226", "\xCC", "\u0226\xCC"},
771	// illegal runes
772	{"", "\x80", "\x80"},
773	{"", "\x80\x80\x80", "\x80\x80\x80"},
774	{"", "\xCC\x80\x80\x80", "\xCC\x80\x80\x80"},
775	{"", "a\x80", "a\x80"},
776	{"", "a\x80\x80\x80", "a\x80\x80\x80"},
777	{"", "a\x80\x80\x80\x80\x80\x80", "a\x80\x80\x80\x80\x80\x80"},
778	{"a", "\x80\x80\x80", "a\x80\x80\x80"},
779	// overflow
780	{"", strings.Repeat("\x80", 33), strings.Repeat("\x80", 33)},
781	{strings.Repeat("\x80", 33), "", strings.Repeat("\x80", 33)},
782	{strings.Repeat("\x80", 33), strings.Repeat("\x80", 33), strings.Repeat("\x80", 66)},
783	// overflow of combining characters
784	{"", grave(34), grave(30) + cgj + grave(4)},
785	{"", grave(36), grave(30) + cgj + grave(6)},
786	{grave(29), grave(5), grave(30) + cgj + grave(4)},
787	{grave(30), grave(4), grave(30) + cgj + grave(4)},
788	{grave(30), grave(3), grave(30) + cgj + grave(3)},
789	{grave(30) + "\xCC", "\x80", grave(30) + cgj + grave(1)},
790	{"", "\uFDFA" + grave(14), "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645" + grave(14)},
791	{"", "\uFDFA" + grave(28) + "\u0316", "\u0635\u0644\u0649 \u0627\u0644\u0644\u0647 \u0639\u0644\u064a\u0647 \u0648\u0633\u0644\u0645\u0316" + grave(28)},
792	// - First rune has a trailing non-starter.
793	{"\u00d5", grave(30), "\u00d5" + grave(29) + cgj + grave(1)},
794	// - U+FF9E decomposes into a non-starter in compatibility mode. A CGJ must be
795	//   inserted even when FF9E starts a new segment.
796	{"\uff9e", grave(30), "\u3099" + grave(29) + cgj + grave(1)},
797	{grave(30), "\uff9e", grave(30) + cgj + "\u3099"},
798	// - Many non-starter decompositions in a row causing overflow.
799	{"", rep(0x340, 31), rep(0x300, 30) + cgj + "\u0300"},
800	{"", rep(0xFF9E, 31), rep(0x3099, 30) + cgj + "\u3099"},
801
802	{"", "\u0644\u0625" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + "\u0300\u0300"},
803	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
804	{"", "\ufef9" + rep(0x300, 31), "\u0644\u0625" + rep(0x300, 29) + cgj + rep(0x0300, 2)},
805
806	// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
807	{"", "\u0f7f" + rep(0xf71, 29) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80"},
808	{"", "\u0f7f" + rep(0xf71, 28) + "\u0f81", "\u0f7f" + rep(0xf71, 29) + "\u0f80"},
809	{"", "\u0f7f" + rep(0xf81, 16), "\u0f7f" + rep(0xf71, 15) + rep(0xf80, 15) + cgj + "\u0f71\u0f80"},
810
811	// weird UTF-8
812	{"\u00E0\xE1", "\x86", "\u00E0\xE1\x86"},
813	{"a\u0300\u11B7", "\u0300", "\u00E0\u11B7\u0300"},
814	{"a\u0300\u11B7\u0300", "\u0300", "\u00E0\u11B7\u0300\u0300"},
815	{"\u0300", "\xF8\x80\x80\x80\x80\u0300", "\u0300\xF8\x80\x80\x80\x80\u0300"},
816	{"\u0300", "\xFC\x80\x80\x80\x80\x80\u0300", "\u0300\xFC\x80\x80\x80\x80\x80\u0300"},
817	{"\xF8\x80\x80\x80\x80\u0300", "\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
818	{"\xFC\x80\x80\x80\x80\x80\u0300", "\u0300", "\xFC\x80\x80\x80\x80\x80\u0300\u0300"},
819	{"\xF8\x80\x80\x80", "\x80\u0300\u0300", "\xF8\x80\x80\x80\x80\u0300\u0300"},
820
821	{"", strings.Repeat("a\u0316\u0300", 6), strings.Repeat("\u00E0\u0316", 6)},
822	// large input.
823	{"", strings.Repeat("a\u0300\u0316", 31), strings.Repeat("\u00E0\u0316", 31)},
824	{"", strings.Repeat("a\u0300\u0316", 4000), strings.Repeat("\u00E0\u0316", 4000)},
825	{"", strings.Repeat("\x80\x80", 4000), strings.Repeat("\x80\x80", 4000)},
826	{"", "\u0041\u0307\u0304", "\u01E0"},
827}
828
829var appendTestsNFKD = []AppendTest{
830	{"", "a" + grave(64), "a" + grave(30) + cgj + grave(30) + cgj + grave(4)},
831
832	{ // segment overflow on unchanged character
833		"",
834		"a" + grave(64) + "\u0316",
835		"a" + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(4),
836	},
837	{ // segment overflow on unchanged character + start value
838		"",
839		"a" + grave(98) + "\u0316",
840		"a" + grave(30) + cgj + grave(30) + cgj + grave(30) + cgj + "\u0316" + grave(8),
841	},
842	{ // segment overflow on decomposition. (U+0340 decomposes to U+0300.)
843		"",
844		"a" + grave(59) + "\u0340",
845		"a" + grave(30) + cgj + grave(30),
846	},
847	{ // segment overflow on non-starter decomposition
848		"",
849		"a" + grave(33) + "\u0340" + grave(30) + "\u0320",
850		"a" + grave(30) + cgj + grave(30) + cgj + "\u0320" + grave(4),
851	},
852	{ // start value after ASCII overflow
853		"",
854		rep('a', segSize) + grave(32) + "\u0320",
855		rep('a', segSize) + grave(30) + cgj + "\u0320" + grave(2),
856	},
857	{ // Jamo overflow
858		"",
859		"\u1100\u1161" + grave(30) + "\u0320" + grave(2),
860		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
861	},
862	{ // Hangul
863		"",
864		"\uac00",
865		"\u1100\u1161",
866	},
867	{ // Hangul overflow
868		"",
869		"\uac00" + grave(32) + "\u0320",
870		"\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
871	},
872	{ // Hangul overflow in Hangul mode.
873		"",
874		"\uac00\uac00" + grave(32) + "\u0320",
875		"\u1100\u1161\u1100\u1161" + grave(29) + cgj + "\u0320" + grave(3),
876	},
877	{ // Hangul overflow in Hangul mode.
878		"",
879		strings.Repeat("\uac00", 3) + grave(32) + "\u0320",
880		strings.Repeat("\u1100\u1161", 3) + grave(29) + cgj + "\u0320" + grave(3),
881	},
882	{ // start value after cc=0
883		"",
884		"您您" + grave(34) + "\u0320",
885		"您您" + grave(30) + cgj + "\u0320" + grave(4),
886	},
887	{ // start value after normalization
888		"",
889		"\u0300\u0320a" + grave(34) + "\u0320",
890		"\u0320\u0300a" + grave(30) + cgj + "\u0320" + grave(4),
891	},
892	{
893		// U+0F81 TIBETAN VOWEL SIGN REVERSED II splits into two modifiers.
894		"",
895		"a\u0f7f" + rep(0xf71, 29) + "\u0f81",
896		"a\u0f7f" + rep(0xf71, 29) + cgj + "\u0f71\u0f80",
897	},
898}
899
900func TestAppend(t *testing.T) {
901	runNormTests(t, "Append", func(f Form, out []byte, s string) []byte {
902		return f.Append(out, []byte(s)...)
903	})
904}
905
906func TestAppendString(t *testing.T) {
907	runNormTests(t, "AppendString", func(f Form, out []byte, s string) []byte {
908		return f.AppendString(out, s)
909	})
910}
911
912func TestBytes(t *testing.T) {
913	runNormTests(t, "Bytes", func(f Form, out []byte, s string) []byte {
914		buf := []byte{}
915		buf = append(buf, out...)
916		buf = append(buf, s...)
917		return f.Bytes(buf)
918	})
919}
920
921func TestString(t *testing.T) {
922	runNormTests(t, "String", func(f Form, out []byte, s string) []byte {
923		outs := string(out) + s
924		return []byte(f.String(outs))
925	})
926}
927
928func runNM(code string) (string, error) {
929	// Write the file.
930	tmpdir, err := ioutil.TempDir(os.TempDir(), "normalize_test")
931	if err != nil {
932		return "", fmt.Errorf("failed to create tmpdir: %v", err)
933	}
934	defer os.RemoveAll(tmpdir)
935	goTool := filepath.Join(runtime.GOROOT(), "bin", "go")
936	filename := filepath.Join(tmpdir, "main.go")
937	if err := ioutil.WriteFile(filename, []byte(code), 0644); err != nil {
938		return "", fmt.Errorf("failed to write main.go: %v", err)
939	}
940	outputFile := filepath.Join(tmpdir, "main")
941
942	// Build the binary.
943	out, err := exec.Command(goTool, "build", "-o", outputFile, filename).CombinedOutput()
944	if err != nil {
945		return "", fmt.Errorf("failed to execute command: %v", err)
946	}
947
948	// Get the symbols.
949	out, err = exec.Command(goTool, "tool", "nm", outputFile).CombinedOutput()
950	return string(out), err
951}
952
953func TestLinking(t *testing.T) {
954	const prog = `
955	package main
956	import "fmt"
957	import "golang.org/x/text/unicode/norm"
958	func main() { fmt.Println(norm.%s) }
959	`
960
961	baseline, errB := runNM(fmt.Sprintf(prog, "MaxSegmentSize"))
962	withTables, errT := runNM(fmt.Sprintf(prog, `NFC.String("")`))
963	if errB != nil || errT != nil {
964		t.Skipf("TestLinking failed: %v and %v", errB, errT)
965	}
966
967	symbols := []string{"norm.formTable", "norm.nfkcValues", "norm.decomps"}
968	for _, symbol := range symbols {
969		if strings.Contains(baseline, symbol) {
970			t.Errorf("found: %q unexpectedly", symbol)
971		}
972		if !strings.Contains(withTables, symbol) {
973			t.Errorf("didn't find: %q unexpectedly", symbol)
974		}
975	}
976}
977
978func appendBench(f Form, in []byte) func() {
979	buf := make([]byte, 0, 4*len(in))
980	return func() {
981		f.Append(buf, in...)
982	}
983}
984
985func bytesBench(f Form, in []byte) func() {
986	return func() {
987		f.Bytes(in)
988	}
989}
990
991func iterBench(f Form, in []byte) func() {
992	iter := Iter{}
993	return func() {
994		iter.Init(f, in)
995		for !iter.Done() {
996			iter.Next()
997		}
998	}
999}
1000
1001func transformBench(f Form, in []byte) func() {
1002	buf := make([]byte, 4*len(in))
1003	return func() {
1004		if _, n, err := f.Transform(buf, in, true); err != nil || len(in) != n {
1005			log.Panic(n, len(in), err)
1006		}
1007	}
1008}
1009
1010func readerBench(f Form, in []byte) func() {
1011	buf := make([]byte, 4*len(in))
1012	return func() {
1013		r := f.Reader(bytes.NewReader(in))
1014		var err error
1015		for err == nil {
1016			_, err = r.Read(buf)
1017		}
1018		if err != io.EOF {
1019			panic("")
1020		}
1021	}
1022}
1023
1024func writerBench(f Form, in []byte) func() {
1025	buf := make([]byte, 0, 4*len(in))
1026	return func() {
1027		r := f.Writer(bytes.NewBuffer(buf))
1028		if _, err := r.Write(in); err != nil {
1029			panic("")
1030		}
1031	}
1032}
1033
1034func appendBenchmarks(bm []func(), f Form, in []byte) []func() {
1035	bm = append(bm, appendBench(f, in))
1036	bm = append(bm, iterBench(f, in))
1037	bm = append(bm, transformBench(f, in))
1038	bm = append(bm, readerBench(f, in))
1039	bm = append(bm, writerBench(f, in))
1040	return bm
1041}
1042
1043func doFormBenchmark(b *testing.B, inf, f Form, s string) {
1044	b.StopTimer()
1045	in := inf.Bytes([]byte(s))
1046	bm := appendBenchmarks(nil, f, in)
1047	b.SetBytes(int64(len(in) * len(bm)))
1048	b.StartTimer()
1049	for i := 0; i < b.N; i++ {
1050		for _, fn := range bm {
1051			fn()
1052		}
1053	}
1054}
1055
1056func doSingle(b *testing.B, f func(Form, []byte) func(), s []byte) {
1057	b.StopTimer()
1058	fn := f(NFC, s)
1059	b.SetBytes(int64(len(s)))
1060	b.StartTimer()
1061	for i := 0; i < b.N; i++ {
1062		fn()
1063	}
1064}
1065
1066var (
1067	smallNoChange = []byte("nörmalization")
1068	smallChange   = []byte("No\u0308rmalization")
1069	ascii         = strings.Repeat("There is nothing to change here! ", 500)
1070)
1071
1072func lowerBench(f Form, in []byte) func() {
1073	// Use package strings instead of bytes as it doesn't allocate memory
1074	// if there aren't any changes.
1075	s := string(in)
1076	return func() {
1077		strings.ToLower(s)
1078	}
1079}
1080
1081func BenchmarkLowerCaseNoChange(b *testing.B) {
1082	doSingle(b, lowerBench, smallNoChange)
1083}
1084func BenchmarkLowerCaseChange(b *testing.B) {
1085	doSingle(b, lowerBench, smallChange)
1086}
1087
1088func quickSpanBench(f Form, in []byte) func() {
1089	return func() {
1090		f.QuickSpan(in)
1091	}
1092}
1093
1094func BenchmarkQuickSpanChangeNFC(b *testing.B) {
1095	doSingle(b, quickSpanBench, smallNoChange)
1096}
1097
1098func BenchmarkBytesNoChangeNFC(b *testing.B) {
1099	doSingle(b, bytesBench, smallNoChange)
1100}
1101func BenchmarkBytesChangeNFC(b *testing.B) {
1102	doSingle(b, bytesBench, smallChange)
1103}
1104
1105func BenchmarkAppendNoChangeNFC(b *testing.B) {
1106	doSingle(b, appendBench, smallNoChange)
1107}
1108func BenchmarkAppendChangeNFC(b *testing.B) {
1109	doSingle(b, appendBench, smallChange)
1110}
1111func BenchmarkAppendLargeNFC(b *testing.B) {
1112	doSingle(b, appendBench, txt_all_bytes)
1113}
1114
1115func BenchmarkIterNoChangeNFC(b *testing.B) {
1116	doSingle(b, iterBench, smallNoChange)
1117}
1118func BenchmarkIterChangeNFC(b *testing.B) {
1119	doSingle(b, iterBench, smallChange)
1120}
1121func BenchmarkIterLargeNFC(b *testing.B) {
1122	doSingle(b, iterBench, txt_all_bytes)
1123}
1124
1125func BenchmarkTransformNoChangeNFC(b *testing.B) {
1126	doSingle(b, transformBench, smallNoChange)
1127}
1128func BenchmarkTransformChangeNFC(b *testing.B) {
1129	doSingle(b, transformBench, smallChange)
1130}
1131func BenchmarkTransformLargeNFC(b *testing.B) {
1132	doSingle(b, transformBench, txt_all_bytes)
1133}
1134
1135func BenchmarkNormalizeAsciiNFC(b *testing.B) {
1136	doFormBenchmark(b, NFC, NFC, ascii)
1137}
1138func BenchmarkNormalizeAsciiNFD(b *testing.B) {
1139	doFormBenchmark(b, NFC, NFD, ascii)
1140}
1141func BenchmarkNormalizeAsciiNFKC(b *testing.B) {
1142	doFormBenchmark(b, NFC, NFKC, ascii)
1143}
1144func BenchmarkNormalizeAsciiNFKD(b *testing.B) {
1145	doFormBenchmark(b, NFC, NFKD, ascii)
1146}
1147
1148func BenchmarkNormalizeNFC2NFC(b *testing.B) {
1149	doFormBenchmark(b, NFC, NFC, txt_all)
1150}
1151func BenchmarkNormalizeNFC2NFD(b *testing.B) {
1152	doFormBenchmark(b, NFC, NFD, txt_all)
1153}
1154func BenchmarkNormalizeNFD2NFC(b *testing.B) {
1155	doFormBenchmark(b, NFD, NFC, txt_all)
1156}
1157func BenchmarkNormalizeNFD2NFD(b *testing.B) {
1158	doFormBenchmark(b, NFD, NFD, txt_all)
1159}
1160
1161// Hangul is often special-cased, so we test it separately.
1162func BenchmarkNormalizeHangulNFC2NFC(b *testing.B) {
1163	doFormBenchmark(b, NFC, NFC, txt_kr)
1164}
1165func BenchmarkNormalizeHangulNFC2NFD(b *testing.B) {
1166	doFormBenchmark(b, NFC, NFD, txt_kr)
1167}
1168func BenchmarkNormalizeHangulNFD2NFC(b *testing.B) {
1169	doFormBenchmark(b, NFD, NFC, txt_kr)
1170}
1171func BenchmarkNormalizeHangulNFD2NFD(b *testing.B) {
1172	doFormBenchmark(b, NFD, NFD, txt_kr)
1173}
1174
1175var forms = []Form{NFC, NFD, NFKC, NFKD}
1176
1177func doTextBenchmark(b *testing.B, s string) {
1178	b.StopTimer()
1179	in := []byte(s)
1180	bm := []func(){}
1181	for _, f := range forms {
1182		bm = appendBenchmarks(bm, f, in)
1183	}
1184	b.SetBytes(int64(len(s) * len(bm)))
1185	b.StartTimer()
1186	for i := 0; i < b.N; i++ {
1187		for _, f := range bm {
1188			f()
1189		}
1190	}
1191}
1192
1193func BenchmarkCanonicalOrdering(b *testing.B) {
1194	doTextBenchmark(b, txt_canon)
1195}
1196func BenchmarkExtendedLatin(b *testing.B) {
1197	doTextBenchmark(b, txt_vn)
1198}
1199func BenchmarkMiscTwoByteUtf8(b *testing.B) {
1200	doTextBenchmark(b, twoByteUtf8)
1201}
1202func BenchmarkMiscThreeByteUtf8(b *testing.B) {
1203	doTextBenchmark(b, threeByteUtf8)
1204}
1205func BenchmarkHangul(b *testing.B) {
1206	doTextBenchmark(b, txt_kr)
1207}
1208func BenchmarkJapanese(b *testing.B) {
1209	doTextBenchmark(b, txt_jp)
1210}
1211func BenchmarkChinese(b *testing.B) {
1212	doTextBenchmark(b, txt_cn)
1213}
1214func BenchmarkOverflow(b *testing.B) {
1215	doTextBenchmark(b, overflow)
1216}
1217
1218var overflow = string(bytes.Repeat([]byte("\u035D"), 4096)) + "\u035B"
1219
1220// Tests sampled from the Canonical ordering tests (Part 2) of
1221// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
1222const txt_canon = `\u0061\u0315\u0300\u05AE\u0300\u0062 \u0061\u0300\u0315\u0300\u05AE\u0062
1223\u0061\u0302\u0315\u0300\u05AE\u0062 \u0061\u0307\u0315\u0300\u05AE\u0062
1224\u0061\u0315\u0300\u05AE\u030A\u0062 \u0061\u059A\u0316\u302A\u031C\u0062
1225\u0061\u032E\u059A\u0316\u302A\u0062 \u0061\u0338\u093C\u0334\u0062
1226\u0061\u059A\u0316\u302A\u0339       \u0061\u0341\u0315\u0300\u05AE\u0062
1227\u0061\u0348\u059A\u0316\u302A\u0062 \u0061\u0361\u0345\u035D\u035C\u0062
1228\u0061\u0366\u0315\u0300\u05AE\u0062 \u0061\u0315\u0300\u05AE\u0486\u0062
1229\u0061\u05A4\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0613\u0062
1230\u0061\u0315\u0300\u05AE\u0615\u0062 \u0061\u0617\u0315\u0300\u05AE\u0062
1231\u0061\u0619\u0618\u064D\u064E\u0062 \u0061\u0315\u0300\u05AE\u0654\u0062
1232\u0061\u0315\u0300\u05AE\u06DC\u0062 \u0061\u0733\u0315\u0300\u05AE\u0062
1233\u0061\u0744\u059A\u0316\u302A\u0062 \u0061\u0315\u0300\u05AE\u0745\u0062
1234\u0061\u09CD\u05B0\u094D\u3099\u0062 \u0061\u0E38\u0E48\u0E38\u0C56\u0062
1235\u0061\u0EB8\u0E48\u0E38\u0E49\u0062 \u0061\u0F72\u0F71\u0EC8\u0F71\u0062
1236\u0061\u1039\u05B0\u094D\u3099\u0062 \u0061\u05B0\u094D\u3099\u1A60\u0062
1237\u0061\u3099\u093C\u0334\u1BE6\u0062 \u0061\u3099\u093C\u0334\u1C37\u0062
1238\u0061\u1CD9\u059A\u0316\u302A\u0062 \u0061\u2DED\u0315\u0300\u05AE\u0062
1239\u0061\u2DEF\u0315\u0300\u05AE\u0062 \u0061\u302D\u302E\u059A\u0316\u0062`
1240
1241// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
1242const txt_vn = `Vi các điu kin sau: Ghi nhn công ca tác giả.
1243Nếu bn sdng, chuyn đổi, hoc xây dng dự án t1244ni dung được chia snày, bn phi áp dng giy phép này hoc
1245mt giy phép khác có các điu khon tương tnhư giy phép này
1246cho dự án ca bn. Hiu rng: MinBt kcác điu kin nào
1247trên đây cũng có thể được min bnếu bn được scho phép ca
1248người shu bn quyn. Phm vi công chúngKhi tác phm hoc
1249bt kchương nào ca tác phm đã trong vùng dành cho công
1250chúng theo quy định ca pháp lut thì tình trng ca nó không
1251bị ảnh hưởng bi giy phép trong bt ktrường hp nào.`
1252
1253// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
1254const txt_ru = `При обязательном соблюдении следующих условий:
1255Attribution — Вы должны атрибутировать произведение (указывать
1256автора и источник) в порядке, предусмотренном автором или
1257лицензиаром (но только так, чтобы никоим образом не подразумевалось,
1258что они поддерживают вас или использование вами данного произведения).
1259Υπό τις ακόλουθες προϋποθέσεις:`
1260
1261// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
1262const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με τον
1263τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια
1264(χωρίς όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή
1265τη χρήση του έργου από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε,
1266τροποποιήσετε ή δημιουργήσετε περαιτέρω βασισμένοι στο έργο θα
1267μπορείτε να διανέμετε το έργο που θα προκύψει μόνο με την ίδια ή
1268παρόμοια άδεια.`
1269
1270// Taken from http://creativecommons.org/licenses/by-sa/3.0/deed.ar
1271const txt_ar = `بموجب الشروط التالية نسب المصنف — يجب عليك أن
1272تنسب العمل بالطريقة التي تحددها المؤلف أو المرخص (ولكن ليس بأي حال من
1273الأحوال أن توحي وتقترح بتحول أو استخدامك للعمل).
1274المشاركة على قدم المساواة — إذا كنت يعدل ، والتغيير ، أو الاستفادة
1275من هذا العمل ، قد ينتج عن توزيع العمل إلا في ظل تشابه او تطابق فى واحد
1276لهذا الترخيص.`
1277
1278// Taken from http://creativecommons.org/licenses/by-sa/1.0/il/
1279const txt_il = `בכפוף לתנאים הבאים: ייחוס — עליך לייחס את היצירה (לתת קרדיט) באופן
1280המצויין על-ידי היוצר או מעניק הרישיון (אך לא בשום אופן המרמז על כך
1281שהם תומכים בך או בשימוש שלך ביצירה). שיתוף זהה — אם תחליט/י לשנות,
1282לעבד או ליצור יצירה נגזרת בהסתמך על יצירה זו, תוכל/י להפיץ את יצירתך
1283החדשה רק תחת אותו הרישיון או רישיון דומה לרישיון זה.`
1284
1285const twoByteUtf8 = txt_ru + txt_gr + txt_ar + txt_il
1286
1287// Taken from http://creativecommons.org/licenses/by-sa/2.0/kr/
1288const txt_kr = `다음과 같은 조건을 따라야 합니다: 저작자표시
1289(Attribution) — 저작자나 이용허락자가 정한 방법으로 저작물의
1290원저작자를 표시하여야 합니다(그러나 원저작자가 이용자나 이용자의
1291이용을 보증하거나 추천한다는 의미로 표시해서는 안됩니다).
1292동일조건변경허락 — 이 저작물을 이용하여 만든 이차적 저작물에는 본
1293라이선스와 동일한 라이선스를 적용해야 합니다.`
1294
1295// Taken from http://creativecommons.org/licenses/by-sa/3.0/th/
1296const txt_th = `ภายใต้เงื่อนไข ดังต่อไปนี้ : แสดงที่มา — คุณต้องแสดงที่
1297มาของงานดังกล่าว ตามรูปแบบที่ผู้สร้างสรรค์หรือผู้อนุญาตกำหนด (แต่
1298ไม่ใช่ในลักษณะที่ว่า พวกเขาสนับสนุนคุณหรือสนับสนุนการที่
1299คุณนำงานไปใช้) อนุญาตแบบเดียวกัน — หากคุณดัดแปลง เปลี่ยนรูป หรื
1300อต่อเติมงานนี้ คุณต้องใช้สัญญาอนุญาตแบบเดียวกันหรือแบบที่เหมื
1301อนกับสัญญาอนุญาตที่ใช้กับงานนี้เท่านั้น`
1302
1303const threeByteUtf8 = txt_th
1304
1305// Taken from http://creativecommons.org/licenses/by-sa/2.0/jp/
1306const txt_jp = `あなたの従うべき条件は以下の通りです。
1307表示 — あなたは原著作者のクレジットを表示しなければなりません。
1308継承 — もしあなたがこの作品を改変、変形または加工した場合、
1309あなたはその結果生じた作品をこの作品と同一の許諾条件の下でのみ
1310頒布することができます。`
1311
1312// http://creativecommons.org/licenses/by-sa/2.5/cn/
1313const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
1314广播或通过信息网络传播本作品 创作演绎作品
1315对本作品进行商业性使用 惟须遵守下列条件:
1316署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
1317相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
1318您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
1319
1320const txt_cjk = txt_cn + txt_jp + txt_kr
1321const txt_all = txt_vn + twoByteUtf8 + threeByteUtf8 + txt_cjk
1322
1323var txt_all_bytes = []byte(txt_all)
1324