1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package cases
6
7import (
8	"bytes"
9	"fmt"
10	"path"
11	"strings"
12	"testing"
13	"unicode/utf8"
14
15	"golang.org/x/text/internal/testtext"
16	"golang.org/x/text/language"
17	"golang.org/x/text/transform"
18	"golang.org/x/text/unicode/norm"
19)
20
21type testCase struct {
22	lang  string
23	src   interface{} // string, []string, or nil to skip test
24	title interface{} // string, []string, or nil to skip test
25	lower interface{} // string, []string, or nil to skip test
26	upper interface{} // string, []string, or nil to skip test
27	opts  options
28}
29
30var testCases = []testCase{
31	0: {
32		lang:  "und",
33		src:   "abc aBc ABC abC İsıI ΕΣΆΣ",
34		title: "Abc Abc Abc Abc İsıi Εσάσ",
35		lower: "abc abc abc abc i\u0307sıi εσάσ",
36		upper: "ABC ABC ABC ABC İSII ΕΣΆΣ",
37		opts:  getOpts(HandleFinalSigma(false)),
38	},
39
40	1: {
41		lang:  "und",
42		src:   "abc aBc ABC abC İsıI ΕΣΆΣ Σ _Σ -Σ",
43		title: "Abc Abc Abc Abc İsıi Εσάς Σ _Σ -Σ",
44		lower: "abc abc abc abc i\u0307sıi εσάς σ _σ -σ",
45		upper: "ABC ABC ABC ABC İSII ΕΣΆΣ Σ _Σ -Σ",
46		opts:  getOpts(HandleFinalSigma(true)),
47	},
48
49	2: { // Title cased runes.
50		lang:  supported,
51		src:   "DžA",
52		title: "Dža",
53		lower: "dža",
54		upper: "DŽA",
55	},
56
57	3: {
58		// Title breaking.
59		lang: supported,
60		src: []string{
61			"FOO CASE TEST",
62			"DON'T DO THiS",
63			"χωΡΊΣ χωΡΊΣ^a χωΡΊΣ:a χωΡΊΣ:^a χωΡΊΣ^ όμΩΣ Σ",
64			"with-hyphens",
65			"49ers 49ers",
66			`"capitalize a^a -hyphen 0X _u a_u:a`,
67			"MidNumLet a.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
68			"MidNum a,b;c\u037ed\u0589e\u060cf\u2044g\ufe50h",
69			"\u0345 x\u3031x x\u05d0x \u05d0x a'.a a.a a4,a",
70		},
71		title: []string{
72			"Foo Case Test",
73			"Don't Do This",
74			"Χωρίς Χωρίσ^A Χωρίσ:a Χωρίσ:^A Χωρίς^ Όμως Σ",
75			"With-Hyphens",
76			// Note that 49Ers is correct according to the spec.
77			// TODO: provide some option to the user to treat different
78			// characters as cased.
79			"49Ers 49Ers",
80			`"Capitalize A^A -Hyphen 0X _U A_u:a`,
81			"Midnumlet A.b\u2018c\u2019d\u2024e\ufe52f\uff07f\uff0eg",
82			"Midnum A,B;C\u037eD\u0589E\u060cF\u2044G\ufe50H",
83			"\u0399 X\u3031X X\u05d0x \u05d0X A'.A A.a A4,A",
84		},
85	},
86
87	// TODO: These are known deviations from the options{} Unicode Word Breaking
88	// Algorithm.
89	// {
90	// 	"und",
91	// 	"x_\u3031_x a4,4a",
92	// 	"X_\u3031_x A4,4a", // Currently is "X_\U3031_X A4,4A".
93	// 	"x_\u3031_x a4,4a",
94	// 	"X_\u3031_X A4,4A",
95	// 	options{},
96	// },
97
98	4: {
99		// Tests title options
100		lang:  "und",
101		src:   "abc aBc ABC abC İsıI o'Brien",
102		title: "Abc ABc ABC AbC İsıI O'Brien",
103		opts:  getOpts(NoLower),
104	},
105
106	5: {
107		lang:  "el",
108		src:   "aBc ΟΔΌΣ Οδός Σο ΣΟ Σ oΣ ΟΣ σ ἕξ \u03ac",
109		title: "Abc Οδός Οδός Σο Σο Σ Oς Ος Σ Ἕξ \u0386",
110		lower: "abc οδός οδός σο σο σ oς ος σ ἕξ \u03ac",
111		upper: "ABC ΟΔΟΣ ΟΔΟΣ ΣΟ ΣΟ Σ OΣ ΟΣ Σ ΕΞ \u0391", // Uppercase removes accents
112	},
113
114	6: {
115		lang:  "tr az",
116		src:   "Isiİ İsıI I\u0307sIiİ İsıI\u0307 I\u0300\u0307",
117		title: "Isii İsıı I\u0307sıii İsıi I\u0300\u0307",
118		lower: "ısii isıı isıii isıi \u0131\u0300\u0307",
119		upper: "ISİİ İSII I\u0307SIİİ İSII\u0307 I\u0300\u0307",
120	},
121
122	7: {
123		lang:  "lt",
124		src:   "I Ï J J̈ Į Į̈ Ì Í Ĩ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
125		title: "I Ï J J̈ Į Į̈ Ì Í Ĩ Xi̇̈ Xj̇̈ Xį̇̈ Xi̇̀ Xi̇́ Xi̇̃ Xi Xi̇̈ Xj Xj̇̈ Xį Xį̇̈ Xi̟̤",
126		lower: "i i̇̈ j j̇̈ į į̇̈ i̇̀ i̇́ i̇̃ xi̇̈ xj̇̈ xį̇̈ xi̇̀ xi̇́ xi̇̃ xi xi̇̈ xj xj̇̈ xį xį̇̈ xi̟̤",
127		upper: "I Ï J J̈ Į Į̈ Ì Í Ĩ XÏ XJ̈ XĮ̈ XÌ XÍ XĨ XI XÏ XJ XJ̈ XĮ XĮ̈ XI̟̤",
128	},
129
130	8: {
131		lang:  "lt",
132		src:   "\u012e\u0300 \u00cc i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
133		title: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
134		lower: "\u012f\u0307\u0300 i\u0307\u0300 i\u0307\u0300 i\u0307\u0301 i\u0307\u0303 i\u0307\u0308 i\u0300\u0307",
135		upper: "\u012e\u0300 \u00cc \u00cc \u00cd \u0128 \u00cf I\u0300\u0307",
136	},
137
138	9: {
139		lang:  "nl",
140		src:   "ijs IJs Ij Ijs İJ İJs aa aA 'ns 'S",
141		title: "IJs IJs IJ IJs İj İjs Aa Aa 'ns 's",
142	},
143
144	// Note: this specification is not currently part of CLDR. The same holds
145	// for the leading apostrophe handling for Dutch.
146	// See https://unicode.org/cldr/trac/ticket/7078.
147	10: {
148		lang:  "af",
149		src:   "wag 'n bietjie",
150		title: "Wag 'n Bietjie",
151		lower: "wag 'n bietjie",
152		upper: "WAG 'N BIETJIE",
153	},
154}
155
156func TestCaseMappings(t *testing.T) {
157	for i, tt := range testCases {
158		src, ok := tt.src.([]string)
159		if !ok {
160			src = strings.Split(tt.src.(string), " ")
161		}
162
163		for _, lang := range strings.Split(tt.lang, " ") {
164			tag := language.MustParse(lang)
165			testEntry := func(name string, mk func(language.Tag, options) transform.SpanningTransformer, gold interface{}) {
166				c := Caser{mk(tag, tt.opts)}
167				if gold != nil {
168					wants, ok := gold.([]string)
169					if !ok {
170						wants = strings.Split(gold.(string), " ")
171					}
172					for j, want := range wants {
173						if got := c.String(src[j]); got != want {
174							t.Errorf("%d:%s:\n%s.String(%+q):\ngot  %+q;\nwant %+q", i, lang, name, src[j], got, want)
175						}
176					}
177				}
178				dst := make([]byte, 256) // big enough to hold any result
179				src := []byte(strings.Join(src, " "))
180				v := testtext.AllocsPerRun(20, func() {
181					c.Transform(dst, src, true)
182				})
183				if v > 1.1 {
184					t.Errorf("%d:%s:\n%s: number of allocs was %f; want 0", i, lang, name, v)
185				}
186			}
187			testEntry("Upper", makeUpper, tt.upper)
188			testEntry("Lower", makeLower, tt.lower)
189			testEntry("Title", makeTitle, tt.title)
190		}
191	}
192}
193
194// TestAlloc tests that some mapping methods should not cause any allocation.
195func TestAlloc(t *testing.T) {
196	dst := make([]byte, 256) // big enough to hold any result
197	src := []byte(txtNonASCII)
198
199	for i, f := range []func() Caser{
200		func() Caser { return Upper(language.Und) },
201		func() Caser { return Lower(language.Und) },
202		func() Caser { return Lower(language.Und, HandleFinalSigma(false)) },
203		// TODO: use a shared copy for these casers as well, in order of
204		// importance, starting with the most important:
205		// func() Caser { return Title(language.Und) },
206		// func() Caser { return Title(language.Und, HandleFinalSigma(false)) },
207	} {
208		testtext.Run(t, "", func(t *testing.T) {
209			var c Caser
210			v := testtext.AllocsPerRun(10, func() {
211				c = f()
212			})
213			if v > 0 {
214				// TODO: Right now only Upper has 1 allocation. Special-case Lower
215				// and Title as well to have less allocations for the root locale.
216				t.Errorf("%d:init: number of allocs was %f; want 0", i, v)
217			}
218			v = testtext.AllocsPerRun(2, func() {
219				c.Transform(dst, src, true)
220			})
221			if v > 0 {
222				t.Errorf("%d:transform: number of allocs was %f; want 0", i, v)
223			}
224		})
225	}
226}
227
228func testHandover(t *testing.T, c Caser, src string) {
229	want := c.String(src)
230	// Find the common prefix.
231	pSrc := 0
232	for ; pSrc < len(src) && pSrc < len(want) && want[pSrc] == src[pSrc]; pSrc++ {
233	}
234
235	// Test handover for each substring of the prefix.
236	for i := 0; i < pSrc; i++ {
237		testtext.Run(t, fmt.Sprint("interleave/", i), func(t *testing.T) {
238			dst := make([]byte, 4*len(src))
239			c.Reset()
240			nSpan, _ := c.Span([]byte(src[:i]), false)
241			copy(dst, src[:nSpan])
242			nTransform, _, _ := c.Transform(dst[nSpan:], []byte(src[nSpan:]), true)
243			got := string(dst[:nSpan+nTransform])
244			if got != want {
245				t.Errorf("full string: got %q; want %q", got, want)
246			}
247		})
248	}
249}
250
251func TestHandover(t *testing.T) {
252	testCases := []struct {
253		desc          string
254		t             Caser
255		first, second string
256	}{{
257		"title/nosigma/single midword",
258		Title(language.Und, HandleFinalSigma(false)),
259		"A.", "a",
260	}, {
261		"title/nosigma/single midword",
262		Title(language.Und, HandleFinalSigma(false)),
263		"A", ".a",
264	}, {
265		"title/nosigma/double midword",
266		Title(language.Und, HandleFinalSigma(false)),
267		"A..", "a",
268	}, {
269		"title/nosigma/double midword",
270		Title(language.Und, HandleFinalSigma(false)),
271		"A.", ".a",
272	}, {
273		"title/nosigma/double midword",
274		Title(language.Und, HandleFinalSigma(false)),
275		"A", "..a",
276	}, {
277		"title/sigma/single midword",
278		Title(language.Und),
279		"ΟΣ.", "a",
280	}, {
281		"title/sigma/single midword",
282		Title(language.Und),
283		"ΟΣ", ".a",
284	}, {
285		"title/sigma/double midword",
286		Title(language.Und),
287		"ΟΣ..", "a",
288	}, {
289		"title/sigma/double midword",
290		Title(language.Und),
291		"ΟΣ.", ".a",
292	}, {
293		"title/sigma/double midword",
294		Title(language.Und),
295		"ΟΣ", "..a",
296	}, {
297		"title/af/leading apostrophe",
298		Title(language.Afrikaans),
299		"'", "n bietje",
300	}}
301	for _, tc := range testCases {
302		testtext.Run(t, tc.desc, func(t *testing.T) {
303			src := tc.first + tc.second
304			want := tc.t.String(src)
305			tc.t.Reset()
306			n, _ := tc.t.Span([]byte(tc.first), false)
307
308			dst := make([]byte, len(want))
309			copy(dst, tc.first[:n])
310
311			nDst, _, _ := tc.t.Transform(dst[n:], []byte(src[n:]), true)
312			got := string(dst[:n+nDst])
313			if got != want {
314				t.Errorf("got %q; want %q", got, want)
315			}
316		})
317	}
318}
319
320// minBufSize is the size of the buffer by which the casing operation in
321// this package are guaranteed to make progress.
322const minBufSize = norm.MaxSegmentSize
323
324type bufferTest struct {
325	desc, src, want  string
326	firstErr         error
327	dstSize, srcSize int
328	t                transform.SpanningTransformer
329}
330
331var bufferTests []bufferTest
332
333func init() {
334	bufferTests = []bufferTest{{
335		desc:     "und/upper/short dst",
336		src:      "abcdefg",
337		want:     "ABCDEFG",
338		firstErr: transform.ErrShortDst,
339		dstSize:  3,
340		srcSize:  minBufSize,
341		t:        Upper(language.Und),
342	}, {
343		desc:     "und/upper/short src",
344		src:      "123é56",
345		want:     "123É56",
346		firstErr: transform.ErrShortSrc,
347		dstSize:  4,
348		srcSize:  4,
349		t:        Upper(language.Und),
350	}, {
351		desc:     "und/upper/no error on short",
352		src:      "12",
353		want:     "12",
354		firstErr: nil,
355		dstSize:  1,
356		srcSize:  1,
357		t:        Upper(language.Und),
358	}, {
359		desc:     "und/lower/short dst",
360		src:      "ABCDEFG",
361		want:     "abcdefg",
362		firstErr: transform.ErrShortDst,
363		dstSize:  3,
364		srcSize:  minBufSize,
365		t:        Lower(language.Und),
366	}, {
367		desc:     "und/lower/short src",
368		src:      "123É56",
369		want:     "123é56",
370		firstErr: transform.ErrShortSrc,
371		dstSize:  4,
372		srcSize:  4,
373		t:        Lower(language.Und),
374	}, {
375		desc:     "und/lower/no error on short",
376		src:      "12",
377		want:     "12",
378		firstErr: nil,
379		dstSize:  1,
380		srcSize:  1,
381		t:        Lower(language.Und),
382	}, {
383		desc:    "und/lower/simple (no final sigma)",
384		src:     "ΟΣ ΟΣΣ",
385		want:    "οσ οσσ",
386		dstSize: minBufSize,
387		srcSize: minBufSize,
388		t:       Lower(language.Und, HandleFinalSigma(false)),
389	}, {
390		desc:    "und/title/simple (no final sigma)",
391		src:     "ΟΣ ΟΣΣ",
392		want:    "Οσ Οσσ",
393		dstSize: minBufSize,
394		srcSize: minBufSize,
395		t:       Title(language.Und, HandleFinalSigma(false)),
396	}, {
397		desc:    "und/title/final sigma: no error",
398		src:     "ΟΣ",
399		want:    "Ος",
400		dstSize: minBufSize,
401		srcSize: minBufSize,
402		t:       Title(language.Und),
403	}, {
404		desc:     "und/title/final sigma: short source",
405		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
406		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
407		firstErr: transform.ErrShortSrc,
408		dstSize:  minBufSize,
409		srcSize:  10,
410		t:        Title(language.Und),
411	}, {
412		desc:     "und/title/final sigma: short destination 1",
413		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
414		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
415		firstErr: transform.ErrShortDst,
416		dstSize:  10,
417		srcSize:  minBufSize,
418		t:        Title(language.Und),
419	}, {
420		desc:     "und/title/final sigma: short destination 2",
421		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
422		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
423		firstErr: transform.ErrShortDst,
424		dstSize:  9,
425		srcSize:  minBufSize,
426		t:        Title(language.Und),
427	}, {
428		desc:     "und/title/final sigma: short destination 3",
429		src:      "ΟΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣΣ",
430		want:     "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσς",
431		firstErr: transform.ErrShortDst,
432		dstSize:  8,
433		srcSize:  minBufSize,
434		t:        Title(language.Und),
435	}, {
436		desc:     "und/title/clipped UTF-8 rune",
437		src:      "σσσσσσσσσσσ",
438		want:     "Σσσσσσσσσσσ",
439		firstErr: transform.ErrShortSrc,
440		dstSize:  minBufSize,
441		srcSize:  5,
442		t:        Title(language.Und),
443	}, {
444		desc:    "und/title/clipped UTF-8 rune atEOF",
445		src:     "σσσ" + string([]byte{0xCF}),
446		want:    "Σσσ" + string([]byte{0xCF}),
447		dstSize: minBufSize,
448		srcSize: minBufSize,
449		t:       Title(language.Und),
450	}, {
451		// Note: the choice to change the final sigma at the end in case of
452		// too many case ignorables is arbitrary. The main reason for this
453		// choice is that it results in simpler code.
454		desc:    "und/title/final sigma: max ignorables",
455		src:     "ΟΣ" + strings.Repeat(".", maxIgnorable) + "a",
456		want:    "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
457		dstSize: minBufSize,
458		srcSize: minBufSize,
459		t:       Title(language.Und),
460	}, {
461		// Note: the choice to change the final sigma at the end in case of
462		// too many case ignorables is arbitrary. The main reason for this
463		// choice is that it results in simpler code.
464		desc:    "und/title/long string",
465		src:     "AA" + strings.Repeat(".", maxIgnorable+1) + "a",
466		want:    "Aa" + strings.Repeat(".", maxIgnorable+1) + "A",
467		dstSize: minBufSize,
468		srcSize: len("AA" + strings.Repeat(".", maxIgnorable+1)),
469		t:       Title(language.Und),
470	}, {
471		// Note: the choice to change the final sigma at the end in case of
472		// too many case ignorables is arbitrary. The main reason for this
473		// choice is that it results in simpler code.
474		desc:    "und/title/final sigma: too many ignorables",
475		src:     "ΟΣ" + strings.Repeat(".", maxIgnorable+1) + "a",
476		want:    "Ος" + strings.Repeat(".", maxIgnorable+1) + "A",
477		dstSize: minBufSize,
478		srcSize: len("ΟΣ" + strings.Repeat(".", maxIgnorable+1)),
479		t:       Title(language.Und),
480	}, {
481		desc:    "und/title/final sigma: apostrophe",
482		src:     "ΟΣ''a",
483		want:    "Οσ''A",
484		dstSize: minBufSize,
485		srcSize: minBufSize,
486		t:       Title(language.Und),
487	}, {
488		desc:    "el/upper/max ignorables",
489		src:     "ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
490		want:    "Ο" + strings.Repeat("\u0321", maxIgnorable-1),
491		dstSize: minBufSize,
492		srcSize: minBufSize,
493		t:       Upper(language.Greek),
494	}, {
495		desc:    "el/upper/too many ignorables",
496		src:     "ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
497		want:    "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
498		dstSize: minBufSize,
499		srcSize: len("ο" + strings.Repeat("\u0321", maxIgnorable)),
500		t:       Upper(language.Greek),
501	}, {
502		desc:     "el/upper/short dst",
503		src:      "123ο",
504		want:     "123Ο",
505		firstErr: transform.ErrShortDst,
506		dstSize:  3,
507		srcSize:  minBufSize,
508		t:        Upper(language.Greek),
509	}, {
510		desc:    "lt/lower/max ignorables",
511		src:     "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
512		want:    "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
513		dstSize: minBufSize,
514		srcSize: minBufSize,
515		t:       Lower(language.Lithuanian),
516	}, {
517		desc:    "lt/lower/too many ignorables",
518		src:     "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
519		want:    "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
520		dstSize: minBufSize,
521		srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
522		t:       Lower(language.Lithuanian),
523	}, {
524		desc:     "lt/lower/decomposition with short dst buffer 1",
525		src:      "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
526		firstErr: transform.ErrShortDst,
527		want:     "aaaaai\u0307\u0300",
528		dstSize:  5,
529		srcSize:  minBufSize,
530		t:        Lower(language.Lithuanian),
531	}, {
532		desc:     "lt/lower/decomposition with short dst buffer 2",
533		src:      "aaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
534		firstErr: transform.ErrShortDst,
535		want:     "aaaai\u0307\u0300",
536		dstSize:  5,
537		srcSize:  minBufSize,
538		t:        Lower(language.Lithuanian),
539	}, {
540		desc:    "lt/upper/max ignorables",
541		src:     "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
542		want:    "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
543		dstSize: minBufSize,
544		srcSize: minBufSize,
545		t:       Upper(language.Lithuanian),
546	}, {
547		desc:    "lt/upper/too many ignorables",
548		src:     "i" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
549		want:    "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
550		dstSize: minBufSize,
551		srcSize: len("i" + strings.Repeat("\u0321", maxIgnorable)),
552		t:       Upper(language.Lithuanian),
553	}, {
554		desc:     "lt/upper/short dst",
555		src:      "12i\u0307\u0300",
556		want:     "12\u00cc",
557		firstErr: transform.ErrShortDst,
558		dstSize:  3,
559		srcSize:  minBufSize,
560		t:        Upper(language.Lithuanian),
561	}, {
562		desc:    "aztr/lower/max ignorables",
563		src:     "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
564		want:    "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
565		dstSize: minBufSize,
566		srcSize: minBufSize,
567		t:       Lower(language.Turkish),
568	}, {
569		desc:    "aztr/lower/too many ignorables",
570		src:     "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
571		want:    "\u0131" + strings.Repeat("\u0321", maxIgnorable) + "\u0307\u0300",
572		dstSize: minBufSize,
573		srcSize: len("I" + strings.Repeat("\u0321", maxIgnorable)),
574		t:       Lower(language.Turkish),
575	}, {
576		desc:     "nl/title/pre-IJ cutoff",
577		src:      "  ij",
578		want:     "  IJ",
579		firstErr: transform.ErrShortDst,
580		dstSize:  2,
581		srcSize:  minBufSize,
582		t:        Title(language.Dutch),
583	}, {
584		desc:     "nl/title/mid-IJ cutoff",
585		src:      "  ij",
586		want:     "  IJ",
587		firstErr: transform.ErrShortDst,
588		dstSize:  3,
589		srcSize:  minBufSize,
590		t:        Title(language.Dutch),
591	}, {
592		desc:     "af/title/apostrophe",
593		src:      "'n bietje",
594		want:     "'n Bietje",
595		firstErr: transform.ErrShortDst,
596		dstSize:  3,
597		srcSize:  minBufSize,
598		t:        Title(language.Afrikaans),
599	}}
600}
601
602func TestShortBuffersAndOverflow(t *testing.T) {
603	for i, tt := range bufferTests {
604		testtext.Run(t, tt.desc, func(t *testing.T) {
605			buf := make([]byte, tt.dstSize)
606			got := []byte{}
607			var nSrc, nDst int
608			var err error
609			for p := 0; p < len(tt.src); p += nSrc {
610				q := p + tt.srcSize
611				if q > len(tt.src) {
612					q = len(tt.src)
613				}
614				nDst, nSrc, err = tt.t.Transform(buf, []byte(tt.src[p:q]), q == len(tt.src))
615				got = append(got, buf[:nDst]...)
616
617				if p == 0 && err != tt.firstErr {
618					t.Errorf("%d:%s:\n error was %v; want %v", i, tt.desc, err, tt.firstErr)
619					break
620				}
621			}
622			if string(got) != tt.want {
623				t.Errorf("%d:%s:\ngot  %+q;\nwant %+q", i, tt.desc, got, tt.want)
624			}
625			testHandover(t, Caser{tt.t}, tt.src)
626		})
627	}
628}
629
630func TestSpan(t *testing.T) {
631	for _, tt := range []struct {
632		desc  string
633		src   string
634		want  string
635		atEOF bool
636		err   error
637		t     Caser
638	}{{
639		desc:  "und/upper/basic",
640		src:   "abcdefg",
641		want:  "",
642		atEOF: true,
643		err:   transform.ErrEndOfSpan,
644		t:     Upper(language.Und),
645	}, {
646		desc:  "und/upper/short src",
647		src:   "123É"[:4],
648		want:  "123",
649		atEOF: false,
650		err:   transform.ErrShortSrc,
651		t:     Upper(language.Und),
652	}, {
653		desc:  "und/upper/no error on short",
654		src:   "12",
655		want:  "12",
656		atEOF: false,
657		t:     Upper(language.Und),
658	}, {
659		desc:  "und/lower/basic",
660		src:   "ABCDEFG",
661		want:  "",
662		atEOF: true,
663		err:   transform.ErrEndOfSpan,
664		t:     Lower(language.Und),
665	}, {
666		desc:  "und/lower/short src num",
667		src:   "123é"[:4],
668		want:  "123",
669		atEOF: false,
670		err:   transform.ErrShortSrc,
671		t:     Lower(language.Und),
672	}, {
673		desc:  "und/lower/short src greek",
674		src:   "αβγé"[:7],
675		want:  "αβγ",
676		atEOF: false,
677		err:   transform.ErrShortSrc,
678		t:     Lower(language.Und),
679	}, {
680		desc:  "und/lower/no error on short",
681		src:   "12",
682		want:  "12",
683		atEOF: false,
684		t:     Lower(language.Und),
685	}, {
686		desc:  "und/lower/simple (no final sigma)",
687		src:   "ος οσσ",
688		want:  "οσ οσσ",
689		atEOF: true,
690		t:     Lower(language.Und, HandleFinalSigma(false)),
691	}, {
692		desc:  "und/title/simple (no final sigma)",
693		src:   "Οσ Οσσ",
694		want:  "Οσ Οσσ",
695		atEOF: true,
696		t:     Title(language.Und, HandleFinalSigma(false)),
697	}, {
698		desc: "und/lower/final sigma: no error",
699		src:  "οΣ", // Oς
700		want: "ο",  // Oς
701		err:  transform.ErrEndOfSpan,
702		t:    Lower(language.Und),
703	}, {
704		desc: "und/title/final sigma: no error",
705		src:  "ΟΣ", // Oς
706		want: "Ο",  // Oς
707		err:  transform.ErrEndOfSpan,
708		t:    Title(language.Und),
709	}, {
710		desc: "und/title/final sigma: no short source!",
711		src:  "ΟσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσΣ",
712		want: "Οσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσσ",
713		err:  transform.ErrEndOfSpan,
714		t:    Title(language.Und),
715	}, {
716		desc:  "und/title/clipped UTF-8 rune",
717		src:   "Σσ" + string([]byte{0xCF}),
718		want:  "Σσ",
719		atEOF: false,
720		err:   transform.ErrShortSrc,
721		t:     Title(language.Und),
722	}, {
723		desc:  "und/title/clipped UTF-8 rune atEOF",
724		src:   "Σσσ" + string([]byte{0xCF}),
725		want:  "Σσσ" + string([]byte{0xCF}),
726		atEOF: true,
727		t:     Title(language.Und),
728	}, {
729		// Note: the choice to change the final sigma at the end in case of
730		// too many case ignorables is arbitrary. The main reason for this
731		// choice is that it results in simpler code.
732		desc: "und/title/long string",
733		src:  "A" + strings.Repeat("a", maxIgnorable+5),
734		want: "A" + strings.Repeat("a", maxIgnorable+5),
735		t:    Title(language.Und),
736	}, {
737		// Note: the choice to change the final sigma at the end in case of
738		// too many case ignorables is arbitrary. The main reason for this
739		// choice is that it results in simpler code.
740		desc:  "und/title/cyrillic",
741		src:   "При",
742		want:  "При",
743		atEOF: true,
744		t:     Title(language.Und, HandleFinalSigma(false)),
745	}, {
746		// Note: the choice to change the final sigma at the end in case of
747		// too many case ignorables is arbitrary. The main reason for this
748		// choice is that it results in simpler code.
749		desc: "und/title/final sigma: max ignorables",
750		src:  "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
751		want: "Οσ" + strings.Repeat(".", maxIgnorable) + "A",
752		t:    Title(language.Und),
753	}, {
754		desc: "el/upper/max ignorables - not implemented",
755		src:  "Ο" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0313",
756		want: "",
757		err:  transform.ErrEndOfSpan,
758		t:    Upper(language.Greek),
759	}, {
760		desc: "el/upper/too many ignorables - not implemented",
761		src:  "Ο" + strings.Repeat("\u0321", maxIgnorable) + "\u0313",
762		want: "",
763		err:  transform.ErrEndOfSpan,
764		t:    Upper(language.Greek),
765	}, {
766		desc: "el/upper/short dst",
767		src:  "123ο",
768		want: "",
769		err:  transform.ErrEndOfSpan,
770		t:    Upper(language.Greek),
771	}, {
772		desc: "lt/lower/max ignorables",
773		src:  "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
774		want: "i" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0307\u0300",
775		t:    Lower(language.Lithuanian),
776	}, {
777		desc: "lt/lower/isLower",
778		src:  "I" + strings.Repeat("\u0321", maxIgnorable) + "\u0300",
779		want: "",
780		err:  transform.ErrEndOfSpan,
781		t:    Lower(language.Lithuanian),
782	}, {
783		desc: "lt/lower/not identical",
784		src:  "aaaaa\u00cc", // U+00CC LATIN CAPITAL LETTER I GRAVE
785		err:  transform.ErrEndOfSpan,
786		want: "aaaaa",
787		t:    Lower(language.Lithuanian),
788	}, {
789		desc: "lt/lower/identical",
790		src:  "aaaai\u0307\u0300", // U+00CC LATIN CAPITAL LETTER I GRAVE
791		want: "aaaai\u0307\u0300",
792		t:    Lower(language.Lithuanian),
793	}, {
794		desc: "lt/upper/not implemented",
795		src:  "I" + strings.Repeat("\u0321", maxIgnorable-1) + "\u0300",
796		want: "",
797		err:  transform.ErrEndOfSpan,
798		t:    Upper(language.Lithuanian),
799	}, {
800		desc: "lt/upper/not implemented, ascii",
801		src:  "AB",
802		want: "",
803		err:  transform.ErrEndOfSpan,
804		t:    Upper(language.Lithuanian),
805	}, {
806		desc: "nl/title/pre-IJ cutoff",
807		src:  "  IJ",
808		want: "  IJ",
809		t:    Title(language.Dutch),
810	}, {
811		desc: "nl/title/mid-IJ cutoff",
812		src:  "  Ia",
813		want: "  Ia",
814		t:    Title(language.Dutch),
815	}, {
816		desc: "af/title/apostrophe",
817		src:  "'n Bietje",
818		want: "'n Bietje",
819		t:    Title(language.Afrikaans),
820	}, {
821		desc: "af/title/apostrophe-incorrect",
822		src:  "'N Bietje",
823		// The Single_Quote (a MidWord), needs to be retained as unspanned so
824		// that a successive call to Transform can detect that N should not be
825		// capitalized.
826		want: "",
827		err:  transform.ErrEndOfSpan,
828		t:    Title(language.Afrikaans),
829	}} {
830		testtext.Run(t, tt.desc, func(t *testing.T) {
831			for p := 0; p < len(tt.want); p += utf8.RuneLen([]rune(tt.src[p:])[0]) {
832				tt.t.Reset()
833				n, err := tt.t.Span([]byte(tt.src[:p]), false)
834				if err != nil && err != transform.ErrShortSrc {
835					t.Errorf("early failure:Span(%+q): %v (%d < %d)", tt.src[:p], err, n, len(tt.want))
836					break
837				}
838			}
839			tt.t.Reset()
840			n, err := tt.t.Span([]byte(tt.src), tt.atEOF)
841			if n != len(tt.want) || err != tt.err {
842				t.Errorf("Span(%+q, %v): got %d, %v; want %d, %v", tt.src, tt.atEOF, n, err, len(tt.want), tt.err)
843			}
844			testHandover(t, tt.t, tt.src)
845		})
846	}
847}
848
849var txtASCII = strings.Repeat("The quick brown fox jumps over the lazy dog. ", 50)
850
851// Taken from http://creativecommons.org/licenses/by-sa/3.0/vn/
852const txt_vn = `Vi các điu kin sau: Ghi nhn công ca tác giả.  Nếu bn s853dng, chuyn đổi, hoc xây dng dự án tni dung được chia snày, bn phi áp
854dng giy phép này hoc  mt giy phép khác có các điu khon tương tnhư giy
855phép này cho dự án ca bn. Hiu rng: MinBt kcác điu kin nào trên đây
856cũng có thể được min bnếu bn được scho phép ca người shu bn quyn.
857Phm vi công chúngKhi tác phm hoc bt kchương nào ca tác phm đã trong
858vùng dành cho công chúng theo quy định ca pháp lut thì tình trng ca nó không
859bị ảnh hưởng bi giy phép trong bt ktrường hp nào.`
860
861// http://creativecommons.org/licenses/by-sa/2.5/cn/
862const txt_cn = `您可以自由: 复制、发行、展览、表演、放映、
863广播或通过信息网络传播本作品 创作演绎作品
864对本作品进行商业性使用 惟须遵守下列条件:
865署名 — 您必须按照作者或者许可人指定的方式对作品进行署名。
866相同方式共享 — 如果您改变、转换本作品或者以本作品为基础进行创作,
867您只能采用与本协议相同的许可协议发布基于本作品的演绎作品。`
868
869// Taken from http://creativecommons.org/licenses/by-sa/1.0/deed.ru
870const txt_ru = `При обязательном соблюдении следующих условий: Attribution — Вы
871должны атрибутировать произведение (указывать автора и источник) в порядке,
872предусмотренном автором или лицензиаром (но только так, чтобы никоим образом не
873подразумевалось, что они поддерживают вас или использование вами данного
874произведения). Υπό τις ακόλουθες προϋποθέσεις:`
875
876// Taken from http://creativecommons.org/licenses/by-sa/3.0/gr/
877const txt_gr = `Αναφορά Δημιουργού — Θα πρέπει να κάνετε την αναφορά στο έργο με
878τον τρόπο που έχει οριστεί από το δημιουργό ή το χορηγούντο την άδεια (χωρίς
879όμως να εννοείται με οποιονδήποτε τρόπο ότι εγκρίνουν εσάς ή τη χρήση του έργου
880από εσάς). Παρόμοια Διανομή — Εάν αλλοιώσετε, τροποποιήσετε ή δημιουργήσετε
881περαιτέρω βασισμένοι στο έργο θα μπορείτε να διανέμετε το έργο που θα προκύψει
882μόνο με την ίδια ή παρόμοια άδεια.`
883
884const txtNonASCII = txt_vn + txt_cn + txt_ru + txt_gr
885
886// TODO: Improve ASCII performance.
887
888func BenchmarkCasers(b *testing.B) {
889	for _, s := range []struct{ name, text string }{
890		{"ascii", txtASCII},
891		{"nonASCII", txtNonASCII},
892		{"short", "При"},
893	} {
894		src := []byte(s.text)
895		// Measure case mappings in bytes package for comparison.
896		for _, f := range []struct {
897			name string
898			fn   func(b []byte) []byte
899		}{
900			{"lower", bytes.ToLower},
901			{"title", bytes.ToTitle},
902			{"upper", bytes.ToUpper},
903		} {
904			testtext.Bench(b, path.Join(s.name, "bytes", f.name), func(b *testing.B) {
905				b.SetBytes(int64(len(src)))
906				for i := 0; i < b.N; i++ {
907					f.fn(src)
908				}
909			})
910		}
911		for _, t := range []struct {
912			name  string
913			caser transform.SpanningTransformer
914		}{
915			{"fold/default", Fold()},
916			{"upper/default", Upper(language.Und)},
917			{"lower/sigma", Lower(language.Und)},
918			{"lower/simple", Lower(language.Und, HandleFinalSigma(false))},
919			{"title/sigma", Title(language.Und)},
920			{"title/simple", Title(language.Und, HandleFinalSigma(false))},
921		} {
922			c := Caser{t.caser}
923			dst := make([]byte, len(src))
924			testtext.Bench(b, path.Join(s.name, t.name, "transform"), func(b *testing.B) {
925				b.SetBytes(int64(len(src)))
926				for i := 0; i < b.N; i++ {
927					c.Reset()
928					c.Transform(dst, src, true)
929				}
930			})
931			// No need to check span for simple cases, as they will be the same
932			// as sigma.
933			if strings.HasSuffix(t.name, "/simple") {
934				continue
935			}
936			spanSrc := c.Bytes(src)
937			testtext.Bench(b, path.Join(s.name, t.name, "span"), func(b *testing.B) {
938				c.Reset()
939				if n, _ := c.Span(spanSrc, true); n < len(spanSrc) {
940					b.Fatalf("spanner is not recognizing text %q as done (at %d)", spanSrc, n)
941				}
942				b.SetBytes(int64(len(spanSrc)))
943				for i := 0; i < b.N; i++ {
944					c.Reset()
945					c.Span(spanSrc, true)
946				}
947			})
948		}
949	}
950}
951