1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build icu
6
7package cases
8
9import (
10	"path"
11	"strings"
12	"testing"
13
14	"golang.org/x/text/internal/testtext"
15	"golang.org/x/text/language"
16	"golang.org/x/text/unicode/norm"
17)
18
19func TestICUConformance(t *testing.T) {
20	// Build test set.
21	input := []string{
22		"a.a a_a",
23		"a\u05d0a",
24		"\u05d0'a",
25		"a\u03084a",
26		"a\u0308a",
27		"a3\u30a3a",
28		"a\u303aa",
29		"a_\u303a_a",
30		"1_a..a",
31		"1_a.a",
32		"a..a.",
33		"a--a-",
34		"a-a-",
35		"a\u200ba",
36		"a\u200b\u200ba",
37		"a\u00ad\u00ada", // Format
38		"a\u00ada",
39		"a''a", // SingleQuote
40		"a'a",
41		"a::a", // MidLetter
42		"a:a",
43		"a..a", // MidNumLet
44		"a.a",
45		"a;;a", // MidNum
46		"a;a",
47		"a__a", // ExtendNumlet
48		"a_a",
49		"ΟΣ''a",
50	}
51	add := func(x interface{}) {
52		switch v := x.(type) {
53		case string:
54			input = append(input, v)
55		case []string:
56			for _, s := range v {
57				input = append(input, s)
58			}
59		}
60	}
61	for _, tc := range testCases {
62		add(tc.src)
63		add(tc.lower)
64		add(tc.upper)
65		add(tc.title)
66	}
67	for _, tc := range bufferTests {
68		add(tc.src)
69	}
70	for _, tc := range breakTest {
71		add(strings.Replace(tc, "|", "", -1))
72	}
73	for _, tc := range foldTestCases {
74		add(tc)
75	}
76
77	// Compare ICU to Go.
78	for _, c := range []string{"lower", "upper", "title", "fold"} {
79		for _, tag := range []string{
80			"und", "af", "az", "el", "lt", "nl", "tr",
81		} {
82			for _, s := range input {
83				if exclude(c, tag, s) {
84					continue
85				}
86				testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
87					want := doICU(tag, c, s)
88					got := doGo(tag, c, s)
89					if norm.NFC.String(got) != norm.NFC.String(want) {
90						t.Errorf("\n    in %[3]q (%+[3]q)\n   got %[1]q (%+[1]q)\n  want %[2]q (%+[2]q)", got, want, s)
91					}
92				})
93			}
94		}
95	}
96}
97
98// exclude indicates if a string should be excluded from testing.
99func exclude(cm, tag, s string) bool {
100	list := []struct{ cm, tags, pattern string }{
101		// TODO: Go does not handle certain esoteric breaks correctly. This will be
102		// fixed once we have a real word break iterator. Alternatively, it
103		// seems like we're not too far off from making it work, so we could
104		// fix these last steps. But first verify that using a separate word
105		// breaker does not hurt performance.
106		{"title", "af nl", "a''a"},
107		{"", "", "א'a"},
108
109		// All the exclusions below seem to be issues with the ICU
110		// implementation (at version 57) and thus are not marked as TODO.
111
112		// ICU does not handle leading apostrophe for Dutch and
113		// Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.
114		{"title", "af nl", "'n"},
115		{"title", "af nl", "'N"},
116
117		// Go terminates the final sigma check after a fixed number of
118		// ignorables have been found. This ensures that the algorithm can make
119		// progress in a streaming scenario.
120		{"lower title", "", "\u039f\u03a3...............................a"},
121		// This also applies to upper in Greek.
122		// NOTE: we could fix the following two cases by adding state to elUpper
123		// and aztrLower. However, considering a modifier to not belong to the
124		// preceding letter after the maximum modifiers count is reached is
125		// consistent with the behavior of unicode/norm.
126		{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
127		{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
128		{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
129		{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
130
131		// ICU title case seems to erroneously removes \u0307 from an upper case
132		// I unconditionally, instead of only when lowercasing. The ICU
133		// transform algorithm transforms these cases consistently with our
134		// implementation.
135		{"title", "az tr", "\u0307"},
136
137		// The spec says to remove \u0307 after Soft-Dotted characters. ICU
138		// transforms conform but ucasemap_utf8ToUpper does not.
139		{"upper title", "lt", "i\u0307"},
140		{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
141
142		// Both Unicode and CLDR prescribe an extra explicit dot above after a
143		// Soft_Dotted character if there are other modifiers.
144		// ucasemap_utf8ToUpper does not do this; ICU transforms do.
145		// The issue with ucasemap_utf8ToUpper seems to be that it does not
146		// consider the modifiers that are part of composition in the evaluation
147		// of More_Above. For instance, according to the More_Above rule for lt,
148		// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
149		// two additional dots). This seems odd, but is correct. ICU is
150		// definitely not correct as it produces different results for different
151		// normal forms. For instance, for an İ:
152		//    \u0130  (NFC) -> i\u0307         (incorrect)
153		//    I\u0307 (NFD) -> i\u0307\u0307   (correct)
154		// We could argue that we should not add a \u0307 if there already is
155		// one, but this may be hard to get correct and is not conform the
156		// standard.
157		{"lower title", "lt", "\u0130"},
158		{"lower title", "lt", "\u00cf"},
159
160		// We are conform ICU ucasemap_utf8ToUpper if we remove support for
161		// elUpper. However, this is clearly not conform the spec. Moreover, the
162		// ICU transforms _do_ implement this transform and produces results
163		// consistent with our implementation. Note that we still prefer to use
164		// ucasemap_utf8ToUpper instead of transforms as the latter have
165		// inconsistencies in the word breaking algorithm.
166		{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
167		{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
168		{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
169
170		{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
171		{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
172		{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
173
174		{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
175		{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
176		{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
177
178		{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
179		{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
180		{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
181	}
182	for _, x := range list {
183		if x.cm != "" && strings.Index(x.cm, cm) == -1 {
184			continue
185		}
186		if x.tags != "" && strings.Index(x.tags, tag) == -1 {
187			continue
188		}
189		if strings.Index(s, x.pattern) != -1 {
190			return true
191		}
192	}
193	return false
194}
195
196func doGo(tag, caser, input string) string {
197	var c Caser
198	t := language.MustParse(tag)
199	switch caser {
200	case "lower":
201		c = Lower(t)
202	case "upper":
203		c = Upper(t)
204	case "title":
205		c = Title(t)
206	case "fold":
207		c = Fold()
208	}
209	return c.String(input)
210}
211