1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package collate
6
7import (
8	"testing"
9
10	"golang.org/x/text/collate/build"
11	"golang.org/x/text/internal/colltab"
12	"golang.org/x/text/unicode/norm"
13)
14
15type ColElems []Weights
16
17type input struct {
18	str string
19	ces [][]int
20}
21
22type check struct {
23	in  string
24	n   int
25	out ColElems
26}
27
28type tableTest struct {
29	in  []input
30	chk []check
31}
32
33func w(ce ...int) Weights {
34	return W(ce...)
35}
36
37var defaults = w(0)
38
39func pt(p, t int) []int {
40	return []int{p, defaults.Secondary, t}
41}
42
43func makeTable(in []input) (*Collator, error) {
44	b := build.NewBuilder()
45	for _, r := range in {
46		if e := b.Add([]rune(r.str), r.ces, nil); e != nil {
47			panic(e)
48		}
49	}
50	t, err := b.Build()
51	if err != nil {
52		return nil, err
53	}
54	return NewFromTable(t), nil
55}
56
57// modSeq holds a seqeunce of modifiers in increasing order of CCC long enough
58// to cause a segment overflow if not handled correctly. The last rune in this
59// list has a CCC of 214.
60var modSeq = []rune{
61	0x05B1, 0x05B2, 0x05B3, 0x05B4, 0x05B5, 0x05B6, 0x05B7, 0x05B8, 0x05B9, 0x05BB,
62	0x05BC, 0x05BD, 0x05BF, 0x05C1, 0x05C2, 0xFB1E, 0x064B, 0x064C, 0x064D, 0x064E,
63	0x064F, 0x0650, 0x0651, 0x0652, 0x0670, 0x0711, 0x0C55, 0x0C56, 0x0E38, 0x0E48,
64	0x0EB8, 0x0EC8, 0x0F71, 0x0F72, 0x0F74, 0x0321, 0x1DCE,
65}
66
67var mods []input
68var modW = func() ColElems {
69	ws := ColElems{}
70	for _, r := range modSeq {
71		rune := norm.NFC.PropertiesString(string(r))
72		ws = append(ws, w(0, int(rune.CCC())))
73		mods = append(mods, input{string(r), [][]int{{0, int(rune.CCC())}}})
74	}
75	return ws
76}()
77
78var appendNextTests = []tableTest{
79	{ // test getWeights
80		[]input{
81			{"a", [][]int{{100}}},
82			{"b", [][]int{{105}}},
83			{"c", [][]int{{110}}},
84			{"ß", [][]int{{120}}},
85		},
86		[]check{
87			{"a", 1, ColElems{w(100)}},
88			{"b", 1, ColElems{w(105)}},
89			{"c", 1, ColElems{w(110)}},
90			{"d", 1, ColElems{w(0x50064)}},
91			{"ab", 1, ColElems{w(100)}},
92			{"bc", 1, ColElems{w(105)}},
93			{"dd", 1, ColElems{w(0x50064)}},
94			{"ß", 2, ColElems{w(120)}},
95		},
96	},
97	{ // test expansion
98		[]input{
99			{"u", [][]int{{100}}},
100			{"U", [][]int{{100}, {0, 25}}},
101			{"w", [][]int{{100}, {100}}},
102			{"W", [][]int{{100}, {0, 25}, {100}, {0, 25}}},
103		},
104		[]check{
105			{"u", 1, ColElems{w(100)}},
106			{"U", 1, ColElems{w(100), w(0, 25)}},
107			{"w", 1, ColElems{w(100), w(100)}},
108			{"W", 1, ColElems{w(100), w(0, 25), w(100), w(0, 25)}},
109		},
110	},
111	{ // test decompose
112		[]input{
113			{"D", [][]int{pt(104, 8)}},
114			{"z", [][]int{pt(130, 8)}},
115			{"\u030C", [][]int{{0, 40}}},                               // Caron
116			{"\u01C5", [][]int{pt(104, 9), pt(130, 4), {0, 40, 0x1F}}}, // Dž = D+z+caron
117		},
118		[]check{
119			{"\u01C5", 2, ColElems{w(pt(104, 9)...), w(pt(130, 4)...), w(0, 40, 0x1F)}},
120		},
121	},
122	{ // test basic contraction
123		[]input{
124			{"a", [][]int{{100}}},
125			{"ab", [][]int{{101}}},
126			{"aab", [][]int{{101}, {101}}},
127			{"abc", [][]int{{102}}},
128			{"b", [][]int{{200}}},
129			{"c", [][]int{{300}}},
130			{"d", [][]int{{400}}},
131		},
132		[]check{
133			{"a", 1, ColElems{w(100)}},
134			{"aa", 1, ColElems{w(100)}},
135			{"aac", 1, ColElems{w(100)}},
136			{"d", 1, ColElems{w(400)}},
137			{"ab", 2, ColElems{w(101)}},
138			{"abb", 2, ColElems{w(101)}},
139			{"aab", 3, ColElems{w(101), w(101)}},
140			{"aaba", 3, ColElems{w(101), w(101)}},
141			{"abc", 3, ColElems{w(102)}},
142			{"abcd", 3, ColElems{w(102)}},
143		},
144	},
145	{ // test discontinuous contraction
146		append(mods, []input{
147			// modifiers; secondary weight equals ccc
148			{"\u0316", [][]int{{0, 220}}},
149			{"\u0317", [][]int{{0, 220}, {0, 220}}},
150			{"\u302D", [][]int{{0, 222}}},
151			{"\u302E", [][]int{{0, 225}}}, // used as starter
152			{"\u302F", [][]int{{0, 224}}}, // used as starter
153			{"\u18A9", [][]int{{0, 228}}},
154			{"\u0300", [][]int{{0, 230}}},
155			{"\u0301", [][]int{{0, 230}}},
156			{"\u0315", [][]int{{0, 232}}},
157			{"\u031A", [][]int{{0, 232}}},
158			{"\u035C", [][]int{{0, 233}}},
159			{"\u035F", [][]int{{0, 233}}},
160			{"\u035D", [][]int{{0, 234}}},
161			{"\u035E", [][]int{{0, 234}}},
162			{"\u0345", [][]int{{0, 240}}},
163
164			// starters
165			{"a", [][]int{{100}}},
166			{"b", [][]int{{200}}},
167			{"c", [][]int{{300}}},
168			{"\u03B1", [][]int{{900}}},
169			{"\x01", [][]int{{0, 0, 0, 0}}},
170
171			// contractions
172			{"a\u0300", [][]int{{101}}},
173			{"a\u0301", [][]int{{102}}},
174			{"a\u035E", [][]int{{110}}},
175			{"a\u035Eb\u035E", [][]int{{115}}},
176			{"ac\u035Eaca\u035E", [][]int{{116}}},
177			{"a\u035Db\u035D", [][]int{{117}}},
178			{"a\u0301\u035Db", [][]int{{120}}},
179			{"a\u0301\u035F", [][]int{{121}}},
180			{"a\u0301\u035Fb", [][]int{{119}}},
181			{"\u03B1\u0345", [][]int{{901}, {902}}},
182			{"\u302E\u302F", [][]int{{0, 131}, {0, 131}}},
183			{"\u302F\u18A9", [][]int{{0, 130}}},
184		}...),
185		[]check{
186			{"a\x01\u0300", 1, ColElems{w(100)}},
187			{"ab", 1, ColElems{w(100)}},                              // closing segment
188			{"a\u0316\u0300b", 5, ColElems{w(101), w(0, 220)}},       // closing segment
189			{"a\u0316\u0300", 5, ColElems{w(101), w(0, 220)}},        // no closing segment
190			{"a\u0316\u0300\u035Cb", 5, ColElems{w(101), w(0, 220)}}, // completes before segment end
191			{"a\u0316\u0300\u035C", 5, ColElems{w(101), w(0, 220)}},  // completes before segment end
192
193			{"a\u0316\u0301b", 5, ColElems{w(102), w(0, 220)}},       // closing segment
194			{"a\u0316\u0301", 5, ColElems{w(102), w(0, 220)}},        // no closing segment
195			{"a\u0316\u0301\u035Cb", 5, ColElems{w(102), w(0, 220)}}, // completes before segment end
196			{"a\u0316\u0301\u035C", 5, ColElems{w(102), w(0, 220)}},  // completes before segment end
197
198			// match blocked by modifier with same ccc
199			{"a\u0301\u0315\u031A\u035Fb", 3, ColElems{w(102)}},
200
201			// multiple gaps
202			{"a\u0301\u035Db", 6, ColElems{w(120)}},
203			{"a\u0301\u035F", 5, ColElems{w(121)}},
204			{"a\u0301\u035Fb", 6, ColElems{w(119)}},
205			{"a\u0316\u0301\u035F", 7, ColElems{w(121), w(0, 220)}},
206			{"a\u0301\u0315\u035Fb", 7, ColElems{w(121), w(0, 232)}},
207			{"a\u0316\u0301\u0315\u035Db", 5, ColElems{w(102), w(0, 220)}},
208			{"a\u0316\u0301\u0315\u035F", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
209			{"a\u0316\u0301\u0315\u035Fb", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
210			{"a\u0316\u0301\u0315\u035F\u035D", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
211			{"a\u0316\u0301\u0315\u035F\u035Db", 9, ColElems{w(121), w(0, 220), w(0, 232)}},
212
213			// handling of segment overflow
214			{ // just fits within segment
215				"a" + string(modSeq[:30]) + "\u0301",
216				3 + len(string(modSeq[:30])),
217				append(ColElems{w(102)}, modW[:30]...),
218			},
219			{"a" + string(modSeq[:31]) + "\u0301", 1, ColElems{w(100)}}, // overflow
220			{"a" + string(modSeq) + "\u0301", 1, ColElems{w(100)}},
221			{ // just fits within segment with two interstitial runes
222				"a" + string(modSeq[:28]) + "\u0301\u0315\u035F",
223				7 + len(string(modSeq[:28])),
224				append(append(ColElems{w(121)}, modW[:28]...), w(0, 232)),
225			},
226			{ // second half does not fit within segment
227				"a" + string(modSeq[:29]) + "\u0301\u0315\u035F",
228				3 + len(string(modSeq[:29])),
229				append(ColElems{w(102)}, modW[:29]...),
230			},
231
232			// discontinuity can only occur in last normalization segment
233			{"a\u035Eb\u035E", 6, ColElems{w(115)}},
234			{"a\u0316\u035Eb\u035E", 5, ColElems{w(110), w(0, 220)}},
235			{"a\u035Db\u035D", 6, ColElems{w(117)}},
236			{"a\u0316\u035Db\u035D", 1, ColElems{w(100)}},
237			{"a\u035Eb\u0316\u035E", 8, ColElems{w(115), w(0, 220)}},
238			{"a\u035Db\u0316\u035D", 8, ColElems{w(117), w(0, 220)}},
239			{"ac\u035Eaca\u035E", 9, ColElems{w(116)}},
240			{"a\u0316c\u035Eaca\u035E", 1, ColElems{w(100)}},
241			{"ac\u035Eac\u0316a\u035E", 1, ColElems{w(100)}},
242
243			// expanding contraction
244			{"\u03B1\u0345", 4, ColElems{w(901), w(902)}},
245
246			// Theoretical possibilities
247			// contraction within a gap
248			{"a\u302F\u18A9\u0301", 9, ColElems{w(102), w(0, 130)}},
249			// expansion within a gap
250			{"a\u0317\u0301", 5, ColElems{w(102), w(0, 220), w(0, 220)}},
251			// repeating CCC blocks last modifier
252			{"a\u302E\u302F\u0301", 1, ColElems{w(100)}},
253			// The trailing combining characters (with lower CCC) should block the first one.
254			// TODO: make the following pass.
255			// {"a\u035E\u0316\u0316", 1, ColElems{w(100)}},
256			{"a\u035F\u035Eb", 5, ColElems{w(110), w(0, 233)}},
257			// Last combiner should match after normalization.
258			// TODO: make the following pass.
259			// {"a\u035D\u0301", 3, ColElems{w(102), w(0, 234)}},
260			// The first combiner is blocking the second one as they have the same CCC.
261			{"a\u035D\u035Eb", 1, ColElems{w(100)}},
262		},
263	},
264}
265
266func TestAppendNext(t *testing.T) {
267	for i, tt := range appendNextTests {
268		c, err := makeTable(tt.in)
269		if err != nil {
270			t.Errorf("%d: error creating table: %v", i, err)
271			continue
272		}
273		for j, chk := range tt.chk {
274			ws, n := c.t.AppendNext(nil, []byte(chk.in))
275			if n != chk.n {
276				t.Errorf("%d:%d: bytes consumed was %d; want %d", i, j, n, chk.n)
277			}
278			out := convertFromWeights(chk.out)
279			if len(ws) != len(out) {
280				t.Errorf("%d:%d: len(ws) was %d; want %d (%X vs %X)\n%X", i, j, len(ws), len(out), ws, out, chk.in)
281				continue
282			}
283			for k, w := range ws {
284				w, _ = colltab.MakeElem(w.Primary(), w.Secondary(), int(w.Tertiary()), 0)
285				if w != out[k] {
286					t.Errorf("%d:%d: Weights %d was %X; want %X", i, j, k, w, out[k])
287				}
288			}
289		}
290	}
291}
292