1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package search
6
7import (
8	"reflect"
9	"strings"
10	"testing"
11
12	"golang.org/x/text/language"
13)
14
15func TestCompile(t *testing.T) {
16	for i, tc := range []struct {
17		desc    string
18		pattern string
19		options []Option
20		n       int
21	}{{
22		desc:    "empty",
23		pattern: "",
24		n:       0,
25	}, {
26		desc:    "single",
27		pattern: "a",
28		n:       1,
29	}, {
30		desc:    "keep modifier",
31		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
32		n:       2,
33	}, {
34		desc:    "remove modifier",
35		pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
36		options: []Option{IgnoreDiacritics},
37		n:       1,
38	}, {
39		desc:    "single with double collation element",
40		pattern: "ä",
41		n:       2,
42	}, {
43		desc:    "leading variable",
44		pattern: " a",
45		n:       2,
46	}, {
47		desc:    "trailing variable",
48		pattern: "aa ",
49		n:       3,
50	}, {
51		desc:    "leading and trailing variable",
52		pattern: " äb ",
53		n:       5,
54	}, {
55		desc:    "keep interior variable",
56		pattern: " ä b ",
57		n:       6,
58	}, {
59		desc:    "keep interior variables",
60		pattern: " b  ä ",
61		n:       7,
62	}, {
63		desc:    "remove ignoreables (zero-weights across the board)",
64		pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
65		n:       3,
66	}} {
67		m := New(language.Und, tc.options...)
68		p := m.CompileString(tc.pattern)
69		if len(p.ce) != tc.n {
70			t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
71		}
72	}
73}
74
75func TestNorm(t *testing.T) {
76	// U+0300: COMBINING GRAVE ACCENT (CCC=230)
77	// U+031B: COMBINING HORN (CCC=216)
78	for _, tc := range []struct {
79		desc string
80		a    string
81		b    string
82		want bool // a and b compile into the same pattern?
83	}{{
84		"simple",
85		"eee\u0300\u031b",
86		"eee\u031b\u0300",
87		true,
88	}, {
89		"large number of modifiers in pattern",
90		strings.Repeat("\u0300", 29) + "\u0318",
91		"\u0318" + strings.Repeat("\u0300", 29),
92		true,
93	}, {
94		"modifier overflow in pattern",
95		strings.Repeat("\u0300", 30) + "\u0318",
96		"\u0318" + strings.Repeat("\u0300", 30),
97		false,
98	}} {
99		m := New(language.Und)
100		a := m.CompileString(tc.a)
101		b := m.CompileString(tc.b)
102		if got := reflect.DeepEqual(a, b); got != tc.want {
103			t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
104		}
105	}
106}
107
108func TestForwardSearch(t *testing.T) {
109	for i, tc := range []struct {
110		desc    string
111		tag     string
112		options []Option
113		pattern string
114		text    string
115		want    []int
116	}{{
117		// The semantics of an empty search is to match nothing.
118		// TODO: change this to be in line with strings.Index? It is quite a
119		// different beast, so not sure yet.
120
121		desc:    "empty pattern and text",
122		tag:     "und",
123		pattern: "",
124		text:    "",
125		want:    nil, // TODO: consider: []int{0, 0},
126	}, {
127		desc:    "non-empty pattern and empty text",
128		tag:     "und",
129		pattern: " ",
130		text:    "",
131		want:    nil,
132	}, {
133		desc:    "empty pattern and non-empty text",
134		tag:     "und",
135		pattern: "",
136		text:    "abc",
137		want:    nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
138	}, {
139		// Variable-only patterns. We don't support variables at the moment,
140		// but verify that, given this, the behavior is indeed as expected.
141
142		desc:    "exact match of variable",
143		tag:     "und",
144		pattern: " ",
145		text:    " ",
146		want:    []int{0, 1},
147	}, {
148		desc:    "variables not handled by default",
149		tag:     "und",
150		pattern: "- ",
151		text:    " -",
152		want:    nil, // Would be (1, 2) for a median match with variable}.
153	}, {
154		desc:    "multiple subsequent identical variables",
155		tag:     "und",
156		pattern: " ",
157		text:    "    ",
158		want:    []int{0, 1, 1, 2, 2, 3, 3, 4},
159	}, {
160		desc:    "text with variables",
161		tag:     "und",
162		options: []Option{IgnoreDiacritics},
163		pattern: "abc",
164		text:    "3 abc 3",
165		want:    []int{2, 5},
166	}, {
167		desc:    "pattern with interior variables",
168		tag:     "und",
169		options: []Option{IgnoreDiacritics},
170		pattern: "a b c",
171		text:    "3 a b c abc a  b  c 3",
172		want:    []int{2, 7}, // Would have 3 matches using variable.
173
174		// TODO: Different variable handling settings.
175	}, {
176		// Options.
177
178		desc:    "match all levels",
179		tag:     "und",
180		pattern: "Abc",
181		text:    "abcAbcABCÁbcábc",
182		want:    []int{3, 6},
183	}, {
184		desc:    "ignore diacritics in text",
185		tag:     "und",
186		options: []Option{IgnoreDiacritics},
187		pattern: "Abc",
188		text:    "Ábc",
189		want:    []int{0, 4},
190	}, {
191		desc:    "ignore diacritics in pattern",
192		tag:     "und",
193		options: []Option{IgnoreDiacritics},
194		pattern: "Ábc",
195		text:    "Abc",
196		want:    []int{0, 3},
197	}, {
198		desc:    "ignore diacritics",
199		tag:     "und",
200		options: []Option{IgnoreDiacritics},
201		pattern: "Abc",
202		text:    "abcAbcABCÁbcábc",
203		want:    []int{3, 6, 9, 13},
204	}, {
205		desc:    "ignore case",
206		tag:     "und",
207		options: []Option{IgnoreCase},
208		pattern: "Abc",
209		text:    "abcAbcABCÁbcábc",
210		want:    []int{0, 3, 3, 6, 6, 9},
211	}, {
212		desc:    "ignore case and diacritics",
213		tag:     "und",
214		options: []Option{IgnoreCase, IgnoreDiacritics},
215		pattern: "Abc",
216		text:    "abcAbcABCÁbcábc",
217		want:    []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
218	}, {
219		desc:    "ignore width to fullwidth",
220		tag:     "und",
221		options: []Option{IgnoreWidth},
222		pattern: "abc",
223		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
224		want:    []int{4, 13},
225	}, {
226		// TODO: distinguish between case and width.
227		desc:    "don't ignore width to fullwidth, ignoring only case",
228		tag:     "und",
229		options: []Option{IgnoreCase},
230		pattern: "abc",
231		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
232		want:    []int{4, 13},
233	}, {
234		desc:    "ignore width to fullwidth and diacritics",
235		tag:     "und",
236		options: []Option{IgnoreWidth, IgnoreDiacritics},
237		pattern: "abc",
238		text:    "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
239		want:    []int{4, 13},
240	}, {
241		desc:    "whole grapheme, single rune",
242		tag:     "und",
243		pattern: "eee",
244		text:    "123 eeé 123",
245		want:    nil,
246	}, {
247		// Note: rules on when to apply contractions may, for certain languages,
248		// differ between search and collation. For example, "ch" is not
249		// considered a contraction for the purpose of searching in Spanish.
250		// Therefore, be careful picking this test.
251		desc:    "whole grapheme, contractions",
252		tag:     "da",
253		pattern: "aba",
254		// Fails at the primary level, because "aa" is a contraction.
255		text: "123 abaa 123",
256		want: []int{},
257	}, {
258		desc:    "whole grapheme, trailing modifier",
259		tag:     "und",
260		pattern: "eee",
261		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
262		want:    nil,
263	}, {
264		// Language-specific matching.
265
266		desc:    "",
267		tag:     "da",
268		options: []Option{IgnoreCase},
269		pattern: "Århus",
270		text:    "AarhusÅrhus  Århus  ",
271		want:    []int{0, 6, 6, 12, 14, 20},
272	}, {
273		desc:    "",
274		tag:     "da",
275		options: []Option{IgnoreCase},
276		pattern: "Aarhus",
277		text:    "Århus Aarhus",
278		want:    []int{0, 6, 7, 13},
279	}, {
280		desc:    "",
281		tag:     "en", // Å does not match A for English.
282		options: []Option{IgnoreCase},
283		pattern: "Aarhus",
284		text:    "Århus",
285		want:    nil,
286	}, {
287		desc:    "ignore modifier in text",
288		options: []Option{IgnoreDiacritics},
289		tag:     "und",
290		pattern: "eee",
291		text:    "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
292		want:    []int{4, 9},         // Matches on grapheme boundary.
293	}, {
294		desc:    "ignore multiple modifiers in text",
295		options: []Option{IgnoreDiacritics},
296		tag:     "und",
297		pattern: "eee",
298		text:    "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
299		want:    []int{4, 11},              // Matches on grapheme boundary.
300	}, {
301		desc:    "ignore modifier in pattern",
302		options: []Option{IgnoreDiacritics},
303		tag:     "und",
304		pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
305		text:    "123 eee 123",
306		want:    []int{4, 7},
307	}, {
308		desc:    "ignore multiple modifiers in pattern",
309		options: []Option{IgnoreDiacritics},
310		tag:     "und",
311		pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
312		text:    "123 eee 123",
313		want:    []int{4, 7},
314	}, {
315		desc: "match non-normalized pattern",
316		tag:  "und",
317		// U+0300: COMBINING GRAVE ACCENT (CCC=230)
318		// U+031B: COMBINING HORN (CCC=216)
319		pattern: "eee\u0300\u031b",
320		text:    "123 eee\u031b\u0300 123",
321		want:    []int{4, 11},
322	}, {
323		desc: "match non-normalized text",
324		tag:  "und",
325		// U+0300: COMBINING GRAVE ACCENT (CCC=230)
326		// U+031B: COMBINING HORN (CCC=216)
327		pattern: "eee\u031b\u0300",
328		text:    "123 eee\u0300\u031b 123",
329		want:    []int{4, 11},
330	}} {
331		m := New(language.MustParse(tc.tag), tc.options...)
332		p := m.CompileString(tc.pattern)
333		for j := 0; j < len(tc.text); {
334			start, end := p.IndexString(tc.text[j:])
335			if start == -1 && end == -1 {
336				j++
337				continue
338			}
339			start += j
340			end += j
341			j = end
342			if len(tc.want) == 0 {
343				t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
344				break
345			}
346			if tc.want[0] != start || tc.want[1] != end {
347				t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
348				tc.want = tc.want[2:]
349				break
350			}
351			tc.want = tc.want[2:]
352		}
353		if len(tc.want) != 0 {
354			t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
355		}
356	}
357}
358