1 /* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "unichar.h"
5 #include "str.h"
6 #include "test-common.h"
7 #include "fts-tokenizer.h"
8 #include "fts-tokenizer-common.h"
9 #include "fts-tokenizer-private.h"
10 #include "fts-tokenizer-generic-private.h"
11 
12 /*there should be a trailing space ' ' at the end of each string except the last one*/
13 #define TEST_INPUT_ADDRESS \
14 	"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
15 	"Bar Baz <bar@example.org>" \
16 	"Foo Bar (comment)foo.bar@host.example.org " \
17 	"foo, foo@domain " \
18 	"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
19 	"trailing, period@blue.com. " \
20 	"multi-trialing, mul@trail.com..... " \
21 	"m@s " \
22 	"hypen@hypen-hypen.com " \
23 	"hypen@hypen-hypen-sick.com.-"
24 
25 static const char *test_inputs[] = {
26 	/* generic things and word truncation: */
27 	"hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
28 	"abc@example.com, "
29 	"Bar Baz <bar@example.org>, "
30 	"foo@domain "
31 	"1234567890123456789012345678\xC3\xA4,"
32 	"12345678901234567890123456789\xC3\xA4,"
33 	"123456789012345678901234567890\xC3\xA4,"
34 	"and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
35 	"(\"Hello world\")3.14 3,14 last",
36 
37 	"1.",
38 
39 	"' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
40 
41 	"'1234567890123456789012345678\xC3\xA4,"
42 	"123456789012345678901234567x'\xC3\xA4,"
43 	"1234567890123456789012345678x're,"
44 	"1234567890123456789012345678x',"
45 	"1234567890123456789012345678x'',"
46 	"12345678901234567890123456789x',"
47 	"12345678901234567890123456789x'',"
48 	"123456789012345678901234567890x',"
49 	"123456789012345678901234567890x'',"
50 
51 	/* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
52 #define SQ "\xE2\x80\x99"
53 	SQ " " SQ " " SQ SQ " " SQ SQ SQ " " SQ "quoted text" SQ SQ "word" SQ " " SQ "hlo words" SQ " you" SQ "re78901234567890123456789012 bad" SQ SQ SQ "word" SQ SQ SQ "pre post" SQ SQ SQ,
54 
55 	"you" SQ "re" SQ "xyz",
56 
57 	/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
58 	   U+205A(e2 81 9a) and U+205F(e2 81 9f) */
59 	"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
60 	"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
61 
62 	/* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E  */
63 	"hello world\xEF\xBC\x8E",
64 
65 	/* TR29 WB5a */
66 	"l" SQ "homme l" SQ "humanit\xC3\xA9 d" SQ "immixtions qu" SQ "il aujourd'hui que'euq"
67 };
68 
test_fts_tokenizer_find(void)69 static void test_fts_tokenizer_find(void)
70 {
71 	test_begin("fts tokenizer find");
72 	test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
73 	test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
74 	test_end();
75 }
76 
77 static unsigned int
test_tokenizer_inputoutput(struct fts_tokenizer * tok,const char * _input,const char * const * expected_output,unsigned int first_outi)78 test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
79 			   const char *const *expected_output,
80 			   unsigned int first_outi)
81 {
82 	const unsigned char *input = (const unsigned char *)_input;
83 	const char *token, *error;
84 	unsigned int i, outi, max, char_len;
85 	size_t input_len = strlen(_input);
86 
87 	/* test all input at once */
88 	outi = first_outi;
89 	while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
90 		test_assert_strcmp(token, expected_output[outi]);
91 		outi++;
92 	}
93 	while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
94 		test_assert_strcmp(token, expected_output[outi]);
95 		outi++;
96 	}
97 	test_assert_idx(expected_output[outi] == NULL, outi);
98 
99 	/* test input one byte at a time */
100 	outi = first_outi;
101 	for (i = 0; i < input_len; i += char_len) {
102 		char_len = uni_utf8_char_bytes(input[i]);
103 		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
104 			test_assert_strcmp(token, expected_output[outi]);
105 			outi++;
106 		}
107 	}
108 	while (fts_tokenizer_final(tok, &token, &error) > 0) {
109 		test_assert_strcmp(token, expected_output[outi]);
110 		outi++;
111 	}
112 	test_assert_idx(expected_output[outi] == NULL, outi);
113 
114 	/* test input in random chunks */
115 	outi = first_outi;
116 	for (i = 0; i < input_len; i += char_len) {
117 		max = i_rand_minmax(1, input_len - i);
118 		for (char_len = 0; char_len < max; )
119 			char_len += uni_utf8_char_bytes(input[i+char_len]);
120 		while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
121 			test_assert_strcmp(token, expected_output[outi]);
122 			outi++;
123 		}
124 	}
125 	while (fts_tokenizer_final(tok, &token, &error) > 0) {
126 		test_assert_strcmp(token, expected_output[outi]);
127 		outi++;
128 	}
129 	test_assert_idx(expected_output[outi] == NULL, outi);
130 
131 	return outi+1;
132 }
133 
134 static void
test_tokenizer_inputs(struct fts_tokenizer * tok,const char * const * inputs,unsigned int count,const char * const * expected_output)135 test_tokenizer_inputs(struct fts_tokenizer *tok,
136 		      const char *const *inputs, unsigned int count,
137 		      const char *const *expected_output)
138 {
139 	unsigned int i, outi = 0;
140 
141 	for (i = 0; i < count; i++) {
142 		outi = test_tokenizer_inputoutput(tok, inputs[i],
143 						  expected_output, outi);
144 	}
145 	test_assert_idx(expected_output[outi] == NULL, outi);
146 }
147 
test_fts_tokenizer_generic_only(void)148 static void test_fts_tokenizer_generic_only(void)
149 {
150 	static const char *const expected_output[] = {
151 		"hello", "world", "And",
152 		"there", "was", "text", "galor\xC3\xA9",
153 		"abc", "example", "com", "Bar", "Baz",
154 		"bar", "example", "org", "foo", "domain",
155 		"1234567890123456789012345678\xC3\xA4",
156 		"12345678901234567890123456789",
157 		"123456789012345678901234567890",
158 		"and", "longlonglongabcdefghijklmnopqr",
159 		"more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
160 
161 		"1", NULL,
162 
163 		"quoted", "text", "word", "hlo", "words", "you're", "bad",
164 		"word", "pre", "post", NULL,
165 
166 		"1234567890123456789012345678\xC3\xA4",
167 		"123456789012345678901234567x'",
168 		"1234567890123456789012345678x'",
169 		"1234567890123456789012345678x",
170 		"1234567890123456789012345678x",
171 		"12345678901234567890123456789x",
172 		"12345678901234567890123456789x",
173 		"123456789012345678901234567890",
174 		"123456789012345678901234567890",
175 
176 		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
177 		"word", "pre", "post", NULL,
178 
179 		"you're'xyz", NULL,
180 
181 		"hello", "world", "And",
182 		"there", "was", "text", "galore",
183 		"and", "more", NULL,
184 
185 		"hello", "world", NULL,
186 
187 		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
188 
189 		NULL
190 	};
191 	struct fts_tokenizer *tok;
192 	const char *error;
193 
194 	test_begin("fts tokenizer generic simple");
195 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
196 	test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
197 
198 	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
199 	fts_tokenizer_unref(&tok);
200 	test_end();
201 }
202 
203 const char *const tr29_settings[] = {"algorithm", "tr29", NULL};
204 
205 /* TODO: U+206F is in "Format" and therefore currently not word break.
206    This definitely needs to be remapped. */
test_fts_tokenizer_generic_tr29_only(void)207 static void test_fts_tokenizer_generic_tr29_only(void)
208 {
209 	static const char *const expected_output[] = {
210 		"hello", "world", "And",
211 		"there", "was", "text", "galor\xC3\xA9",
212 		"abc", "example", "com", "Bar", "Baz",
213 		"bar", "example", "org", "foo", "domain",
214 		"1234567890123456789012345678\xC3\xA4",
215 		"12345678901234567890123456789",
216 		"123456789012345678901234567890",
217 		"and", "longlonglongabcdefghijklmnopqr",
218 		"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
219 
220 		"1", NULL,
221 
222 		"quoted", "text", "word", "hlo", "words", "you're", "bad",
223 		"word", "pre", "post", NULL,
224 
225 		"1234567890123456789012345678\xC3\xA4",
226 		"123456789012345678901234567x'",
227 		"1234567890123456789012345678x'",
228 		"1234567890123456789012345678x",
229 		"1234567890123456789012345678x",
230 		"12345678901234567890123456789x",
231 		"12345678901234567890123456789x",
232 		"123456789012345678901234567890",
233 		"123456789012345678901234567890",
234 
235 		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
236 		"word", "pre", "post", NULL,
237 
238 		"you're'xyz", NULL,
239 
240 		"hello", "world", "And",
241 		"there", "was", "text", "galore",
242 		"and", "more", NULL,
243 
244 		"hello", "world", NULL,
245 
246 		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
247 		NULL
248 	};
249 	struct fts_tokenizer *tok;
250 	const char *error;
251 
252 	test_begin("fts tokenizer generic TR29");
253 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
254 	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
255 	fts_tokenizer_unref(&tok);
256 	test_end();
257 }
258 
259 const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
260 
261 /* TODO: U+206F is in "Format" and therefore currently not word break.
262    This definitely needs to be remapped. */
test_fts_tokenizer_generic_tr29_wb5a(void)263 static void test_fts_tokenizer_generic_tr29_wb5a(void)
264 {
265 	static const char *const expected_output[] = {
266 		"hello", "world", "And",
267 		"there", "was", "text", "galor\xC3\xA9",
268 		"abc", "example", "com", "Bar", "Baz",
269 		"bar", "example", "org", "foo", "domain",
270 		"1234567890123456789012345678\xC3\xA4",
271 		"12345678901234567890123456789",
272 		"123456789012345678901234567890",
273 		"and", "longlonglongabcdefghijklmnopqr",
274 		"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
275 
276 		"1", NULL,
277 
278 		"quoted", "text", "word", "hlo", "words", "you're", "bad",
279 		"word", "pre", "post", NULL,
280 
281 		"1234567890123456789012345678\xC3\xA4",
282 		"123456789012345678901234567x'",
283 		"1234567890123456789012345678x'",
284 		"1234567890123456789012345678x",
285 		"1234567890123456789012345678x",
286 		"12345678901234567890123456789x",
287 		"12345678901234567890123456789x",
288 		"123456789012345678901234567890",
289 		"123456789012345678901234567890",
290 
291 		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
292 		"word", "pre", "post", NULL,
293 
294 		"you're'xyz", NULL,
295 
296 		"hello", "world", "And",
297 		"there", "was", "text", "galore",
298 		"and", "more", NULL,
299 
300 		"hello", "world", NULL,
301 
302 		"l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
303 
304 		NULL
305 	};
306 	struct fts_tokenizer *tok;
307 	const char *error;
308 
309 	test_begin("fts tokenizer generic TR29 with WB5a");
310 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
311 	test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
312 	fts_tokenizer_unref(&tok);
313 	test_end();
314 }
315 
test_fts_tokenizer_address_only(void)316 static void test_fts_tokenizer_address_only(void)
317 {
318 	static const char input[] = TEST_INPUT_ADDRESS;
319 	static const char *const expected_output[] = {
320 		"abc.dfg@example.com", "bar@example.org",
321 		"foo.bar@host.example.org", "foo@domain",
322 		"abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
323 		"period@blue.com", /*trailing period '.' in email */
324 		"mul@trail.com",
325 		"m@s", /*one letter local-part and domain name */
326 		"hypen@hypen-hypen.com",
327 		"hypen@hypen-hypen-sick.com",
328 		NULL
329 	};
330 	struct fts_tokenizer *tok;
331 	const char *error;
332 
333 	test_begin("fts tokenizer email address only");
334 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
335 	test_tokenizer_inputoutput(tok, input, expected_output, 0);
336 	fts_tokenizer_unref(&tok);
337 	test_end();
338 }
339 
test_fts_tokenizer_address_parent(const char * name,const char * const * settings)340 static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
341 {
342 	static const char input[] = TEST_INPUT_ADDRESS;
343 	static const char *const expected_output[] = {
344 		"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
345 		"Bar", "Baz", "bar", "example", "org", "bar@example.org",
346 		"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
347 		"foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",  "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
348 		"trailing", "period", "blue", "com", "period@blue.com",
349 		"multi", "trialing", "mul", "trail", "com", "mul@trail.com",
350 		"m", "s", "m@s",
351 		"hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
352 		"hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
353 		NULL
354 	};
355 	struct fts_tokenizer *tok, *gen_tok;
356 	const char *error;
357 
358 	test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
359 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
360 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
361 	test_tokenizer_inputoutput(tok, input, expected_output, 0);
362 	fts_tokenizer_unref(&tok);
363 	fts_tokenizer_unref(&gen_tok);
364 	test_end();
365 }
366 
367 const char *const simple_settings[] = {"algorithm", "simple", NULL};
test_fts_tokenizer_address_parent_simple(void)368 static void test_fts_tokenizer_address_parent_simple(void)
369 {
370 	test_fts_tokenizer_address_parent("simple", simple_settings);
371 }
372 
test_fts_tokenizer_address_parent_tr29(void)373 static void test_fts_tokenizer_address_parent_tr29(void)
374 {
375 	test_fts_tokenizer_address_parent("tr29", tr29_settings);
376 }
377 
test_fts_tokenizer_address_search(void)378 static void test_fts_tokenizer_address_search(void)
379 {
380 	static const char input[] = TEST_INPUT_ADDRESS;
381 	static const char *const expected_output[] = {
382 		"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
383 		"Bar", "Baz", "bar@example.org",
384 		"Foo", "Bar", "comment", "foo.bar@host.example.org",
385 		"foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
386 		"trailing", "period@blue.com",
387 		"multi", "trialing", "mul@trail.com",
388 		"m@s",
389 		"hypen@hypen-hypen.com",
390 		"hypen@hypen-hypen-sick.com",
391 		NULL
392 	};
393 	static const char *const settings[] = { "search", "", NULL };
394 	struct fts_tokenizer *tok, *gen_tok;
395 	const char *token, *error;
396 
397 	test_begin("fts tokenizer search email address + parent");
398 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
399 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
400 	test_tokenizer_inputoutput(tok, input, expected_output, 0);
401 
402 	/* make sure state is forgotten at EOF */
403 	test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
404 	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
405 		    strcmp(token, "foo") == 0);
406 	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
407 
408 	test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
409 	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
410 		    strcmp(token, "bar@baz") == 0);
411 	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
412 
413 	test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
414 	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
415 		    strcmp(token, "foo") == 0);
416 	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
417 
418 	/* test reset explicitly */
419 	test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
420 	fts_tokenizer_reset(tok);
421 	test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
422 	test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
423 		    strcmp(token, "b@c") == 0);
424 	test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
425 
426 	fts_tokenizer_unref(&tok);
427 	fts_tokenizer_unref(&gen_tok);
428 	test_end();
429 }
430 
test_fts_tokenizer_delete_trailing_partial_char(void)431 static void test_fts_tokenizer_delete_trailing_partial_char(void)
432 {
433 	static const struct {
434 		const char *str;
435 		unsigned int truncated_len;
436 	} tests[] = {
437 		/* non-truncated */
438 		{ "\x7f", 1 },
439 		{ "\xC2\x80", 2 },
440 		{ "\xE0\x80\x80", 3 },
441 		{ "\xF0\x80\x80\x80", 4 },
442 
443 		/* truncated */
444 		{ "\xF0\x80\x80", 0 },
445 		{ "x\xF0\x80\x80", 1 },
446 	};
447 	unsigned int i;
448 	size_t size;
449 
450 	test_begin("fts tokenizer delete trailing partial char");
451 	for (i = 0; i < N_ELEMENTS(tests); i++) {
452 		size = strlen(tests[i].str);
453 		fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
454 		test_assert(size == tests[i].truncated_len);
455 	}
456 	test_end();
457 }
458 
test_fts_tokenizer_address_maxlen(void)459 static void test_fts_tokenizer_address_maxlen(void)
460 {
461 	const char *const settings[] = {"maxlen", "5", NULL};
462 	const char *input = "...\357\277\275@a";
463 	struct fts_tokenizer *tok;
464 	const char *token, *error;
465 
466 	test_begin("fts tokenizer address maxlen");
467 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
468 
469 	while (fts_tokenizer_next(tok, (const unsigned char *)input,
470 				  strlen(input), &token, &error) > 0) ;
471 	while (fts_tokenizer_final(tok, &token, &error) > 0) ;
472 	fts_tokenizer_unref(&tok);
473 	test_end();
474 }
475 
test_fts_tokenizer_random(void)476 static void test_fts_tokenizer_random(void)
477 {
478 	const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
479 	const char *const settings[] = {"algorithm", "simple", NULL};
480 	const char *const email_settings[] = {"maxlen", "9", NULL};
481 	unsigned int i;
482 	unsigned char addr[10] = { 0 };
483 	string_t *str = t_str_new(20);
484 	struct fts_tokenizer *tok, *gen_tok;
485 	const char *token, *error;
486 
487 	test_begin("fts tokenizer random");
488 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
489 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
490 
491 	for (i = 0; i < 10000; i++) T_BEGIN {
492 		for (unsigned int j = 0; j < sizeof(addr); j++)
493 			addr[j] = test_chars[i_rand_limit(N_ELEMENTS(test_chars))];
494 		str_truncate(str, 0);
495 		if (uni_utf8_get_valid_data(addr, sizeof(addr), str))
496 			str_append_data(str, addr, sizeof(addr));
497 		while (fts_tokenizer_next(tok, str_data(str), str_len(str),
498 					  &token, &error) > 0) ;
499 		while (fts_tokenizer_final(tok, &token, &error) > 0) ;
500 	} T_END;
501 	fts_tokenizer_unref(&tok);
502 	fts_tokenizer_unref(&gen_tok);
503 	test_end();
504 }
505 
506 
507 static void
test_fts_tokenizer_explicit_prefix(void)508 test_fts_tokenizer_explicit_prefix(void)
509 {
510 	const char *input = "* ** "
511 		"*pre *both* post* "
512 		"mid*dle *mid*dle* "
513 		"**twopre **twoboth** twopost**";
514 	const char *const expected_star[] = { "pre", "both*", "post*",
515 					      "mid*", "dle", "mid*", "dle*",
516 					      "twopre", "twoboth*", "twopost*",
517 					      NULL, NULL };
518 	const char *const expected_nostar[] = { "pre", "both", "post",
519 						"mid", "dle", "mid", "dle",
520 						"twopre", "twoboth", "twopost",
521 						NULL, NULL };
522 
523 	const char *settings[9] = { "algorithm", "tr29", "wb5a", "yes" };
524 	const char **setptr;
525 
526 	const char *algos[] = { ALGORITHM_SIMPLE_NAME,
527 				ALGORITHM_TR29_NAME,
528 				ALGORITHM_TR29_NAME "+wb5a" };
529 	const char *searches[] = { "indexing", "searching" };
530 	const char *prefixes[] = { "fixed", "prefix" };
531 
532 	for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */
533 		setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */
534 
535 		for (int search = 0; search < 2; search++) {
536 			const char **setptr2 = setptr;
537 			if (search > 0) { *setptr2++ = "search"; *setptr2++ = "yes"; }
538 
539 			for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) {
540 				const char **setptr3 = setptr2;
541 				if (explicitprefix > 0) { *setptr3++ = "explicitprefix"; *setptr3++ = "y"; }
542 
543 				*setptr3++ = NULL;
544 
545 				test_begin(t_strdup_printf("prefix search %s:%s:%s",
546 							   algos[algo],
547 							   searches[search],
548 							   prefixes[explicitprefix]));
549 				struct fts_tokenizer *tok;
550 				const char *error;
551 
552 				test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings,
553 								 &tok, &error) == 0);
554 				test_tokenizer_inputs(
555 					tok, &input, 1,
556 					(search!=0) && (explicitprefix!=0)
557 					? expected_star : expected_nostar);
558 
559 				fts_tokenizer_unref(&tok);
560 				test_end();
561 			}
562 		}
563 	}
564 }
565 
main(void)566 int main(void)
567 {
568 	static void (*const test_functions[])(void) = {
569 		test_fts_tokenizer_find,
570 		test_fts_tokenizer_generic_only,
571 		test_fts_tokenizer_generic_tr29_only,
572 		test_fts_tokenizer_generic_tr29_wb5a,
573 		test_fts_tokenizer_address_only,
574 		test_fts_tokenizer_address_parent_simple,
575 		test_fts_tokenizer_address_parent_tr29,
576 		test_fts_tokenizer_address_maxlen,
577 		test_fts_tokenizer_address_search,
578 		test_fts_tokenizer_delete_trailing_partial_char,
579 		test_fts_tokenizer_random,
580 		test_fts_tokenizer_explicit_prefix,
581 		NULL
582 	};
583 	int ret;
584 
585 	fts_tokenizers_init();
586 	ret = test_run(test_functions);
587 	fts_tokenizers_deinit();
588 
589 	return ret;
590 }
591