1 /* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "unichar.h"
5 #include "str.h"
6 #include "test-common.h"
7 #include "fts-tokenizer.h"
8 #include "fts-tokenizer-common.h"
9 #include "fts-tokenizer-private.h"
10 #include "fts-tokenizer-generic-private.h"
11
12 /*there should be a trailing space ' ' at the end of each string except the last one*/
13 #define TEST_INPUT_ADDRESS \
14 "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
15 "Bar Baz <bar@example.org>" \
16 "Foo Bar (comment)foo.bar@host.example.org " \
17 "foo, foo@domain " \
18 "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
19 "trailing, period@blue.com. " \
20 "multi-trialing, mul@trail.com..... " \
21 "m@s " \
22 "hypen@hypen-hypen.com " \
23 "hypen@hypen-hypen-sick.com.-"
24
25 static const char *test_inputs[] = {
26 /* generic things and word truncation: */
27 "hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
28 "abc@example.com, "
29 "Bar Baz <bar@example.org>, "
30 "foo@domain "
31 "1234567890123456789012345678\xC3\xA4,"
32 "12345678901234567890123456789\xC3\xA4,"
33 "123456789012345678901234567890\xC3\xA4,"
34 "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n "
35 "(\"Hello world\")3.14 3,14 last",
36
37 "1.",
38
39 "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
40
41 "'1234567890123456789012345678\xC3\xA4,"
42 "123456789012345678901234567x'\xC3\xA4,"
43 "1234567890123456789012345678x're,"
44 "1234567890123456789012345678x',"
45 "1234567890123456789012345678x'',"
46 "12345678901234567890123456789x',"
47 "12345678901234567890123456789x'',"
48 "123456789012345678901234567890x',"
49 "123456789012345678901234567890x'',"
50
51 /* \xe28099 = U+2019 is a smart quote, sometimes used as an apostrophe */
52 #define SQ "\xE2\x80\x99"
53 SQ " " SQ " " SQ SQ " " SQ SQ SQ " " SQ "quoted text" SQ SQ "word" SQ " " SQ "hlo words" SQ " you" SQ "re78901234567890123456789012 bad" SQ SQ SQ "word" SQ SQ SQ "pre post" SQ SQ SQ,
54
55 "you" SQ "re" SQ "xyz",
56
57 /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
58 U+205A(e2 81 9a) and U+205F(e2 81 9f) */
59 "hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
60 "galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
61
62 /* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
63 "hello world\xEF\xBC\x8E",
64
65 /* TR29 WB5a */
66 "l" SQ "homme l" SQ "humanit\xC3\xA9 d" SQ "immixtions qu" SQ "il aujourd'hui que'euq"
67 };
68
test_fts_tokenizer_find(void)69 static void test_fts_tokenizer_find(void)
70 {
71 test_begin("fts tokenizer find");
72 test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
73 test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
74 test_end();
75 }
76
77 static unsigned int
test_tokenizer_inputoutput(struct fts_tokenizer * tok,const char * _input,const char * const * expected_output,unsigned int first_outi)78 test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
79 const char *const *expected_output,
80 unsigned int first_outi)
81 {
82 const unsigned char *input = (const unsigned char *)_input;
83 const char *token, *error;
84 unsigned int i, outi, max, char_len;
85 size_t input_len = strlen(_input);
86
87 /* test all input at once */
88 outi = first_outi;
89 while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
90 test_assert_strcmp(token, expected_output[outi]);
91 outi++;
92 }
93 while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
94 test_assert_strcmp(token, expected_output[outi]);
95 outi++;
96 }
97 test_assert_idx(expected_output[outi] == NULL, outi);
98
99 /* test input one byte at a time */
100 outi = first_outi;
101 for (i = 0; i < input_len; i += char_len) {
102 char_len = uni_utf8_char_bytes(input[i]);
103 while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
104 test_assert_strcmp(token, expected_output[outi]);
105 outi++;
106 }
107 }
108 while (fts_tokenizer_final(tok, &token, &error) > 0) {
109 test_assert_strcmp(token, expected_output[outi]);
110 outi++;
111 }
112 test_assert_idx(expected_output[outi] == NULL, outi);
113
114 /* test input in random chunks */
115 outi = first_outi;
116 for (i = 0; i < input_len; i += char_len) {
117 max = i_rand_minmax(1, input_len - i);
118 for (char_len = 0; char_len < max; )
119 char_len += uni_utf8_char_bytes(input[i+char_len]);
120 while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
121 test_assert_strcmp(token, expected_output[outi]);
122 outi++;
123 }
124 }
125 while (fts_tokenizer_final(tok, &token, &error) > 0) {
126 test_assert_strcmp(token, expected_output[outi]);
127 outi++;
128 }
129 test_assert_idx(expected_output[outi] == NULL, outi);
130
131 return outi+1;
132 }
133
134 static void
test_tokenizer_inputs(struct fts_tokenizer * tok,const char * const * inputs,unsigned int count,const char * const * expected_output)135 test_tokenizer_inputs(struct fts_tokenizer *tok,
136 const char *const *inputs, unsigned int count,
137 const char *const *expected_output)
138 {
139 unsigned int i, outi = 0;
140
141 for (i = 0; i < count; i++) {
142 outi = test_tokenizer_inputoutput(tok, inputs[i],
143 expected_output, outi);
144 }
145 test_assert_idx(expected_output[outi] == NULL, outi);
146 }
147
test_fts_tokenizer_generic_only(void)148 static void test_fts_tokenizer_generic_only(void)
149 {
150 static const char *const expected_output[] = {
151 "hello", "world", "And",
152 "there", "was", "text", "galor\xC3\xA9",
153 "abc", "example", "com", "Bar", "Baz",
154 "bar", "example", "org", "foo", "domain",
155 "1234567890123456789012345678\xC3\xA4",
156 "12345678901234567890123456789",
157 "123456789012345678901234567890",
158 "and", "longlonglongabcdefghijklmnopqr",
159 "more", "Hello", "world", "3", "14", "3", "14", "last", NULL,
160
161 "1", NULL,
162
163 "quoted", "text", "word", "hlo", "words", "you're", "bad",
164 "word", "pre", "post", NULL,
165
166 "1234567890123456789012345678\xC3\xA4",
167 "123456789012345678901234567x'",
168 "1234567890123456789012345678x'",
169 "1234567890123456789012345678x",
170 "1234567890123456789012345678x",
171 "12345678901234567890123456789x",
172 "12345678901234567890123456789x",
173 "123456789012345678901234567890",
174 "123456789012345678901234567890",
175
176 "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
177 "word", "pre", "post", NULL,
178
179 "you're'xyz", NULL,
180
181 "hello", "world", "And",
182 "there", "was", "text", "galore",
183 "and", "more", NULL,
184
185 "hello", "world", NULL,
186
187 "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
188
189 NULL
190 };
191 struct fts_tokenizer *tok;
192 const char *error;
193
194 test_begin("fts tokenizer generic simple");
195 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
196 test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
197
198 test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
199 fts_tokenizer_unref(&tok);
200 test_end();
201 }
202
203 const char *const tr29_settings[] = {"algorithm", "tr29", NULL};
204
205 /* TODO: U+206F is in "Format" and therefore currently not word break.
206 This definitely needs to be remapped. */
test_fts_tokenizer_generic_tr29_only(void)207 static void test_fts_tokenizer_generic_tr29_only(void)
208 {
209 static const char *const expected_output[] = {
210 "hello", "world", "And",
211 "there", "was", "text", "galor\xC3\xA9",
212 "abc", "example", "com", "Bar", "Baz",
213 "bar", "example", "org", "foo", "domain",
214 "1234567890123456789012345678\xC3\xA4",
215 "12345678901234567890123456789",
216 "123456789012345678901234567890",
217 "and", "longlonglongabcdefghijklmnopqr",
218 "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
219
220 "1", NULL,
221
222 "quoted", "text", "word", "hlo", "words", "you're", "bad",
223 "word", "pre", "post", NULL,
224
225 "1234567890123456789012345678\xC3\xA4",
226 "123456789012345678901234567x'",
227 "1234567890123456789012345678x'",
228 "1234567890123456789012345678x",
229 "1234567890123456789012345678x",
230 "12345678901234567890123456789x",
231 "12345678901234567890123456789x",
232 "123456789012345678901234567890",
233 "123456789012345678901234567890",
234
235 "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
236 "word", "pre", "post", NULL,
237
238 "you're'xyz", NULL,
239
240 "hello", "world", "And",
241 "there", "was", "text", "galore",
242 "and", "more", NULL,
243
244 "hello", "world", NULL,
245
246 "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
247 NULL
248 };
249 struct fts_tokenizer *tok;
250 const char *error;
251
252 test_begin("fts tokenizer generic TR29");
253 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
254 test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
255 fts_tokenizer_unref(&tok);
256 test_end();
257 }
258
259 const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
260
261 /* TODO: U+206F is in "Format" and therefore currently not word break.
262 This definitely needs to be remapped. */
test_fts_tokenizer_generic_tr29_wb5a(void)263 static void test_fts_tokenizer_generic_tr29_wb5a(void)
264 {
265 static const char *const expected_output[] = {
266 "hello", "world", "And",
267 "there", "was", "text", "galor\xC3\xA9",
268 "abc", "example", "com", "Bar", "Baz",
269 "bar", "example", "org", "foo", "domain",
270 "1234567890123456789012345678\xC3\xA4",
271 "12345678901234567890123456789",
272 "123456789012345678901234567890",
273 "and", "longlonglongabcdefghijklmnopqr",
274 "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
275
276 "1", NULL,
277
278 "quoted", "text", "word", "hlo", "words", "you're", "bad",
279 "word", "pre", "post", NULL,
280
281 "1234567890123456789012345678\xC3\xA4",
282 "123456789012345678901234567x'",
283 "1234567890123456789012345678x'",
284 "1234567890123456789012345678x",
285 "1234567890123456789012345678x",
286 "12345678901234567890123456789x",
287 "12345678901234567890123456789x",
288 "123456789012345678901234567890",
289 "123456789012345678901234567890",
290
291 "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
292 "word", "pre", "post", NULL,
293
294 "you're'xyz", NULL,
295
296 "hello", "world", "And",
297 "there", "was", "text", "galore",
298 "and", "more", NULL,
299
300 "hello", "world", NULL,
301
302 "l", "homme", "l", "humanit\xC3\xA9", "d", "immixtions", "qu", "il", "aujourd'hui", "que'euq", NULL,
303
304 NULL
305 };
306 struct fts_tokenizer *tok;
307 const char *error;
308
309 test_begin("fts tokenizer generic TR29 with WB5a");
310 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
311 test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
312 fts_tokenizer_unref(&tok);
313 test_end();
314 }
315
test_fts_tokenizer_address_only(void)316 static void test_fts_tokenizer_address_only(void)
317 {
318 static const char input[] = TEST_INPUT_ADDRESS;
319 static const char *const expected_output[] = {
320 "abc.dfg@example.com", "bar@example.org",
321 "foo.bar@host.example.org", "foo@domain",
322 "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
323 "period@blue.com", /*trailing period '.' in email */
324 "mul@trail.com",
325 "m@s", /*one letter local-part and domain name */
326 "hypen@hypen-hypen.com",
327 "hypen@hypen-hypen-sick.com",
328 NULL
329 };
330 struct fts_tokenizer *tok;
331 const char *error;
332
333 test_begin("fts tokenizer email address only");
334 test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
335 test_tokenizer_inputoutput(tok, input, expected_output, 0);
336 fts_tokenizer_unref(&tok);
337 test_end();
338 }
339
test_fts_tokenizer_address_parent(const char * name,const char * const * settings)340 static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
341 {
342 static const char input[] = TEST_INPUT_ADDRESS;
343 static const char *const expected_output[] = {
344 "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
345 "Bar", "Baz", "bar", "example", "org", "bar@example.org",
346 "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
347 "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
348 "trailing", "period", "blue", "com", "period@blue.com",
349 "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
350 "m", "s", "m@s",
351 "hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com",
352 "hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
353 NULL
354 };
355 struct fts_tokenizer *tok, *gen_tok;
356 const char *error;
357
358 test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
359 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
360 test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
361 test_tokenizer_inputoutput(tok, input, expected_output, 0);
362 fts_tokenizer_unref(&tok);
363 fts_tokenizer_unref(&gen_tok);
364 test_end();
365 }
366
367 const char *const simple_settings[] = {"algorithm", "simple", NULL};
test_fts_tokenizer_address_parent_simple(void)368 static void test_fts_tokenizer_address_parent_simple(void)
369 {
370 test_fts_tokenizer_address_parent("simple", simple_settings);
371 }
372
test_fts_tokenizer_address_parent_tr29(void)373 static void test_fts_tokenizer_address_parent_tr29(void)
374 {
375 test_fts_tokenizer_address_parent("tr29", tr29_settings);
376 }
377
test_fts_tokenizer_address_search(void)378 static void test_fts_tokenizer_address_search(void)
379 {
380 static const char input[] = TEST_INPUT_ADDRESS;
381 static const char *const expected_output[] = {
382 "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
383 "Bar", "Baz", "bar@example.org",
384 "Foo", "Bar", "comment", "foo.bar@host.example.org",
385 "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
386 "trailing", "period@blue.com",
387 "multi", "trialing", "mul@trail.com",
388 "m@s",
389 "hypen@hypen-hypen.com",
390 "hypen@hypen-hypen-sick.com",
391 NULL
392 };
393 static const char *const settings[] = { "search", "", NULL };
394 struct fts_tokenizer *tok, *gen_tok;
395 const char *token, *error;
396
397 test_begin("fts tokenizer search email address + parent");
398 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
399 test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
400 test_tokenizer_inputoutput(tok, input, expected_output, 0);
401
402 /* make sure state is forgotten at EOF */
403 test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
404 test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
405 strcmp(token, "foo") == 0);
406 test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
407
408 test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
409 test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
410 strcmp(token, "bar@baz") == 0);
411 test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
412
413 test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
414 test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
415 strcmp(token, "foo") == 0);
416 test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
417
418 /* test reset explicitly */
419 test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
420 fts_tokenizer_reset(tok);
421 test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
422 test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
423 strcmp(token, "b@c") == 0);
424 test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
425
426 fts_tokenizer_unref(&tok);
427 fts_tokenizer_unref(&gen_tok);
428 test_end();
429 }
430
test_fts_tokenizer_delete_trailing_partial_char(void)431 static void test_fts_tokenizer_delete_trailing_partial_char(void)
432 {
433 static const struct {
434 const char *str;
435 unsigned int truncated_len;
436 } tests[] = {
437 /* non-truncated */
438 { "\x7f", 1 },
439 { "\xC2\x80", 2 },
440 { "\xE0\x80\x80", 3 },
441 { "\xF0\x80\x80\x80", 4 },
442
443 /* truncated */
444 { "\xF0\x80\x80", 0 },
445 { "x\xF0\x80\x80", 1 },
446 };
447 unsigned int i;
448 size_t size;
449
450 test_begin("fts tokenizer delete trailing partial char");
451 for (i = 0; i < N_ELEMENTS(tests); i++) {
452 size = strlen(tests[i].str);
453 fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
454 test_assert(size == tests[i].truncated_len);
455 }
456 test_end();
457 }
458
test_fts_tokenizer_address_maxlen(void)459 static void test_fts_tokenizer_address_maxlen(void)
460 {
461 const char *const settings[] = {"maxlen", "5", NULL};
462 const char *input = "...\357\277\275@a";
463 struct fts_tokenizer *tok;
464 const char *token, *error;
465
466 test_begin("fts tokenizer address maxlen");
467 test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
468
469 while (fts_tokenizer_next(tok, (const unsigned char *)input,
470 strlen(input), &token, &error) > 0) ;
471 while (fts_tokenizer_final(tok, &token, &error) > 0) ;
472 fts_tokenizer_unref(&tok);
473 test_end();
474 }
475
test_fts_tokenizer_random(void)476 static void test_fts_tokenizer_random(void)
477 {
478 const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
479 const char *const settings[] = {"algorithm", "simple", NULL};
480 const char *const email_settings[] = {"maxlen", "9", NULL};
481 unsigned int i;
482 unsigned char addr[10] = { 0 };
483 string_t *str = t_str_new(20);
484 struct fts_tokenizer *tok, *gen_tok;
485 const char *token, *error;
486
487 test_begin("fts tokenizer random");
488 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
489 test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
490
491 for (i = 0; i < 10000; i++) T_BEGIN {
492 for (unsigned int j = 0; j < sizeof(addr); j++)
493 addr[j] = test_chars[i_rand_limit(N_ELEMENTS(test_chars))];
494 str_truncate(str, 0);
495 if (uni_utf8_get_valid_data(addr, sizeof(addr), str))
496 str_append_data(str, addr, sizeof(addr));
497 while (fts_tokenizer_next(tok, str_data(str), str_len(str),
498 &token, &error) > 0) ;
499 while (fts_tokenizer_final(tok, &token, &error) > 0) ;
500 } T_END;
501 fts_tokenizer_unref(&tok);
502 fts_tokenizer_unref(&gen_tok);
503 test_end();
504 }
505
506
507 static void
test_fts_tokenizer_explicit_prefix(void)508 test_fts_tokenizer_explicit_prefix(void)
509 {
510 const char *input = "* ** "
511 "*pre *both* post* "
512 "mid*dle *mid*dle* "
513 "**twopre **twoboth** twopost**";
514 const char *const expected_star[] = { "pre", "both*", "post*",
515 "mid*", "dle", "mid*", "dle*",
516 "twopre", "twoboth*", "twopost*",
517 NULL, NULL };
518 const char *const expected_nostar[] = { "pre", "both", "post",
519 "mid", "dle", "mid", "dle",
520 "twopre", "twoboth", "twopost",
521 NULL, NULL };
522
523 const char *settings[9] = { "algorithm", "tr29", "wb5a", "yes" };
524 const char **setptr;
525
526 const char *algos[] = { ALGORITHM_SIMPLE_NAME,
527 ALGORITHM_TR29_NAME,
528 ALGORITHM_TR29_NAME "+wb5a" };
529 const char *searches[] = { "indexing", "searching" };
530 const char *prefixes[] = { "fixed", "prefix" };
531
532 for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */
533 setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */
534
535 for (int search = 0; search < 2; search++) {
536 const char **setptr2 = setptr;
537 if (search > 0) { *setptr2++ = "search"; *setptr2++ = "yes"; }
538
539 for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) {
540 const char **setptr3 = setptr2;
541 if (explicitprefix > 0) { *setptr3++ = "explicitprefix"; *setptr3++ = "y"; }
542
543 *setptr3++ = NULL;
544
545 test_begin(t_strdup_printf("prefix search %s:%s:%s",
546 algos[algo],
547 searches[search],
548 prefixes[explicitprefix]));
549 struct fts_tokenizer *tok;
550 const char *error;
551
552 test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings,
553 &tok, &error) == 0);
554 test_tokenizer_inputs(
555 tok, &input, 1,
556 (search!=0) && (explicitprefix!=0)
557 ? expected_star : expected_nostar);
558
559 fts_tokenizer_unref(&tok);
560 test_end();
561 }
562 }
563 }
564 }
565
main(void)566 int main(void)
567 {
568 static void (*const test_functions[])(void) = {
569 test_fts_tokenizer_find,
570 test_fts_tokenizer_generic_only,
571 test_fts_tokenizer_generic_tr29_only,
572 test_fts_tokenizer_generic_tr29_wb5a,
573 test_fts_tokenizer_address_only,
574 test_fts_tokenizer_address_parent_simple,
575 test_fts_tokenizer_address_parent_tr29,
576 test_fts_tokenizer_address_maxlen,
577 test_fts_tokenizer_address_search,
578 test_fts_tokenizer_delete_trailing_partial_char,
579 test_fts_tokenizer_random,
580 test_fts_tokenizer_explicit_prefix,
581 NULL
582 };
583 int ret;
584
585 fts_tokenizers_init();
586 ret = test_run(test_functions);
587 fts_tokenizers_deinit();
588
589 return ret;
590 }
591