1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14
15 #include "fts5Int.h"
16
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
19 */
20
21 /*
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
24 */
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
34 };
35
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
39 };
40
fts5AsciiAddExceptions(AsciiTokenizer * p,const char * zArg,int bTokenChars)41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
45 ){
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
50 }
51 }
52 }
53
54 /*
55 ** Delete a "ascii" tokenizer.
56 */
fts5AsciiDelete(Fts5Tokenizer * p)57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
59 }
60
61 /*
62 ** Create an "ascii" tokenizer.
63 */
fts5AsciiCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
68 ){
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
91 }
92 }
93 if( rc!=SQLITE_OK ){
94 fts5AsciiDelete((Fts5Tokenizer*)p);
95 p = 0;
96 }
97 }
98 }
99
100 *ppOut = (Fts5Tokenizer*)p;
101 return rc;
102 }
103
104
asciiFold(char * aOut,const char * aIn,int nByte)105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106 int i;
107 for(i=0; i<nByte; i++){
108 char c = aIn[i];
109 if( c>='A' && c<='Z' ) c += 32;
110 aOut[i] = c;
111 }
112 }
113
114 /*
115 ** Tokenize some text using the ascii tokenizer.
116 */
fts5AsciiTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int iUnused,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))117 static int fts5AsciiTokenize(
118 Fts5Tokenizer *pTokenizer,
119 void *pCtx,
120 int iUnused,
121 const char *pText, int nText,
122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
123 ){
124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125 int rc = SQLITE_OK;
126 int ie;
127 int is = 0;
128
129 char aFold[64];
130 int nFold = sizeof(aFold);
131 char *pFold = aFold;
132 unsigned char *a = p->aTokenChar;
133
134 UNUSED_PARAM(iUnused);
135
136 while( is<nText && rc==SQLITE_OK ){
137 int nByte;
138
139 /* Skip any leading divider characters. */
140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141 is++;
142 }
143 if( is==nText ) break;
144
145 /* Count the token characters */
146 ie = is+1;
147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148 ie++;
149 }
150
151 /* Fold to lower case */
152 nByte = ie-is;
153 if( nByte>nFold ){
154 if( pFold!=aFold ) sqlite3_free(pFold);
155 pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
156 if( pFold==0 ){
157 rc = SQLITE_NOMEM;
158 break;
159 }
160 nFold = nByte*2;
161 }
162 asciiFold(pFold, &pText[is], nByte);
163
164 /* Invoke the token callback */
165 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166 is = ie+1;
167 }
168
169 if( pFold!=aFold ) sqlite3_free(pFold);
170 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171 return rc;
172 }
173
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
176 */
177
178
179 /*
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
183 */
184 #ifndef SQLITE_AMALGAMATION
185
186 static const unsigned char sqlite3Utf8Trans1[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
195 };
196
197 #define READ_UTF8(zIn, zTerm, c) \
198 c = *(zIn++); \
199 if( c>=0xc0 ){ \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
203 } \
204 if( c<0x80 \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
207 }
208
209
210 #define WRITE_UTF8(zOut, c) { \
211 if( c<0x00080 ){ \
212 *zOut++ = (unsigned char)(c&0xFF); \
213 } \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
217 } \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
222 }else{ \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
227 } \
228 }
229
230 #endif /* ifndef SQLITE_AMALGAMATION */
231
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
233 struct Unicode61Tokenizer {
234 unsigned char aTokenChar[128]; /* ASCII range token characters */
235 char *aFold; /* Buffer to fold text into */
236 int nFold; /* Size of aFold[] in bytes */
237 int eRemoveDiacritic; /* True if remove_diacritics=1 is set */
238 int nException;
239 int *aiException;
240
241 unsigned char aCategory[32]; /* True for token char categories */
242 };
243
244 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
245 #define FTS5_REMOVE_DIACRITICS_NONE 0
246 #define FTS5_REMOVE_DIACRITICS_SIMPLE 1
247 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
248
fts5UnicodeAddExceptions(Unicode61Tokenizer * p,const char * z,int bTokenChars)249 static int fts5UnicodeAddExceptions(
250 Unicode61Tokenizer *p, /* Tokenizer object */
251 const char *z, /* Characters to treat as exceptions */
252 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
253 ){
254 int rc = SQLITE_OK;
255 int n = (int)strlen(z);
256 int *aNew;
257
258 if( n>0 ){
259 aNew = (int*)sqlite3_realloc64(p->aiException,
260 (n+p->nException)*sizeof(int));
261 if( aNew ){
262 int nNew = p->nException;
263 const unsigned char *zCsr = (const unsigned char*)z;
264 const unsigned char *zTerm = (const unsigned char*)&z[n];
265 while( zCsr<zTerm ){
266 u32 iCode;
267 int bToken;
268 READ_UTF8(zCsr, zTerm, iCode);
269 if( iCode<128 ){
270 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
271 }else{
272 bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
273 assert( (bToken==0 || bToken==1) );
274 assert( (bTokenChars==0 || bTokenChars==1) );
275 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
276 int i;
277 for(i=0; i<nNew; i++){
278 if( (u32)aNew[i]>iCode ) break;
279 }
280 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
281 aNew[i] = iCode;
282 nNew++;
283 }
284 }
285 }
286 p->aiException = aNew;
287 p->nException = nNew;
288 }else{
289 rc = SQLITE_NOMEM;
290 }
291 }
292
293 return rc;
294 }
295
296 /*
297 ** Return true if the p->aiException[] array contains the value iCode.
298 */
fts5UnicodeIsException(Unicode61Tokenizer * p,int iCode)299 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
300 if( p->nException>0 ){
301 int *a = p->aiException;
302 int iLo = 0;
303 int iHi = p->nException-1;
304
305 while( iHi>=iLo ){
306 int iTest = (iHi + iLo) / 2;
307 if( iCode==a[iTest] ){
308 return 1;
309 }else if( iCode>a[iTest] ){
310 iLo = iTest+1;
311 }else{
312 iHi = iTest-1;
313 }
314 }
315 }
316
317 return 0;
318 }
319
320 /*
321 ** Delete a "unicode61" tokenizer.
322 */
fts5UnicodeDelete(Fts5Tokenizer * pTok)323 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
324 if( pTok ){
325 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
326 sqlite3_free(p->aiException);
327 sqlite3_free(p->aFold);
328 sqlite3_free(p);
329 }
330 return;
331 }
332
unicodeSetCategories(Unicode61Tokenizer * p,const char * zCat)333 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
334 const char *z = zCat;
335
336 while( *z ){
337 while( *z==' ' || *z=='\t' ) z++;
338 if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
339 return SQLITE_ERROR;
340 }
341 while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
342 }
343
344 sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
345 return SQLITE_OK;
346 }
347
348 /*
349 ** Create a "unicode61" tokenizer.
350 */
fts5UnicodeCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)351 static int fts5UnicodeCreate(
352 void *pUnused,
353 const char **azArg, int nArg,
354 Fts5Tokenizer **ppOut
355 ){
356 int rc = SQLITE_OK; /* Return code */
357 Unicode61Tokenizer *p = 0; /* New tokenizer object */
358
359 UNUSED_PARAM(pUnused);
360
361 if( nArg%2 ){
362 rc = SQLITE_ERROR;
363 }else{
364 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
365 if( p ){
366 const char *zCat = "L* N* Co";
367 int i;
368 memset(p, 0, sizeof(Unicode61Tokenizer));
369
370 p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
371 p->nFold = 64;
372 p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
373 if( p->aFold==0 ){
374 rc = SQLITE_NOMEM;
375 }
376
377 /* Search for a "categories" argument */
378 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
379 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
380 zCat = azArg[i+1];
381 }
382 }
383
384 if( rc==SQLITE_OK ){
385 rc = unicodeSetCategories(p, zCat);
386 }
387
388 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
389 const char *zArg = azArg[i+1];
390 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
391 if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
392 rc = SQLITE_ERROR;
393 }else{
394 p->eRemoveDiacritic = (zArg[0] - '0');
395 assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
396 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
397 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
398 );
399 }
400 }else
401 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
402 rc = fts5UnicodeAddExceptions(p, zArg, 1);
403 }else
404 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
405 rc = fts5UnicodeAddExceptions(p, zArg, 0);
406 }else
407 if( 0==sqlite3_stricmp(azArg[i], "categories") ){
408 /* no-op */
409 }else{
410 rc = SQLITE_ERROR;
411 }
412 }
413
414 }else{
415 rc = SQLITE_NOMEM;
416 }
417 if( rc!=SQLITE_OK ){
418 fts5UnicodeDelete((Fts5Tokenizer*)p);
419 p = 0;
420 }
421 *ppOut = (Fts5Tokenizer*)p;
422 }
423 return rc;
424 }
425
426 /*
427 ** Return true if, for the purposes of tokenizing with the tokenizer
428 ** passed as the first argument, codepoint iCode is considered a token
429 ** character (not a separator).
430 */
fts5UnicodeIsAlnum(Unicode61Tokenizer * p,int iCode)431 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
432 return (
433 p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
434 ^ fts5UnicodeIsException(p, iCode)
435 );
436 }
437
fts5UnicodeTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int iUnused,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))438 static int fts5UnicodeTokenize(
439 Fts5Tokenizer *pTokenizer,
440 void *pCtx,
441 int iUnused,
442 const char *pText, int nText,
443 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
444 ){
445 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
446 int rc = SQLITE_OK;
447 unsigned char *a = p->aTokenChar;
448
449 unsigned char *zTerm = (unsigned char*)&pText[nText];
450 unsigned char *zCsr = (unsigned char *)pText;
451
452 /* Output buffer */
453 char *aFold = p->aFold;
454 int nFold = p->nFold;
455 const char *pEnd = &aFold[nFold-6];
456
457 UNUSED_PARAM(iUnused);
458
459 /* Each iteration of this loop gobbles up a contiguous run of separators,
460 ** then the next token. */
461 while( rc==SQLITE_OK ){
462 u32 iCode; /* non-ASCII codepoint read from input */
463 char *zOut = aFold;
464 int is;
465 int ie;
466
467 /* Skip any separator characters. */
468 while( 1 ){
469 if( zCsr>=zTerm ) goto tokenize_done;
470 if( *zCsr & 0x80 ) {
471 /* A character outside of the ascii range. Skip past it if it is
472 ** a separator character. Or break out of the loop if it is not. */
473 is = zCsr - (unsigned char*)pText;
474 READ_UTF8(zCsr, zTerm, iCode);
475 if( fts5UnicodeIsAlnum(p, iCode) ){
476 goto non_ascii_tokenchar;
477 }
478 }else{
479 if( a[*zCsr] ){
480 is = zCsr - (unsigned char*)pText;
481 goto ascii_tokenchar;
482 }
483 zCsr++;
484 }
485 }
486
487 /* Run through the tokenchars. Fold them into the output buffer along
488 ** the way. */
489 while( zCsr<zTerm ){
490
491 /* Grow the output buffer so that there is sufficient space to fit the
492 ** largest possible utf-8 character. */
493 if( zOut>pEnd ){
494 aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
495 if( aFold==0 ){
496 rc = SQLITE_NOMEM;
497 goto tokenize_done;
498 }
499 zOut = &aFold[zOut - p->aFold];
500 memcpy(aFold, p->aFold, nFold);
501 sqlite3_free(p->aFold);
502 p->aFold = aFold;
503 p->nFold = nFold = nFold*2;
504 pEnd = &aFold[nFold-6];
505 }
506
507 if( *zCsr & 0x80 ){
508 /* An non-ascii-range character. Fold it into the output buffer if
509 ** it is a token character, or break out of the loop if it is not. */
510 READ_UTF8(zCsr, zTerm, iCode);
511 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
512 non_ascii_tokenchar:
513 iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
514 if( iCode ) WRITE_UTF8(zOut, iCode);
515 }else{
516 break;
517 }
518 }else if( a[*zCsr]==0 ){
519 /* An ascii-range separator character. End of token. */
520 break;
521 }else{
522 ascii_tokenchar:
523 if( *zCsr>='A' && *zCsr<='Z' ){
524 *zOut++ = *zCsr + 32;
525 }else{
526 *zOut++ = *zCsr;
527 }
528 zCsr++;
529 }
530 ie = zCsr - (unsigned char*)pText;
531 }
532
533 /* Invoke the token callback */
534 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
535 }
536
537 tokenize_done:
538 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
539 return rc;
540 }
541
542 /**************************************************************************
543 ** Start of porter stemmer implementation.
544 */
545
546 /* Any tokens larger than this (in bytes) are passed through without
547 ** stemming. */
548 #define FTS5_PORTER_MAX_TOKEN 64
549
550 typedef struct PorterTokenizer PorterTokenizer;
551 struct PorterTokenizer {
552 fts5_tokenizer tokenizer; /* Parent tokenizer module */
553 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
554 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
555 };
556
557 /*
558 ** Delete a "porter" tokenizer.
559 */
fts5PorterDelete(Fts5Tokenizer * pTok)560 static void fts5PorterDelete(Fts5Tokenizer *pTok){
561 if( pTok ){
562 PorterTokenizer *p = (PorterTokenizer*)pTok;
563 if( p->pTokenizer ){
564 p->tokenizer.xDelete(p->pTokenizer);
565 }
566 sqlite3_free(p);
567 }
568 }
569
570 /*
571 ** Create a "porter" tokenizer.
572 */
fts5PorterCreate(void * pCtx,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)573 static int fts5PorterCreate(
574 void *pCtx,
575 const char **azArg, int nArg,
576 Fts5Tokenizer **ppOut
577 ){
578 fts5_api *pApi = (fts5_api*)pCtx;
579 int rc = SQLITE_OK;
580 PorterTokenizer *pRet;
581 void *pUserdata = 0;
582 const char *zBase = "unicode61";
583
584 if( nArg>0 ){
585 zBase = azArg[0];
586 }
587
588 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
589 if( pRet ){
590 memset(pRet, 0, sizeof(PorterTokenizer));
591 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
592 }else{
593 rc = SQLITE_NOMEM;
594 }
595 if( rc==SQLITE_OK ){
596 int nArg2 = (nArg>0 ? nArg-1 : 0);
597 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
598 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
599 }
600
601 if( rc!=SQLITE_OK ){
602 fts5PorterDelete((Fts5Tokenizer*)pRet);
603 pRet = 0;
604 }
605 *ppOut = (Fts5Tokenizer*)pRet;
606 return rc;
607 }
608
609 typedef struct PorterContext PorterContext;
610 struct PorterContext {
611 void *pCtx;
612 int (*xToken)(void*, int, const char*, int, int, int);
613 char *aBuf;
614 };
615
616 typedef struct PorterRule PorterRule;
617 struct PorterRule {
618 const char *zSuffix;
619 int nSuffix;
620 int (*xCond)(char *zStem, int nStem);
621 const char *zOutput;
622 int nOutput;
623 };
624
625 #if 0
626 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
627 int ret = -1;
628 int nBuf = *pnBuf;
629 PorterRule *p;
630
631 for(p=aRule; p->zSuffix; p++){
632 assert( strlen(p->zSuffix)==p->nSuffix );
633 assert( strlen(p->zOutput)==p->nOutput );
634 if( nBuf<p->nSuffix ) continue;
635 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
636 }
637
638 if( p->zSuffix ){
639 int nStem = nBuf - p->nSuffix;
640 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
641 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
642 *pnBuf = nStem + p->nOutput;
643 ret = p - aRule;
644 }
645 }
646
647 return ret;
648 }
649 #endif
650
fts5PorterIsVowel(char c,int bYIsVowel)651 static int fts5PorterIsVowel(char c, int bYIsVowel){
652 return (
653 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
654 );
655 }
656
fts5PorterGobbleVC(char * zStem,int nStem,int bPrevCons)657 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
658 int i;
659 int bCons = bPrevCons;
660
661 /* Scan for a vowel */
662 for(i=0; i<nStem; i++){
663 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
664 }
665
666 /* Scan for a consonent */
667 for(i++; i<nStem; i++){
668 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
669 }
670 return 0;
671 }
672
673 /* porter rule condition: (m > 0) */
fts5Porter_MGt0(char * zStem,int nStem)674 static int fts5Porter_MGt0(char *zStem, int nStem){
675 return !!fts5PorterGobbleVC(zStem, nStem, 0);
676 }
677
678 /* porter rule condition: (m > 1) */
fts5Porter_MGt1(char * zStem,int nStem)679 static int fts5Porter_MGt1(char *zStem, int nStem){
680 int n;
681 n = fts5PorterGobbleVC(zStem, nStem, 0);
682 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
683 return 1;
684 }
685 return 0;
686 }
687
688 /* porter rule condition: (m = 1) */
fts5Porter_MEq1(char * zStem,int nStem)689 static int fts5Porter_MEq1(char *zStem, int nStem){
690 int n;
691 n = fts5PorterGobbleVC(zStem, nStem, 0);
692 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
693 return 1;
694 }
695 return 0;
696 }
697
698 /* porter rule condition: (*o) */
fts5Porter_Ostar(char * zStem,int nStem)699 static int fts5Porter_Ostar(char *zStem, int nStem){
700 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
701 return 0;
702 }else{
703 int i;
704 int mask = 0;
705 int bCons = 0;
706 for(i=0; i<nStem; i++){
707 bCons = !fts5PorterIsVowel(zStem[i], bCons);
708 assert( bCons==0 || bCons==1 );
709 mask = (mask << 1) + bCons;
710 }
711 return ((mask & 0x0007)==0x0005);
712 }
713 }
714
715 /* porter rule condition: (m > 1 and (*S or *T)) */
fts5Porter_MGt1_and_S_or_T(char * zStem,int nStem)716 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
717 assert( nStem>0 );
718 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
719 && fts5Porter_MGt1(zStem, nStem);
720 }
721
722 /* porter rule condition: (*v*) */
fts5Porter_Vowel(char * zStem,int nStem)723 static int fts5Porter_Vowel(char *zStem, int nStem){
724 int i;
725 for(i=0; i<nStem; i++){
726 if( fts5PorterIsVowel(zStem[i], i>0) ){
727 return 1;
728 }
729 }
730 return 0;
731 }
732
733
734 /**************************************************************************
735 ***************************************************************************
736 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
737 */
738
fts5PorterStep4(char * aBuf,int * pnBuf)739 static int fts5PorterStep4(char *aBuf, int *pnBuf){
740 int ret = 0;
741 int nBuf = *pnBuf;
742 switch( aBuf[nBuf-2] ){
743
744 case 'a':
745 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
746 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
747 *pnBuf = nBuf - 2;
748 }
749 }
750 break;
751
752 case 'c':
753 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
754 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
755 *pnBuf = nBuf - 4;
756 }
757 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
758 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
759 *pnBuf = nBuf - 4;
760 }
761 }
762 break;
763
764 case 'e':
765 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
766 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
767 *pnBuf = nBuf - 2;
768 }
769 }
770 break;
771
772 case 'i':
773 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
774 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
775 *pnBuf = nBuf - 2;
776 }
777 }
778 break;
779
780 case 'l':
781 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
782 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
783 *pnBuf = nBuf - 4;
784 }
785 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
786 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
787 *pnBuf = nBuf - 4;
788 }
789 }
790 break;
791
792 case 'n':
793 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
794 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
795 *pnBuf = nBuf - 3;
796 }
797 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
798 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
799 *pnBuf = nBuf - 5;
800 }
801 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
802 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
803 *pnBuf = nBuf - 4;
804 }
805 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
806 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
807 *pnBuf = nBuf - 3;
808 }
809 }
810 break;
811
812 case 'o':
813 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
814 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
815 *pnBuf = nBuf - 3;
816 }
817 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
818 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
819 *pnBuf = nBuf - 2;
820 }
821 }
822 break;
823
824 case 's':
825 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
826 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
827 *pnBuf = nBuf - 3;
828 }
829 }
830 break;
831
832 case 't':
833 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
834 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
835 *pnBuf = nBuf - 3;
836 }
837 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
838 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
839 *pnBuf = nBuf - 3;
840 }
841 }
842 break;
843
844 case 'u':
845 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
846 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
847 *pnBuf = nBuf - 3;
848 }
849 }
850 break;
851
852 case 'v':
853 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
854 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
855 *pnBuf = nBuf - 3;
856 }
857 }
858 break;
859
860 case 'z':
861 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
862 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
863 *pnBuf = nBuf - 3;
864 }
865 }
866 break;
867
868 }
869 return ret;
870 }
871
872
fts5PorterStep1B2(char * aBuf,int * pnBuf)873 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
874 int ret = 0;
875 int nBuf = *pnBuf;
876 switch( aBuf[nBuf-2] ){
877
878 case 'a':
879 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
880 memcpy(&aBuf[nBuf-2], "ate", 3);
881 *pnBuf = nBuf - 2 + 3;
882 ret = 1;
883 }
884 break;
885
886 case 'b':
887 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
888 memcpy(&aBuf[nBuf-2], "ble", 3);
889 *pnBuf = nBuf - 2 + 3;
890 ret = 1;
891 }
892 break;
893
894 case 'i':
895 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
896 memcpy(&aBuf[nBuf-2], "ize", 3);
897 *pnBuf = nBuf - 2 + 3;
898 ret = 1;
899 }
900 break;
901
902 }
903 return ret;
904 }
905
906
fts5PorterStep2(char * aBuf,int * pnBuf)907 static int fts5PorterStep2(char *aBuf, int *pnBuf){
908 int ret = 0;
909 int nBuf = *pnBuf;
910 switch( aBuf[nBuf-2] ){
911
912 case 'a':
913 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
914 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
915 memcpy(&aBuf[nBuf-7], "ate", 3);
916 *pnBuf = nBuf - 7 + 3;
917 }
918 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
919 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
920 memcpy(&aBuf[nBuf-6], "tion", 4);
921 *pnBuf = nBuf - 6 + 4;
922 }
923 }
924 break;
925
926 case 'c':
927 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
928 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
929 memcpy(&aBuf[nBuf-4], "ence", 4);
930 *pnBuf = nBuf - 4 + 4;
931 }
932 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
933 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
934 memcpy(&aBuf[nBuf-4], "ance", 4);
935 *pnBuf = nBuf - 4 + 4;
936 }
937 }
938 break;
939
940 case 'e':
941 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
942 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
943 memcpy(&aBuf[nBuf-4], "ize", 3);
944 *pnBuf = nBuf - 4 + 3;
945 }
946 }
947 break;
948
949 case 'g':
950 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
951 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
952 memcpy(&aBuf[nBuf-4], "log", 3);
953 *pnBuf = nBuf - 4 + 3;
954 }
955 }
956 break;
957
958 case 'l':
959 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
960 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
961 memcpy(&aBuf[nBuf-3], "ble", 3);
962 *pnBuf = nBuf - 3 + 3;
963 }
964 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
965 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
966 memcpy(&aBuf[nBuf-4], "al", 2);
967 *pnBuf = nBuf - 4 + 2;
968 }
969 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
970 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
971 memcpy(&aBuf[nBuf-5], "ent", 3);
972 *pnBuf = nBuf - 5 + 3;
973 }
974 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
975 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
976 memcpy(&aBuf[nBuf-3], "e", 1);
977 *pnBuf = nBuf - 3 + 1;
978 }
979 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
980 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
981 memcpy(&aBuf[nBuf-5], "ous", 3);
982 *pnBuf = nBuf - 5 + 3;
983 }
984 }
985 break;
986
987 case 'o':
988 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
989 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
990 memcpy(&aBuf[nBuf-7], "ize", 3);
991 *pnBuf = nBuf - 7 + 3;
992 }
993 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
994 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
995 memcpy(&aBuf[nBuf-5], "ate", 3);
996 *pnBuf = nBuf - 5 + 3;
997 }
998 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
999 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1000 memcpy(&aBuf[nBuf-4], "ate", 3);
1001 *pnBuf = nBuf - 4 + 3;
1002 }
1003 }
1004 break;
1005
1006 case 's':
1007 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1008 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1009 memcpy(&aBuf[nBuf-5], "al", 2);
1010 *pnBuf = nBuf - 5 + 2;
1011 }
1012 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1013 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1014 memcpy(&aBuf[nBuf-7], "ive", 3);
1015 *pnBuf = nBuf - 7 + 3;
1016 }
1017 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1018 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1019 memcpy(&aBuf[nBuf-7], "ful", 3);
1020 *pnBuf = nBuf - 7 + 3;
1021 }
1022 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1023 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1024 memcpy(&aBuf[nBuf-7], "ous", 3);
1025 *pnBuf = nBuf - 7 + 3;
1026 }
1027 }
1028 break;
1029
1030 case 't':
1031 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1032 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1033 memcpy(&aBuf[nBuf-5], "al", 2);
1034 *pnBuf = nBuf - 5 + 2;
1035 }
1036 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1037 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1038 memcpy(&aBuf[nBuf-5], "ive", 3);
1039 *pnBuf = nBuf - 5 + 3;
1040 }
1041 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1042 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1043 memcpy(&aBuf[nBuf-6], "ble", 3);
1044 *pnBuf = nBuf - 6 + 3;
1045 }
1046 }
1047 break;
1048
1049 }
1050 return ret;
1051 }
1052
1053
fts5PorterStep3(char * aBuf,int * pnBuf)1054 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1055 int ret = 0;
1056 int nBuf = *pnBuf;
1057 switch( aBuf[nBuf-2] ){
1058
1059 case 'a':
1060 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1061 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1062 memcpy(&aBuf[nBuf-4], "ic", 2);
1063 *pnBuf = nBuf - 4 + 2;
1064 }
1065 }
1066 break;
1067
1068 case 's':
1069 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1070 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1071 *pnBuf = nBuf - 4;
1072 }
1073 }
1074 break;
1075
1076 case 't':
1077 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1078 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1079 memcpy(&aBuf[nBuf-5], "ic", 2);
1080 *pnBuf = nBuf - 5 + 2;
1081 }
1082 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1083 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1084 memcpy(&aBuf[nBuf-5], "ic", 2);
1085 *pnBuf = nBuf - 5 + 2;
1086 }
1087 }
1088 break;
1089
1090 case 'u':
1091 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1092 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1093 *pnBuf = nBuf - 3;
1094 }
1095 }
1096 break;
1097
1098 case 'v':
1099 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1100 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1101 *pnBuf = nBuf - 5;
1102 }
1103 }
1104 break;
1105
1106 case 'z':
1107 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1108 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1109 memcpy(&aBuf[nBuf-5], "al", 2);
1110 *pnBuf = nBuf - 5 + 2;
1111 }
1112 }
1113 break;
1114
1115 }
1116 return ret;
1117 }
1118
1119
fts5PorterStep1B(char * aBuf,int * pnBuf)1120 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1121 int ret = 0;
1122 int nBuf = *pnBuf;
1123 switch( aBuf[nBuf-2] ){
1124
1125 case 'e':
1126 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1127 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1128 memcpy(&aBuf[nBuf-3], "ee", 2);
1129 *pnBuf = nBuf - 3 + 2;
1130 }
1131 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1132 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1133 *pnBuf = nBuf - 2;
1134 ret = 1;
1135 }
1136 }
1137 break;
1138
1139 case 'n':
1140 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1141 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1142 *pnBuf = nBuf - 3;
1143 ret = 1;
1144 }
1145 }
1146 break;
1147
1148 }
1149 return ret;
1150 }
1151
1152 /*
1153 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1154 ***************************************************************************
1155 **************************************************************************/
1156
fts5PorterStep1A(char * aBuf,int * pnBuf)1157 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1158 int nBuf = *pnBuf;
1159 if( aBuf[nBuf-1]=='s' ){
1160 if( aBuf[nBuf-2]=='e' ){
1161 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1162 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1163 ){
1164 *pnBuf = nBuf-2;
1165 }else{
1166 *pnBuf = nBuf-1;
1167 }
1168 }
1169 else if( aBuf[nBuf-2]!='s' ){
1170 *pnBuf = nBuf-1;
1171 }
1172 }
1173 }
1174
fts5PorterCb(void * pCtx,int tflags,const char * pToken,int nToken,int iStart,int iEnd)1175 static int fts5PorterCb(
1176 void *pCtx,
1177 int tflags,
1178 const char *pToken,
1179 int nToken,
1180 int iStart,
1181 int iEnd
1182 ){
1183 PorterContext *p = (PorterContext*)pCtx;
1184
1185 char *aBuf;
1186 int nBuf;
1187
1188 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1189 aBuf = p->aBuf;
1190 nBuf = nToken;
1191 memcpy(aBuf, pToken, nBuf);
1192
1193 /* Step 1. */
1194 fts5PorterStep1A(aBuf, &nBuf);
1195 if( fts5PorterStep1B(aBuf, &nBuf) ){
1196 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1197 char c = aBuf[nBuf-1];
1198 if( fts5PorterIsVowel(c, 0)==0
1199 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1200 ){
1201 nBuf--;
1202 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1203 aBuf[nBuf++] = 'e';
1204 }
1205 }
1206 }
1207
1208 /* Step 1C. */
1209 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1210 aBuf[nBuf-1] = 'i';
1211 }
1212
1213 /* Steps 2 through 4. */
1214 fts5PorterStep2(aBuf, &nBuf);
1215 fts5PorterStep3(aBuf, &nBuf);
1216 fts5PorterStep4(aBuf, &nBuf);
1217
1218 /* Step 5a. */
1219 assert( nBuf>0 );
1220 if( aBuf[nBuf-1]=='e' ){
1221 if( fts5Porter_MGt1(aBuf, nBuf-1)
1222 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1223 ){
1224 nBuf--;
1225 }
1226 }
1227
1228 /* Step 5b. */
1229 if( nBuf>1 && aBuf[nBuf-1]=='l'
1230 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1231 ){
1232 nBuf--;
1233 }
1234
1235 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1236
1237 pass_through:
1238 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1239 }
1240
1241 /*
1242 ** Tokenize using the porter tokenizer.
1243 */
fts5PorterTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int flags,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))1244 static int fts5PorterTokenize(
1245 Fts5Tokenizer *pTokenizer,
1246 void *pCtx,
1247 int flags,
1248 const char *pText, int nText,
1249 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1250 ){
1251 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1252 PorterContext sCtx;
1253 sCtx.xToken = xToken;
1254 sCtx.pCtx = pCtx;
1255 sCtx.aBuf = p->aBuf;
1256 return p->tokenizer.xTokenize(
1257 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1258 );
1259 }
1260
1261 /**************************************************************************
1262 ** Start of trigram implementation.
1263 */
1264 typedef struct TrigramTokenizer TrigramTokenizer;
1265 struct TrigramTokenizer {
1266 int bFold; /* True to fold to lower-case */
1267 };
1268
1269 /*
1270 ** Free a trigram tokenizer.
1271 */
fts5TriDelete(Fts5Tokenizer * p)1272 static void fts5TriDelete(Fts5Tokenizer *p){
1273 sqlite3_free(p);
1274 }
1275
1276 /*
1277 ** Allocate a trigram tokenizer.
1278 */
fts5TriCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)1279 static int fts5TriCreate(
1280 void *pUnused,
1281 const char **azArg,
1282 int nArg,
1283 Fts5Tokenizer **ppOut
1284 ){
1285 int rc = SQLITE_OK;
1286 TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1287 UNUSED_PARAM(pUnused);
1288 if( pNew==0 ){
1289 rc = SQLITE_NOMEM;
1290 }else{
1291 int i;
1292 pNew->bFold = 1;
1293 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1294 const char *zArg = azArg[i+1];
1295 if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1296 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1297 rc = SQLITE_ERROR;
1298 }else{
1299 pNew->bFold = (zArg[0]=='0');
1300 }
1301 }else{
1302 rc = SQLITE_ERROR;
1303 }
1304 }
1305 if( rc!=SQLITE_OK ){
1306 fts5TriDelete((Fts5Tokenizer*)pNew);
1307 pNew = 0;
1308 }
1309 }
1310 *ppOut = (Fts5Tokenizer*)pNew;
1311 return rc;
1312 }
1313
1314 /*
1315 ** Trigram tokenizer tokenize routine.
1316 */
fts5TriTokenize(Fts5Tokenizer * pTok,void * pCtx,int unusedFlags,const char * pText,int nText,int (* xToken)(void *,int,const char *,int,int,int))1317 static int fts5TriTokenize(
1318 Fts5Tokenizer *pTok,
1319 void *pCtx,
1320 int unusedFlags,
1321 const char *pText, int nText,
1322 int (*xToken)(void*, int, const char*, int, int, int)
1323 ){
1324 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1325 int rc = SQLITE_OK;
1326 char aBuf[32];
1327 const unsigned char *zIn = (const unsigned char*)pText;
1328 const unsigned char *zEof = &zIn[nText];
1329 u32 iCode;
1330
1331 UNUSED_PARAM(unusedFlags);
1332 while( 1 ){
1333 char *zOut = aBuf;
1334 int iStart = zIn - (const unsigned char*)pText;
1335 const unsigned char *zNext;
1336
1337 READ_UTF8(zIn, zEof, iCode);
1338 if( iCode==0 ) break;
1339 zNext = zIn;
1340 if( zIn<zEof ){
1341 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1342 WRITE_UTF8(zOut, iCode);
1343 READ_UTF8(zIn, zEof, iCode);
1344 if( iCode==0 ) break;
1345 }else{
1346 break;
1347 }
1348 if( zIn<zEof ){
1349 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1350 WRITE_UTF8(zOut, iCode);
1351 READ_UTF8(zIn, zEof, iCode);
1352 if( iCode==0 ) break;
1353 if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1354 WRITE_UTF8(zOut, iCode);
1355 }else{
1356 break;
1357 }
1358 rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
1359 if( rc!=SQLITE_OK ) break;
1360 zIn = zNext;
1361 }
1362
1363 return rc;
1364 }
1365
1366 /*
1367 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1368 ** pTok is a tokenizer previously created using the same method. This function
1369 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1370 ** indicating the style of pattern matching that the tokenizer can support.
1371 ** In practice, this is:
1372 **
1373 ** "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1374 ** "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1375 ** all other tokenizers - FTS5_PATTERN_NONE
1376 */
sqlite3Fts5TokenizerPattern(int (* xCreate)(void *,const char **,int,Fts5Tokenizer **),Fts5Tokenizer * pTok)1377 int sqlite3Fts5TokenizerPattern(
1378 int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1379 Fts5Tokenizer *pTok
1380 ){
1381 if( xCreate==fts5TriCreate ){
1382 TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1383 return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1384 }
1385 return FTS5_PATTERN_NONE;
1386 }
1387
1388 /*
1389 ** Register all built-in tokenizers with FTS5.
1390 */
sqlite3Fts5TokenizerInit(fts5_api * pApi)1391 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1392 struct BuiltinTokenizer {
1393 const char *zName;
1394 fts5_tokenizer x;
1395 } aBuiltin[] = {
1396 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1397 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1398 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1399 { "trigram", {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1400 };
1401
1402 int rc = SQLITE_OK; /* Return code */
1403 int i; /* To iterate through builtin functions */
1404
1405 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1406 rc = pApi->xCreateTokenizer(pApi,
1407 aBuiltin[i].zName,
1408 (void*)pApi,
1409 &aBuiltin[i].x,
1410 0
1411 );
1412 }
1413
1414 return rc;
1415 }
1416