1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13 
14 
15 #include "fts5Int.h"
16 
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
19 */
20 
21 /*
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
24 */
25 static unsigned char aAsciiTokenChar[128] = {
26   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
27   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
28   0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
29   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
30   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
31   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
32   0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
33   1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
34 };
35 
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38   unsigned char aTokenChar[128];
39 };
40 
fts5AsciiAddExceptions(AsciiTokenizer * p,const char * zArg,int bTokenChars)41 static void fts5AsciiAddExceptions(
42   AsciiTokenizer *p,
43   const char *zArg,
44   int bTokenChars
45 ){
46   int i;
47   for(i=0; zArg[i]; i++){
48     if( (zArg[i] & 0x80)==0 ){
49       p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
50     }
51   }
52 }
53 
54 /*
55 ** Delete a "ascii" tokenizer.
56 */
fts5AsciiDelete(Fts5Tokenizer * p)57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58   sqlite3_free(p);
59 }
60 
61 /*
62 ** Create an "ascii" tokenizer.
63 */
fts5AsciiCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)64 static int fts5AsciiCreate(
65   void *pUnused,
66   const char **azArg, int nArg,
67   Fts5Tokenizer **ppOut
68 ){
69   int rc = SQLITE_OK;
70   AsciiTokenizer *p = 0;
71   UNUSED_PARAM(pUnused);
72   if( nArg%2 ){
73     rc = SQLITE_ERROR;
74   }else{
75     p = sqlite3_malloc(sizeof(AsciiTokenizer));
76     if( p==0 ){
77       rc = SQLITE_NOMEM;
78     }else{
79       int i;
80       memset(p, 0, sizeof(AsciiTokenizer));
81       memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83         const char *zArg = azArg[i+1];
84         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85           fts5AsciiAddExceptions(p, zArg, 1);
86         }else
87         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88           fts5AsciiAddExceptions(p, zArg, 0);
89         }else{
90           rc = SQLITE_ERROR;
91         }
92       }
93       if( rc!=SQLITE_OK ){
94         fts5AsciiDelete((Fts5Tokenizer*)p);
95         p = 0;
96       }
97     }
98   }
99 
100   *ppOut = (Fts5Tokenizer*)p;
101   return rc;
102 }
103 
104 
asciiFold(char * aOut,const char * aIn,int nByte)105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106   int i;
107   for(i=0; i<nByte; i++){
108     char c = aIn[i];
109     if( c>='A' && c<='Z' ) c += 32;
110     aOut[i] = c;
111   }
112 }
113 
114 /*
115 ** Tokenize some text using the ascii tokenizer.
116 */
fts5AsciiTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int iUnused,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))117 static int fts5AsciiTokenize(
118   Fts5Tokenizer *pTokenizer,
119   void *pCtx,
120   int iUnused,
121   const char *pText, int nText,
122   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
123 ){
124   AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125   int rc = SQLITE_OK;
126   int ie;
127   int is = 0;
128 
129   char aFold[64];
130   int nFold = sizeof(aFold);
131   char *pFold = aFold;
132   unsigned char *a = p->aTokenChar;
133 
134   UNUSED_PARAM(iUnused);
135 
136   while( is<nText && rc==SQLITE_OK ){
137     int nByte;
138 
139     /* Skip any leading divider characters. */
140     while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141       is++;
142     }
143     if( is==nText ) break;
144 
145     /* Count the token characters */
146     ie = is+1;
147     while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148       ie++;
149     }
150 
151     /* Fold to lower case */
152     nByte = ie-is;
153     if( nByte>nFold ){
154       if( pFold!=aFold ) sqlite3_free(pFold);
155       pFold = sqlite3_malloc64((sqlite3_int64)nByte*2);
156       if( pFold==0 ){
157         rc = SQLITE_NOMEM;
158         break;
159       }
160       nFold = nByte*2;
161     }
162     asciiFold(pFold, &pText[is], nByte);
163 
164     /* Invoke the token callback */
165     rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166     is = ie+1;
167   }
168 
169   if( pFold!=aFold ) sqlite3_free(pFold);
170   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171   return rc;
172 }
173 
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
176 */
177 
178 
179 /*
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
183 */
184 #ifndef SQLITE_AMALGAMATION
185 
186 static const unsigned char sqlite3Utf8Trans1[] = {
187   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
195 };
196 
197 #define READ_UTF8(zIn, zTerm, c)                           \
198   c = *(zIn++);                                            \
199   if( c>=0xc0 ){                                           \
200     c = sqlite3Utf8Trans1[c-0xc0];                         \
201     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
202       c = (c<<6) + (0x3f & *(zIn++));                      \
203     }                                                      \
204     if( c<0x80                                             \
205         || (c&0xFFFFF800)==0xD800                          \
206         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
207   }
208 
209 
210 #define WRITE_UTF8(zOut, c) {                          \
211   if( c<0x00080 ){                                     \
212     *zOut++ = (unsigned char)(c&0xFF);                 \
213   }                                                    \
214   else if( c<0x00800 ){                                \
215     *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
216     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
217   }                                                    \
218   else if( c<0x10000 ){                                \
219     *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F);    \
220     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
221     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
222   }else{                                               \
223     *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07);  \
224     *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F);  \
225     *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F);   \
226     *zOut++ = 0x80 + (unsigned char)(c & 0x3F);        \
227   }                                                    \
228 }
229 
230 #endif /* ifndef SQLITE_AMALGAMATION */
231 
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
233 struct Unicode61Tokenizer {
234   unsigned char aTokenChar[128];  /* ASCII range token characters */
235   char *aFold;                    /* Buffer to fold text into */
236   int nFold;                      /* Size of aFold[] in bytes */
237   int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
238   int nException;
239   int *aiException;
240 
241   unsigned char aCategory[32];    /* True for token char categories */
242 };
243 
244 /* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
245 #define FTS5_REMOVE_DIACRITICS_NONE    0
246 #define FTS5_REMOVE_DIACRITICS_SIMPLE  1
247 #define FTS5_REMOVE_DIACRITICS_COMPLEX 2
248 
fts5UnicodeAddExceptions(Unicode61Tokenizer * p,const char * z,int bTokenChars)249 static int fts5UnicodeAddExceptions(
250   Unicode61Tokenizer *p,          /* Tokenizer object */
251   const char *z,                  /* Characters to treat as exceptions */
252   int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
253 ){
254   int rc = SQLITE_OK;
255   int n = (int)strlen(z);
256   int *aNew;
257 
258   if( n>0 ){
259     aNew = (int*)sqlite3_realloc64(p->aiException,
260                                    (n+p->nException)*sizeof(int));
261     if( aNew ){
262       int nNew = p->nException;
263       const unsigned char *zCsr = (const unsigned char*)z;
264       const unsigned char *zTerm = (const unsigned char*)&z[n];
265       while( zCsr<zTerm ){
266         u32 iCode;
267         int bToken;
268         READ_UTF8(zCsr, zTerm, iCode);
269         if( iCode<128 ){
270           p->aTokenChar[iCode] = (unsigned char)bTokenChars;
271         }else{
272           bToken = p->aCategory[sqlite3Fts5UnicodeCategory(iCode)];
273           assert( (bToken==0 || bToken==1) );
274           assert( (bTokenChars==0 || bTokenChars==1) );
275           if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
276             int i;
277             for(i=0; i<nNew; i++){
278               if( (u32)aNew[i]>iCode ) break;
279             }
280             memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
281             aNew[i] = iCode;
282             nNew++;
283           }
284         }
285       }
286       p->aiException = aNew;
287       p->nException = nNew;
288     }else{
289       rc = SQLITE_NOMEM;
290     }
291   }
292 
293   return rc;
294 }
295 
296 /*
297 ** Return true if the p->aiException[] array contains the value iCode.
298 */
fts5UnicodeIsException(Unicode61Tokenizer * p,int iCode)299 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
300   if( p->nException>0 ){
301     int *a = p->aiException;
302     int iLo = 0;
303     int iHi = p->nException-1;
304 
305     while( iHi>=iLo ){
306       int iTest = (iHi + iLo) / 2;
307       if( iCode==a[iTest] ){
308         return 1;
309       }else if( iCode>a[iTest] ){
310         iLo = iTest+1;
311       }else{
312         iHi = iTest-1;
313       }
314     }
315   }
316 
317   return 0;
318 }
319 
320 /*
321 ** Delete a "unicode61" tokenizer.
322 */
fts5UnicodeDelete(Fts5Tokenizer * pTok)323 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
324   if( pTok ){
325     Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
326     sqlite3_free(p->aiException);
327     sqlite3_free(p->aFold);
328     sqlite3_free(p);
329   }
330   return;
331 }
332 
unicodeSetCategories(Unicode61Tokenizer * p,const char * zCat)333 static int unicodeSetCategories(Unicode61Tokenizer *p, const char *zCat){
334   const char *z = zCat;
335 
336   while( *z ){
337     while( *z==' ' || *z=='\t' ) z++;
338     if( *z && sqlite3Fts5UnicodeCatParse(z, p->aCategory) ){
339       return SQLITE_ERROR;
340     }
341     while( *z!=' ' && *z!='\t' && *z!='\0' ) z++;
342   }
343 
344   sqlite3Fts5UnicodeAscii(p->aCategory, p->aTokenChar);
345   return SQLITE_OK;
346 }
347 
348 /*
349 ** Create a "unicode61" tokenizer.
350 */
fts5UnicodeCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)351 static int fts5UnicodeCreate(
352   void *pUnused,
353   const char **azArg, int nArg,
354   Fts5Tokenizer **ppOut
355 ){
356   int rc = SQLITE_OK;             /* Return code */
357   Unicode61Tokenizer *p = 0;      /* New tokenizer object */
358 
359   UNUSED_PARAM(pUnused);
360 
361   if( nArg%2 ){
362     rc = SQLITE_ERROR;
363   }else{
364     p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
365     if( p ){
366       const char *zCat = "L* N* Co";
367       int i;
368       memset(p, 0, sizeof(Unicode61Tokenizer));
369 
370       p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
371       p->nFold = 64;
372       p->aFold = sqlite3_malloc64(p->nFold * sizeof(char));
373       if( p->aFold==0 ){
374         rc = SQLITE_NOMEM;
375       }
376 
377       /* Search for a "categories" argument */
378       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
379         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
380           zCat = azArg[i+1];
381         }
382       }
383 
384       if( rc==SQLITE_OK ){
385         rc = unicodeSetCategories(p, zCat);
386       }
387 
388       for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
389         const char *zArg = azArg[i+1];
390         if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
391           if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
392             rc = SQLITE_ERROR;
393           }else{
394             p->eRemoveDiacritic = (zArg[0] - '0');
395             assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
396                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
397                  || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
398             );
399           }
400         }else
401         if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
402           rc = fts5UnicodeAddExceptions(p, zArg, 1);
403         }else
404         if( 0==sqlite3_stricmp(azArg[i], "separators") ){
405           rc = fts5UnicodeAddExceptions(p, zArg, 0);
406         }else
407         if( 0==sqlite3_stricmp(azArg[i], "categories") ){
408           /* no-op */
409         }else{
410           rc = SQLITE_ERROR;
411         }
412       }
413 
414     }else{
415       rc = SQLITE_NOMEM;
416     }
417     if( rc!=SQLITE_OK ){
418       fts5UnicodeDelete((Fts5Tokenizer*)p);
419       p = 0;
420     }
421     *ppOut = (Fts5Tokenizer*)p;
422   }
423   return rc;
424 }
425 
426 /*
427 ** Return true if, for the purposes of tokenizing with the tokenizer
428 ** passed as the first argument, codepoint iCode is considered a token
429 ** character (not a separator).
430 */
fts5UnicodeIsAlnum(Unicode61Tokenizer * p,int iCode)431 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
432   return (
433     p->aCategory[sqlite3Fts5UnicodeCategory((u32)iCode)]
434     ^ fts5UnicodeIsException(p, iCode)
435   );
436 }
437 
fts5UnicodeTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int iUnused,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))438 static int fts5UnicodeTokenize(
439   Fts5Tokenizer *pTokenizer,
440   void *pCtx,
441   int iUnused,
442   const char *pText, int nText,
443   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
444 ){
445   Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
446   int rc = SQLITE_OK;
447   unsigned char *a = p->aTokenChar;
448 
449   unsigned char *zTerm = (unsigned char*)&pText[nText];
450   unsigned char *zCsr = (unsigned char *)pText;
451 
452   /* Output buffer */
453   char *aFold = p->aFold;
454   int nFold = p->nFold;
455   const char *pEnd = &aFold[nFold-6];
456 
457   UNUSED_PARAM(iUnused);
458 
459   /* Each iteration of this loop gobbles up a contiguous run of separators,
460   ** then the next token.  */
461   while( rc==SQLITE_OK ){
462     u32 iCode;                    /* non-ASCII codepoint read from input */
463     char *zOut = aFold;
464     int is;
465     int ie;
466 
467     /* Skip any separator characters. */
468     while( 1 ){
469       if( zCsr>=zTerm ) goto tokenize_done;
470       if( *zCsr & 0x80 ) {
471         /* A character outside of the ascii range. Skip past it if it is
472         ** a separator character. Or break out of the loop if it is not. */
473         is = zCsr - (unsigned char*)pText;
474         READ_UTF8(zCsr, zTerm, iCode);
475         if( fts5UnicodeIsAlnum(p, iCode) ){
476           goto non_ascii_tokenchar;
477         }
478       }else{
479         if( a[*zCsr] ){
480           is = zCsr - (unsigned char*)pText;
481           goto ascii_tokenchar;
482         }
483         zCsr++;
484       }
485     }
486 
487     /* Run through the tokenchars. Fold them into the output buffer along
488     ** the way.  */
489     while( zCsr<zTerm ){
490 
491       /* Grow the output buffer so that there is sufficient space to fit the
492       ** largest possible utf-8 character.  */
493       if( zOut>pEnd ){
494         aFold = sqlite3_malloc64((sqlite3_int64)nFold*2);
495         if( aFold==0 ){
496           rc = SQLITE_NOMEM;
497           goto tokenize_done;
498         }
499         zOut = &aFold[zOut - p->aFold];
500         memcpy(aFold, p->aFold, nFold);
501         sqlite3_free(p->aFold);
502         p->aFold = aFold;
503         p->nFold = nFold = nFold*2;
504         pEnd = &aFold[nFold-6];
505       }
506 
507       if( *zCsr & 0x80 ){
508         /* An non-ascii-range character. Fold it into the output buffer if
509         ** it is a token character, or break out of the loop if it is not. */
510         READ_UTF8(zCsr, zTerm, iCode);
511         if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
512  non_ascii_tokenchar:
513           iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
514           if( iCode ) WRITE_UTF8(zOut, iCode);
515         }else{
516           break;
517         }
518       }else if( a[*zCsr]==0 ){
519         /* An ascii-range separator character. End of token. */
520         break;
521       }else{
522  ascii_tokenchar:
523         if( *zCsr>='A' && *zCsr<='Z' ){
524           *zOut++ = *zCsr + 32;
525         }else{
526           *zOut++ = *zCsr;
527         }
528         zCsr++;
529       }
530       ie = zCsr - (unsigned char*)pText;
531     }
532 
533     /* Invoke the token callback */
534     rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
535   }
536 
537  tokenize_done:
538   if( rc==SQLITE_DONE ) rc = SQLITE_OK;
539   return rc;
540 }
541 
542 /**************************************************************************
543 ** Start of porter stemmer implementation.
544 */
545 
546 /* Any tokens larger than this (in bytes) are passed through without
547 ** stemming. */
548 #define FTS5_PORTER_MAX_TOKEN 64
549 
550 typedef struct PorterTokenizer PorterTokenizer;
551 struct PorterTokenizer {
552   fts5_tokenizer tokenizer;       /* Parent tokenizer module */
553   Fts5Tokenizer *pTokenizer;      /* Parent tokenizer instance */
554   char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
555 };
556 
557 /*
558 ** Delete a "porter" tokenizer.
559 */
fts5PorterDelete(Fts5Tokenizer * pTok)560 static void fts5PorterDelete(Fts5Tokenizer *pTok){
561   if( pTok ){
562     PorterTokenizer *p = (PorterTokenizer*)pTok;
563     if( p->pTokenizer ){
564       p->tokenizer.xDelete(p->pTokenizer);
565     }
566     sqlite3_free(p);
567   }
568 }
569 
570 /*
571 ** Create a "porter" tokenizer.
572 */
fts5PorterCreate(void * pCtx,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)573 static int fts5PorterCreate(
574   void *pCtx,
575   const char **azArg, int nArg,
576   Fts5Tokenizer **ppOut
577 ){
578   fts5_api *pApi = (fts5_api*)pCtx;
579   int rc = SQLITE_OK;
580   PorterTokenizer *pRet;
581   void *pUserdata = 0;
582   const char *zBase = "unicode61";
583 
584   if( nArg>0 ){
585     zBase = azArg[0];
586   }
587 
588   pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
589   if( pRet ){
590     memset(pRet, 0, sizeof(PorterTokenizer));
591     rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
592   }else{
593     rc = SQLITE_NOMEM;
594   }
595   if( rc==SQLITE_OK ){
596     int nArg2 = (nArg>0 ? nArg-1 : 0);
597     const char **azArg2 = (nArg2 ? &azArg[1] : 0);
598     rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
599   }
600 
601   if( rc!=SQLITE_OK ){
602     fts5PorterDelete((Fts5Tokenizer*)pRet);
603     pRet = 0;
604   }
605   *ppOut = (Fts5Tokenizer*)pRet;
606   return rc;
607 }
608 
609 typedef struct PorterContext PorterContext;
610 struct PorterContext {
611   void *pCtx;
612   int (*xToken)(void*, int, const char*, int, int, int);
613   char *aBuf;
614 };
615 
616 typedef struct PorterRule PorterRule;
617 struct PorterRule {
618   const char *zSuffix;
619   int nSuffix;
620   int (*xCond)(char *zStem, int nStem);
621   const char *zOutput;
622   int nOutput;
623 };
624 
625 #if 0
626 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
627   int ret = -1;
628   int nBuf = *pnBuf;
629   PorterRule *p;
630 
631   for(p=aRule; p->zSuffix; p++){
632     assert( strlen(p->zSuffix)==p->nSuffix );
633     assert( strlen(p->zOutput)==p->nOutput );
634     if( nBuf<p->nSuffix ) continue;
635     if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
636   }
637 
638   if( p->zSuffix ){
639     int nStem = nBuf - p->nSuffix;
640     if( p->xCond==0 || p->xCond(aBuf, nStem) ){
641       memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
642       *pnBuf = nStem + p->nOutput;
643       ret = p - aRule;
644     }
645   }
646 
647   return ret;
648 }
649 #endif
650 
fts5PorterIsVowel(char c,int bYIsVowel)651 static int fts5PorterIsVowel(char c, int bYIsVowel){
652   return (
653       c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
654   );
655 }
656 
fts5PorterGobbleVC(char * zStem,int nStem,int bPrevCons)657 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
658   int i;
659   int bCons = bPrevCons;
660 
661   /* Scan for a vowel */
662   for(i=0; i<nStem; i++){
663     if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
664   }
665 
666   /* Scan for a consonent */
667   for(i++; i<nStem; i++){
668     if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
669   }
670   return 0;
671 }
672 
673 /* porter rule condition: (m > 0) */
fts5Porter_MGt0(char * zStem,int nStem)674 static int fts5Porter_MGt0(char *zStem, int nStem){
675   return !!fts5PorterGobbleVC(zStem, nStem, 0);
676 }
677 
678 /* porter rule condition: (m > 1) */
fts5Porter_MGt1(char * zStem,int nStem)679 static int fts5Porter_MGt1(char *zStem, int nStem){
680   int n;
681   n = fts5PorterGobbleVC(zStem, nStem, 0);
682   if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
683     return 1;
684   }
685   return 0;
686 }
687 
688 /* porter rule condition: (m = 1) */
fts5Porter_MEq1(char * zStem,int nStem)689 static int fts5Porter_MEq1(char *zStem, int nStem){
690   int n;
691   n = fts5PorterGobbleVC(zStem, nStem, 0);
692   if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
693     return 1;
694   }
695   return 0;
696 }
697 
698 /* porter rule condition: (*o) */
fts5Porter_Ostar(char * zStem,int nStem)699 static int fts5Porter_Ostar(char *zStem, int nStem){
700   if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
701     return 0;
702   }else{
703     int i;
704     int mask = 0;
705     int bCons = 0;
706     for(i=0; i<nStem; i++){
707       bCons = !fts5PorterIsVowel(zStem[i], bCons);
708       assert( bCons==0 || bCons==1 );
709       mask = (mask << 1) + bCons;
710     }
711     return ((mask & 0x0007)==0x0005);
712   }
713 }
714 
715 /* porter rule condition: (m > 1 and (*S or *T)) */
fts5Porter_MGt1_and_S_or_T(char * zStem,int nStem)716 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
717   assert( nStem>0 );
718   return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
719       && fts5Porter_MGt1(zStem, nStem);
720 }
721 
722 /* porter rule condition: (*v*) */
fts5Porter_Vowel(char * zStem,int nStem)723 static int fts5Porter_Vowel(char *zStem, int nStem){
724   int i;
725   for(i=0; i<nStem; i++){
726     if( fts5PorterIsVowel(zStem[i], i>0) ){
727       return 1;
728     }
729   }
730   return 0;
731 }
732 
733 
734 /**************************************************************************
735 ***************************************************************************
736 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
737 */
738 
fts5PorterStep4(char * aBuf,int * pnBuf)739 static int fts5PorterStep4(char *aBuf, int *pnBuf){
740   int ret = 0;
741   int nBuf = *pnBuf;
742   switch( aBuf[nBuf-2] ){
743 
744     case 'a':
745       if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
746         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
747           *pnBuf = nBuf - 2;
748         }
749       }
750       break;
751 
752     case 'c':
753       if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
754         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
755           *pnBuf = nBuf - 4;
756         }
757       }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
758         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
759           *pnBuf = nBuf - 4;
760         }
761       }
762       break;
763 
764     case 'e':
765       if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
766         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
767           *pnBuf = nBuf - 2;
768         }
769       }
770       break;
771 
772     case 'i':
773       if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
774         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
775           *pnBuf = nBuf - 2;
776         }
777       }
778       break;
779 
780     case 'l':
781       if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
782         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
783           *pnBuf = nBuf - 4;
784         }
785       }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
786         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
787           *pnBuf = nBuf - 4;
788         }
789       }
790       break;
791 
792     case 'n':
793       if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
794         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
795           *pnBuf = nBuf - 3;
796         }
797       }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
798         if( fts5Porter_MGt1(aBuf, nBuf-5) ){
799           *pnBuf = nBuf - 5;
800         }
801       }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
802         if( fts5Porter_MGt1(aBuf, nBuf-4) ){
803           *pnBuf = nBuf - 4;
804         }
805       }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
806         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
807           *pnBuf = nBuf - 3;
808         }
809       }
810       break;
811 
812     case 'o':
813       if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
814         if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
815           *pnBuf = nBuf - 3;
816         }
817       }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
818         if( fts5Porter_MGt1(aBuf, nBuf-2) ){
819           *pnBuf = nBuf - 2;
820         }
821       }
822       break;
823 
824     case 's':
825       if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
826         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
827           *pnBuf = nBuf - 3;
828         }
829       }
830       break;
831 
832     case 't':
833       if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
834         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
835           *pnBuf = nBuf - 3;
836         }
837       }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
838         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
839           *pnBuf = nBuf - 3;
840         }
841       }
842       break;
843 
844     case 'u':
845       if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
846         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
847           *pnBuf = nBuf - 3;
848         }
849       }
850       break;
851 
852     case 'v':
853       if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
854         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
855           *pnBuf = nBuf - 3;
856         }
857       }
858       break;
859 
860     case 'z':
861       if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
862         if( fts5Porter_MGt1(aBuf, nBuf-3) ){
863           *pnBuf = nBuf - 3;
864         }
865       }
866       break;
867 
868   }
869   return ret;
870 }
871 
872 
fts5PorterStep1B2(char * aBuf,int * pnBuf)873 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
874   int ret = 0;
875   int nBuf = *pnBuf;
876   switch( aBuf[nBuf-2] ){
877 
878     case 'a':
879       if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
880         memcpy(&aBuf[nBuf-2], "ate", 3);
881         *pnBuf = nBuf - 2 + 3;
882         ret = 1;
883       }
884       break;
885 
886     case 'b':
887       if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
888         memcpy(&aBuf[nBuf-2], "ble", 3);
889         *pnBuf = nBuf - 2 + 3;
890         ret = 1;
891       }
892       break;
893 
894     case 'i':
895       if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
896         memcpy(&aBuf[nBuf-2], "ize", 3);
897         *pnBuf = nBuf - 2 + 3;
898         ret = 1;
899       }
900       break;
901 
902   }
903   return ret;
904 }
905 
906 
fts5PorterStep2(char * aBuf,int * pnBuf)907 static int fts5PorterStep2(char *aBuf, int *pnBuf){
908   int ret = 0;
909   int nBuf = *pnBuf;
910   switch( aBuf[nBuf-2] ){
911 
912     case 'a':
913       if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
914         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
915           memcpy(&aBuf[nBuf-7], "ate", 3);
916           *pnBuf = nBuf - 7 + 3;
917         }
918       }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
919         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
920           memcpy(&aBuf[nBuf-6], "tion", 4);
921           *pnBuf = nBuf - 6 + 4;
922         }
923       }
924       break;
925 
926     case 'c':
927       if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
928         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
929           memcpy(&aBuf[nBuf-4], "ence", 4);
930           *pnBuf = nBuf - 4 + 4;
931         }
932       }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
933         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
934           memcpy(&aBuf[nBuf-4], "ance", 4);
935           *pnBuf = nBuf - 4 + 4;
936         }
937       }
938       break;
939 
940     case 'e':
941       if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
942         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
943           memcpy(&aBuf[nBuf-4], "ize", 3);
944           *pnBuf = nBuf - 4 + 3;
945         }
946       }
947       break;
948 
949     case 'g':
950       if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
951         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
952           memcpy(&aBuf[nBuf-4], "log", 3);
953           *pnBuf = nBuf - 4 + 3;
954         }
955       }
956       break;
957 
958     case 'l':
959       if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
960         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
961           memcpy(&aBuf[nBuf-3], "ble", 3);
962           *pnBuf = nBuf - 3 + 3;
963         }
964       }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
965         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
966           memcpy(&aBuf[nBuf-4], "al", 2);
967           *pnBuf = nBuf - 4 + 2;
968         }
969       }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
970         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
971           memcpy(&aBuf[nBuf-5], "ent", 3);
972           *pnBuf = nBuf - 5 + 3;
973         }
974       }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
975         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
976           memcpy(&aBuf[nBuf-3], "e", 1);
977           *pnBuf = nBuf - 3 + 1;
978         }
979       }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
980         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
981           memcpy(&aBuf[nBuf-5], "ous", 3);
982           *pnBuf = nBuf - 5 + 3;
983         }
984       }
985       break;
986 
987     case 'o':
988       if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
989         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
990           memcpy(&aBuf[nBuf-7], "ize", 3);
991           *pnBuf = nBuf - 7 + 3;
992         }
993       }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
994         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
995           memcpy(&aBuf[nBuf-5], "ate", 3);
996           *pnBuf = nBuf - 5 + 3;
997         }
998       }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
999         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1000           memcpy(&aBuf[nBuf-4], "ate", 3);
1001           *pnBuf = nBuf - 4 + 3;
1002         }
1003       }
1004       break;
1005 
1006     case 's':
1007       if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
1008         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1009           memcpy(&aBuf[nBuf-5], "al", 2);
1010           *pnBuf = nBuf - 5 + 2;
1011         }
1012       }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
1013         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1014           memcpy(&aBuf[nBuf-7], "ive", 3);
1015           *pnBuf = nBuf - 7 + 3;
1016         }
1017       }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
1018         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1019           memcpy(&aBuf[nBuf-7], "ful", 3);
1020           *pnBuf = nBuf - 7 + 3;
1021         }
1022       }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
1023         if( fts5Porter_MGt0(aBuf, nBuf-7) ){
1024           memcpy(&aBuf[nBuf-7], "ous", 3);
1025           *pnBuf = nBuf - 7 + 3;
1026         }
1027       }
1028       break;
1029 
1030     case 't':
1031       if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
1032         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1033           memcpy(&aBuf[nBuf-5], "al", 2);
1034           *pnBuf = nBuf - 5 + 2;
1035         }
1036       }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
1037         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1038           memcpy(&aBuf[nBuf-5], "ive", 3);
1039           *pnBuf = nBuf - 5 + 3;
1040         }
1041       }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
1042         if( fts5Porter_MGt0(aBuf, nBuf-6) ){
1043           memcpy(&aBuf[nBuf-6], "ble", 3);
1044           *pnBuf = nBuf - 6 + 3;
1045         }
1046       }
1047       break;
1048 
1049   }
1050   return ret;
1051 }
1052 
1053 
fts5PorterStep3(char * aBuf,int * pnBuf)1054 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1055   int ret = 0;
1056   int nBuf = *pnBuf;
1057   switch( aBuf[nBuf-2] ){
1058 
1059     case 'a':
1060       if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1061         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1062           memcpy(&aBuf[nBuf-4], "ic", 2);
1063           *pnBuf = nBuf - 4 + 2;
1064         }
1065       }
1066       break;
1067 
1068     case 's':
1069       if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1070         if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1071           *pnBuf = nBuf - 4;
1072         }
1073       }
1074       break;
1075 
1076     case 't':
1077       if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1078         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1079           memcpy(&aBuf[nBuf-5], "ic", 2);
1080           *pnBuf = nBuf - 5 + 2;
1081         }
1082       }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1083         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1084           memcpy(&aBuf[nBuf-5], "ic", 2);
1085           *pnBuf = nBuf - 5 + 2;
1086         }
1087       }
1088       break;
1089 
1090     case 'u':
1091       if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1092         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1093           *pnBuf = nBuf - 3;
1094         }
1095       }
1096       break;
1097 
1098     case 'v':
1099       if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1100         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1101           *pnBuf = nBuf - 5;
1102         }
1103       }
1104       break;
1105 
1106     case 'z':
1107       if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1108         if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1109           memcpy(&aBuf[nBuf-5], "al", 2);
1110           *pnBuf = nBuf - 5 + 2;
1111         }
1112       }
1113       break;
1114 
1115   }
1116   return ret;
1117 }
1118 
1119 
fts5PorterStep1B(char * aBuf,int * pnBuf)1120 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1121   int ret = 0;
1122   int nBuf = *pnBuf;
1123   switch( aBuf[nBuf-2] ){
1124 
1125     case 'e':
1126       if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1127         if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1128           memcpy(&aBuf[nBuf-3], "ee", 2);
1129           *pnBuf = nBuf - 3 + 2;
1130         }
1131       }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1132         if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1133           *pnBuf = nBuf - 2;
1134           ret = 1;
1135         }
1136       }
1137       break;
1138 
1139     case 'n':
1140       if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1141         if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1142           *pnBuf = nBuf - 3;
1143           ret = 1;
1144         }
1145       }
1146       break;
1147 
1148   }
1149   return ret;
1150 }
1151 
1152 /*
1153 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1154 ***************************************************************************
1155 **************************************************************************/
1156 
fts5PorterStep1A(char * aBuf,int * pnBuf)1157 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1158   int nBuf = *pnBuf;
1159   if( aBuf[nBuf-1]=='s' ){
1160     if( aBuf[nBuf-2]=='e' ){
1161       if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1162        || (nBuf>3 && aBuf[nBuf-3]=='i' )
1163       ){
1164         *pnBuf = nBuf-2;
1165       }else{
1166         *pnBuf = nBuf-1;
1167       }
1168     }
1169     else if( aBuf[nBuf-2]!='s' ){
1170       *pnBuf = nBuf-1;
1171     }
1172   }
1173 }
1174 
fts5PorterCb(void * pCtx,int tflags,const char * pToken,int nToken,int iStart,int iEnd)1175 static int fts5PorterCb(
1176   void *pCtx,
1177   int tflags,
1178   const char *pToken,
1179   int nToken,
1180   int iStart,
1181   int iEnd
1182 ){
1183   PorterContext *p = (PorterContext*)pCtx;
1184 
1185   char *aBuf;
1186   int nBuf;
1187 
1188   if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1189   aBuf = p->aBuf;
1190   nBuf = nToken;
1191   memcpy(aBuf, pToken, nBuf);
1192 
1193   /* Step 1. */
1194   fts5PorterStep1A(aBuf, &nBuf);
1195   if( fts5PorterStep1B(aBuf, &nBuf) ){
1196     if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1197       char c = aBuf[nBuf-1];
1198       if( fts5PorterIsVowel(c, 0)==0
1199        && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1200       ){
1201         nBuf--;
1202       }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1203         aBuf[nBuf++] = 'e';
1204       }
1205     }
1206   }
1207 
1208   /* Step 1C. */
1209   if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1210     aBuf[nBuf-1] = 'i';
1211   }
1212 
1213   /* Steps 2 through 4. */
1214   fts5PorterStep2(aBuf, &nBuf);
1215   fts5PorterStep3(aBuf, &nBuf);
1216   fts5PorterStep4(aBuf, &nBuf);
1217 
1218   /* Step 5a. */
1219   assert( nBuf>0 );
1220   if( aBuf[nBuf-1]=='e' ){
1221     if( fts5Porter_MGt1(aBuf, nBuf-1)
1222      || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1223     ){
1224       nBuf--;
1225     }
1226   }
1227 
1228   /* Step 5b. */
1229   if( nBuf>1 && aBuf[nBuf-1]=='l'
1230    && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1231   ){
1232     nBuf--;
1233   }
1234 
1235   return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1236 
1237  pass_through:
1238   return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1239 }
1240 
1241 /*
1242 ** Tokenize using the porter tokenizer.
1243 */
fts5PorterTokenize(Fts5Tokenizer * pTokenizer,void * pCtx,int flags,const char * pText,int nText,int (* xToken)(void *,int,const char *,int nToken,int iStart,int iEnd))1244 static int fts5PorterTokenize(
1245   Fts5Tokenizer *pTokenizer,
1246   void *pCtx,
1247   int flags,
1248   const char *pText, int nText,
1249   int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1250 ){
1251   PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1252   PorterContext sCtx;
1253   sCtx.xToken = xToken;
1254   sCtx.pCtx = pCtx;
1255   sCtx.aBuf = p->aBuf;
1256   return p->tokenizer.xTokenize(
1257       p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1258   );
1259 }
1260 
1261 /**************************************************************************
1262 ** Start of trigram implementation.
1263 */
1264 typedef struct TrigramTokenizer TrigramTokenizer;
1265 struct TrigramTokenizer {
1266   int bFold;                      /* True to fold to lower-case */
1267 };
1268 
1269 /*
1270 ** Free a trigram tokenizer.
1271 */
fts5TriDelete(Fts5Tokenizer * p)1272 static void fts5TriDelete(Fts5Tokenizer *p){
1273   sqlite3_free(p);
1274 }
1275 
1276 /*
1277 ** Allocate a trigram tokenizer.
1278 */
fts5TriCreate(void * pUnused,const char ** azArg,int nArg,Fts5Tokenizer ** ppOut)1279 static int fts5TriCreate(
1280   void *pUnused,
1281   const char **azArg,
1282   int nArg,
1283   Fts5Tokenizer **ppOut
1284 ){
1285   int rc = SQLITE_OK;
1286   TrigramTokenizer *pNew = (TrigramTokenizer*)sqlite3_malloc(sizeof(*pNew));
1287   UNUSED_PARAM(pUnused);
1288   if( pNew==0 ){
1289     rc = SQLITE_NOMEM;
1290   }else{
1291     int i;
1292     pNew->bFold = 1;
1293     for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
1294       const char *zArg = azArg[i+1];
1295       if( 0==sqlite3_stricmp(azArg[i], "case_sensitive") ){
1296         if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
1297           rc = SQLITE_ERROR;
1298         }else{
1299           pNew->bFold = (zArg[0]=='0');
1300         }
1301       }else{
1302         rc = SQLITE_ERROR;
1303       }
1304     }
1305     if( rc!=SQLITE_OK ){
1306       fts5TriDelete((Fts5Tokenizer*)pNew);
1307       pNew = 0;
1308     }
1309   }
1310   *ppOut = (Fts5Tokenizer*)pNew;
1311   return rc;
1312 }
1313 
1314 /*
1315 ** Trigram tokenizer tokenize routine.
1316 */
fts5TriTokenize(Fts5Tokenizer * pTok,void * pCtx,int unusedFlags,const char * pText,int nText,int (* xToken)(void *,int,const char *,int,int,int))1317 static int fts5TriTokenize(
1318   Fts5Tokenizer *pTok,
1319   void *pCtx,
1320   int unusedFlags,
1321   const char *pText, int nText,
1322   int (*xToken)(void*, int, const char*, int, int, int)
1323 ){
1324   TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1325   int rc = SQLITE_OK;
1326   char aBuf[32];
1327   const unsigned char *zIn = (const unsigned char*)pText;
1328   const unsigned char *zEof = &zIn[nText];
1329   u32 iCode;
1330 
1331   UNUSED_PARAM(unusedFlags);
1332   while( 1 ){
1333     char *zOut = aBuf;
1334     int iStart = zIn - (const unsigned char*)pText;
1335     const unsigned char *zNext;
1336 
1337     READ_UTF8(zIn, zEof, iCode);
1338     if( iCode==0 ) break;
1339     zNext = zIn;
1340     if( zIn<zEof ){
1341       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1342       WRITE_UTF8(zOut, iCode);
1343       READ_UTF8(zIn, zEof, iCode);
1344       if( iCode==0 ) break;
1345     }else{
1346       break;
1347     }
1348     if( zIn<zEof ){
1349       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1350       WRITE_UTF8(zOut, iCode);
1351       READ_UTF8(zIn, zEof, iCode);
1352       if( iCode==0 ) break;
1353       if( p->bFold ) iCode = sqlite3Fts5UnicodeFold(iCode, 0);
1354       WRITE_UTF8(zOut, iCode);
1355     }else{
1356       break;
1357     }
1358     rc = xToken(pCtx, 0, aBuf, zOut-aBuf, iStart, iStart + zOut-aBuf);
1359     if( rc!=SQLITE_OK ) break;
1360     zIn = zNext;
1361   }
1362 
1363   return rc;
1364 }
1365 
1366 /*
1367 ** Argument xCreate is a pointer to a constructor function for a tokenizer.
1368 ** pTok is a tokenizer previously created using the same method. This function
1369 ** returns one of FTS5_PATTERN_NONE, FTS5_PATTERN_LIKE or FTS5_PATTERN_GLOB
1370 ** indicating the style of pattern matching that the tokenizer can support.
1371 ** In practice, this is:
1372 **
1373 **     "trigram" tokenizer, case_sensitive=1 - FTS5_PATTERN_GLOB
1374 **     "trigram" tokenizer, case_sensitive=0 (the default) - FTS5_PATTERN_LIKE
1375 **     all other tokenizers - FTS5_PATTERN_NONE
1376 */
sqlite3Fts5TokenizerPattern(int (* xCreate)(void *,const char **,int,Fts5Tokenizer **),Fts5Tokenizer * pTok)1377 int sqlite3Fts5TokenizerPattern(
1378     int (*xCreate)(void*, const char**, int, Fts5Tokenizer**),
1379     Fts5Tokenizer *pTok
1380 ){
1381   if( xCreate==fts5TriCreate ){
1382     TrigramTokenizer *p = (TrigramTokenizer*)pTok;
1383     return p->bFold ? FTS5_PATTERN_LIKE : FTS5_PATTERN_GLOB;
1384   }
1385   return FTS5_PATTERN_NONE;
1386 }
1387 
1388 /*
1389 ** Register all built-in tokenizers with FTS5.
1390 */
sqlite3Fts5TokenizerInit(fts5_api * pApi)1391 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1392   struct BuiltinTokenizer {
1393     const char *zName;
1394     fts5_tokenizer x;
1395   } aBuiltin[] = {
1396     { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1397     { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1398     { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1399     { "trigram",   {fts5TriCreate, fts5TriDelete, fts5TriTokenize}},
1400   };
1401 
1402   int rc = SQLITE_OK;             /* Return code */
1403   int i;                          /* To iterate through builtin functions */
1404 
1405   for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1406     rc = pApi->xCreateTokenizer(pApi,
1407         aBuiltin[i].zName,
1408         (void*)pApi,
1409         &aBuiltin[i].x,
1410         0
1411     );
1412   }
1413 
1414   return rc;
1415 }
1416