1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 **
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches.  See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
16 */
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
19 
20 #ifndef SQLITE_AMALGAMATION
21 # if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
22 #  define NDEBUG 1
23 # endif
24 # if defined(NDEBUG) && defined(SQLITE_DEBUG)
25 #  undef NDEBUG
26 # endif
27 # include <string.h>
28 # include <stdio.h>
29 # include <stdlib.h>
30 # include <assert.h>
31 # define ALWAYS(X)  1
32 # define NEVER(X)   0
33   typedef unsigned char u8;
34   typedef unsigned short u16;
35 #endif
36 #include <ctype.h>
37 
38 #ifndef SQLITE_OMIT_VIRTUALTABLE
39 
40 /*
41 ** Character classes for ASCII characters:
42 **
43 **   0   ''        Silent letters:   H W
44 **   1   'A'       Any vowel:   A E I O U (Y)
45 **   2   'B'       A bilabeal stop or fricative:  B F P V W
46 **   3   'C'       Other fricatives or back stops:  C G J K Q S X Z
47 **   4   'D'       Alveolar stops:  D T
48 **   5   'H'       Letter H at the beginning of a word
49 **   6   'L'       Glide:  L
50 **   7   'R'       Semivowel:  R
51 **   8   'M'       Nasals:  M N
52 **   9   'Y'       Letter Y at the beginning of a word.
53 **   10  '9'       Digits: 0 1 2 3 4 5 6 7 8 9
54 **   11  ' '       White space
55 **   12  '?'       Other.
56 */
57 #define CCLASS_SILENT         0
58 #define CCLASS_VOWEL          1
59 #define CCLASS_B              2
60 #define CCLASS_C              3
61 #define CCLASS_D              4
62 #define CCLASS_H              5
63 #define CCLASS_L              6
64 #define CCLASS_R              7
65 #define CCLASS_M              8
66 #define CCLASS_Y              9
67 #define CCLASS_DIGIT         10
68 #define CCLASS_SPACE         11
69 #define CCLASS_OTHER         12
70 
71 /*
72 ** The following table gives the character class for non-initial ASCII
73 ** characters.
74 */
75 static const unsigned char midClass[] = {
76  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
77  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
78  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
79  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
80  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
81  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
82  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
83  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
84  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
85  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
86  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
87  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
88  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
89  /* ' */ CCLASS_SILENT,   /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
90  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
91  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
92  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
93  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
94  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
95  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
96  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
97  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
98  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
99  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
100  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
101  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
102  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
103  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
104  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
105  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_VOWEL,
106  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
107  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
108  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
109  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
110  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
111  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
112  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
113  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
114  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
115  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
116  /* x */ CCLASS_C,        /* y */ CCLASS_VOWEL,   /* z */ CCLASS_C,
117  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
118  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
119 };
120 /*
121 ** This tables gives the character class for ASCII characters that form the
122 ** initial character of a word.  The only difference from midClass is with
123 ** the letters H, W, and Y.
124 */
125 static const unsigned char initClass[] = {
126  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
127  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
128  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
129  /*   */ CCLASS_SPACE,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
130  /*   */ CCLASS_SPACE,    /*   */ CCLASS_SPACE,   /*   */ CCLASS_OTHER,
131  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
132  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
133  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
134  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
135  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_OTHER,
136  /*   */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,   /*   */ CCLASS_SPACE,
137  /* ! */ CCLASS_OTHER,    /* " */ CCLASS_OTHER,   /* # */ CCLASS_OTHER,
138  /* $ */ CCLASS_OTHER,    /* % */ CCLASS_OTHER,   /* & */ CCLASS_OTHER,
139  /* ' */ CCLASS_OTHER,    /* ( */ CCLASS_OTHER,   /* ) */ CCLASS_OTHER,
140  /* * */ CCLASS_OTHER,    /* + */ CCLASS_OTHER,   /* , */ CCLASS_OTHER,
141  /* - */ CCLASS_OTHER,    /* . */ CCLASS_OTHER,   /* / */ CCLASS_OTHER,
142  /* 0 */ CCLASS_DIGIT,    /* 1 */ CCLASS_DIGIT,   /* 2 */ CCLASS_DIGIT,
143  /* 3 */ CCLASS_DIGIT,    /* 4 */ CCLASS_DIGIT,   /* 5 */ CCLASS_DIGIT,
144  /* 6 */ CCLASS_DIGIT,    /* 7 */ CCLASS_DIGIT,   /* 8 */ CCLASS_DIGIT,
145  /* 9 */ CCLASS_DIGIT,    /* : */ CCLASS_OTHER,   /* ; */ CCLASS_OTHER,
146  /* < */ CCLASS_OTHER,    /* = */ CCLASS_OTHER,   /* > */ CCLASS_OTHER,
147  /* ? */ CCLASS_OTHER,    /* @ */ CCLASS_OTHER,   /* A */ CCLASS_VOWEL,
148  /* B */ CCLASS_B,        /* C */ CCLASS_C,       /* D */ CCLASS_D,
149  /* E */ CCLASS_VOWEL,    /* F */ CCLASS_B,       /* G */ CCLASS_C,
150  /* H */ CCLASS_SILENT,   /* I */ CCLASS_VOWEL,   /* J */ CCLASS_C,
151  /* K */ CCLASS_C,        /* L */ CCLASS_L,       /* M */ CCLASS_M,
152  /* N */ CCLASS_M,        /* O */ CCLASS_VOWEL,   /* P */ CCLASS_B,
153  /* Q */ CCLASS_C,        /* R */ CCLASS_R,       /* S */ CCLASS_C,
154  /* T */ CCLASS_D,        /* U */ CCLASS_VOWEL,   /* V */ CCLASS_B,
155  /* W */ CCLASS_B,        /* X */ CCLASS_C,       /* Y */ CCLASS_Y,
156  /* Z */ CCLASS_C,        /* [ */ CCLASS_OTHER,   /* \ */ CCLASS_OTHER,
157  /* ] */ CCLASS_OTHER,    /* ^ */ CCLASS_OTHER,   /* _ */ CCLASS_OTHER,
158  /* ` */ CCLASS_OTHER,    /* a */ CCLASS_VOWEL,   /* b */ CCLASS_B,
159  /* c */ CCLASS_C,        /* d */ CCLASS_D,       /* e */ CCLASS_VOWEL,
160  /* f */ CCLASS_B,        /* g */ CCLASS_C,       /* h */ CCLASS_SILENT,
161  /* i */ CCLASS_VOWEL,    /* j */ CCLASS_C,       /* k */ CCLASS_C,
162  /* l */ CCLASS_L,        /* m */ CCLASS_M,       /* n */ CCLASS_M,
163  /* o */ CCLASS_VOWEL,    /* p */ CCLASS_B,       /* q */ CCLASS_C,
164  /* r */ CCLASS_R,        /* s */ CCLASS_C,       /* t */ CCLASS_D,
165  /* u */ CCLASS_VOWEL,    /* v */ CCLASS_B,       /* w */ CCLASS_B,
166  /* x */ CCLASS_C,        /* y */ CCLASS_Y,       /* z */ CCLASS_C,
167  /* { */ CCLASS_OTHER,    /* | */ CCLASS_OTHER,   /* } */ CCLASS_OTHER,
168  /* ~ */ CCLASS_OTHER,    /*   */ CCLASS_OTHER,
169 };
170 
171 /*
172 ** Mapping from the character class number (0-13) to a symbol for each
173 ** character class.  Note that initClass[] can be used to map the class
174 ** symbol back into the class number.
175 */
176 static const unsigned char className[] = ".ABCDHLRMY9 ?";
177 
178 /*
179 ** Generate a "phonetic hash" from a string of ASCII characters
180 ** in zIn[0..nIn-1].
181 **
182 **   * Map characters by character class as defined above.
183 **   * Omit double-letters
184 **   * Omit vowels beside R and L
185 **   * Omit T when followed by CH
186 **   * Omit W when followed by R
187 **   * Omit D when followed by J or G
188 **   * Omit K in KN or G in GN at the beginning of a word
189 **
190 ** Space to hold the result is obtained from sqlite3_malloc()
191 **
192 ** Return NULL if memory allocation fails.
193 */
phoneticHash(const unsigned char * zIn,int nIn)194 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
195   unsigned char *zOut = sqlite3_malloc64( nIn + 1 );
196   int i;
197   int nOut = 0;
198   char cPrev = 0x77;
199   char cPrevX = 0x77;
200   const unsigned char *aClass = initClass;
201 
202   if( zOut==0 ) return 0;
203   if( nIn>2 ){
204     switch( zIn[0] ){
205       case 'g':
206       case 'k': {
207         if( zIn[1]=='n' ){ zIn++; nIn--; }
208         break;
209       }
210     }
211   }
212   for(i=0; i<nIn; i++){
213     unsigned char c = zIn[i];
214     if( i+1<nIn ){
215       if( c=='w' && zIn[i+1]=='r' ) continue;
216       if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
217       if( i+2<nIn ){
218         if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
219       }
220     }
221     c = aClass[c&0x7f];
222     if( c==CCLASS_SPACE ) continue;
223     if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
224     aClass = midClass;
225     if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
226        continue; /* No vowels beside L or R */
227     }
228     if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
229        nOut--;   /* No vowels beside L or R */
230     }
231     cPrev = c;
232     if( c==CCLASS_SILENT ) continue;
233     cPrevX = c;
234     c = className[c];
235     assert( nOut>=0 );
236     if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
237   }
238   zOut[nOut] = 0;
239   return zOut;
240 }
241 
242 /*
243 ** This is an SQL function wrapper around phoneticHash().  See
244 ** the description of phoneticHash() for additional information.
245 */
phoneticHashSqlFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)246 static void phoneticHashSqlFunc(
247   sqlite3_context *context,
248   int argc,
249   sqlite3_value **argv
250 ){
251   const unsigned char *zIn;
252   unsigned char *zOut;
253 
254   zIn = sqlite3_value_text(argv[0]);
255   if( zIn==0 ) return;
256   zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
257   if( zOut==0 ){
258     sqlite3_result_error_nomem(context);
259   }else{
260     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
261   }
262 }
263 
264 /*
265 ** Return the character class number for a character given its
266 ** context.
267 */
characterClass(char cPrev,char c)268 static char characterClass(char cPrev, char c){
269   return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
270 }
271 
272 /*
273 ** Return the cost of inserting or deleting character c immediately
274 ** following character cPrev.  If cPrev==0, that means c is the first
275 ** character of the word.
276 */
insertOrDeleteCost(char cPrev,char c,char cNext)277 static int insertOrDeleteCost(char cPrev, char c, char cNext){
278   char classC = characterClass(cPrev, c);
279   char classCprev;
280 
281   if( classC==CCLASS_SILENT ){
282     /* Insert or delete "silent" characters such as H or W */
283     return 1;
284   }
285   if( cPrev==c ){
286     /* Repeated characters, or miss a repeat */
287     return 10;
288   }
289   if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
290     return 20;  /* Insert a vowel before or after 'r' */
291   }
292   classCprev = characterClass(cPrev, cPrev);
293   if( classC==classCprev ){
294     if( classC==CCLASS_VOWEL ){
295       /* Remove or add a new vowel to a vowel cluster */
296       return 15;
297     }else{
298       /* Remove or add a consonant not in the same class */
299       return 50;
300     }
301   }
302 
303   /* any other character insertion or deletion */
304   return 100;
305 }
306 
307 /*
308 ** Divide the insertion cost by this factor when appending to the
309 ** end of the word.
310 */
311 #define FINAL_INS_COST_DIV  4
312 
313 /*
314 ** Return the cost of substituting cTo in place of cFrom assuming
315 ** the previous character is cPrev.  If cPrev==0 then cTo is the first
316 ** character of the word.
317 */
substituteCost(char cPrev,char cFrom,char cTo)318 static int substituteCost(char cPrev, char cFrom, char cTo){
319   char classFrom, classTo;
320   if( cFrom==cTo ){
321     /* Exact match */
322     return 0;
323   }
324   if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
325     /* differ only in case */
326     return 0;
327   }
328   classFrom = characterClass(cPrev, cFrom);
329   classTo = characterClass(cPrev, cTo);
330   if( classFrom==classTo ){
331     /* Same character class */
332     return 40;
333   }
334   if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
335       && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
336     /* Convert from one consonant to another, but in a different class */
337     return 75;
338   }
339   /* Any other subsitution */
340   return 100;
341 }
342 
343 /*
344 ** Given two strings zA and zB which are pure ASCII, return the cost
345 ** of transforming zA into zB.  If zA ends with '*' assume that it is
346 ** a prefix of zB and give only minimal penalty for extra characters
347 ** on the end of zB.
348 **
349 ** Smaller numbers mean a closer match.
350 **
351 ** Negative values indicate an error:
352 **    -1  One of the inputs is NULL
353 **    -2  Non-ASCII characters on input
354 **    -3  Unable to allocate memory
355 **
356 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
357 ** of zB that matched the pattern in zA. If zA does not end with a '*',
358 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
359 ** If zA does end in a '*', then it is the number of bytes in the prefix
360 ** of zB that was deemed to match zA.
361 */
editdist1(const char * zA,const char * zB,int * pnMatch)362 static int editdist1(const char *zA, const char *zB, int *pnMatch){
363   int nA, nB;            /* Number of characters in zA[] and zB[] */
364   int xA, xB;            /* Loop counters for zA[] and zB[] */
365   char cA = 0, cB;       /* Current character of zA and zB */
366   char cAprev, cBprev;   /* Previous character of zA and zB */
367   char cAnext, cBnext;   /* Next character in zA and zB */
368   int d;                 /* North-west cost value */
369   int dc = 0;            /* North-west character value */
370   int res;               /* Final result */
371   int *m;                /* The cost matrix */
372   char *cx;              /* Corresponding character values */
373   int *toFree = 0;       /* Malloced space */
374   int nMatch = 0;
375   int mStack[60+15];     /* Stack space to use if not too much is needed */
376 
377   /* Early out if either input is NULL */
378   if( zA==0 || zB==0 ) return -1;
379 
380   /* Skip any common prefix */
381   while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
382   if( pnMatch ) *pnMatch = nMatch;
383   if( zA[0]==0 && zB[0]==0 ) return 0;
384 
385 #if 0
386   printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
387 #endif
388 
389   /* Verify input strings and measure their lengths */
390   for(nA=0; zA[nA]; nA++){
391     if( zA[nA]&0x80 ) return -2;
392   }
393   for(nB=0; zB[nB]; nB++){
394     if( zB[nB]&0x80 ) return -2;
395   }
396 
397   /* Special processing if either string is empty */
398   if( nA==0 ){
399     cBprev = (char)dc;
400     for(xB=res=0; (cB = zB[xB])!=0; xB++){
401       res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
402       cBprev = cB;
403     }
404     return res;
405   }
406   if( nB==0 ){
407     cAprev = (char)dc;
408     for(xA=res=0; (cA = zA[xA])!=0; xA++){
409       res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
410       cAprev = cA;
411     }
412     return res;
413   }
414 
415   /* A is a prefix of B */
416   if( zA[0]=='*' && zA[1]==0 ) return 0;
417 
418   /* Allocate and initialize the Wagner matrix */
419   if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
420     m = mStack;
421   }else{
422     m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 );
423     if( m==0 ) return -3;
424   }
425   cx = (char*)&m[nB+1];
426 
427   /* Compute the Wagner edit distance */
428   m[0] = 0;
429   cx[0] = (char)dc;
430   cBprev = (char)dc;
431   for(xB=1; xB<=nB; xB++){
432     cBnext = zB[xB];
433     cB = zB[xB-1];
434     cx[xB] = cB;
435     m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
436     cBprev = cB;
437   }
438   cAprev = (char)dc;
439   for(xA=1; xA<=nA; xA++){
440     int lastA = (xA==nA);
441     cA = zA[xA-1];
442     cAnext = zA[xA];
443     if( cA=='*' && lastA ) break;
444     d = m[0];
445     dc = cx[0];
446     m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
447     cBprev = 0;
448     for(xB=1; xB<=nB; xB++){
449       int totalCost, insCost, delCost, subCost, ncx;
450       cB = zB[xB-1];
451       cBnext = zB[xB];
452 
453       /* Cost to insert cB */
454       insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
455       if( lastA ) insCost /= FINAL_INS_COST_DIV;
456 
457       /* Cost to delete cA */
458       delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
459 
460       /* Cost to substitute cA->cB */
461       subCost = substituteCost(cx[xB-1], cA, cB);
462 
463       /* Best cost */
464       totalCost = insCost + m[xB-1];
465       ncx = cB;
466       if( (delCost + m[xB])<totalCost ){
467         totalCost = delCost + m[xB];
468         ncx = cA;
469       }
470       if( (subCost + d)<totalCost ){
471         totalCost = subCost + d;
472       }
473 
474 #if 0
475       printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
476              " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
477              xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
478              insCost, delCost, subCost, totalCost, ncx?ncx:' ');
479 #endif
480 
481       /* Update the matrix */
482       d = m[xB];
483       dc = cx[xB];
484       m[xB] = totalCost;
485       cx[xB] = (char)ncx;
486       cBprev = cB;
487     }
488     cAprev = cA;
489   }
490 
491   /* Free the wagner matrix and return the result */
492   if( cA=='*' ){
493     res = m[1];
494     for(xB=1; xB<=nB; xB++){
495       if( m[xB]<res ){
496         res = m[xB];
497         if( pnMatch ) *pnMatch = xB+nMatch;
498       }
499     }
500   }else{
501     res = m[nB];
502     /* In the current implementation, pnMatch is always NULL if zA does
503     ** not end in "*" */
504     assert( pnMatch==0 );
505   }
506   sqlite3_free(toFree);
507   return res;
508 }
509 
510 /*
511 ** Function:    editdist(A,B)
512 **
513 ** Return the cost of transforming string A into string B.  Both strings
514 ** must be pure ASCII text.  If A ends with '*' then it is assumed to be
515 ** a prefix of B and extra characters on the end of B have minimal additional
516 ** cost.
517 */
editdistSqlFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)518 static void editdistSqlFunc(
519   sqlite3_context *context,
520   int argc,
521   sqlite3_value **argv
522 ){
523   int res = editdist1(
524                     (const char*)sqlite3_value_text(argv[0]),
525                     (const char*)sqlite3_value_text(argv[1]),
526                     0);
527   if( res<0 ){
528     if( res==(-3) ){
529       sqlite3_result_error_nomem(context);
530     }else if( res==(-2) ){
531       sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
532     }else{
533       sqlite3_result_error(context, "NULL input to editdist()", -1);
534     }
535   }else{
536     sqlite3_result_int(context, res);
537   }
538 }
539 
540 /* End of the fixed-cost edit distance implementation
541 ******************************************************************************
542 *****************************************************************************
543 ** Begin: Configurable cost unicode edit distance routines
544 */
545 /* Forward declaration of structures */
546 typedef struct EditDist3Cost EditDist3Cost;
547 typedef struct EditDist3Config EditDist3Config;
548 typedef struct EditDist3Point EditDist3Point;
549 typedef struct EditDist3From EditDist3From;
550 typedef struct EditDist3FromString EditDist3FromString;
551 typedef struct EditDist3To EditDist3To;
552 typedef struct EditDist3ToString EditDist3ToString;
553 typedef struct EditDist3Lang EditDist3Lang;
554 
555 
556 /*
557 ** An entry in the edit cost table
558 */
559 struct EditDist3Cost {
560   EditDist3Cost *pNext;     /* Next cost element */
561   u8 nFrom;                 /* Number of bytes in aFrom */
562   u8 nTo;                   /* Number of bytes in aTo */
563   u16 iCost;                /* Cost of this transformation */
564   char a[4]    ;            /* FROM string followed by TO string */
565   /* Additional TO and FROM string bytes appended as necessary */
566 };
567 
568 /*
569 ** Edit costs for a particular language ID
570 */
571 struct EditDist3Lang {
572   int iLang;             /* Language ID */
573   int iInsCost;          /* Default insertion cost */
574   int iDelCost;          /* Default deletion cost */
575   int iSubCost;          /* Default substitution cost */
576   EditDist3Cost *pCost;  /* Costs */
577 };
578 
579 
580 /*
581 ** The default EditDist3Lang object, with default costs.
582 */
583 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
584 
585 /*
586 ** Complete configuration
587 */
588 struct EditDist3Config {
589   int nLang;             /* Number of language IDs.  Size of a[] */
590   EditDist3Lang *a;      /* One for each distinct language ID */
591 };
592 
593 /*
594 ** Extra information about each character in the FROM string.
595 */
596 struct EditDist3From {
597   int nSubst;              /* Number of substitution cost entries */
598   int nDel;                /* Number of deletion cost entries */
599   int nByte;               /* Number of bytes in this character */
600   EditDist3Cost **apSubst; /* Array of substitution costs for this element */
601   EditDist3Cost **apDel;   /* Array of deletion cost entries */
602 };
603 
604 /*
605 ** A precompiled FROM string.
606 *
607 ** In the common case we expect the FROM string to be reused multiple times.
608 ** In other words, the common case will be to measure the edit distance
609 ** from a single origin string to multiple target strings.
610 */
611 struct EditDist3FromString {
612   char *z;                 /* The complete text of the FROM string */
613   int n;                   /* Number of characters in the FROM string */
614   int isPrefix;            /* True if ends with '*' character */
615   EditDist3From *a;        /* Extra info about each char of the FROM string */
616 };
617 
618 /*
619 ** Extra information about each character in the TO string.
620 */
621 struct EditDist3To {
622   int nIns;                /* Number of insertion cost entries */
623   int nByte;               /* Number of bytes in this character */
624   EditDist3Cost **apIns;   /* Array of deletion cost entries */
625 };
626 
627 /*
628 ** A precompiled FROM string
629 */
630 struct EditDist3ToString {
631   char *z;                 /* The complete text of the TO string */
632   int n;                   /* Number of characters in the TO string */
633   EditDist3To *a;          /* Extra info about each char of the TO string */
634 };
635 
636 /*
637 ** Clear or delete an instance of the object that records all edit-distance
638 ** weights.
639 */
editDist3ConfigClear(EditDist3Config * p)640 static void editDist3ConfigClear(EditDist3Config *p){
641   int i;
642   if( p==0 ) return;
643   for(i=0; i<p->nLang; i++){
644     EditDist3Cost *pCost, *pNext;
645     pCost = p->a[i].pCost;
646     while( pCost ){
647       pNext = pCost->pNext;
648       sqlite3_free(pCost);
649       pCost = pNext;
650     }
651   }
652   sqlite3_free(p->a);
653   memset(p, 0, sizeof(*p));
654 }
editDist3ConfigDelete(void * pIn)655 static void editDist3ConfigDelete(void *pIn){
656   EditDist3Config *p = (EditDist3Config*)pIn;
657   editDist3ConfigClear(p);
658   sqlite3_free(p);
659 }
660 
661 /* Compare the FROM values of two EditDist3Cost objects, for sorting.
662 ** Return negative, zero, or positive if the A is less than, equal to,
663 ** or greater than B.
664 */
editDist3CostCompare(EditDist3Cost * pA,EditDist3Cost * pB)665 static int editDist3CostCompare(EditDist3Cost *pA, EditDist3Cost *pB){
666   int n = pA->nFrom;
667   int rc;
668   if( n>pB->nFrom ) n = pB->nFrom;
669   rc = strncmp(pA->a, pB->a, n);
670   if( rc==0 ) rc = pA->nFrom - pB->nFrom;
671   return rc;
672 }
673 
674 /*
675 ** Merge together two sorted lists of EditDist3Cost objects, in order
676 ** of increasing FROM.
677 */
editDist3CostMerge(EditDist3Cost * pA,EditDist3Cost * pB)678 static EditDist3Cost *editDist3CostMerge(
679   EditDist3Cost *pA,
680   EditDist3Cost *pB
681 ){
682   EditDist3Cost *pHead = 0;
683   EditDist3Cost **ppTail = &pHead;
684   EditDist3Cost *p;
685   while( pA && pB ){
686     if( editDist3CostCompare(pA,pB)<=0 ){
687       p = pA;
688       pA = pA->pNext;
689     }else{
690       p = pB;
691       pB = pB->pNext;
692     }
693     *ppTail = p;
694     ppTail =  &p->pNext;
695   }
696   if( pA ){
697     *ppTail = pA;
698   }else{
699     *ppTail = pB;
700   }
701   return pHead;
702 }
703 
704 /*
705 ** Sort a list of EditDist3Cost objects into order of increasing FROM
706 */
editDist3CostSort(EditDist3Cost * pList)707 static EditDist3Cost *editDist3CostSort(EditDist3Cost *pList){
708   EditDist3Cost *ap[60], *p;
709   int i;
710   int mx = 0;
711   ap[0] = 0;
712   ap[1] = 0;
713   while( pList ){
714     p = pList;
715     pList = p->pNext;
716     p->pNext = 0;
717     for(i=0; ap[i]; i++){
718       p = editDist3CostMerge(ap[i],p);
719       ap[i] = 0;
720     }
721     ap[i] = p;
722     if( i>mx ){
723       mx = i;
724       ap[i+1] = 0;
725     }
726   }
727   p = 0;
728   for(i=0; i<=mx; i++){
729     if( ap[i] ) p = editDist3CostMerge(p,ap[i]);
730   }
731   return p;
732 }
733 
734 /*
735 ** Load all edit-distance weights from a table.
736 */
editDist3ConfigLoad(EditDist3Config * p,sqlite3 * db,const char * zTable)737 static int editDist3ConfigLoad(
738   EditDist3Config *p,      /* The edit distance configuration to load */
739   sqlite3 *db,            /* Load from this database */
740   const char *zTable      /* Name of the table from which to load */
741 ){
742   sqlite3_stmt *pStmt;
743   int rc, rc2;
744   char *zSql;
745   int iLangPrev = -9999;
746   EditDist3Lang *pLang = 0;
747 
748   zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
749                          " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
750   if( zSql==0 ) return SQLITE_NOMEM;
751   rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
752   sqlite3_free(zSql);
753   if( rc ) return rc;
754   editDist3ConfigClear(p);
755   while( sqlite3_step(pStmt)==SQLITE_ROW ){
756     int iLang = sqlite3_column_int(pStmt, 0);
757     const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
758     int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
759     const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
760     int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
761     int iCost = sqlite3_column_int(pStmt, 3);
762 
763     assert( zFrom!=0 || nFrom==0 );
764     assert( zTo!=0 || nTo==0 );
765     if( nFrom>100 || nTo>100 ) continue;
766     if( iCost<0 ) continue;
767     if( iCost>=10000 ) continue;  /* Costs above 10K are considered infinite */
768     if( pLang==0 || iLang!=iLangPrev ){
769       EditDist3Lang *pNew;
770       pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0]));
771       if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
772       p->a = pNew;
773       pLang = &p->a[p->nLang];
774       p->nLang++;
775       pLang->iLang = iLang;
776       pLang->iInsCost = 100;
777       pLang->iDelCost = 100;
778       pLang->iSubCost = 150;
779       pLang->pCost = 0;
780       iLangPrev = iLang;
781     }
782     if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
783       pLang->iDelCost = iCost;
784     }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
785       pLang->iInsCost = iCost;
786     }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
787       pLang->iSubCost = iCost;
788     }else{
789       EditDist3Cost *pCost;
790       int nExtra = nFrom + nTo - 4;
791       if( nExtra<0 ) nExtra = 0;
792       pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra );
793       if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
794       pCost->nFrom = (u8)nFrom;
795       pCost->nTo = (u8)nTo;
796       pCost->iCost = (u16)iCost;
797       memcpy(pCost->a, zFrom, nFrom);
798       memcpy(pCost->a + nFrom, zTo, nTo);
799       pCost->pNext = pLang->pCost;
800       pLang->pCost = pCost;
801     }
802   }
803   rc2 = sqlite3_finalize(pStmt);
804   if( rc==SQLITE_OK ) rc = rc2;
805   if( rc==SQLITE_OK ){
806     int iLang;
807     for(iLang=0; iLang<p->nLang; iLang++){
808       p->a[iLang].pCost = editDist3CostSort(p->a[iLang].pCost);
809     }
810   }
811   return rc;
812 }
813 
814 /*
815 ** Return the length (in bytes) of a utf-8 character.  Or return a maximum
816 ** of N.
817 */
utf8Len(unsigned char c,int N)818 static int utf8Len(unsigned char c, int N){
819   int len = 1;
820   if( c>0x7f ){
821     if( (c&0xe0)==0xc0 ){
822       len = 2;
823     }else if( (c&0xf0)==0xe0 ){
824       len = 3;
825     }else{
826       len = 4;
827     }
828   }
829   if( len>N ) len = N;
830   return len;
831 }
832 
833 /*
834 ** Return TRUE (non-zero) if the To side of the given cost matches
835 ** the given string.
836 */
matchTo(EditDist3Cost * p,const char * z,int n)837 static int matchTo(EditDist3Cost *p, const char *z, int n){
838   assert( n>0 );
839   if( p->a[p->nFrom]!=z[0] ) return 0;
840   if( p->nTo>n ) return 0;
841   if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
842   return 1;
843 }
844 
845 /*
846 ** Return TRUE (non-zero) if the From side of the given cost matches
847 ** the given string.
848 */
matchFrom(EditDist3Cost * p,const char * z,int n)849 static int matchFrom(EditDist3Cost *p, const char *z, int n){
850   assert( p->nFrom<=n );
851   if( p->nFrom ){
852     if( p->a[0]!=z[0] ) return 0;
853     if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
854   }
855   return 1;
856 }
857 
858 /*
859 ** Return TRUE (non-zero) of the next FROM character and the next TO
860 ** character are the same.
861 */
matchFromTo(EditDist3FromString * pStr,int n1,const char * z2,int n2)862 static int matchFromTo(
863   EditDist3FromString *pStr,  /* Left hand string */
864   int n1,                     /* Index of comparison character on the left */
865   const char *z2,             /* Right-handl comparison character */
866   int n2                      /* Bytes remaining in z2[] */
867 ){
868   int b1 = pStr->a[n1].nByte;
869   if( b1>n2 ) return 0;
870   assert( b1>0 );
871   if( pStr->z[n1]!=z2[0] ) return 0;
872   if( strncmp(pStr->z+n1, z2, b1)!=0 ) return 0;
873   return 1;
874 }
875 
876 /*
877 ** Delete an EditDist3FromString objecct
878 */
editDist3FromStringDelete(EditDist3FromString * p)879 static void editDist3FromStringDelete(EditDist3FromString *p){
880   int i;
881   if( p ){
882     for(i=0; i<p->n; i++){
883       sqlite3_free(p->a[i].apDel);
884       sqlite3_free(p->a[i].apSubst);
885     }
886     sqlite3_free(p);
887   }
888 }
889 
890 /*
891 ** Create a EditDist3FromString object.
892 */
editDist3FromStringNew(const EditDist3Lang * pLang,const char * z,int n)893 static EditDist3FromString *editDist3FromStringNew(
894   const EditDist3Lang *pLang,
895   const char *z,
896   int n
897 ){
898   EditDist3FromString *pStr;
899   EditDist3Cost *p;
900   int i;
901 
902   if( z==0 ) return 0;
903   if( n<0 ) n = (int)strlen(z);
904   pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
905   if( pStr==0 ) return 0;
906   pStr->a = (EditDist3From*)&pStr[1];
907   memset(pStr->a, 0, sizeof(pStr->a[0])*n);
908   pStr->n = n;
909   pStr->z = (char*)&pStr->a[n];
910   memcpy(pStr->z, z, n+1);
911   if( n && z[n-1]=='*' ){
912     pStr->isPrefix = 1;
913     n--;
914     pStr->n--;
915     pStr->z[n] = 0;
916   }else{
917     pStr->isPrefix = 0;
918   }
919 
920   for(i=0; i<n; i++){
921     EditDist3From *pFrom = &pStr->a[i];
922     memset(pFrom, 0, sizeof(*pFrom));
923     pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
924     for(p=pLang->pCost; p; p=p->pNext){
925       EditDist3Cost **apNew;
926       if( i+p->nFrom>n ) continue;
927       if( matchFrom(p, z+i, n-i)==0 ) continue;
928       if( p->nTo==0 ){
929         apNew = sqlite3_realloc64(pFrom->apDel,
930                                 sizeof(*apNew)*(pFrom->nDel+1));
931         if( apNew==0 ) break;
932         pFrom->apDel = apNew;
933         apNew[pFrom->nDel++] = p;
934       }else{
935         apNew = sqlite3_realloc64(pFrom->apSubst,
936                                 sizeof(*apNew)*(pFrom->nSubst+1));
937         if( apNew==0 ) break;
938         pFrom->apSubst = apNew;
939         apNew[pFrom->nSubst++] = p;
940       }
941     }
942     if( p ){
943       editDist3FromStringDelete(pStr);
944       pStr = 0;
945       break;
946     }
947   }
948   return pStr;
949 }
950 
951 /*
952 ** Update entry m[i] such that it is the minimum of its current value
953 ** and m[j]+iCost.
954 */
updateCost(unsigned int * m,int i,int j,int iCost)955 static void updateCost(
956   unsigned int *m,
957   int i,
958   int j,
959   int iCost
960 ){
961   unsigned int b;
962   assert( iCost>=0 );
963   assert( iCost<10000 );
964   b = m[j] + iCost;
965   if( b<m[i] ) m[i] = b;
966 }
967 
968 /*
969 ** How much stack space (int bytes) to use for Wagner matrix in
970 ** editDist3Core().  If more space than this is required, the entire
971 ** matrix is taken from the heap.  To reduce the load on the memory
972 ** allocator, make this value as large as practical for the
973 ** architecture in use.
974 */
975 #ifndef SQLITE_SPELLFIX_STACKALLOC_SZ
976 # define SQLITE_SPELLFIX_STACKALLOC_SZ  (1024)
977 #endif
978 
979 /* Compute the edit distance between two strings.
980 **
981 ** If an error occurs, return a negative number which is the error code.
982 **
983 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
984 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
985 ** not contain the pattern for a prefix-search, then this is always the number
986 ** of characters in z2. If pFrom does contain a prefix search pattern, then
987 ** it is the number of characters in the prefix of z2 that was deemed to
988 ** match pFrom.
989 */
editDist3Core(EditDist3FromString * pFrom,const char * z2,int n2,const EditDist3Lang * pLang,int * pnMatch)990 static int editDist3Core(
991   EditDist3FromString *pFrom,  /* The FROM string */
992   const char *z2,              /* The TO string */
993   int n2,                      /* Length of the TO string */
994   const EditDist3Lang *pLang,  /* Edit weights for a particular language ID */
995   int *pnMatch                 /* OUT: Characters in matched prefix */
996 ){
997   int k, n;
998   int i1, b1;
999   int i2, b2;
1000   EditDist3FromString f = *pFrom;
1001   EditDist3To *a2;
1002   unsigned int *m;
1003   unsigned int *pToFree;
1004   int szRow;
1005   EditDist3Cost *p;
1006   int res;
1007   sqlite3_uint64 nByte;
1008   unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)];
1009 
1010   /* allocate the Wagner matrix and the aTo[] array for the TO string */
1011   n = (f.n+1)*(n2+1);
1012   n = (n+1)&~1;
1013   nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2;
1014   if( nByte<=sizeof(stackSpace) ){
1015     m = stackSpace;
1016     pToFree = 0;
1017   }else{
1018     m = pToFree = sqlite3_malloc64( nByte );
1019     if( m==0 ) return -1;            /* Out of memory */
1020   }
1021   a2 = (EditDist3To*)&m[n];
1022   memset(a2, 0, sizeof(a2[0])*n2);
1023 
1024   /* Fill in the a1[] matrix for all characters of the TO string */
1025   for(i2=0; i2<n2; i2++){
1026     a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
1027     for(p=pLang->pCost; p; p=p->pNext){
1028       EditDist3Cost **apNew;
1029       if( p->nFrom>0 ) break;
1030       if( i2+p->nTo>n2 ) continue;
1031       if( p->a[0]>z2[i2] ) break;
1032       if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
1033       a2[i2].nIns++;
1034       apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
1035       if( apNew==0 ){
1036         res = -1;  /* Out of memory */
1037         goto editDist3Abort;
1038       }
1039       a2[i2].apIns = apNew;
1040       a2[i2].apIns[a2[i2].nIns-1] = p;
1041     }
1042   }
1043 
1044   /* Prepare to compute the minimum edit distance */
1045   szRow = f.n+1;
1046   memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
1047   m[0] = 0;
1048 
1049   /* First fill in the top-row of the matrix with FROM deletion costs */
1050   for(i1=0; i1<f.n; i1 += b1){
1051     b1 = f.a[i1].nByte;
1052     updateCost(m, i1+b1, i1, pLang->iDelCost);
1053     for(k=0; k<f.a[i1].nDel; k++){
1054       p = f.a[i1].apDel[k];
1055       updateCost(m, i1+p->nFrom, i1, p->iCost);
1056     }
1057   }
1058 
1059   /* Fill in all subsequent rows, top-to-bottom, left-to-right */
1060   for(i2=0; i2<n2; i2 += b2){
1061     int rx;      /* Starting index for current row */
1062     int rxp;     /* Starting index for previous row */
1063     b2 = a2[i2].nByte;
1064     rx = szRow*(i2+b2);
1065     rxp = szRow*i2;
1066     updateCost(m, rx, rxp, pLang->iInsCost);
1067     for(k=0; k<a2[i2].nIns; k++){
1068       p = a2[i2].apIns[k];
1069       updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
1070     }
1071     for(i1=0; i1<f.n; i1+=b1){
1072       int cx;    /* Index of current cell */
1073       int cxp;   /* Index of cell immediately to the left */
1074       int cxd;   /* Index of cell to the left and one row above */
1075       int cxu;   /* Index of cell immediately above */
1076       b1 = f.a[i1].nByte;
1077       cxp = rx + i1;
1078       cx = cxp + b1;
1079       cxd = rxp + i1;
1080       cxu = cxd + b1;
1081       updateCost(m, cx, cxp, pLang->iDelCost);
1082       for(k=0; k<f.a[i1].nDel; k++){
1083         p = f.a[i1].apDel[k];
1084         updateCost(m, cxp+p->nFrom, cxp, p->iCost);
1085       }
1086       updateCost(m, cx, cxu, pLang->iInsCost);
1087       if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
1088         updateCost(m, cx, cxd, 0);
1089       }
1090       updateCost(m, cx, cxd, pLang->iSubCost);
1091       for(k=0; k<f.a[i1].nSubst; k++){
1092         p = f.a[i1].apSubst[k];
1093         if( matchTo(p, z2+i2, n2-i2) ){
1094           updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
1095         }
1096       }
1097     }
1098   }
1099 
1100 #if 0  /* Enable for debugging */
1101   printf("         ^");
1102   for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
1103   printf("\n   ^:");
1104   for(i1=0; i1<szRow; i1++){
1105     int v = m[i1];
1106     if( v>9999 ) printf(" ****");
1107     else         printf(" %4d", v);
1108   }
1109   printf("\n");
1110   for(i2=0; i2<n2; i2++){
1111     printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1112     for(i1=0; i1<szRow; i1++){
1113       int v = m[(i2+1)*szRow+i1];
1114       if( v>9999 ) printf(" ****");
1115       else         printf(" %4d", v);
1116     }
1117     printf("\n");
1118   }
1119 #endif
1120 
1121   /* Free memory allocations and return the result */
1122   res = (int)m[szRow*(n2+1)-1];
1123   n = n2;
1124   if( f.isPrefix ){
1125     for(i2=1; i2<=n2; i2++){
1126       int b = m[szRow*i2-1];
1127       if( b<=res ){
1128         res = b;
1129         n = i2 - 1;
1130       }
1131     }
1132   }
1133   if( pnMatch ){
1134     int nExtra = 0;
1135     for(k=0; k<n; k++){
1136       if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1137     }
1138     *pnMatch = n - nExtra;
1139   }
1140 
1141 editDist3Abort:
1142   for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1143   sqlite3_free(pToFree);
1144   return res;
1145 }
1146 
1147 /*
1148 ** Get an appropriate EditDist3Lang object.
1149 */
editDist3FindLang(EditDist3Config * pConfig,int iLang)1150 static const EditDist3Lang *editDist3FindLang(
1151   EditDist3Config *pConfig,
1152   int iLang
1153 ){
1154   int i;
1155   for(i=0; i<pConfig->nLang; i++){
1156     if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1157   }
1158   return &editDist3Lang;
1159 }
1160 
1161 /*
1162 ** Function:    editdist3(A,B,iLang)
1163 **              editdist3(tablename)
1164 **
1165 ** Return the cost of transforming string A into string B using edit
1166 ** weights for iLang.
1167 **
1168 ** The second form loads edit weights into memory from a table.
1169 */
editDist3SqlFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)1170 static void editDist3SqlFunc(
1171   sqlite3_context *context,
1172   int argc,
1173   sqlite3_value **argv
1174 ){
1175   EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1176   sqlite3 *db = sqlite3_context_db_handle(context);
1177   int rc;
1178   if( argc==1 ){
1179     const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1180     rc = editDist3ConfigLoad(pConfig, db, zTable);
1181     if( rc ) sqlite3_result_error_code(context, rc);
1182   }else{
1183     const char *zA = (const char*)sqlite3_value_text(argv[0]);
1184     const char *zB = (const char*)sqlite3_value_text(argv[1]);
1185     int nA = sqlite3_value_bytes(argv[0]);
1186     int nB = sqlite3_value_bytes(argv[1]);
1187     int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1188     const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1189     EditDist3FromString *pFrom;
1190     int dist;
1191 
1192     pFrom = editDist3FromStringNew(pLang, zA, nA);
1193     if( pFrom==0 ){
1194       sqlite3_result_error_nomem(context);
1195       return;
1196     }
1197     dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1198     editDist3FromStringDelete(pFrom);
1199     if( dist==(-1) ){
1200       sqlite3_result_error_nomem(context);
1201     }else{
1202       sqlite3_result_int(context, dist);
1203     }
1204   }
1205 }
1206 
1207 /*
1208 ** Register the editDist3 function with SQLite
1209 */
editDist3Install(sqlite3 * db)1210 static int editDist3Install(sqlite3 *db){
1211   int rc;
1212   EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) );
1213   if( pConfig==0 ) return SQLITE_NOMEM;
1214   memset(pConfig, 0, sizeof(*pConfig));
1215   rc = sqlite3_create_function_v2(db, "editdist3",
1216               2, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1217               editDist3SqlFunc, 0, 0, 0);
1218   if( rc==SQLITE_OK ){
1219     rc = sqlite3_create_function_v2(db, "editdist3",
1220                 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1221                 editDist3SqlFunc, 0, 0, 0);
1222   }
1223   if( rc==SQLITE_OK ){
1224     rc = sqlite3_create_function_v2(db, "editdist3",
1225                 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1226                 editDist3SqlFunc, 0, 0, editDist3ConfigDelete);
1227   }else{
1228     sqlite3_free(pConfig);
1229   }
1230   return rc;
1231 }
1232 /* End configurable cost unicode edit distance routines
1233 ******************************************************************************
1234 ******************************************************************************
1235 ** Begin transliterate unicode-to-ascii implementation
1236 */
1237 
1238 #if !SQLITE_AMALGAMATION
1239 /*
1240 ** This lookup table is used to help decode the first byte of
1241 ** a multi-byte UTF8 character.
1242 */
1243 static const unsigned char sqlite3Utf8Trans1[] = {
1244   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1245   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1246   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1247   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1248   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1249   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1250   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1251   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1252 };
1253 #endif
1254 
1255 /*
1256 ** Return the value of the first UTF-8 character in the string.
1257 */
utf8Read(const unsigned char * z,int n,int * pSize)1258 static int utf8Read(const unsigned char *z, int n, int *pSize){
1259   int c, i;
1260 
1261   /* All callers to this routine (in the current implementation)
1262   ** always have n>0. */
1263   if( NEVER(n==0) ){
1264     c = i = 0;
1265   }else{
1266     c = z[0];
1267     i = 1;
1268     if( c>=0xc0 ){
1269       c = sqlite3Utf8Trans1[c-0xc0];
1270       while( i<n && (z[i] & 0xc0)==0x80 ){
1271         c = (c<<6) + (0x3f & z[i++]);
1272       }
1273     }
1274   }
1275   *pSize = i;
1276   return c;
1277 }
1278 
1279 /*
1280 ** Return the number of characters in the utf-8 string in the nIn byte
1281 ** buffer pointed to by zIn.
1282 */
utf8Charlen(const char * zIn,int nIn)1283 static int utf8Charlen(const char *zIn, int nIn){
1284   int i;
1285   int nChar = 0;
1286   for(i=0; i<nIn; nChar++){
1287     int sz;
1288     utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1289     i += sz;
1290   }
1291   return nChar;
1292 }
1293 
1294 typedef struct Transliteration Transliteration;
1295 struct Transliteration {
1296  unsigned short int cFrom;
1297  unsigned char cTo0, cTo1, cTo2, cTo3;
1298 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1299  unsigned char cTo4;
1300 #endif
1301 };
1302 
1303 /*
1304 ** Table of translations from unicode characters into ASCII.
1305 */
1306 static const Transliteration translit[] = {
1307   { 0x00A0,  0x20, 0x00, 0x00, 0x00 },  /*   to   */
1308   { 0x00B5,  0x75, 0x00, 0x00, 0x00 },  /* µ to u */
1309   { 0x00C0,  0x41, 0x00, 0x00, 0x00 },  /* À to A */
1310   { 0x00C1,  0x41, 0x00, 0x00, 0x00 },  /* Á to A */
1311   { 0x00C2,  0x41, 0x00, 0x00, 0x00 },  /* Â to A */
1312   { 0x00C3,  0x41, 0x00, 0x00, 0x00 },  /* Ã to A */
1313   { 0x00C4,  0x41, 0x65, 0x00, 0x00 },  /* Ä to Ae */
1314   { 0x00C5,  0x41, 0x61, 0x00, 0x00 },  /* Å to Aa */
1315   { 0x00C6,  0x41, 0x45, 0x00, 0x00 },  /* Æ to AE */
1316   { 0x00C7,  0x43, 0x00, 0x00, 0x00 },  /* Ç to C */
1317   { 0x00C8,  0x45, 0x00, 0x00, 0x00 },  /* È to E */
1318   { 0x00C9,  0x45, 0x00, 0x00, 0x00 },  /* É to E */
1319   { 0x00CA,  0x45, 0x00, 0x00, 0x00 },  /* Ê to E */
1320   { 0x00CB,  0x45, 0x00, 0x00, 0x00 },  /* Ë to E */
1321   { 0x00CC,  0x49, 0x00, 0x00, 0x00 },  /* Ì to I */
1322   { 0x00CD,  0x49, 0x00, 0x00, 0x00 },  /* Í to I */
1323   { 0x00CE,  0x49, 0x00, 0x00, 0x00 },  /* Î to I */
1324   { 0x00CF,  0x49, 0x00, 0x00, 0x00 },  /* Ï to I */
1325   { 0x00D0,  0x44, 0x00, 0x00, 0x00 },  /* Ð to D */
1326   { 0x00D1,  0x4E, 0x00, 0x00, 0x00 },  /* Ñ to N */
1327   { 0x00D2,  0x4F, 0x00, 0x00, 0x00 },  /* Ò to O */
1328   { 0x00D3,  0x4F, 0x00, 0x00, 0x00 },  /* Ó to O */
1329   { 0x00D4,  0x4F, 0x00, 0x00, 0x00 },  /* Ô to O */
1330   { 0x00D5,  0x4F, 0x00, 0x00, 0x00 },  /* Õ to O */
1331   { 0x00D6,  0x4F, 0x65, 0x00, 0x00 },  /* Ö to Oe */
1332   { 0x00D7,  0x78, 0x00, 0x00, 0x00 },  /* × to x */
1333   { 0x00D8,  0x4F, 0x00, 0x00, 0x00 },  /* Ø to O */
1334   { 0x00D9,  0x55, 0x00, 0x00, 0x00 },  /* Ù to U */
1335   { 0x00DA,  0x55, 0x00, 0x00, 0x00 },  /* Ú to U */
1336   { 0x00DB,  0x55, 0x00, 0x00, 0x00 },  /* Û to U */
1337   { 0x00DC,  0x55, 0x65, 0x00, 0x00 },  /* Ü to Ue */
1338   { 0x00DD,  0x59, 0x00, 0x00, 0x00 },  /* Ý to Y */
1339   { 0x00DE,  0x54, 0x68, 0x00, 0x00 },  /* Þ to Th */
1340   { 0x00DF,  0x73, 0x73, 0x00, 0x00 },  /* ß to ss */
1341   { 0x00E0,  0x61, 0x00, 0x00, 0x00 },  /* à to a */
1342   { 0x00E1,  0x61, 0x00, 0x00, 0x00 },  /* á to a */
1343   { 0x00E2,  0x61, 0x00, 0x00, 0x00 },  /* â to a */
1344   { 0x00E3,  0x61, 0x00, 0x00, 0x00 },  /* ã to a */
1345   { 0x00E4,  0x61, 0x65, 0x00, 0x00 },  /* ä to ae */
1346   { 0x00E5,  0x61, 0x61, 0x00, 0x00 },  /* å to aa */
1347   { 0x00E6,  0x61, 0x65, 0x00, 0x00 },  /* æ to ae */
1348   { 0x00E7,  0x63, 0x00, 0x00, 0x00 },  /* ç to c */
1349   { 0x00E8,  0x65, 0x00, 0x00, 0x00 },  /* è to e */
1350   { 0x00E9,  0x65, 0x00, 0x00, 0x00 },  /* é to e */
1351   { 0x00EA,  0x65, 0x00, 0x00, 0x00 },  /* ê to e */
1352   { 0x00EB,  0x65, 0x00, 0x00, 0x00 },  /* ë to e */
1353   { 0x00EC,  0x69, 0x00, 0x00, 0x00 },  /* ì to i */
1354   { 0x00ED,  0x69, 0x00, 0x00, 0x00 },  /* í to i */
1355   { 0x00EE,  0x69, 0x00, 0x00, 0x00 },  /* î to i */
1356   { 0x00EF,  0x69, 0x00, 0x00, 0x00 },  /* ï to i */
1357   { 0x00F0,  0x64, 0x00, 0x00, 0x00 },  /* ð to d */
1358   { 0x00F1,  0x6E, 0x00, 0x00, 0x00 },  /* ñ to n */
1359   { 0x00F2,  0x6F, 0x00, 0x00, 0x00 },  /* ò to o */
1360   { 0x00F3,  0x6F, 0x00, 0x00, 0x00 },  /* ó to o */
1361   { 0x00F4,  0x6F, 0x00, 0x00, 0x00 },  /* ô to o */
1362   { 0x00F5,  0x6F, 0x00, 0x00, 0x00 },  /* õ to o */
1363   { 0x00F6,  0x6F, 0x65, 0x00, 0x00 },  /* ö to oe */
1364   { 0x00F7,  0x3A, 0x00, 0x00, 0x00 },  /* ÷ to : */
1365   { 0x00F8,  0x6F, 0x00, 0x00, 0x00 },  /* ø to o */
1366   { 0x00F9,  0x75, 0x00, 0x00, 0x00 },  /* ù to u */
1367   { 0x00FA,  0x75, 0x00, 0x00, 0x00 },  /* ú to u */
1368   { 0x00FB,  0x75, 0x00, 0x00, 0x00 },  /* û to u */
1369   { 0x00FC,  0x75, 0x65, 0x00, 0x00 },  /* ü to ue */
1370   { 0x00FD,  0x79, 0x00, 0x00, 0x00 },  /* ý to y */
1371   { 0x00FE,  0x74, 0x68, 0x00, 0x00 },  /* þ to th */
1372   { 0x00FF,  0x79, 0x00, 0x00, 0x00 },  /* ÿ to y */
1373   { 0x0100,  0x41, 0x00, 0x00, 0x00 },  /* Ā to A */
1374   { 0x0101,  0x61, 0x00, 0x00, 0x00 },  /* ā to a */
1375   { 0x0102,  0x41, 0x00, 0x00, 0x00 },  /* Ă to A */
1376   { 0x0103,  0x61, 0x00, 0x00, 0x00 },  /* ă to a */
1377   { 0x0104,  0x41, 0x00, 0x00, 0x00 },  /* Ą to A */
1378   { 0x0105,  0x61, 0x00, 0x00, 0x00 },  /* ą to a */
1379   { 0x0106,  0x43, 0x00, 0x00, 0x00 },  /* Ć to C */
1380   { 0x0107,  0x63, 0x00, 0x00, 0x00 },  /* ć to c */
1381   { 0x0108,  0x43, 0x68, 0x00, 0x00 },  /* Ĉ to Ch */
1382   { 0x0109,  0x63, 0x68, 0x00, 0x00 },  /* ĉ to ch */
1383   { 0x010A,  0x43, 0x00, 0x00, 0x00 },  /* Ċ to C */
1384   { 0x010B,  0x63, 0x00, 0x00, 0x00 },  /* ċ to c */
1385   { 0x010C,  0x43, 0x00, 0x00, 0x00 },  /* Č to C */
1386   { 0x010D,  0x63, 0x00, 0x00, 0x00 },  /* č to c */
1387   { 0x010E,  0x44, 0x00, 0x00, 0x00 },  /* Ď to D */
1388   { 0x010F,  0x64, 0x00, 0x00, 0x00 },  /* ď to d */
1389   { 0x0110,  0x44, 0x00, 0x00, 0x00 },  /* Đ to D */
1390   { 0x0111,  0x64, 0x00, 0x00, 0x00 },  /* đ to d */
1391   { 0x0112,  0x45, 0x00, 0x00, 0x00 },  /* Ē to E */
1392   { 0x0113,  0x65, 0x00, 0x00, 0x00 },  /* ē to e */
1393   { 0x0114,  0x45, 0x00, 0x00, 0x00 },  /* Ĕ to E */
1394   { 0x0115,  0x65, 0x00, 0x00, 0x00 },  /* ĕ to e */
1395   { 0x0116,  0x45, 0x00, 0x00, 0x00 },  /* Ė to E */
1396   { 0x0117,  0x65, 0x00, 0x00, 0x00 },  /* ė to e */
1397   { 0x0118,  0x45, 0x00, 0x00, 0x00 },  /* Ę to E */
1398   { 0x0119,  0x65, 0x00, 0x00, 0x00 },  /* ę to e */
1399   { 0x011A,  0x45, 0x00, 0x00, 0x00 },  /* Ě to E */
1400   { 0x011B,  0x65, 0x00, 0x00, 0x00 },  /* ě to e */
1401   { 0x011C,  0x47, 0x68, 0x00, 0x00 },  /* Ĝ to Gh */
1402   { 0x011D,  0x67, 0x68, 0x00, 0x00 },  /* ĝ to gh */
1403   { 0x011E,  0x47, 0x00, 0x00, 0x00 },  /* Ğ to G */
1404   { 0x011F,  0x67, 0x00, 0x00, 0x00 },  /* ğ to g */
1405   { 0x0120,  0x47, 0x00, 0x00, 0x00 },  /* Ġ to G */
1406   { 0x0121,  0x67, 0x00, 0x00, 0x00 },  /* ġ to g */
1407   { 0x0122,  0x47, 0x00, 0x00, 0x00 },  /* Ģ to G */
1408   { 0x0123,  0x67, 0x00, 0x00, 0x00 },  /* ģ to g */
1409   { 0x0124,  0x48, 0x68, 0x00, 0x00 },  /* Ĥ to Hh */
1410   { 0x0125,  0x68, 0x68, 0x00, 0x00 },  /* ĥ to hh */
1411   { 0x0126,  0x48, 0x00, 0x00, 0x00 },  /* Ħ to H */
1412   { 0x0127,  0x68, 0x00, 0x00, 0x00 },  /* ħ to h */
1413   { 0x0128,  0x49, 0x00, 0x00, 0x00 },  /* Ĩ to I */
1414   { 0x0129,  0x69, 0x00, 0x00, 0x00 },  /* ĩ to i */
1415   { 0x012A,  0x49, 0x00, 0x00, 0x00 },  /* Ī to I */
1416   { 0x012B,  0x69, 0x00, 0x00, 0x00 },  /* ī to i */
1417   { 0x012C,  0x49, 0x00, 0x00, 0x00 },  /* Ĭ to I */
1418   { 0x012D,  0x69, 0x00, 0x00, 0x00 },  /* ĭ to i */
1419   { 0x012E,  0x49, 0x00, 0x00, 0x00 },  /* Į to I */
1420   { 0x012F,  0x69, 0x00, 0x00, 0x00 },  /* į to i */
1421   { 0x0130,  0x49, 0x00, 0x00, 0x00 },  /* İ to I */
1422   { 0x0131,  0x69, 0x00, 0x00, 0x00 },  /* ı to i */
1423   { 0x0132,  0x49, 0x4A, 0x00, 0x00 },  /* IJ to IJ */
1424   { 0x0133,  0x69, 0x6A, 0x00, 0x00 },  /* ij to ij */
1425   { 0x0134,  0x4A, 0x68, 0x00, 0x00 },  /* Ĵ to Jh */
1426   { 0x0135,  0x6A, 0x68, 0x00, 0x00 },  /* ĵ to jh */
1427   { 0x0136,  0x4B, 0x00, 0x00, 0x00 },  /* Ķ to K */
1428   { 0x0137,  0x6B, 0x00, 0x00, 0x00 },  /* ķ to k */
1429   { 0x0138,  0x6B, 0x00, 0x00, 0x00 },  /* ĸ to k */
1430   { 0x0139,  0x4C, 0x00, 0x00, 0x00 },  /* Ĺ to L */
1431   { 0x013A,  0x6C, 0x00, 0x00, 0x00 },  /* ĺ to l */
1432   { 0x013B,  0x4C, 0x00, 0x00, 0x00 },  /* Ļ to L */
1433   { 0x013C,  0x6C, 0x00, 0x00, 0x00 },  /* ļ to l */
1434   { 0x013D,  0x4C, 0x00, 0x00, 0x00 },  /* Ľ to L */
1435   { 0x013E,  0x6C, 0x00, 0x00, 0x00 },  /* ľ to l */
1436   { 0x013F,  0x4C, 0x2E, 0x00, 0x00 },  /* Ŀ to L. */
1437   { 0x0140,  0x6C, 0x2E, 0x00, 0x00 },  /* ŀ to l. */
1438   { 0x0141,  0x4C, 0x00, 0x00, 0x00 },  /* Ł to L */
1439   { 0x0142,  0x6C, 0x00, 0x00, 0x00 },  /* ł to l */
1440   { 0x0143,  0x4E, 0x00, 0x00, 0x00 },  /* Ń to N */
1441   { 0x0144,  0x6E, 0x00, 0x00, 0x00 },  /* ń to n */
1442   { 0x0145,  0x4E, 0x00, 0x00, 0x00 },  /* Ņ to N */
1443   { 0x0146,  0x6E, 0x00, 0x00, 0x00 },  /* ņ to n */
1444   { 0x0147,  0x4E, 0x00, 0x00, 0x00 },  /* Ň to N */
1445   { 0x0148,  0x6E, 0x00, 0x00, 0x00 },  /* ň to n */
1446   { 0x0149,  0x27, 0x6E, 0x00, 0x00 },  /* ʼn to 'n */
1447   { 0x014A,  0x4E, 0x47, 0x00, 0x00 },  /* Ŋ to NG */
1448   { 0x014B,  0x6E, 0x67, 0x00, 0x00 },  /* ŋ to ng */
1449   { 0x014C,  0x4F, 0x00, 0x00, 0x00 },  /* Ō to O */
1450   { 0x014D,  0x6F, 0x00, 0x00, 0x00 },  /* ō to o */
1451   { 0x014E,  0x4F, 0x00, 0x00, 0x00 },  /* Ŏ to O */
1452   { 0x014F,  0x6F, 0x00, 0x00, 0x00 },  /* ŏ to o */
1453   { 0x0150,  0x4F, 0x00, 0x00, 0x00 },  /* Ő to O */
1454   { 0x0151,  0x6F, 0x00, 0x00, 0x00 },  /* ő to o */
1455   { 0x0152,  0x4F, 0x45, 0x00, 0x00 },  /* Œ to OE */
1456   { 0x0153,  0x6F, 0x65, 0x00, 0x00 },  /* œ to oe */
1457   { 0x0154,  0x52, 0x00, 0x00, 0x00 },  /* Ŕ to R */
1458   { 0x0155,  0x72, 0x00, 0x00, 0x00 },  /* ŕ to r */
1459   { 0x0156,  0x52, 0x00, 0x00, 0x00 },  /* Ŗ to R */
1460   { 0x0157,  0x72, 0x00, 0x00, 0x00 },  /* ŗ to r */
1461   { 0x0158,  0x52, 0x00, 0x00, 0x00 },  /* Ř to R */
1462   { 0x0159,  0x72, 0x00, 0x00, 0x00 },  /* ř to r */
1463   { 0x015A,  0x53, 0x00, 0x00, 0x00 },  /* Ś to S */
1464   { 0x015B,  0x73, 0x00, 0x00, 0x00 },  /* ś to s */
1465   { 0x015C,  0x53, 0x68, 0x00, 0x00 },  /* Ŝ to Sh */
1466   { 0x015D,  0x73, 0x68, 0x00, 0x00 },  /* ŝ to sh */
1467   { 0x015E,  0x53, 0x00, 0x00, 0x00 },  /* Ş to S */
1468   { 0x015F,  0x73, 0x00, 0x00, 0x00 },  /* ş to s */
1469   { 0x0160,  0x53, 0x00, 0x00, 0x00 },  /* Š to S */
1470   { 0x0161,  0x73, 0x00, 0x00, 0x00 },  /* š to s */
1471   { 0x0162,  0x54, 0x00, 0x00, 0x00 },  /* Ţ to T */
1472   { 0x0163,  0x74, 0x00, 0x00, 0x00 },  /* ţ to t */
1473   { 0x0164,  0x54, 0x00, 0x00, 0x00 },  /* Ť to T */
1474   { 0x0165,  0x74, 0x00, 0x00, 0x00 },  /* ť to t */
1475   { 0x0166,  0x54, 0x00, 0x00, 0x00 },  /* Ŧ to T */
1476   { 0x0167,  0x74, 0x00, 0x00, 0x00 },  /* ŧ to t */
1477   { 0x0168,  0x55, 0x00, 0x00, 0x00 },  /* Ũ to U */
1478   { 0x0169,  0x75, 0x00, 0x00, 0x00 },  /* ũ to u */
1479   { 0x016A,  0x55, 0x00, 0x00, 0x00 },  /* Ū to U */
1480   { 0x016B,  0x75, 0x00, 0x00, 0x00 },  /* ū to u */
1481   { 0x016C,  0x55, 0x00, 0x00, 0x00 },  /* Ŭ to U */
1482   { 0x016D,  0x75, 0x00, 0x00, 0x00 },  /* ŭ to u */
1483   { 0x016E,  0x55, 0x00, 0x00, 0x00 },  /* Ů to U */
1484   { 0x016F,  0x75, 0x00, 0x00, 0x00 },  /* ů to u */
1485   { 0x0170,  0x55, 0x00, 0x00, 0x00 },  /* Ű to U */
1486   { 0x0171,  0x75, 0x00, 0x00, 0x00 },  /* ű to u */
1487   { 0x0172,  0x55, 0x00, 0x00, 0x00 },  /* Ų to U */
1488   { 0x0173,  0x75, 0x00, 0x00, 0x00 },  /* ų to u */
1489   { 0x0174,  0x57, 0x00, 0x00, 0x00 },  /* Ŵ to W */
1490   { 0x0175,  0x77, 0x00, 0x00, 0x00 },  /* ŵ to w */
1491   { 0x0176,  0x59, 0x00, 0x00, 0x00 },  /* Ŷ to Y */
1492   { 0x0177,  0x79, 0x00, 0x00, 0x00 },  /* ŷ to y */
1493   { 0x0178,  0x59, 0x00, 0x00, 0x00 },  /* Ÿ to Y */
1494   { 0x0179,  0x5A, 0x00, 0x00, 0x00 },  /* Ź to Z */
1495   { 0x017A,  0x7A, 0x00, 0x00, 0x00 },  /* ź to z */
1496   { 0x017B,  0x5A, 0x00, 0x00, 0x00 },  /* Ż to Z */
1497   { 0x017C,  0x7A, 0x00, 0x00, 0x00 },  /* ż to z */
1498   { 0x017D,  0x5A, 0x00, 0x00, 0x00 },  /* Ž to Z */
1499   { 0x017E,  0x7A, 0x00, 0x00, 0x00 },  /* ž to z */
1500   { 0x017F,  0x73, 0x00, 0x00, 0x00 },  /* ſ to s */
1501   { 0x0192,  0x66, 0x00, 0x00, 0x00 },  /* ƒ to f */
1502   { 0x0218,  0x53, 0x00, 0x00, 0x00 },  /* Ș to S */
1503   { 0x0219,  0x73, 0x00, 0x00, 0x00 },  /* ș to s */
1504   { 0x021A,  0x54, 0x00, 0x00, 0x00 },  /* Ț to T */
1505   { 0x021B,  0x74, 0x00, 0x00, 0x00 },  /* ț to t */
1506   { 0x0386,  0x41, 0x00, 0x00, 0x00 },  /* Ά to A */
1507   { 0x0388,  0x45, 0x00, 0x00, 0x00 },  /* Έ to E */
1508   { 0x0389,  0x49, 0x00, 0x00, 0x00 },  /* Ή to I */
1509   { 0x038A,  0x49, 0x00, 0x00, 0x00 },  /* Ί to I */
1510   { 0x038C,  0x4f, 0x00, 0x00, 0x00 },  /* Ό to O */
1511   { 0x038E,  0x59, 0x00, 0x00, 0x00 },  /* Ύ to Y */
1512   { 0x038F,  0x4f, 0x00, 0x00, 0x00 },  /* Ώ to O */
1513   { 0x0390,  0x69, 0x00, 0x00, 0x00 },  /* ΐ to i */
1514   { 0x0391,  0x41, 0x00, 0x00, 0x00 },  /* Α to A */
1515   { 0x0392,  0x42, 0x00, 0x00, 0x00 },  /* Β to B */
1516   { 0x0393,  0x47, 0x00, 0x00, 0x00 },  /* Γ to G */
1517   { 0x0394,  0x44, 0x00, 0x00, 0x00 },  /* Δ to D */
1518   { 0x0395,  0x45, 0x00, 0x00, 0x00 },  /* Ε to E */
1519   { 0x0396,  0x5a, 0x00, 0x00, 0x00 },  /* Ζ to Z */
1520   { 0x0397,  0x49, 0x00, 0x00, 0x00 },  /* Η to I */
1521   { 0x0398,  0x54, 0x68, 0x00, 0x00 },  /* Θ to Th */
1522   { 0x0399,  0x49, 0x00, 0x00, 0x00 },  /* Ι to I */
1523   { 0x039A,  0x4b, 0x00, 0x00, 0x00 },  /* Κ to K */
1524   { 0x039B,  0x4c, 0x00, 0x00, 0x00 },  /* Λ to L */
1525   { 0x039C,  0x4d, 0x00, 0x00, 0x00 },  /* Μ to M */
1526   { 0x039D,  0x4e, 0x00, 0x00, 0x00 },  /* Ν to N */
1527   { 0x039E,  0x58, 0x00, 0x00, 0x00 },  /* Ξ to X */
1528   { 0x039F,  0x4f, 0x00, 0x00, 0x00 },  /* Ο to O */
1529   { 0x03A0,  0x50, 0x00, 0x00, 0x00 },  /* Π to P */
1530   { 0x03A1,  0x52, 0x00, 0x00, 0x00 },  /* Ρ to R */
1531   { 0x03A3,  0x53, 0x00, 0x00, 0x00 },  /* Σ to S */
1532   { 0x03A4,  0x54, 0x00, 0x00, 0x00 },  /* Τ to T */
1533   { 0x03A5,  0x59, 0x00, 0x00, 0x00 },  /* Υ to Y */
1534   { 0x03A6,  0x46, 0x00, 0x00, 0x00 },  /* Φ to F */
1535   { 0x03A7,  0x43, 0x68, 0x00, 0x00 },  /* Χ to Ch */
1536   { 0x03A8,  0x50, 0x73, 0x00, 0x00 },  /* Ψ to Ps */
1537   { 0x03A9,  0x4f, 0x00, 0x00, 0x00 },  /* Ω to O */
1538   { 0x03AA,  0x49, 0x00, 0x00, 0x00 },  /* Ϊ to I */
1539   { 0x03AB,  0x59, 0x00, 0x00, 0x00 },  /* Ϋ to Y */
1540   { 0x03AC,  0x61, 0x00, 0x00, 0x00 },  /* ά to a */
1541   { 0x03AD,  0x65, 0x00, 0x00, 0x00 },  /* έ to e */
1542   { 0x03AE,  0x69, 0x00, 0x00, 0x00 },  /* ή to i */
1543   { 0x03AF,  0x69, 0x00, 0x00, 0x00 },  /* ί to i */
1544   { 0x03B1,  0x61, 0x00, 0x00, 0x00 },  /* α to a */
1545   { 0x03B2,  0x62, 0x00, 0x00, 0x00 },  /* β to b */
1546   { 0x03B3,  0x67, 0x00, 0x00, 0x00 },  /* γ to g */
1547   { 0x03B4,  0x64, 0x00, 0x00, 0x00 },  /* δ to d */
1548   { 0x03B5,  0x65, 0x00, 0x00, 0x00 },  /* ε to e */
1549   { 0x03B6,  0x7a, 0x00, 0x00, 0x00 },  /* ζ to z */
1550   { 0x03B7,  0x69, 0x00, 0x00, 0x00 },  /* η to i */
1551   { 0x03B8,  0x74, 0x68, 0x00, 0x00 },  /* θ to th */
1552   { 0x03B9,  0x69, 0x00, 0x00, 0x00 },  /* ι to i */
1553   { 0x03BA,  0x6b, 0x00, 0x00, 0x00 },  /* κ to k */
1554   { 0x03BB,  0x6c, 0x00, 0x00, 0x00 },  /* λ to l */
1555   { 0x03BC,  0x6d, 0x00, 0x00, 0x00 },  /* μ to m */
1556   { 0x03BD,  0x6e, 0x00, 0x00, 0x00 },  /* ν to n */
1557   { 0x03BE,  0x78, 0x00, 0x00, 0x00 },  /* ξ to x */
1558   { 0x03BF,  0x6f, 0x00, 0x00, 0x00 },  /* ο to o */
1559   { 0x03C0,  0x70, 0x00, 0x00, 0x00 },  /* π to p */
1560   { 0x03C1,  0x72, 0x00, 0x00, 0x00 },  /* ρ to r */
1561   { 0x03C3,  0x73, 0x00, 0x00, 0x00 },  /* σ to s */
1562   { 0x03C4,  0x74, 0x00, 0x00, 0x00 },  /* τ to t */
1563   { 0x03C5,  0x79, 0x00, 0x00, 0x00 },  /* υ to y */
1564   { 0x03C6,  0x66, 0x00, 0x00, 0x00 },  /* φ to f */
1565   { 0x03C7,  0x63, 0x68, 0x00, 0x00 },  /* χ to ch */
1566   { 0x03C8,  0x70, 0x73, 0x00, 0x00 },  /* ψ to ps */
1567   { 0x03C9,  0x6f, 0x00, 0x00, 0x00 },  /* ω to o */
1568   { 0x03CA,  0x69, 0x00, 0x00, 0x00 },  /* ϊ to i */
1569   { 0x03CB,  0x79, 0x00, 0x00, 0x00 },  /* ϋ to y */
1570   { 0x03CC,  0x6f, 0x00, 0x00, 0x00 },  /* ό to o */
1571   { 0x03CD,  0x79, 0x00, 0x00, 0x00 },  /* ύ to y */
1572   { 0x03CE,  0x69, 0x00, 0x00, 0x00 },  /* ώ to i */
1573   { 0x0400,  0x45, 0x00, 0x00, 0x00 },  /* Ѐ to E */
1574   { 0x0401,  0x45, 0x00, 0x00, 0x00 },  /* Ё to E */
1575   { 0x0402,  0x44, 0x00, 0x00, 0x00 },  /* Ђ to D */
1576   { 0x0403,  0x47, 0x00, 0x00, 0x00 },  /* Ѓ to G */
1577   { 0x0404,  0x45, 0x00, 0x00, 0x00 },  /* Є to E */
1578   { 0x0405,  0x5a, 0x00, 0x00, 0x00 },  /* Ѕ to Z */
1579   { 0x0406,  0x49, 0x00, 0x00, 0x00 },  /* І to I */
1580   { 0x0407,  0x49, 0x00, 0x00, 0x00 },  /* Ї to I */
1581   { 0x0408,  0x4a, 0x00, 0x00, 0x00 },  /* Ј to J */
1582   { 0x0409,  0x49, 0x00, 0x00, 0x00 },  /* Љ to I */
1583   { 0x040A,  0x4e, 0x00, 0x00, 0x00 },  /* Њ to N */
1584   { 0x040B,  0x44, 0x00, 0x00, 0x00 },  /* Ћ to D */
1585   { 0x040C,  0x4b, 0x00, 0x00, 0x00 },  /* Ќ to K */
1586   { 0x040D,  0x49, 0x00, 0x00, 0x00 },  /* Ѝ to I */
1587   { 0x040E,  0x55, 0x00, 0x00, 0x00 },  /* Ў to U */
1588   { 0x040F,  0x44, 0x00, 0x00, 0x00 },  /* Џ to D */
1589   { 0x0410,  0x41, 0x00, 0x00, 0x00 },  /* А to A */
1590   { 0x0411,  0x42, 0x00, 0x00, 0x00 },  /* Б to B */
1591   { 0x0412,  0x56, 0x00, 0x00, 0x00 },  /* В to V */
1592   { 0x0413,  0x47, 0x00, 0x00, 0x00 },  /* Г to G */
1593   { 0x0414,  0x44, 0x00, 0x00, 0x00 },  /* Д to D */
1594   { 0x0415,  0x45, 0x00, 0x00, 0x00 },  /* Е to E */
1595   { 0x0416,  0x5a, 0x68, 0x00, 0x00 },  /* Ж to Zh */
1596   { 0x0417,  0x5a, 0x00, 0x00, 0x00 },  /* З to Z */
1597   { 0x0418,  0x49, 0x00, 0x00, 0x00 },  /* И to I */
1598   { 0x0419,  0x49, 0x00, 0x00, 0x00 },  /* Й to I */
1599   { 0x041A,  0x4b, 0x00, 0x00, 0x00 },  /* К to K */
1600   { 0x041B,  0x4c, 0x00, 0x00, 0x00 },  /* Л to L */
1601   { 0x041C,  0x4d, 0x00, 0x00, 0x00 },  /* М to M */
1602   { 0x041D,  0x4e, 0x00, 0x00, 0x00 },  /* Н to N */
1603   { 0x041E,  0x4f, 0x00, 0x00, 0x00 },  /* О to O */
1604   { 0x041F,  0x50, 0x00, 0x00, 0x00 },  /* П to P */
1605   { 0x0420,  0x52, 0x00, 0x00, 0x00 },  /* Р to R */
1606   { 0x0421,  0x53, 0x00, 0x00, 0x00 },  /* С to S */
1607   { 0x0422,  0x54, 0x00, 0x00, 0x00 },  /* Т to T */
1608   { 0x0423,  0x55, 0x00, 0x00, 0x00 },  /* У to U */
1609   { 0x0424,  0x46, 0x00, 0x00, 0x00 },  /* Ф to F */
1610   { 0x0425,  0x4b, 0x68, 0x00, 0x00 },  /* Х to Kh */
1611   { 0x0426,  0x54, 0x63, 0x00, 0x00 },  /* Ц to Tc */
1612   { 0x0427,  0x43, 0x68, 0x00, 0x00 },  /* Ч to Ch */
1613   { 0x0428,  0x53, 0x68, 0x00, 0x00 },  /* Ш to Sh */
1614   { 0x0429,  0x53, 0x68, 0x63, 0x68 },  /* Щ to Shch */
1615   { 0x042A,  0x61, 0x00, 0x00, 0x00 },  /*  to A */
1616   { 0x042B,  0x59, 0x00, 0x00, 0x00 },  /* Ы to Y */
1617   { 0x042C,  0x59, 0x00, 0x00, 0x00 },  /*  to Y */
1618   { 0x042D,  0x45, 0x00, 0x00, 0x00 },  /* Э to E */
1619   { 0x042E,  0x49, 0x75, 0x00, 0x00 },  /* Ю to Iu */
1620   { 0x042F,  0x49, 0x61, 0x00, 0x00 },  /* Я to Ia */
1621   { 0x0430,  0x61, 0x00, 0x00, 0x00 },  /* а to a */
1622   { 0x0431,  0x62, 0x00, 0x00, 0x00 },  /* б to b */
1623   { 0x0432,  0x76, 0x00, 0x00, 0x00 },  /* в to v */
1624   { 0x0433,  0x67, 0x00, 0x00, 0x00 },  /* г to g */
1625   { 0x0434,  0x64, 0x00, 0x00, 0x00 },  /* д to d */
1626   { 0x0435,  0x65, 0x00, 0x00, 0x00 },  /* е to e */
1627   { 0x0436,  0x7a, 0x68, 0x00, 0x00 },  /* ж to zh */
1628   { 0x0437,  0x7a, 0x00, 0x00, 0x00 },  /* з to z */
1629   { 0x0438,  0x69, 0x00, 0x00, 0x00 },  /* и to i */
1630   { 0x0439,  0x69, 0x00, 0x00, 0x00 },  /* й to i */
1631   { 0x043A,  0x6b, 0x00, 0x00, 0x00 },  /* к to k */
1632   { 0x043B,  0x6c, 0x00, 0x00, 0x00 },  /* л to l */
1633   { 0x043C,  0x6d, 0x00, 0x00, 0x00 },  /* м to m */
1634   { 0x043D,  0x6e, 0x00, 0x00, 0x00 },  /* н to n */
1635   { 0x043E,  0x6f, 0x00, 0x00, 0x00 },  /* о to o */
1636   { 0x043F,  0x70, 0x00, 0x00, 0x00 },  /* п to p */
1637   { 0x0440,  0x72, 0x00, 0x00, 0x00 },  /* р to r */
1638   { 0x0441,  0x73, 0x00, 0x00, 0x00 },  /* с to s */
1639   { 0x0442,  0x74, 0x00, 0x00, 0x00 },  /* т to t */
1640   { 0x0443,  0x75, 0x00, 0x00, 0x00 },  /* у to u */
1641   { 0x0444,  0x66, 0x00, 0x00, 0x00 },  /* ф to f */
1642   { 0x0445,  0x6b, 0x68, 0x00, 0x00 },  /* х to kh */
1643   { 0x0446,  0x74, 0x63, 0x00, 0x00 },  /* ц to tc */
1644   { 0x0447,  0x63, 0x68, 0x00, 0x00 },  /* ч to ch */
1645   { 0x0448,  0x73, 0x68, 0x00, 0x00 },  /* ш to sh */
1646   { 0x0449,  0x73, 0x68, 0x63, 0x68 },  /* щ to shch */
1647   { 0x044A,  0x61, 0x00, 0x00, 0x00 },  /*  to a */
1648   { 0x044B,  0x79, 0x00, 0x00, 0x00 },  /* ы to y */
1649   { 0x044C,  0x79, 0x00, 0x00, 0x00 },  /*  to y */
1650   { 0x044D,  0x65, 0x00, 0x00, 0x00 },  /* э to e */
1651   { 0x044E,  0x69, 0x75, 0x00, 0x00 },  /* ю to iu */
1652   { 0x044F,  0x69, 0x61, 0x00, 0x00 },  /* я to ia */
1653   { 0x0450,  0x65, 0x00, 0x00, 0x00 },  /* ѐ to e */
1654   { 0x0451,  0x65, 0x00, 0x00, 0x00 },  /* ё to e */
1655   { 0x0452,  0x64, 0x00, 0x00, 0x00 },  /* ђ to d */
1656   { 0x0453,  0x67, 0x00, 0x00, 0x00 },  /* ѓ to g */
1657   { 0x0454,  0x65, 0x00, 0x00, 0x00 },  /* є to e */
1658   { 0x0455,  0x7a, 0x00, 0x00, 0x00 },  /* ѕ to z */
1659   { 0x0456,  0x69, 0x00, 0x00, 0x00 },  /* і to i */
1660   { 0x0457,  0x69, 0x00, 0x00, 0x00 },  /* ї to i */
1661   { 0x0458,  0x6a, 0x00, 0x00, 0x00 },  /* ј to j */
1662   { 0x0459,  0x69, 0x00, 0x00, 0x00 },  /* љ to i */
1663   { 0x045A,  0x6e, 0x00, 0x00, 0x00 },  /* њ to n */
1664   { 0x045B,  0x64, 0x00, 0x00, 0x00 },  /* ћ to d */
1665   { 0x045C,  0x6b, 0x00, 0x00, 0x00 },  /* ќ to k */
1666   { 0x045D,  0x69, 0x00, 0x00, 0x00 },  /* ѝ to i */
1667   { 0x045E,  0x75, 0x00, 0x00, 0x00 },  /* ў to u */
1668   { 0x045F,  0x64, 0x00, 0x00, 0x00 },  /* џ to d */
1669   { 0x1E02,  0x42, 0x00, 0x00, 0x00 },  /* Ḃ to B */
1670   { 0x1E03,  0x62, 0x00, 0x00, 0x00 },  /* ḃ to b */
1671   { 0x1E0A,  0x44, 0x00, 0x00, 0x00 },  /* Ḋ to D */
1672   { 0x1E0B,  0x64, 0x00, 0x00, 0x00 },  /* ḋ to d */
1673   { 0x1E1E,  0x46, 0x00, 0x00, 0x00 },  /* Ḟ to F */
1674   { 0x1E1F,  0x66, 0x00, 0x00, 0x00 },  /* ḟ to f */
1675   { 0x1E40,  0x4D, 0x00, 0x00, 0x00 },  /* Ṁ to M */
1676   { 0x1E41,  0x6D, 0x00, 0x00, 0x00 },  /* ṁ to m */
1677   { 0x1E56,  0x50, 0x00, 0x00, 0x00 },  /* Ṗ to P */
1678   { 0x1E57,  0x70, 0x00, 0x00, 0x00 },  /* ṗ to p */
1679   { 0x1E60,  0x53, 0x00, 0x00, 0x00 },  /* Ṡ to S */
1680   { 0x1E61,  0x73, 0x00, 0x00, 0x00 },  /* ṡ to s */
1681   { 0x1E6A,  0x54, 0x00, 0x00, 0x00 },  /* Ṫ to T */
1682   { 0x1E6B,  0x74, 0x00, 0x00, 0x00 },  /* ṫ to t */
1683   { 0x1E80,  0x57, 0x00, 0x00, 0x00 },  /* Ẁ to W */
1684   { 0x1E81,  0x77, 0x00, 0x00, 0x00 },  /* ẁ to w */
1685   { 0x1E82,  0x57, 0x00, 0x00, 0x00 },  /* Ẃ to W */
1686   { 0x1E83,  0x77, 0x00, 0x00, 0x00 },  /* ẃ to w */
1687   { 0x1E84,  0x57, 0x00, 0x00, 0x00 },  /* Ẅ to W */
1688   { 0x1E85,  0x77, 0x00, 0x00, 0x00 },  /* ẅ to w */
1689   { 0x1EF2,  0x59, 0x00, 0x00, 0x00 },  /* Ỳ to Y */
1690   { 0x1EF3,  0x79, 0x00, 0x00, 0x00 },  /* ỳ to y */
1691   { 0xFB00,  0x66, 0x66, 0x00, 0x00 },  /* ff to ff */
1692   { 0xFB01,  0x66, 0x69, 0x00, 0x00 },  /* fi to fi */
1693   { 0xFB02,  0x66, 0x6C, 0x00, 0x00 },  /* fl to fl */
1694   { 0xFB05,  0x73, 0x74, 0x00, 0x00 },  /* ſt to st */
1695   { 0xFB06,  0x73, 0x74, 0x00, 0x00 },  /* st to st */
1696 };
1697 
spellfixFindTranslit(int c,int * pxTop)1698 static const Transliteration *spellfixFindTranslit(int c, int *pxTop){
1699   *pxTop = (sizeof(translit)/sizeof(translit[0])) - 1;
1700   return translit;
1701 }
1702 
1703 /*
1704 ** Convert the input string from UTF-8 into pure ASCII by converting
1705 ** all non-ASCII characters to some combination of characters in the
1706 ** ASCII subset.
1707 **
1708 ** The returned string might contain more characters than the input.
1709 **
1710 ** Space to hold the returned string comes from sqlite3_malloc() and
1711 ** should be freed by the caller.
1712 */
transliterate(const unsigned char * zIn,int nIn)1713 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1714 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1715   unsigned char *zOut = sqlite3_malloc64( nIn*5 + 1 );
1716 #else
1717   unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 );
1718 #endif
1719   int c, sz, nOut;
1720   if( zOut==0 ) return 0;
1721   nOut = 0;
1722   while( nIn>0 ){
1723     c = utf8Read(zIn, nIn, &sz);
1724     zIn += sz;
1725     nIn -= sz;
1726     if( c<=127 ){
1727       zOut[nOut++] = (unsigned char)c;
1728     }else{
1729       int xTop, xBtm, x;
1730       const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1731       xBtm = 0;
1732       while( xTop>=xBtm ){
1733         x = (xTop + xBtm)/2;
1734         if( tbl[x].cFrom==c ){
1735           zOut[nOut++] = tbl[x].cTo0;
1736           if( tbl[x].cTo1 ){
1737             zOut[nOut++] = tbl[x].cTo1;
1738             if( tbl[x].cTo2 ){
1739               zOut[nOut++] = tbl[x].cTo2;
1740               if( tbl[x].cTo3 ){
1741                 zOut[nOut++] = tbl[x].cTo3;
1742 #ifdef SQLITE_SPELLFIX_5BYTE_MAPPINGS
1743                 if( tbl[x].cTo4 ){
1744                   zOut[nOut++] = tbl[x].cTo4;
1745                 }
1746 #endif /* SQLITE_SPELLFIX_5BYTE_MAPPINGS */
1747               }
1748             }
1749           }
1750           c = 0;
1751           break;
1752         }else if( tbl[x].cFrom>c ){
1753           xTop = x-1;
1754         }else{
1755           xBtm = x+1;
1756         }
1757       }
1758       if( c ) zOut[nOut++] = '?';
1759     }
1760   }
1761   zOut[nOut] = 0;
1762   return zOut;
1763 }
1764 
1765 /*
1766 ** Return the number of characters in the shortest prefix of the input
1767 ** string that transliterates to an ASCII string nTrans bytes or longer.
1768 ** Or, if the transliteration of the input string is less than nTrans
1769 ** bytes in size, return the number of characters in the input string.
1770 */
translen_to_charlen(const char * zIn,int nIn,int nTrans)1771 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1772   int i, c, sz, nOut;
1773   int nChar;
1774 
1775   i = nOut = 0;
1776   for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1777     c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1778     i += sz;
1779 
1780     nOut++;
1781     if( c>=128 ){
1782       int xTop, xBtm, x;
1783       const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1784       xBtm = 0;
1785       while( xTop>=xBtm ){
1786         x = (xTop + xBtm)/2;
1787         if( tbl[x].cFrom==c ){
1788           if( tbl[x].cTo1 ){
1789             nOut++;
1790             if( tbl[x].cTo2 ){
1791               nOut++;
1792               if( tbl[x].cTo3 ){
1793                 nOut++;
1794               }
1795             }
1796           }
1797           break;
1798         }else if( tbl[x].cFrom>c ){
1799           xTop = x-1;
1800         }else{
1801           xBtm = x+1;
1802         }
1803       }
1804     }
1805   }
1806 
1807   return nChar;
1808 }
1809 
1810 
1811 /*
1812 **    spellfix1_translit(X)
1813 **
1814 ** Convert a string that contains non-ASCII Roman characters into
1815 ** pure ASCII.
1816 */
transliterateSqlFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)1817 static void transliterateSqlFunc(
1818   sqlite3_context *context,
1819   int argc,
1820   sqlite3_value **argv
1821 ){
1822   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1823   int nIn = sqlite3_value_bytes(argv[0]);
1824   unsigned char *zOut = transliterate(zIn, nIn);
1825   if( zOut==0 ){
1826     sqlite3_result_error_nomem(context);
1827   }else{
1828     sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1829   }
1830 }
1831 
1832 /*
1833 **    spellfix1_scriptcode(X)
1834 **
1835 ** Try to determine the dominant script used by the word X and return
1836 ** its ISO 15924 numeric code.
1837 **
1838 ** The current implementation only understands the following scripts:
1839 **
1840 **    215  (Latin)
1841 **    220  (Cyrillic)
1842 **    200  (Greek)
1843 **
1844 ** This routine will return 998 if the input X contains characters from
1845 ** two or more of the above scripts or 999 if X contains no characters
1846 ** from any of the above scripts.
1847 */
scriptCodeSqlFunc(sqlite3_context * context,int argc,sqlite3_value ** argv)1848 static void scriptCodeSqlFunc(
1849   sqlite3_context *context,
1850   int argc,
1851   sqlite3_value **argv
1852 ){
1853   const unsigned char *zIn = sqlite3_value_text(argv[0]);
1854   int nIn = sqlite3_value_bytes(argv[0]);
1855   int c, sz;
1856   int scriptMask = 0;
1857   int res;
1858   int seenDigit = 0;
1859 # define SCRIPT_LATIN       0x0001
1860 # define SCRIPT_CYRILLIC    0x0002
1861 # define SCRIPT_GREEK       0x0004
1862 # define SCRIPT_HEBREW      0x0008
1863 # define SCRIPT_ARABIC      0x0010
1864 
1865   while( nIn>0 ){
1866     c = utf8Read(zIn, nIn, &sz);
1867     zIn += sz;
1868     nIn -= sz;
1869     if( c<0x02af ){
1870       if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ){
1871         scriptMask |= SCRIPT_LATIN;
1872       }else if( c>='0' && c<='9' ){
1873         seenDigit = 1;
1874       }
1875     }else if( c>=0x0400 && c<=0x04ff ){
1876       scriptMask |= SCRIPT_CYRILLIC;
1877     }else if( c>=0x0386 && c<=0x03ce ){
1878       scriptMask |= SCRIPT_GREEK;
1879     }else if( c>=0x0590 && c<=0x05ff ){
1880       scriptMask |= SCRIPT_HEBREW;
1881     }else if( c>=0x0600 && c<=0x06ff ){
1882       scriptMask |= SCRIPT_ARABIC;
1883     }
1884   }
1885   if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN;
1886   switch( scriptMask ){
1887     case 0:                res = 999; break;
1888     case SCRIPT_LATIN:     res = 215; break;
1889     case SCRIPT_CYRILLIC:  res = 220; break;
1890     case SCRIPT_GREEK:     res = 200; break;
1891     case SCRIPT_HEBREW:    res = 125; break;
1892     case SCRIPT_ARABIC:    res = 160; break;
1893     default:               res = 998; break;
1894   }
1895   sqlite3_result_int(context, res);
1896 }
1897 
1898 /* End transliterate
1899 ******************************************************************************
1900 ******************************************************************************
1901 ** Begin spellfix1 virtual table.
1902 */
1903 
1904 /* Maximum length of a phonehash used for querying the shadow table */
1905 #define SPELLFIX_MX_HASH  32
1906 
1907 /* Maximum number of hash strings to examine per query */
1908 #define SPELLFIX_MX_RUN   1
1909 
1910 typedef struct spellfix1_vtab spellfix1_vtab;
1911 typedef struct spellfix1_cursor spellfix1_cursor;
1912 
1913 /* Fuzzy-search virtual table object */
1914 struct spellfix1_vtab {
1915   sqlite3_vtab base;         /* Base class - must be first */
1916   sqlite3 *db;               /* Database connection */
1917   char *zDbName;             /* Name of database holding this table */
1918   char *zTableName;          /* Name of the virtual table */
1919   char *zCostTable;          /* Table holding edit-distance cost numbers */
1920   EditDist3Config *pConfig3; /* Parsed edit distance costs */
1921 };
1922 
1923 /* Fuzzy-search cursor object */
1924 struct spellfix1_cursor {
1925   sqlite3_vtab_cursor base;    /* Base class - must be first */
1926   spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */
1927   char *zPattern;              /* rhs of MATCH clause */
1928   int idxNum;                  /* idxNum value passed to xFilter() */
1929   int nRow;                    /* Number of rows of content */
1930   int nAlloc;                  /* Number of allocated rows */
1931   int iRow;                    /* Current row of content */
1932   int iLang;                   /* Value of the langid= constraint */
1933   int iTop;                    /* Value of the top= constraint */
1934   int iScope;                  /* Value of the scope= constraint */
1935   int nSearch;                 /* Number of vocabulary items checked */
1936   sqlite3_stmt *pFullScan;     /* Shadow query for a full table scan */
1937   struct spellfix1_row {       /* For each row of content */
1938     sqlite3_int64 iRowid;         /* Rowid for this row */
1939     char *zWord;                  /* Text for this row */
1940     int iRank;                    /* Rank for this row */
1941     int iDistance;                /* Distance from pattern for this row */
1942     int iScore;                   /* Score for sorting */
1943     int iMatchlen;                /* Value of matchlen column (or -1) */
1944     char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1945   } *a;
1946 };
1947 
1948 /*
1949 ** Construct one or more SQL statements from the format string given
1950 ** and then evaluate those statements. The success code is written
1951 ** into *pRc.
1952 **
1953 ** If *pRc is initially non-zero then this routine is a no-op.
1954 */
spellfix1DbExec(int * pRc,sqlite3 * db,const char * zFormat,...)1955 static void spellfix1DbExec(
1956   int *pRc,              /* Success code */
1957   sqlite3 *db,           /* Database in which to run SQL */
1958   const char *zFormat,   /* Format string for SQL */
1959   ...                    /* Arguments to the format string */
1960 ){
1961   va_list ap;
1962   char *zSql;
1963   if( *pRc ) return;
1964   va_start(ap, zFormat);
1965   zSql = sqlite3_vmprintf(zFormat, ap);
1966   va_end(ap);
1967   if( zSql==0 ){
1968     *pRc = SQLITE_NOMEM;
1969   }else{
1970     *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1971     sqlite3_free(zSql);
1972   }
1973 }
1974 
1975 /*
1976 ** xDisconnect/xDestroy method for the fuzzy-search module.
1977 */
spellfix1Uninit(int isDestroy,sqlite3_vtab * pVTab)1978 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1979   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1980   int rc = SQLITE_OK;
1981   if( isDestroy ){
1982     sqlite3 *db = p->db;
1983     spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1984                   p->zDbName, p->zTableName);
1985   }
1986   if( rc==SQLITE_OK ){
1987     sqlite3_free(p->zTableName);
1988     editDist3ConfigDelete(p->pConfig3);
1989     sqlite3_free(p->zCostTable);
1990     sqlite3_free(p);
1991   }
1992   return rc;
1993 }
spellfix1Disconnect(sqlite3_vtab * pVTab)1994 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1995   return spellfix1Uninit(0, pVTab);
1996 }
spellfix1Destroy(sqlite3_vtab * pVTab)1997 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1998   return spellfix1Uninit(1, pVTab);
1999 }
2000 
2001 /*
2002 ** Make a copy of a string.  Remove leading and trailing whitespace
2003 ** and dequote it.
2004 */
spellfix1Dequote(const char * zIn)2005 static char *spellfix1Dequote(const char *zIn){
2006   char *zOut;
2007   int i, j;
2008   char c;
2009   while( isspace((unsigned char)zIn[0]) ) zIn++;
2010   zOut = sqlite3_mprintf("%s", zIn);
2011   if( zOut==0 ) return 0;
2012   i = (int)strlen(zOut);
2013 #if 0  /* The parser will never leave spaces at the end */
2014   while( i>0 && isspace(zOut[i-1]) ){ i--; }
2015 #endif
2016   zOut[i] = 0;
2017   c = zOut[0];
2018   if( c=='\'' || c=='"' ){
2019     for(i=1, j=0; ALWAYS(zOut[i]); i++){
2020       zOut[j++] = zOut[i];
2021       if( zOut[i]==c ){
2022         if( zOut[i+1]==c ){
2023           i++;
2024         }else{
2025           zOut[j-1] = 0;
2026           break;
2027         }
2028       }
2029     }
2030   }
2031   return zOut;
2032 }
2033 
2034 
2035 /*
2036 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
2037 **
2038 **   argv[0]   -> module name  ("spellfix1")
2039 **   argv[1]   -> database name
2040 **   argv[2]   -> table name
2041 **   argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
2042 */
spellfix1Init(int isCreate,sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVTab,char ** pzErr)2043 static int spellfix1Init(
2044   int isCreate,
2045   sqlite3 *db,
2046   void *pAux,
2047   int argc, const char *const*argv,
2048   sqlite3_vtab **ppVTab,
2049   char **pzErr
2050 ){
2051   spellfix1_vtab *pNew = 0;
2052   /* const char *zModule = argv[0]; // not used */
2053   const char *zDbName = argv[1];
2054   const char *zTableName = argv[2];
2055   int nDbName;
2056   int rc = SQLITE_OK;
2057   int i;
2058 
2059   nDbName = (int)strlen(zDbName);
2060   pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1);
2061   if( pNew==0 ){
2062     rc = SQLITE_NOMEM;
2063   }else{
2064     memset(pNew, 0, sizeof(*pNew));
2065     pNew->zDbName = (char*)&pNew[1];
2066     memcpy(pNew->zDbName, zDbName, nDbName+1);
2067     pNew->zTableName = sqlite3_mprintf("%s", zTableName);
2068     pNew->db = db;
2069     if( pNew->zTableName==0 ){
2070       rc = SQLITE_NOMEM;
2071     }else{
2072       sqlite3_vtab_config(db, SQLITE_VTAB_INNOCUOUS);
2073       rc = sqlite3_declare_vtab(db,
2074            "CREATE TABLE x(word,rank,distance,langid, "
2075            "score, matchlen, phonehash HIDDEN, "
2076            "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
2077            "soundslike HIDDEN, command HIDDEN)"
2078       );
2079 #define SPELLFIX_COL_WORD            0
2080 #define SPELLFIX_COL_RANK            1
2081 #define SPELLFIX_COL_DISTANCE        2
2082 #define SPELLFIX_COL_LANGID          3
2083 #define SPELLFIX_COL_SCORE           4
2084 #define SPELLFIX_COL_MATCHLEN        5
2085 #define SPELLFIX_COL_PHONEHASH       6
2086 #define SPELLFIX_COL_TOP             7
2087 #define SPELLFIX_COL_SCOPE           8
2088 #define SPELLFIX_COL_SRCHCNT         9
2089 #define SPELLFIX_COL_SOUNDSLIKE     10
2090 #define SPELLFIX_COL_COMMAND        11
2091     }
2092     if( rc==SQLITE_OK && isCreate ){
2093       spellfix1DbExec(&rc, db,
2094          "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
2095          "  id INTEGER PRIMARY KEY,\n"
2096          "  rank INT,\n"
2097          "  langid INT,\n"
2098          "  word TEXT,\n"
2099          "  k1 TEXT,\n"
2100          "  k2 TEXT\n"
2101          ");\n",
2102          zDbName, zTableName
2103       );
2104       spellfix1DbExec(&rc, db,
2105          "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
2106             "ON \"%w_vocab\"(langid,k2);",
2107          zDbName, zTableName, zTableName
2108       );
2109     }
2110     for(i=3; rc==SQLITE_OK && i<argc; i++){
2111       if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
2112         pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
2113         if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
2114         continue;
2115       }
2116       *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
2117       rc = SQLITE_ERROR;
2118     }
2119   }
2120 
2121   if( rc && pNew ){
2122     *ppVTab = 0;
2123     spellfix1Uninit(0, &pNew->base);
2124   }else{
2125     *ppVTab = (sqlite3_vtab *)pNew;
2126   }
2127   return rc;
2128 }
2129 
2130 /*
2131 ** The xConnect and xCreate methods
2132 */
spellfix1Connect(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVTab,char ** pzErr)2133 static int spellfix1Connect(
2134   sqlite3 *db,
2135   void *pAux,
2136   int argc, const char *const*argv,
2137   sqlite3_vtab **ppVTab,
2138   char **pzErr
2139 ){
2140   return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
2141 }
spellfix1Create(sqlite3 * db,void * pAux,int argc,const char * const * argv,sqlite3_vtab ** ppVTab,char ** pzErr)2142 static int spellfix1Create(
2143   sqlite3 *db,
2144   void *pAux,
2145   int argc, const char *const*argv,
2146   sqlite3_vtab **ppVTab,
2147   char **pzErr
2148 ){
2149   return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2150 }
2151 
2152 /*
2153 ** Clear all of the content from a cursor.
2154 */
spellfix1ResetCursor(spellfix1_cursor * pCur)2155 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2156   int i;
2157   for(i=0; i<pCur->nRow; i++){
2158     sqlite3_free(pCur->a[i].zWord);
2159   }
2160   pCur->nRow = 0;
2161   pCur->iRow = 0;
2162   pCur->nSearch = 0;
2163   if( pCur->pFullScan ){
2164     sqlite3_finalize(pCur->pFullScan);
2165     pCur->pFullScan = 0;
2166   }
2167 }
2168 
2169 /*
2170 ** Resize the cursor to hold up to N rows of content
2171 */
spellfix1ResizeCursor(spellfix1_cursor * pCur,int N)2172 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2173   struct spellfix1_row *aNew;
2174   assert( N>=pCur->nRow );
2175   aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N);
2176   if( aNew==0 && N>0 ){
2177     spellfix1ResetCursor(pCur);
2178     sqlite3_free(pCur->a);
2179     pCur->nAlloc = 0;
2180     pCur->a = 0;
2181   }else{
2182     pCur->nAlloc = N;
2183     pCur->a = aNew;
2184   }
2185 }
2186 
2187 
2188 /*
2189 ** Close a fuzzy-search cursor.
2190 */
spellfix1Close(sqlite3_vtab_cursor * cur)2191 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2192   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2193   spellfix1ResetCursor(pCur);
2194   spellfix1ResizeCursor(pCur, 0);
2195   sqlite3_free(pCur->zPattern);
2196   sqlite3_free(pCur);
2197   return SQLITE_OK;
2198 }
2199 
2200 #define SPELLFIX_IDXNUM_MATCH  0x01         /* word MATCH $str */
2201 #define SPELLFIX_IDXNUM_LANGID 0x02         /* langid == $langid */
2202 #define SPELLFIX_IDXNUM_TOP    0x04         /* top = $top */
2203 #define SPELLFIX_IDXNUM_SCOPE  0x08         /* scope = $scope */
2204 #define SPELLFIX_IDXNUM_DISTLT 0x10         /* distance < $distance */
2205 #define SPELLFIX_IDXNUM_DISTLE 0x20         /* distance <= $distance */
2206 #define SPELLFIX_IDXNUM_ROWID  0x40         /* rowid = $rowid */
2207 #define SPELLFIX_IDXNUM_DIST   (0x10|0x20)  /* DISTLT and DISTLE */
2208 
2209 /*
2210 **
2211 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2212 ** above.
2213 **
2214 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2215 ** if specified and in that order.
2216 */
spellfix1BestIndex(sqlite3_vtab * tab,sqlite3_index_info * pIdxInfo)2217 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2218   int iPlan = 0;
2219   int iLangTerm = -1;
2220   int iTopTerm = -1;
2221   int iScopeTerm = -1;
2222   int iDistTerm = -1;
2223   int iRowidTerm = -1;
2224   int i;
2225   const struct sqlite3_index_constraint *pConstraint;
2226   pConstraint = pIdxInfo->aConstraint;
2227   for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2228     if( pConstraint->usable==0 ) continue;
2229 
2230     /* Terms of the form:  word MATCH $str */
2231     if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2232      && pConstraint->iColumn==SPELLFIX_COL_WORD
2233      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2234     ){
2235       iPlan |= SPELLFIX_IDXNUM_MATCH;
2236       pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2237       pIdxInfo->aConstraintUsage[i].omit = 1;
2238     }
2239 
2240     /* Terms of the form:  langid = $langid  */
2241     if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2242      && pConstraint->iColumn==SPELLFIX_COL_LANGID
2243      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2244     ){
2245       iPlan |= SPELLFIX_IDXNUM_LANGID;
2246       iLangTerm = i;
2247     }
2248 
2249     /* Terms of the form:  top = $top */
2250     if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2251      && pConstraint->iColumn==SPELLFIX_COL_TOP
2252      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2253     ){
2254       iPlan |= SPELLFIX_IDXNUM_TOP;
2255       iTopTerm = i;
2256     }
2257 
2258     /* Terms of the form:  scope = $scope */
2259     if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2260      && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2261      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2262     ){
2263       iPlan |= SPELLFIX_IDXNUM_SCOPE;
2264       iScopeTerm = i;
2265     }
2266 
2267     /* Terms of the form:  distance < $dist or distance <= $dist */
2268     if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2269      && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2270      && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2271           || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2272     ){
2273       if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2274         iPlan |= SPELLFIX_IDXNUM_DISTLT;
2275       }else{
2276         iPlan |= SPELLFIX_IDXNUM_DISTLE;
2277       }
2278       iDistTerm = i;
2279     }
2280 
2281     /* Terms of the form:  distance < $dist or distance <= $dist */
2282     if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2283      && pConstraint->iColumn<0
2284      && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2285     ){
2286       iPlan |= SPELLFIX_IDXNUM_ROWID;
2287       iRowidTerm = i;
2288     }
2289   }
2290   if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2291     int idx = 2;
2292     pIdxInfo->idxNum = iPlan;
2293     if( pIdxInfo->nOrderBy==1
2294      && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2295      && pIdxInfo->aOrderBy[0].desc==0
2296     ){
2297       pIdxInfo->orderByConsumed = 1;  /* Default order by iScore */
2298     }
2299     if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2300       pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2301       pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2302     }
2303     if( iPlan&SPELLFIX_IDXNUM_TOP ){
2304       pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2305       pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2306     }
2307     if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2308       pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2309       pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2310     }
2311     if( iPlan&SPELLFIX_IDXNUM_DIST ){
2312       pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2313       pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2314     }
2315     pIdxInfo->estimatedCost = 1e5;
2316   }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2317     pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2318     pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2319     pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2320     pIdxInfo->estimatedCost = 5;
2321   }else{
2322     pIdxInfo->idxNum = 0;
2323     pIdxInfo->estimatedCost = 1e50;
2324   }
2325   return SQLITE_OK;
2326 }
2327 
2328 /*
2329 ** Open a new fuzzy-search cursor.
2330 */
spellfix1Open(sqlite3_vtab * pVTab,sqlite3_vtab_cursor ** ppCursor)2331 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2332   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2333   spellfix1_cursor *pCur;
2334   pCur = sqlite3_malloc64( sizeof(*pCur) );
2335   if( pCur==0 ) return SQLITE_NOMEM;
2336   memset(pCur, 0, sizeof(*pCur));
2337   pCur->pVTab = p;
2338   *ppCursor = &pCur->base;
2339   return SQLITE_OK;
2340 }
2341 
2342 /*
2343 ** Adjust a distance measurement by the words rank in order to show
2344 ** preference to common words.
2345 */
spellfix1Score(int iDistance,int iRank)2346 static int spellfix1Score(int iDistance, int iRank){
2347   int iLog2;
2348   for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2349   return iDistance + 32 - iLog2;
2350 }
2351 
2352 /*
2353 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2354 ** that they sort in order of increasing distance.
2355 */
spellfix1RowCompare(const void * A,const void * B)2356 static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B){
2357   const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2358   const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2359   return a->iScore - b->iScore;
2360 }
2361 
2362 /*
2363 ** A structure used to pass information from spellfix1FilterForMatch()
2364 ** into spellfix1RunQuery().
2365 */
2366 typedef struct MatchQuery {
2367   spellfix1_cursor *pCur;          /* The cursor being queried */
2368   sqlite3_stmt *pStmt;             /* shadow table query statment */
2369   char zHash[SPELLFIX_MX_HASH];    /* The current phonehash for zPattern */
2370   const char *zPattern;            /* Transliterated input string */
2371   int nPattern;                    /* Length of zPattern */
2372   EditDist3FromString *pMatchStr3; /* Original unicode string */
2373   EditDist3Config *pConfig3;       /* Edit-distance cost coefficients */
2374   const EditDist3Lang *pLang;      /* The selected language coefficients */
2375   int iLang;                       /* The language id */
2376   int iScope;                      /* Default scope */
2377   int iMaxDist;                    /* Maximum allowed edit distance, or -1 */
2378   int rc;                          /* Error code */
2379   int nRun;                  /* Number of prior runs for the same zPattern */
2380   char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH];  /* Prior hashes */
2381 } MatchQuery;
2382 
2383 /*
2384 ** Run a query looking for the best matches against zPattern using
2385 ** zHash as the character class seed hash.
2386 */
spellfix1RunQuery(MatchQuery * p,const char * zQuery,int nQuery)2387 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2388   const char *zK1;
2389   const char *zWord;
2390   int iDist;
2391   int iRank;
2392   int iScore;
2393   int iWorst = 0;
2394   int idx;
2395   int idxWorst = -1;
2396   int i;
2397   int iScope = p->iScope;
2398   spellfix1_cursor *pCur = p->pCur;
2399   sqlite3_stmt *pStmt = p->pStmt;
2400   char zHash1[SPELLFIX_MX_HASH];
2401   char zHash2[SPELLFIX_MX_HASH];
2402   char *zClass;
2403   int nClass;
2404   int rc;
2405 
2406   if( pCur->a==0 || p->rc ) return;   /* Prior memory allocation failure */
2407   zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2408   if( zClass==0 ){
2409     p->rc = SQLITE_NOMEM;
2410     return;
2411   }
2412   nClass = (int)strlen(zClass);
2413   if( nClass>SPELLFIX_MX_HASH-2 ){
2414     nClass = SPELLFIX_MX_HASH-2;
2415     zClass[nClass] = 0;
2416   }
2417   if( nClass<=iScope ){
2418     if( nClass>2 ){
2419       iScope = nClass-1;
2420     }else{
2421       iScope = nClass;
2422     }
2423   }
2424   memcpy(zHash1, zClass, iScope);
2425   sqlite3_free(zClass);
2426   zHash1[iScope] = 0;
2427   memcpy(zHash2, zHash1, iScope);
2428   zHash2[iScope] = 'Z';
2429   zHash2[iScope+1] = 0;
2430 #if SPELLFIX_MX_RUN>1
2431   for(i=0; i<p->nRun; i++){
2432     if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2433   }
2434 #endif
2435   assert( p->nRun<SPELLFIX_MX_RUN );
2436   memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2437   if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2438    || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2439   ){
2440     p->rc = SQLITE_NOMEM;
2441     return;
2442   }
2443 #if SPELLFIX_MX_RUN>1
2444   for(i=0; i<pCur->nRow; i++){
2445     if( pCur->a[i].iScore>iWorst ){
2446       iWorst = pCur->a[i].iScore;
2447       idxWorst = i;
2448     }
2449   }
2450 #endif
2451   while( sqlite3_step(pStmt)==SQLITE_ROW ){
2452     int iMatchlen = -1;
2453     iRank = sqlite3_column_int(pStmt, 2);
2454     if( p->pMatchStr3 ){
2455       int nWord = sqlite3_column_bytes(pStmt, 1);
2456       zWord = (const char*)sqlite3_column_text(pStmt, 1);
2457       iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2458     }else{
2459       zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2460       if( zK1==0 ) continue;
2461       iDist = editdist1(p->zPattern, zK1, 0);
2462     }
2463     if( iDist<0 ){
2464       p->rc = SQLITE_NOMEM;
2465       break;
2466     }
2467     pCur->nSearch++;
2468 
2469     /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2470     ** check if this row meets it. If not, jump back up to the top of the
2471     ** loop to process the next row. Otherwise, if the row does match the
2472     ** distance constraint, check if the pCur->a[] array is already full.
2473     ** If it is and no explicit "top = ?" constraint was present in the
2474     ** query, grow the array to ensure there is room for the new entry. */
2475     assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2476     if( p->iMaxDist>=0 ){
2477       if( iDist>p->iMaxDist ) continue;
2478       if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2479         spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2480         if( pCur->a==0 ) break;
2481       }
2482     }
2483 
2484     iScore = spellfix1Score(iDist,iRank);
2485     if( pCur->nRow<pCur->nAlloc ){
2486       idx = pCur->nRow;
2487     }else if( iScore<iWorst ){
2488       idx = idxWorst;
2489       sqlite3_free(pCur->a[idx].zWord);
2490     }else{
2491       continue;
2492     }
2493 
2494     pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2495     if( pCur->a[idx].zWord==0 ){
2496       p->rc = SQLITE_NOMEM;
2497       break;
2498     }
2499     pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2500     pCur->a[idx].iRank = iRank;
2501     pCur->a[idx].iDistance = iDist;
2502     pCur->a[idx].iScore = iScore;
2503     pCur->a[idx].iMatchlen = iMatchlen;
2504     memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2505     if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2506     if( pCur->nRow==pCur->nAlloc ){
2507       iWorst = pCur->a[0].iScore;
2508       idxWorst = 0;
2509       for(i=1; i<pCur->nRow; i++){
2510         iScore = pCur->a[i].iScore;
2511         if( iWorst<iScore ){
2512           iWorst = iScore;
2513           idxWorst = i;
2514         }
2515       }
2516     }
2517   }
2518   rc = sqlite3_reset(pStmt);
2519   if( rc ) p->rc = rc;
2520 }
2521 
2522 /*
2523 ** This version of the xFilter method work if the MATCH term is present
2524 ** and we are doing a scan.
2525 */
spellfix1FilterForMatch(spellfix1_cursor * pCur,int argc,sqlite3_value ** argv)2526 static int spellfix1FilterForMatch(
2527   spellfix1_cursor *pCur,
2528   int argc,
2529   sqlite3_value **argv
2530 ){
2531   int idxNum = pCur->idxNum;
2532   const unsigned char *zMatchThis;   /* RHS of the MATCH operator */
2533   EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2534   char *zPattern;                    /* Transliteration of zMatchThis */
2535   int nPattern;                      /* Length of zPattern */
2536   int iLimit = 20;                   /* Max number of rows of output */
2537   int iScope = 3;                    /* Use this many characters of zClass */
2538   int iLang = 0;                     /* Language code */
2539   char *zSql;                        /* SQL of shadow table query */
2540   sqlite3_stmt *pStmt = 0;           /* Shadow table query */
2541   int rc;                            /* Result code */
2542   int idx = 1;                       /* Next available filter parameter */
2543   spellfix1_vtab *p = pCur->pVTab;   /* The virtual table that owns pCur */
2544   MatchQuery x;                      /* For passing info to RunQuery() */
2545 
2546   /* Load the cost table if we have not already done so */
2547   if( p->zCostTable!=0 && p->pConfig3==0 ){
2548     p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) );
2549     if( p->pConfig3==0 ) return SQLITE_NOMEM;
2550     memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2551     rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2552     if( rc ) return rc;
2553   }
2554   memset(&x, 0, sizeof(x));
2555   x.iScope = 3;  /* Default scope if none specified by "WHERE scope=N" */
2556   x.iMaxDist = -1;   /* Maximum allowed edit distance */
2557 
2558   if( idxNum&2 ){
2559     iLang = sqlite3_value_int(argv[idx++]);
2560   }
2561   if( idxNum&4 ){
2562     iLimit = sqlite3_value_int(argv[idx++]);
2563     if( iLimit<1 ) iLimit = 1;
2564   }
2565   if( idxNum&8 ){
2566     x.iScope = sqlite3_value_int(argv[idx++]);
2567     if( x.iScope<1 ) x.iScope = 1;
2568     if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2569   }
2570   if( idxNum&(16|32) ){
2571     x.iMaxDist = sqlite3_value_int(argv[idx++]);
2572     if( idxNum&16 ) x.iMaxDist--;
2573     if( x.iMaxDist<0 ) x.iMaxDist = 0;
2574   }
2575   spellfix1ResetCursor(pCur);
2576   spellfix1ResizeCursor(pCur, iLimit);
2577   zMatchThis = sqlite3_value_text(argv[0]);
2578   if( zMatchThis==0 ) return SQLITE_OK;
2579   if( p->pConfig3 ){
2580     x.pLang = editDist3FindLang(p->pConfig3, iLang);
2581     pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2582     if( pMatchStr3==0 ){
2583       x.rc = SQLITE_NOMEM;
2584       goto filter_exit;
2585     }
2586   }else{
2587     x.pLang = 0;
2588   }
2589   zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2590   sqlite3_free(pCur->zPattern);
2591   pCur->zPattern = zPattern;
2592   if( zPattern==0 ){
2593     x.rc = SQLITE_NOMEM;
2594     goto filter_exit;
2595   }
2596   nPattern = (int)strlen(zPattern);
2597   if( zPattern[nPattern-1]=='*' ) nPattern--;
2598   zSql = sqlite3_mprintf(
2599      "SELECT id, word, rank, coalesce(k1,word)"
2600      "  FROM \"%w\".\"%w_vocab\""
2601      " WHERE langid=%d AND k2>=?1 AND k2<?2",
2602      p->zDbName, p->zTableName, iLang
2603   );
2604   if( zSql==0 ){
2605     x.rc = SQLITE_NOMEM;
2606     pStmt = 0;
2607     goto filter_exit;
2608   }
2609   rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2610   sqlite3_free(zSql);
2611   pCur->iLang = iLang;
2612   x.pCur = pCur;
2613   x.pStmt = pStmt;
2614   x.zPattern = zPattern;
2615   x.nPattern = nPattern;
2616   x.pMatchStr3 = pMatchStr3;
2617   x.iLang = iLang;
2618   x.rc = rc;
2619   x.pConfig3 = p->pConfig3;
2620   if( x.rc==SQLITE_OK ){
2621     spellfix1RunQuery(&x, zPattern, nPattern);
2622   }
2623 
2624   if( pCur->a ){
2625     qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2626     pCur->iTop = iLimit;
2627     pCur->iScope = iScope;
2628   }else{
2629     x.rc = SQLITE_NOMEM;
2630   }
2631 
2632 filter_exit:
2633   sqlite3_finalize(pStmt);
2634   editDist3FromStringDelete(pMatchStr3);
2635   return x.rc;
2636 }
2637 
2638 /*
2639 ** This version of xFilter handles a full-table scan case
2640 */
spellfix1FilterForFullScan(spellfix1_cursor * pCur,int argc,sqlite3_value ** argv)2641 static int spellfix1FilterForFullScan(
2642   spellfix1_cursor *pCur,
2643   int argc,
2644   sqlite3_value **argv
2645 ){
2646   int rc = SQLITE_OK;
2647   int idxNum = pCur->idxNum;
2648   char *zSql;
2649   spellfix1_vtab *pVTab = pCur->pVTab;
2650   spellfix1ResetCursor(pCur);
2651   assert( idxNum==0 || idxNum==64 );
2652   zSql = sqlite3_mprintf(
2653      "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2654      pVTab->zDbName, pVTab->zTableName,
2655      ((idxNum & 64) ? " WHERE rowid=?" : "")
2656   );
2657   if( zSql==0 ) return SQLITE_NOMEM;
2658   rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2659   sqlite3_free(zSql);
2660   if( rc==SQLITE_OK && (idxNum & 64) ){
2661     assert( argc==1 );
2662     rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2663   }
2664   pCur->nRow = pCur->iRow = 0;
2665   if( rc==SQLITE_OK ){
2666     rc = sqlite3_step(pCur->pFullScan);
2667     if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2668     if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2669   }else{
2670     pCur->iRow = 0;
2671   }
2672   return rc;
2673 }
2674 
2675 
2676 /*
2677 ** Called to "rewind" a cursor back to the beginning so that
2678 ** it starts its output over again.  Always called at least once
2679 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2680 */
spellfix1Filter(sqlite3_vtab_cursor * cur,int idxNum,const char * idxStr,int argc,sqlite3_value ** argv)2681 static int spellfix1Filter(
2682   sqlite3_vtab_cursor *cur,
2683   int idxNum, const char *idxStr,
2684   int argc, sqlite3_value **argv
2685 ){
2686   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2687   int rc;
2688   pCur->idxNum = idxNum;
2689   if( idxNum & 1 ){
2690     rc = spellfix1FilterForMatch(pCur, argc, argv);
2691   }else{
2692     rc = spellfix1FilterForFullScan(pCur, argc, argv);
2693   }
2694   return rc;
2695 }
2696 
2697 
2698 /*
2699 ** Advance a cursor to its next row of output
2700 */
spellfix1Next(sqlite3_vtab_cursor * cur)2701 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2702   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2703   int rc = SQLITE_OK;
2704   if( pCur->iRow < pCur->nRow ){
2705     if( pCur->pFullScan ){
2706       rc = sqlite3_step(pCur->pFullScan);
2707       if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2708       if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2709     }else{
2710       pCur->iRow++;
2711     }
2712   }
2713   return rc;
2714 }
2715 
2716 /*
2717 ** Return TRUE if we are at the end-of-file
2718 */
spellfix1Eof(sqlite3_vtab_cursor * cur)2719 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2720   spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2721   return pCur->iRow>=pCur->nRow;
2722 }
2723 
2724 /*
2725 ** Return columns from the current row.
2726 */
spellfix1Column(sqlite3_vtab_cursor * cur,sqlite3_context * ctx,int i)2727 static int spellfix1Column(
2728   sqlite3_vtab_cursor *cur,
2729   sqlite3_context *ctx,
2730   int i
2731 ){
2732   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2733   if( pCur->pFullScan ){
2734     if( i<=SPELLFIX_COL_LANGID ){
2735       sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2736     }else{
2737       sqlite3_result_null(ctx);
2738     }
2739     return SQLITE_OK;
2740   }
2741   switch( i ){
2742     case SPELLFIX_COL_WORD: {
2743       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2744       break;
2745     }
2746     case SPELLFIX_COL_RANK: {
2747       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2748       break;
2749     }
2750     case SPELLFIX_COL_DISTANCE: {
2751       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2752       break;
2753     }
2754     case SPELLFIX_COL_LANGID: {
2755       sqlite3_result_int(ctx, pCur->iLang);
2756       break;
2757     }
2758     case SPELLFIX_COL_SCORE: {
2759       sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2760       break;
2761     }
2762     case SPELLFIX_COL_MATCHLEN: {
2763       int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2764       if( iMatchlen<0 ){
2765         int nPattern = (int)strlen(pCur->zPattern);
2766         char *zWord = pCur->a[pCur->iRow].zWord;
2767         int nWord = (int)strlen(zWord);
2768 
2769         if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2770           char *zTranslit;
2771           int res;
2772           zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2773           if( !zTranslit ) return SQLITE_NOMEM;
2774           res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2775           sqlite3_free(zTranslit);
2776           if( res<0 ) return SQLITE_NOMEM;
2777           iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2778         }else{
2779           iMatchlen = utf8Charlen(zWord, nWord);
2780         }
2781       }
2782 
2783       sqlite3_result_int(ctx, iMatchlen);
2784       break;
2785     }
2786     case SPELLFIX_COL_PHONEHASH: {
2787       sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2788       break;
2789     }
2790     case SPELLFIX_COL_TOP: {
2791       sqlite3_result_int(ctx, pCur->iTop);
2792       break;
2793     }
2794     case SPELLFIX_COL_SCOPE: {
2795       sqlite3_result_int(ctx, pCur->iScope);
2796       break;
2797     }
2798     case SPELLFIX_COL_SRCHCNT: {
2799       sqlite3_result_int(ctx, pCur->nSearch);
2800       break;
2801     }
2802     default: {
2803       sqlite3_result_null(ctx);
2804       break;
2805     }
2806   }
2807   return SQLITE_OK;
2808 }
2809 
2810 /*
2811 ** The rowid.
2812 */
spellfix1Rowid(sqlite3_vtab_cursor * cur,sqlite_int64 * pRowid)2813 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2814   spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2815   if( pCur->pFullScan ){
2816     *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2817   }else{
2818     *pRowid = pCur->a[pCur->iRow].iRowid;
2819   }
2820   return SQLITE_OK;
2821 }
2822 
2823 /*
2824 ** This function is called by the xUpdate() method. It returns a string
2825 ** containing the conflict mode that xUpdate() should use for the current
2826 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2827 */
spellfix1GetConflict(sqlite3 * db)2828 static const char *spellfix1GetConflict(sqlite3 *db){
2829   static const char *azConflict[] = {
2830     /* Note: Instead of "FAIL" - "ABORT". */
2831     "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2832   };
2833   int eConflict = sqlite3_vtab_on_conflict(db);
2834 
2835   assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2836        || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2837        || eConflict==SQLITE_REPLACE
2838   );
2839   assert( SQLITE_ROLLBACK==1 );
2840   assert( SQLITE_IGNORE==2 );
2841   assert( SQLITE_FAIL==3 );
2842   assert( SQLITE_ABORT==4 );
2843   assert( SQLITE_REPLACE==5 );
2844 
2845   return azConflict[eConflict-1];
2846 }
2847 
2848 /*
2849 ** The xUpdate() method.
2850 */
spellfix1Update(sqlite3_vtab * pVTab,int argc,sqlite3_value ** argv,sqlite_int64 * pRowid)2851 static int spellfix1Update(
2852   sqlite3_vtab *pVTab,
2853   int argc,
2854   sqlite3_value **argv,
2855   sqlite_int64 *pRowid
2856 ){
2857   int rc = SQLITE_OK;
2858   sqlite3_int64 rowid, newRowid;
2859   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2860   sqlite3 *db = p->db;
2861 
2862   if( argc==1 ){
2863     /* A delete operation on the rowid given by argv[0] */
2864     rowid = *pRowid = sqlite3_value_int64(argv[0]);
2865     spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2866                            " WHERE id=%lld",
2867                   p->zDbName, p->zTableName, rowid);
2868   }else{
2869     const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2870     int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2871     int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2872     int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2873     const unsigned char *zSoundslike =
2874            sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2875     int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2876     char *zK1, *zK2;
2877     int i;
2878     char c;
2879     const char *zConflict = spellfix1GetConflict(db);
2880 
2881     if( zWord==0 ){
2882       /* Inserts of the form:  INSERT INTO table(command) VALUES('xyzzy');
2883       ** cause zWord to be NULL, so we look at the "command" column to see
2884       ** what special actions to take */
2885       const char *zCmd =
2886          (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2887       if( zCmd==0 ){
2888         pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2889                                          p->zTableName);
2890         return SQLITE_CONSTRAINT_NOTNULL;
2891       }
2892       if( strcmp(zCmd,"reset")==0 ){
2893         /* Reset the  edit cost table (if there is one). */
2894         editDist3ConfigDelete(p->pConfig3);
2895         p->pConfig3 = 0;
2896         return SQLITE_OK;
2897       }
2898       if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2899         editDist3ConfigDelete(p->pConfig3);
2900         p->pConfig3 = 0;
2901         sqlite3_free(p->zCostTable);
2902         p->zCostTable = spellfix1Dequote(zCmd+16);
2903         if( p->zCostTable==0 ) return SQLITE_NOMEM;
2904         if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2905           sqlite3_free(p->zCostTable);
2906           p->zCostTable = 0;
2907         }
2908         return SQLITE_OK;
2909       }
2910       pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2911                                        p->zTableName, zCmd);
2912       return SQLITE_ERROR;
2913     }
2914     if( iRank<1 ) iRank = 1;
2915     if( zSoundslike ){
2916       zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2917     }else{
2918       zK1 = (char*)transliterate(zWord, nWord);
2919     }
2920     if( zK1==0 ) return SQLITE_NOMEM;
2921     for(i=0; (c = zK1[i])!=0; i++){
2922        if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2923     }
2924     zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2925     if( zK2==0 ){
2926       sqlite3_free(zK1);
2927       return SQLITE_NOMEM;
2928     }
2929     if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2930       if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2931         spellfix1DbExec(&rc, db,
2932                "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2933                "VALUES(%d,%d,%Q,nullif(%Q,%Q),%Q)",
2934                p->zDbName, p->zTableName,
2935                iRank, iLang, zWord, zK1, zWord, zK2
2936         );
2937       }else{
2938         newRowid = sqlite3_value_int64(argv[1]);
2939         spellfix1DbExec(&rc, db,
2940             "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2941             "VALUES(%lld,%d,%d,%Q,nullif(%Q,%Q),%Q)",
2942             zConflict, p->zDbName, p->zTableName,
2943             newRowid, iRank, iLang, zWord, zK1, zWord, zK2
2944         );
2945       }
2946       *pRowid = sqlite3_last_insert_rowid(db);
2947     }else{
2948       rowid = sqlite3_value_int64(argv[0]);
2949       newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2950       spellfix1DbExec(&rc, db,
2951              "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2952              " word=%Q, k1=nullif(%Q,%Q), k2=%Q WHERE id=%lld",
2953              zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2954              zWord, zK1, zWord, zK2, rowid
2955       );
2956     }
2957     sqlite3_free(zK1);
2958     sqlite3_free(zK2);
2959   }
2960   return rc;
2961 }
2962 
2963 /*
2964 ** Rename the spellfix1 table.
2965 */
spellfix1Rename(sqlite3_vtab * pVTab,const char * zNew)2966 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2967   spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2968   sqlite3 *db = p->db;
2969   int rc = SQLITE_OK;
2970   char *zNewName = sqlite3_mprintf("%s", zNew);
2971   if( zNewName==0 ){
2972     return SQLITE_NOMEM;
2973   }
2974   spellfix1DbExec(&rc, db,
2975      "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2976      p->zDbName, p->zTableName, zNewName
2977   );
2978   if( rc==SQLITE_OK ){
2979     sqlite3_free(p->zTableName);
2980     p->zTableName = zNewName;
2981   }else{
2982     sqlite3_free(zNewName);
2983   }
2984   return rc;
2985 }
2986 
2987 
2988 /*
2989 ** A virtual table module that provides fuzzy search.
2990 */
2991 static sqlite3_module spellfix1Module = {
2992   0,                       /* iVersion */
2993   spellfix1Create,         /* xCreate - handle CREATE VIRTUAL TABLE */
2994   spellfix1Connect,        /* xConnect - reconnected to an existing table */
2995   spellfix1BestIndex,      /* xBestIndex - figure out how to do a query */
2996   spellfix1Disconnect,     /* xDisconnect - close a connection */
2997   spellfix1Destroy,        /* xDestroy - handle DROP TABLE */
2998   spellfix1Open,           /* xOpen - open a cursor */
2999   spellfix1Close,          /* xClose - close a cursor */
3000   spellfix1Filter,         /* xFilter - configure scan constraints */
3001   spellfix1Next,           /* xNext - advance a cursor */
3002   spellfix1Eof,            /* xEof - check for end of scan */
3003   spellfix1Column,         /* xColumn - read data */
3004   spellfix1Rowid,          /* xRowid - read data */
3005   spellfix1Update,         /* xUpdate */
3006   0,                       /* xBegin */
3007   0,                       /* xSync */
3008   0,                       /* xCommit */
3009   0,                       /* xRollback */
3010   0,                       /* xFindMethod */
3011   spellfix1Rename,         /* xRename */
3012 };
3013 
3014 /*
3015 ** Register the various functions and the virtual table.
3016 */
spellfix1Register(sqlite3 * db)3017 static int spellfix1Register(sqlite3 *db){
3018   int rc = SQLITE_OK;
3019   int i;
3020   rc = sqlite3_create_function(db, "spellfix1_translit", 1,
3021                                SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3022                                 transliterateSqlFunc, 0, 0);
3023   if( rc==SQLITE_OK ){
3024     rc = sqlite3_create_function(db, "spellfix1_editdist", 2,
3025                                  SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3026                                   editdistSqlFunc, 0, 0);
3027   }
3028   if( rc==SQLITE_OK ){
3029     rc = sqlite3_create_function(db, "spellfix1_phonehash", 1,
3030                                  SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3031                                   phoneticHashSqlFunc, 0, 0);
3032   }
3033   if( rc==SQLITE_OK ){
3034     rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1,
3035                                   SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3036                                   scriptCodeSqlFunc, 0, 0);
3037   }
3038   if( rc==SQLITE_OK ){
3039     rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
3040   }
3041   if( rc==SQLITE_OK ){
3042     rc = editDist3Install(db);
3043   }
3044 
3045   /* Verify sanity of the translit[] table */
3046   for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
3047     assert( translit[i].cFrom<translit[i+1].cFrom );
3048   }
3049 
3050   return rc;
3051 }
3052 
3053 #endif /* SQLITE_OMIT_VIRTUALTABLE */
3054 
3055 /*
3056 ** Extension load function.
3057 */
3058 #ifdef _WIN32
3059 __declspec(dllexport)
3060 #endif
sqlite3_spellfix_init(sqlite3 * db,char ** pzErrMsg,const sqlite3_api_routines * pApi)3061 int sqlite3_spellfix_init(
3062   sqlite3 *db,
3063   char **pzErrMsg,
3064   const sqlite3_api_routines *pApi
3065 ){
3066   SQLITE_EXTENSION_INIT2(pApi);
3067 #ifndef SQLITE_OMIT_VIRTUALTABLE
3068   return spellfix1Register(db);
3069 #endif
3070   return SQLITE_OK;
3071 }
3072