1 /*
2 ** 2013 Apr 22
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 **
13 ** This file contains code for the "fts5tokenize" virtual table module.
14 ** An fts5tokenize virtual table is created as follows:
15 **
16 **   CREATE VIRTUAL TABLE <tbl> USING fts5tokenize(
17 **       <tokenizer-name>, <arg-1>, ...
18 **   );
19 **
20 ** The table created has the following schema:
21 **
22 **   CREATE TABLE <tbl>(input HIDDEN, token, start, end, position)
23 **
24 ** When queried, the query must include a WHERE clause of type:
25 **
26 **   input = <string>
27 **
28 ** The virtual table module tokenizes this <string>, using the FTS3
29 ** tokenizer specified by the arguments to the CREATE VIRTUAL TABLE
30 ** statement and returns one row for each token in the result. With
31 ** fields set as follows:
32 **
33 **   input:   Always set to a copy of <string>
34 **   token:   A token from the input.
35 **   start:   Byte offset of the token within the input <string>.
36 **   end:     Byte offset of the byte immediately following the end of the
37 **            token within the input string.
38 **   pos:     Token offset of token within input.
39 **
40 */
41 #if defined(SQLITE_TEST) && defined(SQLITE_ENABLE_FTS5)
42 
43 #include "fts5.h"
44 #include <string.h>
45 #include <assert.h>
46 
47 typedef struct Fts5tokTable Fts5tokTable;
48 typedef struct Fts5tokCursor Fts5tokCursor;
49 typedef struct Fts5tokRow Fts5tokRow;
50 
51 /*
52 ** Virtual table structure.
53 */
54 struct Fts5tokTable {
55   sqlite3_vtab base;              /* Base class used by SQLite core */
56   fts5_tokenizer tok;             /* Tokenizer functions */
57   Fts5Tokenizer *pTok;            /* Tokenizer instance */
58 };
59 
60 /*
61 ** A container for a rows values.
62 */
63 struct Fts5tokRow {
64   char *zToken;
65   int iStart;
66   int iEnd;
67   int iPos;
68 };
69 
70 /*
71 ** Virtual table cursor structure.
72 */
73 struct Fts5tokCursor {
74   sqlite3_vtab_cursor base;       /* Base class used by SQLite core */
75   int iRowid;                     /* Current 'rowid' value */
76   char *zInput;                   /* Input string */
77   int nRow;                       /* Number of entries in aRow[] */
78   Fts5tokRow *aRow;               /* Array of rows to return */
79 };
80 
fts5tokDequote(char * z)81 static void fts5tokDequote(char *z){
82   char q = z[0];
83 
84   if( q=='[' || q=='\'' || q=='"' || q=='`' ){
85     int iIn = 1;
86     int iOut = 0;
87     if( q=='[' ) q = ']';
88 
89     while( z[iIn] ){
90       if( z[iIn]==q ){
91         if( z[iIn+1]!=q ){
92           /* Character iIn was the close quote. */
93           iIn++;
94           break;
95         }else{
96           /* Character iIn and iIn+1 form an escaped quote character. Skip
97           ** the input cursor past both and copy a single quote character
98           ** to the output buffer. */
99           iIn += 2;
100           z[iOut++] = q;
101         }
102       }else{
103         z[iOut++] = z[iIn++];
104       }
105     }
106 
107     z[iOut] = '\0';
108   }
109 }
110 
111 /*
112 ** The second argument, argv[], is an array of pointers to nul-terminated
113 ** strings. This function makes a copy of the array and strings into a
114 ** single block of memory. It then dequotes any of the strings that appear
115 ** to be quoted.
116 **
117 ** If successful, output parameter *pazDequote is set to point at the
118 ** array of dequoted strings and SQLITE_OK is returned. The caller is
119 ** responsible for eventually calling sqlite3_free() to free the array
120 ** in this case. Or, if an error occurs, an SQLite error code is returned.
121 ** The final value of *pazDequote is undefined in this case.
122 */
fts5tokDequoteArray(int argc,const char * const * argv,char *** pazDequote)123 static int fts5tokDequoteArray(
124   int argc,                       /* Number of elements in argv[] */
125   const char * const *argv,       /* Input array */
126   char ***pazDequote              /* Output array */
127 ){
128   int rc = SQLITE_OK;             /* Return code */
129   if( argc==0 ){
130     *pazDequote = 0;
131   }else{
132     int i;
133     int nByte = 0;
134     char **azDequote;
135 
136     for(i=0; i<argc; i++){
137       nByte += (int)(strlen(argv[i]) + 1);
138     }
139 
140     *pazDequote = azDequote = sqlite3_malloc(sizeof(char *)*argc + nByte);
141     if( azDequote==0 ){
142       rc = SQLITE_NOMEM;
143     }else{
144       char *pSpace = (char *)&azDequote[argc];
145       for(i=0; i<argc; i++){
146         int n = (int)strlen(argv[i]);
147         azDequote[i] = pSpace;
148         memcpy(pSpace, argv[i], n+1);
149         fts5tokDequote(pSpace);
150         pSpace += (n+1);
151       }
152     }
153   }
154 
155   return rc;
156 }
157 
158 /*
159 ** Schema of the tokenizer table.
160 */
161 #define FTS3_TOK_SCHEMA "CREATE TABLE x(input HIDDEN, token, start, end, position)"
162 
163 /*
164 ** This function does all the work for both the xConnect and xCreate methods.
165 ** These tables have no persistent representation of their own, so xConnect
166 ** and xCreate are identical operations.
167 **
168 **   argv[0]: module name
169 **   argv[1]: database name
170 **   argv[2]: table name
171 **   argv[3]: first argument (tokenizer name)
172 */
fts5tokConnectMethod(sqlite3 * db,void * pCtx,int argc,const char * const * argv,sqlite3_vtab ** ppVtab,char ** pzErr)173 static int fts5tokConnectMethod(
174   sqlite3 *db,                    /* Database connection */
175   void *pCtx,                     /* Pointer to fts5_api object */
176   int argc,                       /* Number of elements in argv array */
177   const char * const *argv,       /* xCreate/xConnect argument array */
178   sqlite3_vtab **ppVtab,          /* OUT: New sqlite3_vtab object */
179   char **pzErr                    /* OUT: sqlite3_malloc'd error message */
180 ){
181   fts5_api *pApi = (fts5_api*)pCtx;
182   Fts5tokTable *pTab = 0;
183   int rc;
184   char **azDequote = 0;
185   int nDequote = 0;
186 
187   rc = sqlite3_declare_vtab(db,
188        "CREATE TABLE x(input HIDDEN, token, start, end, position)"
189   );
190 
191   if( rc==SQLITE_OK ){
192     nDequote = argc-3;
193     rc = fts5tokDequoteArray(nDequote, &argv[3], &azDequote);
194   }
195 
196   if( rc==SQLITE_OK ){
197     pTab = (Fts5tokTable*)sqlite3_malloc(sizeof(Fts5tokTable));
198     if( pTab==0 ){
199       rc = SQLITE_NOMEM;
200     }else{
201       memset(pTab, 0, sizeof(Fts5tokTable));
202     }
203   }
204 
205   if( rc==SQLITE_OK ){
206     void *pTokCtx = 0;
207     const char *zModule = 0;
208     if( nDequote>0 ){
209       zModule = azDequote[0];
210     }
211 
212     rc = pApi->xFindTokenizer(pApi, zModule, &pTokCtx, &pTab->tok);
213     if( rc==SQLITE_OK ){
214       const char **azArg = (const char **)&azDequote[1];
215       int nArg = nDequote>0 ? nDequote-1 : 0;
216       rc = pTab->tok.xCreate(pTokCtx, azArg, nArg, &pTab->pTok);
217     }
218   }
219 
220   if( rc!=SQLITE_OK ){
221     sqlite3_free(pTab);
222     pTab = 0;
223   }
224 
225   *ppVtab = (sqlite3_vtab*)pTab;
226   sqlite3_free(azDequote);
227   return rc;
228 }
229 
230 /*
231 ** This function does the work for both the xDisconnect and xDestroy methods.
232 ** These tables have no persistent representation of their own, so xDisconnect
233 ** and xDestroy are identical operations.
234 */
fts5tokDisconnectMethod(sqlite3_vtab * pVtab)235 static int fts5tokDisconnectMethod(sqlite3_vtab *pVtab){
236   Fts5tokTable *pTab = (Fts5tokTable *)pVtab;
237   if( pTab->pTok ){
238     pTab->tok.xDelete(pTab->pTok);
239   }
240   sqlite3_free(pTab);
241   return SQLITE_OK;
242 }
243 
244 /*
245 ** xBestIndex - Analyze a WHERE and ORDER BY clause.
246 */
fts5tokBestIndexMethod(sqlite3_vtab * pVTab,sqlite3_index_info * pInfo)247 static int fts5tokBestIndexMethod(
248   sqlite3_vtab *pVTab,
249   sqlite3_index_info *pInfo
250 ){
251   int i;
252 
253   for(i=0; i<pInfo->nConstraint; i++){
254     if( pInfo->aConstraint[i].usable
255      && pInfo->aConstraint[i].iColumn==0
256      && pInfo->aConstraint[i].op==SQLITE_INDEX_CONSTRAINT_EQ
257     ){
258       pInfo->idxNum = 1;
259       pInfo->aConstraintUsage[i].argvIndex = 1;
260       pInfo->aConstraintUsage[i].omit = 1;
261       pInfo->estimatedCost = 1;
262       return SQLITE_OK;
263     }
264   }
265 
266   pInfo->idxNum = 0;
267   assert( pInfo->estimatedCost>1000000.0 );
268 
269   return SQLITE_OK;
270 }
271 
272 /*
273 ** xOpen - Open a cursor.
274 */
fts5tokOpenMethod(sqlite3_vtab * pVTab,sqlite3_vtab_cursor ** ppCsr)275 static int fts5tokOpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
276   Fts5tokCursor *pCsr;
277 
278   pCsr = (Fts5tokCursor *)sqlite3_malloc(sizeof(Fts5tokCursor));
279   if( pCsr==0 ){
280     return SQLITE_NOMEM;
281   }
282   memset(pCsr, 0, sizeof(Fts5tokCursor));
283 
284   *ppCsr = (sqlite3_vtab_cursor *)pCsr;
285   return SQLITE_OK;
286 }
287 
288 /*
289 ** Reset the tokenizer cursor passed as the only argument. As if it had
290 ** just been returned by fts5tokOpenMethod().
291 */
fts5tokResetCursor(Fts5tokCursor * pCsr)292 static void fts5tokResetCursor(Fts5tokCursor *pCsr){
293   int i;
294   for(i=0; i<pCsr->nRow; i++){
295     sqlite3_free(pCsr->aRow[i].zToken);
296   }
297   sqlite3_free(pCsr->zInput);
298   sqlite3_free(pCsr->aRow);
299   pCsr->zInput = 0;
300   pCsr->aRow = 0;
301   pCsr->nRow = 0;
302   pCsr->iRowid = 0;
303 }
304 
305 /*
306 ** xClose - Close a cursor.
307 */
fts5tokCloseMethod(sqlite3_vtab_cursor * pCursor)308 static int fts5tokCloseMethod(sqlite3_vtab_cursor *pCursor){
309   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
310   fts5tokResetCursor(pCsr);
311   sqlite3_free(pCsr);
312   return SQLITE_OK;
313 }
314 
315 /*
316 ** xNext - Advance the cursor to the next row, if any.
317 */
fts5tokNextMethod(sqlite3_vtab_cursor * pCursor)318 static int fts5tokNextMethod(sqlite3_vtab_cursor *pCursor){
319   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
320   pCsr->iRowid++;
321   return SQLITE_OK;
322 }
323 
fts5tokCb(void * pCtx,int tflags,const char * pToken,int nToken,int iStart,int iEnd)324 static int fts5tokCb(
325   void *pCtx,         /* Pointer to Fts5tokCursor */
326   int tflags,         /* Mask of FTS5_TOKEN_* flags */
327   const char *pToken, /* Pointer to buffer containing token */
328   int nToken,         /* Size of token in bytes */
329   int iStart,         /* Byte offset of token within input text */
330   int iEnd            /* Byte offset of end of token within input text */
331 ){
332   Fts5tokCursor *pCsr = (Fts5tokCursor*)pCtx;
333   Fts5tokRow *pRow;
334 
335   if( (pCsr->nRow & (pCsr->nRow-1))==0 ){
336     int nNew = pCsr->nRow ? pCsr->nRow*2 : 32;
337     Fts5tokRow *aNew;
338     aNew = (Fts5tokRow*)sqlite3_realloc(pCsr->aRow, nNew*sizeof(Fts5tokRow));
339     if( aNew==0 ) return SQLITE_NOMEM;
340     memset(&aNew[pCsr->nRow], 0, sizeof(Fts5tokRow)*(nNew-pCsr->nRow));
341     pCsr->aRow = aNew;
342   }
343 
344   pRow = &pCsr->aRow[pCsr->nRow];
345   pRow->iStart = iStart;
346   pRow->iEnd = iEnd;
347   if( pCsr->nRow ){
348     pRow->iPos = pRow[-1].iPos + ((tflags & FTS5_TOKEN_COLOCATED) ? 0 : 1);
349   }
350   pRow->zToken = sqlite3_malloc(nToken+1);
351   if( pRow->zToken==0 ) return SQLITE_NOMEM;
352   memcpy(pRow->zToken, pToken, nToken);
353   pRow->zToken[nToken] = 0;
354   pCsr->nRow++;
355 
356   return SQLITE_OK;
357 }
358 
359 /*
360 ** xFilter - Initialize a cursor to point at the start of its data.
361 */
fts5tokFilterMethod(sqlite3_vtab_cursor * pCursor,int idxNum,const char * idxStr,int nVal,sqlite3_value ** apVal)362 static int fts5tokFilterMethod(
363   sqlite3_vtab_cursor *pCursor,   /* The cursor used for this query */
364   int idxNum,                     /* Strategy index */
365   const char *idxStr,             /* Unused */
366   int nVal,                       /* Number of elements in apVal */
367   sqlite3_value **apVal           /* Arguments for the indexing scheme */
368 ){
369   int rc = SQLITE_ERROR;
370   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
371   Fts5tokTable *pTab = (Fts5tokTable *)(pCursor->pVtab);
372 
373   fts5tokResetCursor(pCsr);
374   if( idxNum==1 ){
375     const char *zByte = (const char *)sqlite3_value_text(apVal[0]);
376     int nByte = sqlite3_value_bytes(apVal[0]);
377     pCsr->zInput = sqlite3_malloc(nByte+1);
378     if( pCsr->zInput==0 ){
379       rc = SQLITE_NOMEM;
380     }else{
381       memcpy(pCsr->zInput, zByte, nByte);
382       pCsr->zInput[nByte] = 0;
383       rc = pTab->tok.xTokenize(
384           pTab->pTok, (void*)pCsr, 0, zByte, nByte, fts5tokCb
385       );
386     }
387   }
388 
389   if( rc!=SQLITE_OK ) return rc;
390   return fts5tokNextMethod(pCursor);
391 }
392 
393 /*
394 ** xEof - Return true if the cursor is at EOF, or false otherwise.
395 */
fts5tokEofMethod(sqlite3_vtab_cursor * pCursor)396 static int fts5tokEofMethod(sqlite3_vtab_cursor *pCursor){
397   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
398   return (pCsr->iRowid>pCsr->nRow);
399 }
400 
401 /*
402 ** xColumn - Return a column value.
403 */
fts5tokColumnMethod(sqlite3_vtab_cursor * pCursor,sqlite3_context * pCtx,int iCol)404 static int fts5tokColumnMethod(
405   sqlite3_vtab_cursor *pCursor,   /* Cursor to retrieve value from */
406   sqlite3_context *pCtx,          /* Context for sqlite3_result_xxx() calls */
407   int iCol                        /* Index of column to read value from */
408 ){
409   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
410   Fts5tokRow *pRow = &pCsr->aRow[pCsr->iRowid-1];
411 
412   /* CREATE TABLE x(input, token, start, end, position) */
413   switch( iCol ){
414     case 0:
415       sqlite3_result_text(pCtx, pCsr->zInput, -1, SQLITE_TRANSIENT);
416       break;
417     case 1:
418       sqlite3_result_text(pCtx, pRow->zToken, -1, SQLITE_TRANSIENT);
419       break;
420     case 2:
421       sqlite3_result_int(pCtx, pRow->iStart);
422       break;
423     case 3:
424       sqlite3_result_int(pCtx, pRow->iEnd);
425       break;
426     default:
427       assert( iCol==4 );
428       sqlite3_result_int(pCtx, pRow->iPos);
429       break;
430   }
431   return SQLITE_OK;
432 }
433 
434 /*
435 ** xRowid - Return the current rowid for the cursor.
436 */
fts5tokRowidMethod(sqlite3_vtab_cursor * pCursor,sqlite_int64 * pRowid)437 static int fts5tokRowidMethod(
438   sqlite3_vtab_cursor *pCursor,   /* Cursor to retrieve value from */
439   sqlite_int64 *pRowid            /* OUT: Rowid value */
440 ){
441   Fts5tokCursor *pCsr = (Fts5tokCursor *)pCursor;
442   *pRowid = (sqlite3_int64)pCsr->iRowid;
443   return SQLITE_OK;
444 }
445 
446 /*
447 ** Register the fts5tok module with database connection db. Return SQLITE_OK
448 ** if successful or an error code if sqlite3_create_module() fails.
449 */
sqlite3Fts5TestRegisterTok(sqlite3 * db,fts5_api * pApi)450 int sqlite3Fts5TestRegisterTok(sqlite3 *db, fts5_api *pApi){
451   static const sqlite3_module fts5tok_module = {
452      0,                           /* iVersion      */
453      fts5tokConnectMethod,        /* xCreate       */
454      fts5tokConnectMethod,        /* xConnect      */
455      fts5tokBestIndexMethod,      /* xBestIndex    */
456      fts5tokDisconnectMethod,     /* xDisconnect   */
457      fts5tokDisconnectMethod,     /* xDestroy      */
458      fts5tokOpenMethod,           /* xOpen         */
459      fts5tokCloseMethod,          /* xClose        */
460      fts5tokFilterMethod,         /* xFilter       */
461      fts5tokNextMethod,           /* xNext         */
462      fts5tokEofMethod,            /* xEof          */
463      fts5tokColumnMethod,         /* xColumn       */
464      fts5tokRowidMethod,          /* xRowid        */
465      0,                           /* xUpdate       */
466      0,                           /* xBegin        */
467      0,                           /* xSync         */
468      0,                           /* xCommit       */
469      0,                           /* xRollback     */
470      0,                           /* xFindFunction */
471      0,                           /* xRename       */
472      0,                           /* xSavepoint    */
473      0,                           /* xRelease      */
474      0                            /* xRollbackTo   */
475   };
476   int rc;                         /* Return code */
477 
478   rc = sqlite3_create_module(db, "fts5tokenize", &fts5tok_module, (void*)pApi);
479   return rc;
480 }
481 
482 #endif /* defined(SQLITE_TEST) && defined(SQLITE_ENABLE_FTS5) */
483