1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13 
14 
15 #include "fts5Int.h"
16 #include <math.h>                 /* amalgamator: keep */
17 
18 /*
19 ** Object used to iterate through all "coalesced phrase instances" in
20 ** a single column of the current row. If the phrase instances in the
21 ** column being considered do not overlap, this object simply iterates
22 ** through them. Or, if they do overlap (share one or more tokens in
23 ** common), each set of overlapping instances is treated as a single
24 ** match. See documentation for the highlight() auxiliary function for
25 ** details.
26 **
27 ** Usage is:
28 **
29 **   for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
30 **      (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
31 **      rc = fts5CInstIterNext(&iter)
32 **   ){
33 **     printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
34 **   }
35 **
36 */
37 typedef struct CInstIter CInstIter;
38 struct CInstIter {
39   const Fts5ExtensionApi *pApi;   /* API offered by current FTS version */
40   Fts5Context *pFts;              /* First arg to pass to pApi functions */
41   int iCol;                       /* Column to search */
42   int iInst;                      /* Next phrase instance index */
43   int nInst;                      /* Total number of phrase instances */
44 
45   /* Output variables */
46   int iStart;                     /* First token in coalesced phrase instance */
47   int iEnd;                       /* Last token in coalesced phrase instance */
48 };
49 
50 /*
51 ** Advance the iterator to the next coalesced phrase instance. Return
52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
53 */
fts5CInstIterNext(CInstIter * pIter)54 static int fts5CInstIterNext(CInstIter *pIter){
55   int rc = SQLITE_OK;
56   pIter->iStart = -1;
57   pIter->iEnd = -1;
58 
59   while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
60     int ip; int ic; int io;
61     rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
62     if( rc==SQLITE_OK ){
63       if( ic==pIter->iCol ){
64         int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
65         if( pIter->iStart<0 ){
66           pIter->iStart = io;
67           pIter->iEnd = iEnd;
68         }else if( io<=pIter->iEnd ){
69           if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
70         }else{
71           break;
72         }
73       }
74       pIter->iInst++;
75     }
76   }
77 
78   return rc;
79 }
80 
81 /*
82 ** Initialize the iterator object indicated by the final parameter to
83 ** iterate through coalesced phrase instances in column iCol.
84 */
fts5CInstIterInit(const Fts5ExtensionApi * pApi,Fts5Context * pFts,int iCol,CInstIter * pIter)85 static int fts5CInstIterInit(
86   const Fts5ExtensionApi *pApi,
87   Fts5Context *pFts,
88   int iCol,
89   CInstIter *pIter
90 ){
91   int rc;
92 
93   memset(pIter, 0, sizeof(CInstIter));
94   pIter->pApi = pApi;
95   pIter->pFts = pFts;
96   pIter->iCol = iCol;
97   rc = pApi->xInstCount(pFts, &pIter->nInst);
98 
99   if( rc==SQLITE_OK ){
100     rc = fts5CInstIterNext(pIter);
101   }
102 
103   return rc;
104 }
105 
106 
107 
108 /*************************************************************************
109 ** Start of highlight() implementation.
110 */
111 typedef struct HighlightContext HighlightContext;
112 struct HighlightContext {
113   CInstIter iter;                 /* Coalesced Instance Iterator */
114   int iPos;                       /* Current token offset in zIn[] */
115   int iRangeStart;                /* First token to include */
116   int iRangeEnd;                  /* If non-zero, last token to include */
117   const char *zOpen;              /* Opening highlight */
118   const char *zClose;             /* Closing highlight */
119   const char *zIn;                /* Input text */
120   int nIn;                        /* Size of input text in bytes */
121   int iOff;                       /* Current offset within zIn[] */
122   char *zOut;                     /* Output value */
123 };
124 
125 /*
126 ** Append text to the HighlightContext output string - p->zOut. Argument
127 ** z points to a buffer containing n bytes of text to append. If n is
128 ** negative, everything up until the first '\0' is appended to the output.
129 **
130 ** If *pRc is set to any value other than SQLITE_OK when this function is
131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
132 ** *pRc is set to an error code before returning.
133 */
fts5HighlightAppend(int * pRc,HighlightContext * p,const char * z,int n)134 static void fts5HighlightAppend(
135   int *pRc,
136   HighlightContext *p,
137   const char *z, int n
138 ){
139   if( *pRc==SQLITE_OK && z ){
140     if( n<0 ) n = (int)strlen(z);
141     p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
142     if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
143   }
144 }
145 
146 /*
147 ** Tokenizer callback used by implementation of highlight() function.
148 */
fts5HighlightCb(void * pContext,int tflags,const char * pToken,int nToken,int iStartOff,int iEndOff)149 static int fts5HighlightCb(
150   void *pContext,                 /* Pointer to HighlightContext object */
151   int tflags,                     /* Mask of FTS5_TOKEN_* flags */
152   const char *pToken,             /* Buffer containing token */
153   int nToken,                     /* Size of token in bytes */
154   int iStartOff,                  /* Start offset of token */
155   int iEndOff                     /* End offset of token */
156 ){
157   HighlightContext *p = (HighlightContext*)pContext;
158   int rc = SQLITE_OK;
159   int iPos;
160 
161   UNUSED_PARAM2(pToken, nToken);
162 
163   if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
164   iPos = p->iPos++;
165 
166   if( p->iRangeEnd>0 ){
167     if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
168     if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
169   }
170 
171   if( iPos==p->iter.iStart ){
172     fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
173     fts5HighlightAppend(&rc, p, p->zOpen, -1);
174     p->iOff = iStartOff;
175   }
176 
177   if( iPos==p->iter.iEnd ){
178     if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){
179       fts5HighlightAppend(&rc, p, p->zOpen, -1);
180     }
181     fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
182     fts5HighlightAppend(&rc, p, p->zClose, -1);
183     p->iOff = iEndOff;
184     if( rc==SQLITE_OK ){
185       rc = fts5CInstIterNext(&p->iter);
186     }
187   }
188 
189   if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){
190     fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
191     p->iOff = iEndOff;
192     if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){
193       fts5HighlightAppend(&rc, p, p->zClose, -1);
194     }
195   }
196 
197   return rc;
198 }
199 
200 /*
201 ** Implementation of highlight() function.
202 */
fts5HighlightFunction(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)203 static void fts5HighlightFunction(
204   const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
205   Fts5Context *pFts,              /* First arg to pass to pApi functions */
206   sqlite3_context *pCtx,          /* Context for returning result/error */
207   int nVal,                       /* Number of values in apVal[] array */
208   sqlite3_value **apVal           /* Array of trailing arguments */
209 ){
210   HighlightContext ctx;
211   int rc;
212   int iCol;
213 
214   if( nVal!=3 ){
215     const char *zErr = "wrong number of arguments to function highlight()";
216     sqlite3_result_error(pCtx, zErr, -1);
217     return;
218   }
219 
220   iCol = sqlite3_value_int(apVal[0]);
221   memset(&ctx, 0, sizeof(HighlightContext));
222   ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
223   ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
224   rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
225 
226   if( ctx.zIn ){
227     if( rc==SQLITE_OK ){
228       rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
229     }
230 
231     if( rc==SQLITE_OK ){
232       rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
233     }
234     fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
235 
236     if( rc==SQLITE_OK ){
237       sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
238     }
239     sqlite3_free(ctx.zOut);
240   }
241   if( rc!=SQLITE_OK ){
242     sqlite3_result_error_code(pCtx, rc);
243   }
244 }
245 /*
246 ** End of highlight() implementation.
247 **************************************************************************/
248 
249 /*
250 ** Context object passed to the fts5SentenceFinderCb() function.
251 */
252 typedef struct Fts5SFinder Fts5SFinder;
253 struct Fts5SFinder {
254   int iPos;                       /* Current token position */
255   int nFirstAlloc;                /* Allocated size of aFirst[] */
256   int nFirst;                     /* Number of entries in aFirst[] */
257   int *aFirst;                    /* Array of first token in each sentence */
258   const char *zDoc;               /* Document being tokenized */
259 };
260 
261 /*
262 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
263 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
264 ** error occurs.
265 */
fts5SentenceFinderAdd(Fts5SFinder * p,int iAdd)266 static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
267   if( p->nFirstAlloc==p->nFirst ){
268     int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
269     int *aNew;
270 
271     aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int));
272     if( aNew==0 ) return SQLITE_NOMEM;
273     p->aFirst = aNew;
274     p->nFirstAlloc = nNew;
275   }
276   p->aFirst[p->nFirst++] = iAdd;
277   return SQLITE_OK;
278 }
279 
280 /*
281 ** This function is an xTokenize() callback used by the auxiliary snippet()
282 ** function. Its job is to identify tokens that are the first in a sentence.
283 ** For each such token, an entry is added to the SFinder.aFirst[] array.
284 */
fts5SentenceFinderCb(void * pContext,int tflags,const char * pToken,int nToken,int iStartOff,int iEndOff)285 static int fts5SentenceFinderCb(
286   void *pContext,                 /* Pointer to HighlightContext object */
287   int tflags,                     /* Mask of FTS5_TOKEN_* flags */
288   const char *pToken,             /* Buffer containing token */
289   int nToken,                     /* Size of token in bytes */
290   int iStartOff,                  /* Start offset of token */
291   int iEndOff                     /* End offset of token */
292 ){
293   int rc = SQLITE_OK;
294 
295   UNUSED_PARAM2(pToken, nToken);
296   UNUSED_PARAM(iEndOff);
297 
298   if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
299     Fts5SFinder *p = (Fts5SFinder*)pContext;
300     if( p->iPos>0 ){
301       int i;
302       char c = 0;
303       for(i=iStartOff-1; i>=0; i--){
304         c = p->zDoc[i];
305         if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
306       }
307       if( i!=iStartOff-1 && (c=='.' || c==':') ){
308         rc = fts5SentenceFinderAdd(p, p->iPos);
309       }
310     }else{
311       rc = fts5SentenceFinderAdd(p, 0);
312     }
313     p->iPos++;
314   }
315   return rc;
316 }
317 
fts5SnippetScore(const Fts5ExtensionApi * pApi,Fts5Context * pFts,int nDocsize,unsigned char * aSeen,int iCol,int iPos,int nToken,int * pnScore,int * piPos)318 static int fts5SnippetScore(
319   const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
320   Fts5Context *pFts,              /* First arg to pass to pApi functions */
321   int nDocsize,                   /* Size of column in tokens */
322   unsigned char *aSeen,           /* Array with one element per query phrase */
323   int iCol,                       /* Column to score */
324   int iPos,                       /* Starting offset to score */
325   int nToken,                     /* Max tokens per snippet */
326   int *pnScore,                   /* OUT: Score */
327   int *piPos                      /* OUT: Adjusted offset */
328 ){
329   int rc;
330   int i;
331   int ip = 0;
332   int ic = 0;
333   int iOff = 0;
334   int iFirst = -1;
335   int nInst;
336   int nScore = 0;
337   int iLast = 0;
338   sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken;
339 
340   rc = pApi->xInstCount(pFts, &nInst);
341   for(i=0; i<nInst && rc==SQLITE_OK; i++){
342     rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
343     if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){
344       nScore += (aSeen[ip] ? 1 : 1000);
345       aSeen[ip] = 1;
346       if( iFirst<0 ) iFirst = iOff;
347       iLast = iOff + pApi->xPhraseSize(pFts, ip);
348     }
349   }
350 
351   *pnScore = nScore;
352   if( piPos ){
353     sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
354     if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
355     if( iAdj<0 ) iAdj = 0;
356     *piPos = (int)iAdj;
357   }
358 
359   return rc;
360 }
361 
362 /*
363 ** Return the value in pVal interpreted as utf-8 text. Except, if pVal
364 ** contains a NULL value, return a pointer to a static string zero
365 ** bytes in length instead of a NULL pointer.
366 */
fts5ValueToText(sqlite3_value * pVal)367 static const char *fts5ValueToText(sqlite3_value *pVal){
368   const char *zRet = (const char*)sqlite3_value_text(pVal);
369   return zRet ? zRet : "";
370 }
371 
372 /*
373 ** Implementation of snippet() function.
374 */
fts5SnippetFunction(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)375 static void fts5SnippetFunction(
376   const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
377   Fts5Context *pFts,              /* First arg to pass to pApi functions */
378   sqlite3_context *pCtx,          /* Context for returning result/error */
379   int nVal,                       /* Number of values in apVal[] array */
380   sqlite3_value **apVal           /* Array of trailing arguments */
381 ){
382   HighlightContext ctx;
383   int rc = SQLITE_OK;             /* Return code */
384   int iCol;                       /* 1st argument to snippet() */
385   const char *zEllips;            /* 4th argument to snippet() */
386   int nToken;                     /* 5th argument to snippet() */
387   int nInst = 0;                  /* Number of instance matches this row */
388   int i;                          /* Used to iterate through instances */
389   int nPhrase;                    /* Number of phrases in query */
390   unsigned char *aSeen;           /* Array of "seen instance" flags */
391   int iBestCol;                   /* Column containing best snippet */
392   int iBestStart = 0;             /* First token of best snippet */
393   int nBestScore = 0;             /* Score of best snippet */
394   int nColSize = 0;               /* Total size of iBestCol in tokens */
395   Fts5SFinder sFinder;            /* Used to find the beginnings of sentences */
396   int nCol;
397 
398   if( nVal!=5 ){
399     const char *zErr = "wrong number of arguments to function snippet()";
400     sqlite3_result_error(pCtx, zErr, -1);
401     return;
402   }
403 
404   nCol = pApi->xColumnCount(pFts);
405   memset(&ctx, 0, sizeof(HighlightContext));
406   iCol = sqlite3_value_int(apVal[0]);
407   ctx.zOpen = fts5ValueToText(apVal[1]);
408   ctx.zClose = fts5ValueToText(apVal[2]);
409   zEllips = fts5ValueToText(apVal[3]);
410   nToken = sqlite3_value_int(apVal[4]);
411 
412   iBestCol = (iCol>=0 ? iCol : 0);
413   nPhrase = pApi->xPhraseCount(pFts);
414   aSeen = sqlite3_malloc(nPhrase);
415   if( aSeen==0 ){
416     rc = SQLITE_NOMEM;
417   }
418   if( rc==SQLITE_OK ){
419     rc = pApi->xInstCount(pFts, &nInst);
420   }
421 
422   memset(&sFinder, 0, sizeof(Fts5SFinder));
423   for(i=0; i<nCol; i++){
424     if( iCol<0 || iCol==i ){
425       int nDoc;
426       int nDocsize;
427       int ii;
428       sFinder.iPos = 0;
429       sFinder.nFirst = 0;
430       rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
431       if( rc!=SQLITE_OK ) break;
432       rc = pApi->xTokenize(pFts,
433           sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb
434       );
435       if( rc!=SQLITE_OK ) break;
436       rc = pApi->xColumnSize(pFts, i, &nDocsize);
437       if( rc!=SQLITE_OK ) break;
438 
439       for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
440         int ip, ic, io;
441         int iAdj;
442         int nScore;
443         int jj;
444 
445         rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
446         if( ic!=i ) continue;
447         if( io>nDocsize ) rc = FTS5_CORRUPT;
448         if( rc!=SQLITE_OK ) continue;
449         memset(aSeen, 0, nPhrase);
450         rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
451             io, nToken, &nScore, &iAdj
452         );
453         if( rc==SQLITE_OK && nScore>nBestScore ){
454           nBestScore = nScore;
455           iBestCol = i;
456           iBestStart = iAdj;
457           nColSize = nDocsize;
458         }
459 
460         if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
461           for(jj=0; jj<(sFinder.nFirst-1); jj++){
462             if( sFinder.aFirst[jj+1]>io ) break;
463           }
464 
465           if( sFinder.aFirst[jj]<io ){
466             memset(aSeen, 0, nPhrase);
467             rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
468               sFinder.aFirst[jj], nToken, &nScore, 0
469             );
470 
471             nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
472             if( rc==SQLITE_OK && nScore>nBestScore ){
473               nBestScore = nScore;
474               iBestCol = i;
475               iBestStart = sFinder.aFirst[jj];
476               nColSize = nDocsize;
477             }
478           }
479         }
480       }
481     }
482   }
483 
484   if( rc==SQLITE_OK ){
485     rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
486   }
487   if( rc==SQLITE_OK && nColSize==0 ){
488     rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
489   }
490   if( ctx.zIn ){
491     if( rc==SQLITE_OK ){
492       rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
493     }
494 
495     ctx.iRangeStart = iBestStart;
496     ctx.iRangeEnd = iBestStart + nToken - 1;
497 
498     if( iBestStart>0 ){
499       fts5HighlightAppend(&rc, &ctx, zEllips, -1);
500     }
501 
502     /* Advance iterator ctx.iter so that it points to the first coalesced
503     ** phrase instance at or following position iBestStart. */
504     while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
505       rc = fts5CInstIterNext(&ctx.iter);
506     }
507 
508     if( rc==SQLITE_OK ){
509       rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
510     }
511     if( ctx.iRangeEnd>=(nColSize-1) ){
512       fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
513     }else{
514       fts5HighlightAppend(&rc, &ctx, zEllips, -1);
515     }
516   }
517   if( rc==SQLITE_OK ){
518     sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
519   }else{
520     sqlite3_result_error_code(pCtx, rc);
521   }
522   sqlite3_free(ctx.zOut);
523   sqlite3_free(aSeen);
524   sqlite3_free(sFinder.aFirst);
525 }
526 
527 /************************************************************************/
528 
529 /*
530 ** The first time the bm25() function is called for a query, an instance
531 ** of the following structure is allocated and populated.
532 */
533 typedef struct Fts5Bm25Data Fts5Bm25Data;
534 struct Fts5Bm25Data {
535   int nPhrase;                    /* Number of phrases in query */
536   double avgdl;                   /* Average number of tokens in each row */
537   double *aIDF;                   /* IDF for each phrase */
538   double *aFreq;                  /* Array used to calculate phrase freq. */
539 };
540 
541 /*
542 ** Callback used by fts5Bm25GetData() to count the number of rows in the
543 ** table matched by each individual phrase within the query.
544 */
fts5CountCb(const Fts5ExtensionApi * pApi,Fts5Context * pFts,void * pUserData)545 static int fts5CountCb(
546   const Fts5ExtensionApi *pApi,
547   Fts5Context *pFts,
548   void *pUserData                 /* Pointer to sqlite3_int64 variable */
549 ){
550   sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
551   UNUSED_PARAM2(pApi, pFts);
552   (*pn)++;
553   return SQLITE_OK;
554 }
555 
556 /*
557 ** Set *ppData to point to the Fts5Bm25Data object for the current query.
558 ** If the object has not already been allocated, allocate and populate it
559 ** now.
560 */
fts5Bm25GetData(const Fts5ExtensionApi * pApi,Fts5Context * pFts,Fts5Bm25Data ** ppData)561 static int fts5Bm25GetData(
562   const Fts5ExtensionApi *pApi,
563   Fts5Context *pFts,
564   Fts5Bm25Data **ppData           /* OUT: bm25-data object for this query */
565 ){
566   int rc = SQLITE_OK;             /* Return code */
567   Fts5Bm25Data *p;                /* Object to return */
568 
569   p = pApi->xGetAuxdata(pFts, 0);
570   if( p==0 ){
571     int nPhrase;                  /* Number of phrases in query */
572     sqlite3_int64 nRow = 0;       /* Number of rows in table */
573     sqlite3_int64 nToken = 0;     /* Number of tokens in table */
574     sqlite3_int64 nByte;          /* Bytes of space to allocate */
575     int i;
576 
577     /* Allocate the Fts5Bm25Data object */
578     nPhrase = pApi->xPhraseCount(pFts);
579     nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
580     p = (Fts5Bm25Data*)sqlite3_malloc64(nByte);
581     if( p==0 ){
582       rc = SQLITE_NOMEM;
583     }else{
584       memset(p, 0, (size_t)nByte);
585       p->nPhrase = nPhrase;
586       p->aIDF = (double*)&p[1];
587       p->aFreq = &p->aIDF[nPhrase];
588     }
589 
590     /* Calculate the average document length for this FTS5 table */
591     if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
592     assert( rc!=SQLITE_OK || nRow>0 );
593     if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
594     if( rc==SQLITE_OK ) p->avgdl = (double)nToken  / (double)nRow;
595 
596     /* Calculate an IDF for each phrase in the query */
597     for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
598       sqlite3_int64 nHit = 0;
599       rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
600       if( rc==SQLITE_OK ){
601         /* Calculate the IDF (Inverse Document Frequency) for phrase i.
602         ** This is done using the standard BM25 formula as found on wikipedia:
603         **
604         **   IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
605         **
606         ** where "N" is the total number of documents in the set and nHit
607         ** is the number that contain at least one instance of the phrase
608         ** under consideration.
609         **
610         ** The problem with this is that if (N < 2*nHit), the IDF is
611         ** negative. Which is undesirable. So the mimimum allowable IDF is
612         ** (1e-6) - roughly the same as a term that appears in just over
613         ** half of set of 5,000,000 documents.  */
614         double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
615         if( idf<=0.0 ) idf = 1e-6;
616         p->aIDF[i] = idf;
617       }
618     }
619 
620     if( rc!=SQLITE_OK ){
621       sqlite3_free(p);
622     }else{
623       rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
624     }
625     if( rc!=SQLITE_OK ) p = 0;
626   }
627   *ppData = p;
628   return rc;
629 }
630 
631 /*
632 ** Implementation of bm25() function.
633 */
fts5Bm25Function(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)634 static void fts5Bm25Function(
635   const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
636   Fts5Context *pFts,              /* First arg to pass to pApi functions */
637   sqlite3_context *pCtx,          /* Context for returning result/error */
638   int nVal,                       /* Number of values in apVal[] array */
639   sqlite3_value **apVal           /* Array of trailing arguments */
640 ){
641   const double k1 = 1.2;          /* Constant "k1" from BM25 formula */
642   const double b = 0.75;          /* Constant "b" from BM25 formula */
643   int rc = SQLITE_OK;             /* Error code */
644   double score = 0.0;             /* SQL function return value */
645   Fts5Bm25Data *pData;            /* Values allocated/calculated once only */
646   int i;                          /* Iterator variable */
647   int nInst = 0;                  /* Value returned by xInstCount() */
648   double D = 0.0;                 /* Total number of tokens in row */
649   double *aFreq = 0;              /* Array of phrase freq. for current row */
650 
651   /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
652   ** for each phrase in the query for the current row. */
653   rc = fts5Bm25GetData(pApi, pFts, &pData);
654   if( rc==SQLITE_OK ){
655     aFreq = pData->aFreq;
656     memset(aFreq, 0, sizeof(double) * pData->nPhrase);
657     rc = pApi->xInstCount(pFts, &nInst);
658   }
659   for(i=0; rc==SQLITE_OK && i<nInst; i++){
660     int ip; int ic; int io;
661     rc = pApi->xInst(pFts, i, &ip, &ic, &io);
662     if( rc==SQLITE_OK ){
663       double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
664       aFreq[ip] += w;
665     }
666   }
667 
668   /* Figure out the total size of the current row in tokens. */
669   if( rc==SQLITE_OK ){
670     int nTok;
671     rc = pApi->xColumnSize(pFts, -1, &nTok);
672     D = (double)nTok;
673   }
674 
675   /* Determine the BM25 score for the current row. */
676   for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){
677     score += pData->aIDF[i] * (
678       ( aFreq[i] * (k1 + 1.0) ) /
679       ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
680     );
681   }
682 
683   /* If no error has occurred, return the calculated score. Otherwise,
684   ** throw an SQL exception.  */
685   if( rc==SQLITE_OK ){
686     sqlite3_result_double(pCtx, -1.0 * score);
687   }else{
688     sqlite3_result_error_code(pCtx, rc);
689   }
690 }
691 
sqlite3Fts5AuxInit(fts5_api * pApi)692 int sqlite3Fts5AuxInit(fts5_api *pApi){
693   struct Builtin {
694     const char *zFunc;            /* Function name (nul-terminated) */
695     void *pUserData;              /* User-data pointer */
696     fts5_extension_function xFunc;/* Callback function */
697     void (*xDestroy)(void*);      /* Destructor function */
698   } aBuiltin [] = {
699     { "snippet",   0, fts5SnippetFunction, 0 },
700     { "highlight", 0, fts5HighlightFunction, 0 },
701     { "bm25",      0, fts5Bm25Function,    0 },
702   };
703   int rc = SQLITE_OK;             /* Return code */
704   int i;                          /* To iterate through builtin functions */
705 
706   for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
707     rc = pApi->xCreateFunction(pApi,
708         aBuiltin[i].zFunc,
709         aBuiltin[i].pUserData,
710         aBuiltin[i].xFunc,
711         aBuiltin[i].xDestroy
712     );
713   }
714 
715   return rc;
716 }
717