1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13
14
15 #include "fts5Int.h"
16 #include <math.h> /* amalgamator: keep */
17
18 /*
19 ** Object used to iterate through all "coalesced phrase instances" in
20 ** a single column of the current row. If the phrase instances in the
21 ** column being considered do not overlap, this object simply iterates
22 ** through them. Or, if they do overlap (share one or more tokens in
23 ** common), each set of overlapping instances is treated as a single
24 ** match. See documentation for the highlight() auxiliary function for
25 ** details.
26 **
27 ** Usage is:
28 **
29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
31 ** rc = fts5CInstIterNext(&iter)
32 ** ){
33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
34 ** }
35 **
36 */
37 typedef struct CInstIter CInstIter;
38 struct CInstIter {
39 const Fts5ExtensionApi *pApi; /* API offered by current FTS version */
40 Fts5Context *pFts; /* First arg to pass to pApi functions */
41 int iCol; /* Column to search */
42 int iInst; /* Next phrase instance index */
43 int nInst; /* Total number of phrase instances */
44
45 /* Output variables */
46 int iStart; /* First token in coalesced phrase instance */
47 int iEnd; /* Last token in coalesced phrase instance */
48 };
49
50 /*
51 ** Advance the iterator to the next coalesced phrase instance. Return
52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
53 */
fts5CInstIterNext(CInstIter * pIter)54 static int fts5CInstIterNext(CInstIter *pIter){
55 int rc = SQLITE_OK;
56 pIter->iStart = -1;
57 pIter->iEnd = -1;
58
59 while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
60 int ip; int ic; int io;
61 rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
62 if( rc==SQLITE_OK ){
63 if( ic==pIter->iCol ){
64 int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
65 if( pIter->iStart<0 ){
66 pIter->iStart = io;
67 pIter->iEnd = iEnd;
68 }else if( io<=pIter->iEnd ){
69 if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
70 }else{
71 break;
72 }
73 }
74 pIter->iInst++;
75 }
76 }
77
78 return rc;
79 }
80
81 /*
82 ** Initialize the iterator object indicated by the final parameter to
83 ** iterate through coalesced phrase instances in column iCol.
84 */
fts5CInstIterInit(const Fts5ExtensionApi * pApi,Fts5Context * pFts,int iCol,CInstIter * pIter)85 static int fts5CInstIterInit(
86 const Fts5ExtensionApi *pApi,
87 Fts5Context *pFts,
88 int iCol,
89 CInstIter *pIter
90 ){
91 int rc;
92
93 memset(pIter, 0, sizeof(CInstIter));
94 pIter->pApi = pApi;
95 pIter->pFts = pFts;
96 pIter->iCol = iCol;
97 rc = pApi->xInstCount(pFts, &pIter->nInst);
98
99 if( rc==SQLITE_OK ){
100 rc = fts5CInstIterNext(pIter);
101 }
102
103 return rc;
104 }
105
106
107
108 /*************************************************************************
109 ** Start of highlight() implementation.
110 */
111 typedef struct HighlightContext HighlightContext;
112 struct HighlightContext {
113 CInstIter iter; /* Coalesced Instance Iterator */
114 int iPos; /* Current token offset in zIn[] */
115 int iRangeStart; /* First token to include */
116 int iRangeEnd; /* If non-zero, last token to include */
117 const char *zOpen; /* Opening highlight */
118 const char *zClose; /* Closing highlight */
119 const char *zIn; /* Input text */
120 int nIn; /* Size of input text in bytes */
121 int iOff; /* Current offset within zIn[] */
122 char *zOut; /* Output value */
123 };
124
125 /*
126 ** Append text to the HighlightContext output string - p->zOut. Argument
127 ** z points to a buffer containing n bytes of text to append. If n is
128 ** negative, everything up until the first '\0' is appended to the output.
129 **
130 ** If *pRc is set to any value other than SQLITE_OK when this function is
131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
132 ** *pRc is set to an error code before returning.
133 */
fts5HighlightAppend(int * pRc,HighlightContext * p,const char * z,int n)134 static void fts5HighlightAppend(
135 int *pRc,
136 HighlightContext *p,
137 const char *z, int n
138 ){
139 if( *pRc==SQLITE_OK && z ){
140 if( n<0 ) n = (int)strlen(z);
141 p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
142 if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
143 }
144 }
145
146 /*
147 ** Tokenizer callback used by implementation of highlight() function.
148 */
fts5HighlightCb(void * pContext,int tflags,const char * pToken,int nToken,int iStartOff,int iEndOff)149 static int fts5HighlightCb(
150 void *pContext, /* Pointer to HighlightContext object */
151 int tflags, /* Mask of FTS5_TOKEN_* flags */
152 const char *pToken, /* Buffer containing token */
153 int nToken, /* Size of token in bytes */
154 int iStartOff, /* Start offset of token */
155 int iEndOff /* End offset of token */
156 ){
157 HighlightContext *p = (HighlightContext*)pContext;
158 int rc = SQLITE_OK;
159 int iPos;
160
161 UNUSED_PARAM2(pToken, nToken);
162
163 if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
164 iPos = p->iPos++;
165
166 if( p->iRangeEnd>0 ){
167 if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
168 if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
169 }
170
171 if( iPos==p->iter.iStart ){
172 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
173 fts5HighlightAppend(&rc, p, p->zOpen, -1);
174 p->iOff = iStartOff;
175 }
176
177 if( iPos==p->iter.iEnd ){
178 if( p->iRangeEnd && p->iter.iStart<p->iRangeStart ){
179 fts5HighlightAppend(&rc, p, p->zOpen, -1);
180 }
181 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
182 fts5HighlightAppend(&rc, p, p->zClose, -1);
183 p->iOff = iEndOff;
184 if( rc==SQLITE_OK ){
185 rc = fts5CInstIterNext(&p->iter);
186 }
187 }
188
189 if( p->iRangeEnd>0 && iPos==p->iRangeEnd ){
190 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
191 p->iOff = iEndOff;
192 if( iPos>=p->iter.iStart && iPos<p->iter.iEnd ){
193 fts5HighlightAppend(&rc, p, p->zClose, -1);
194 }
195 }
196
197 return rc;
198 }
199
200 /*
201 ** Implementation of highlight() function.
202 */
fts5HighlightFunction(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)203 static void fts5HighlightFunction(
204 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
205 Fts5Context *pFts, /* First arg to pass to pApi functions */
206 sqlite3_context *pCtx, /* Context for returning result/error */
207 int nVal, /* Number of values in apVal[] array */
208 sqlite3_value **apVal /* Array of trailing arguments */
209 ){
210 HighlightContext ctx;
211 int rc;
212 int iCol;
213
214 if( nVal!=3 ){
215 const char *zErr = "wrong number of arguments to function highlight()";
216 sqlite3_result_error(pCtx, zErr, -1);
217 return;
218 }
219
220 iCol = sqlite3_value_int(apVal[0]);
221 memset(&ctx, 0, sizeof(HighlightContext));
222 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
223 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
224 rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
225
226 if( ctx.zIn ){
227 if( rc==SQLITE_OK ){
228 rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
229 }
230
231 if( rc==SQLITE_OK ){
232 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
233 }
234 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
235
236 if( rc==SQLITE_OK ){
237 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
238 }
239 sqlite3_free(ctx.zOut);
240 }
241 if( rc!=SQLITE_OK ){
242 sqlite3_result_error_code(pCtx, rc);
243 }
244 }
245 /*
246 ** End of highlight() implementation.
247 **************************************************************************/
248
249 /*
250 ** Context object passed to the fts5SentenceFinderCb() function.
251 */
252 typedef struct Fts5SFinder Fts5SFinder;
253 struct Fts5SFinder {
254 int iPos; /* Current token position */
255 int nFirstAlloc; /* Allocated size of aFirst[] */
256 int nFirst; /* Number of entries in aFirst[] */
257 int *aFirst; /* Array of first token in each sentence */
258 const char *zDoc; /* Document being tokenized */
259 };
260
261 /*
262 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
263 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
264 ** error occurs.
265 */
fts5SentenceFinderAdd(Fts5SFinder * p,int iAdd)266 static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
267 if( p->nFirstAlloc==p->nFirst ){
268 int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
269 int *aNew;
270
271 aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int));
272 if( aNew==0 ) return SQLITE_NOMEM;
273 p->aFirst = aNew;
274 p->nFirstAlloc = nNew;
275 }
276 p->aFirst[p->nFirst++] = iAdd;
277 return SQLITE_OK;
278 }
279
280 /*
281 ** This function is an xTokenize() callback used by the auxiliary snippet()
282 ** function. Its job is to identify tokens that are the first in a sentence.
283 ** For each such token, an entry is added to the SFinder.aFirst[] array.
284 */
fts5SentenceFinderCb(void * pContext,int tflags,const char * pToken,int nToken,int iStartOff,int iEndOff)285 static int fts5SentenceFinderCb(
286 void *pContext, /* Pointer to HighlightContext object */
287 int tflags, /* Mask of FTS5_TOKEN_* flags */
288 const char *pToken, /* Buffer containing token */
289 int nToken, /* Size of token in bytes */
290 int iStartOff, /* Start offset of token */
291 int iEndOff /* End offset of token */
292 ){
293 int rc = SQLITE_OK;
294
295 UNUSED_PARAM2(pToken, nToken);
296 UNUSED_PARAM(iEndOff);
297
298 if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
299 Fts5SFinder *p = (Fts5SFinder*)pContext;
300 if( p->iPos>0 ){
301 int i;
302 char c = 0;
303 for(i=iStartOff-1; i>=0; i--){
304 c = p->zDoc[i];
305 if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
306 }
307 if( i!=iStartOff-1 && (c=='.' || c==':') ){
308 rc = fts5SentenceFinderAdd(p, p->iPos);
309 }
310 }else{
311 rc = fts5SentenceFinderAdd(p, 0);
312 }
313 p->iPos++;
314 }
315 return rc;
316 }
317
fts5SnippetScore(const Fts5ExtensionApi * pApi,Fts5Context * pFts,int nDocsize,unsigned char * aSeen,int iCol,int iPos,int nToken,int * pnScore,int * piPos)318 static int fts5SnippetScore(
319 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
320 Fts5Context *pFts, /* First arg to pass to pApi functions */
321 int nDocsize, /* Size of column in tokens */
322 unsigned char *aSeen, /* Array with one element per query phrase */
323 int iCol, /* Column to score */
324 int iPos, /* Starting offset to score */
325 int nToken, /* Max tokens per snippet */
326 int *pnScore, /* OUT: Score */
327 int *piPos /* OUT: Adjusted offset */
328 ){
329 int rc;
330 int i;
331 int ip = 0;
332 int ic = 0;
333 int iOff = 0;
334 int iFirst = -1;
335 int nInst;
336 int nScore = 0;
337 int iLast = 0;
338 sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken;
339
340 rc = pApi->xInstCount(pFts, &nInst);
341 for(i=0; i<nInst && rc==SQLITE_OK; i++){
342 rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
343 if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){
344 nScore += (aSeen[ip] ? 1 : 1000);
345 aSeen[ip] = 1;
346 if( iFirst<0 ) iFirst = iOff;
347 iLast = iOff + pApi->xPhraseSize(pFts, ip);
348 }
349 }
350
351 *pnScore = nScore;
352 if( piPos ){
353 sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
354 if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
355 if( iAdj<0 ) iAdj = 0;
356 *piPos = (int)iAdj;
357 }
358
359 return rc;
360 }
361
362 /*
363 ** Return the value in pVal interpreted as utf-8 text. Except, if pVal
364 ** contains a NULL value, return a pointer to a static string zero
365 ** bytes in length instead of a NULL pointer.
366 */
fts5ValueToText(sqlite3_value * pVal)367 static const char *fts5ValueToText(sqlite3_value *pVal){
368 const char *zRet = (const char*)sqlite3_value_text(pVal);
369 return zRet ? zRet : "";
370 }
371
372 /*
373 ** Implementation of snippet() function.
374 */
fts5SnippetFunction(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)375 static void fts5SnippetFunction(
376 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
377 Fts5Context *pFts, /* First arg to pass to pApi functions */
378 sqlite3_context *pCtx, /* Context for returning result/error */
379 int nVal, /* Number of values in apVal[] array */
380 sqlite3_value **apVal /* Array of trailing arguments */
381 ){
382 HighlightContext ctx;
383 int rc = SQLITE_OK; /* Return code */
384 int iCol; /* 1st argument to snippet() */
385 const char *zEllips; /* 4th argument to snippet() */
386 int nToken; /* 5th argument to snippet() */
387 int nInst = 0; /* Number of instance matches this row */
388 int i; /* Used to iterate through instances */
389 int nPhrase; /* Number of phrases in query */
390 unsigned char *aSeen; /* Array of "seen instance" flags */
391 int iBestCol; /* Column containing best snippet */
392 int iBestStart = 0; /* First token of best snippet */
393 int nBestScore = 0; /* Score of best snippet */
394 int nColSize = 0; /* Total size of iBestCol in tokens */
395 Fts5SFinder sFinder; /* Used to find the beginnings of sentences */
396 int nCol;
397
398 if( nVal!=5 ){
399 const char *zErr = "wrong number of arguments to function snippet()";
400 sqlite3_result_error(pCtx, zErr, -1);
401 return;
402 }
403
404 nCol = pApi->xColumnCount(pFts);
405 memset(&ctx, 0, sizeof(HighlightContext));
406 iCol = sqlite3_value_int(apVal[0]);
407 ctx.zOpen = fts5ValueToText(apVal[1]);
408 ctx.zClose = fts5ValueToText(apVal[2]);
409 zEllips = fts5ValueToText(apVal[3]);
410 nToken = sqlite3_value_int(apVal[4]);
411
412 iBestCol = (iCol>=0 ? iCol : 0);
413 nPhrase = pApi->xPhraseCount(pFts);
414 aSeen = sqlite3_malloc(nPhrase);
415 if( aSeen==0 ){
416 rc = SQLITE_NOMEM;
417 }
418 if( rc==SQLITE_OK ){
419 rc = pApi->xInstCount(pFts, &nInst);
420 }
421
422 memset(&sFinder, 0, sizeof(Fts5SFinder));
423 for(i=0; i<nCol; i++){
424 if( iCol<0 || iCol==i ){
425 int nDoc;
426 int nDocsize;
427 int ii;
428 sFinder.iPos = 0;
429 sFinder.nFirst = 0;
430 rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
431 if( rc!=SQLITE_OK ) break;
432 rc = pApi->xTokenize(pFts,
433 sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb
434 );
435 if( rc!=SQLITE_OK ) break;
436 rc = pApi->xColumnSize(pFts, i, &nDocsize);
437 if( rc!=SQLITE_OK ) break;
438
439 for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
440 int ip, ic, io;
441 int iAdj;
442 int nScore;
443 int jj;
444
445 rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
446 if( ic!=i ) continue;
447 if( io>nDocsize ) rc = FTS5_CORRUPT;
448 if( rc!=SQLITE_OK ) continue;
449 memset(aSeen, 0, nPhrase);
450 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
451 io, nToken, &nScore, &iAdj
452 );
453 if( rc==SQLITE_OK && nScore>nBestScore ){
454 nBestScore = nScore;
455 iBestCol = i;
456 iBestStart = iAdj;
457 nColSize = nDocsize;
458 }
459
460 if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
461 for(jj=0; jj<(sFinder.nFirst-1); jj++){
462 if( sFinder.aFirst[jj+1]>io ) break;
463 }
464
465 if( sFinder.aFirst[jj]<io ){
466 memset(aSeen, 0, nPhrase);
467 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
468 sFinder.aFirst[jj], nToken, &nScore, 0
469 );
470
471 nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
472 if( rc==SQLITE_OK && nScore>nBestScore ){
473 nBestScore = nScore;
474 iBestCol = i;
475 iBestStart = sFinder.aFirst[jj];
476 nColSize = nDocsize;
477 }
478 }
479 }
480 }
481 }
482 }
483
484 if( rc==SQLITE_OK ){
485 rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
486 }
487 if( rc==SQLITE_OK && nColSize==0 ){
488 rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
489 }
490 if( ctx.zIn ){
491 if( rc==SQLITE_OK ){
492 rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
493 }
494
495 ctx.iRangeStart = iBestStart;
496 ctx.iRangeEnd = iBestStart + nToken - 1;
497
498 if( iBestStart>0 ){
499 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
500 }
501
502 /* Advance iterator ctx.iter so that it points to the first coalesced
503 ** phrase instance at or following position iBestStart. */
504 while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
505 rc = fts5CInstIterNext(&ctx.iter);
506 }
507
508 if( rc==SQLITE_OK ){
509 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
510 }
511 if( ctx.iRangeEnd>=(nColSize-1) ){
512 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
513 }else{
514 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
515 }
516 }
517 if( rc==SQLITE_OK ){
518 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
519 }else{
520 sqlite3_result_error_code(pCtx, rc);
521 }
522 sqlite3_free(ctx.zOut);
523 sqlite3_free(aSeen);
524 sqlite3_free(sFinder.aFirst);
525 }
526
527 /************************************************************************/
528
529 /*
530 ** The first time the bm25() function is called for a query, an instance
531 ** of the following structure is allocated and populated.
532 */
533 typedef struct Fts5Bm25Data Fts5Bm25Data;
534 struct Fts5Bm25Data {
535 int nPhrase; /* Number of phrases in query */
536 double avgdl; /* Average number of tokens in each row */
537 double *aIDF; /* IDF for each phrase */
538 double *aFreq; /* Array used to calculate phrase freq. */
539 };
540
541 /*
542 ** Callback used by fts5Bm25GetData() to count the number of rows in the
543 ** table matched by each individual phrase within the query.
544 */
fts5CountCb(const Fts5ExtensionApi * pApi,Fts5Context * pFts,void * pUserData)545 static int fts5CountCb(
546 const Fts5ExtensionApi *pApi,
547 Fts5Context *pFts,
548 void *pUserData /* Pointer to sqlite3_int64 variable */
549 ){
550 sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
551 UNUSED_PARAM2(pApi, pFts);
552 (*pn)++;
553 return SQLITE_OK;
554 }
555
556 /*
557 ** Set *ppData to point to the Fts5Bm25Data object for the current query.
558 ** If the object has not already been allocated, allocate and populate it
559 ** now.
560 */
fts5Bm25GetData(const Fts5ExtensionApi * pApi,Fts5Context * pFts,Fts5Bm25Data ** ppData)561 static int fts5Bm25GetData(
562 const Fts5ExtensionApi *pApi,
563 Fts5Context *pFts,
564 Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */
565 ){
566 int rc = SQLITE_OK; /* Return code */
567 Fts5Bm25Data *p; /* Object to return */
568
569 p = pApi->xGetAuxdata(pFts, 0);
570 if( p==0 ){
571 int nPhrase; /* Number of phrases in query */
572 sqlite3_int64 nRow = 0; /* Number of rows in table */
573 sqlite3_int64 nToken = 0; /* Number of tokens in table */
574 sqlite3_int64 nByte; /* Bytes of space to allocate */
575 int i;
576
577 /* Allocate the Fts5Bm25Data object */
578 nPhrase = pApi->xPhraseCount(pFts);
579 nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
580 p = (Fts5Bm25Data*)sqlite3_malloc64(nByte);
581 if( p==0 ){
582 rc = SQLITE_NOMEM;
583 }else{
584 memset(p, 0, (size_t)nByte);
585 p->nPhrase = nPhrase;
586 p->aIDF = (double*)&p[1];
587 p->aFreq = &p->aIDF[nPhrase];
588 }
589
590 /* Calculate the average document length for this FTS5 table */
591 if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
592 assert( rc!=SQLITE_OK || nRow>0 );
593 if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
594 if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow;
595
596 /* Calculate an IDF for each phrase in the query */
597 for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
598 sqlite3_int64 nHit = 0;
599 rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
600 if( rc==SQLITE_OK ){
601 /* Calculate the IDF (Inverse Document Frequency) for phrase i.
602 ** This is done using the standard BM25 formula as found on wikipedia:
603 **
604 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
605 **
606 ** where "N" is the total number of documents in the set and nHit
607 ** is the number that contain at least one instance of the phrase
608 ** under consideration.
609 **
610 ** The problem with this is that if (N < 2*nHit), the IDF is
611 ** negative. Which is undesirable. So the mimimum allowable IDF is
612 ** (1e-6) - roughly the same as a term that appears in just over
613 ** half of set of 5,000,000 documents. */
614 double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
615 if( idf<=0.0 ) idf = 1e-6;
616 p->aIDF[i] = idf;
617 }
618 }
619
620 if( rc!=SQLITE_OK ){
621 sqlite3_free(p);
622 }else{
623 rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
624 }
625 if( rc!=SQLITE_OK ) p = 0;
626 }
627 *ppData = p;
628 return rc;
629 }
630
631 /*
632 ** Implementation of bm25() function.
633 */
fts5Bm25Function(const Fts5ExtensionApi * pApi,Fts5Context * pFts,sqlite3_context * pCtx,int nVal,sqlite3_value ** apVal)634 static void fts5Bm25Function(
635 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
636 Fts5Context *pFts, /* First arg to pass to pApi functions */
637 sqlite3_context *pCtx, /* Context for returning result/error */
638 int nVal, /* Number of values in apVal[] array */
639 sqlite3_value **apVal /* Array of trailing arguments */
640 ){
641 const double k1 = 1.2; /* Constant "k1" from BM25 formula */
642 const double b = 0.75; /* Constant "b" from BM25 formula */
643 int rc = SQLITE_OK; /* Error code */
644 double score = 0.0; /* SQL function return value */
645 Fts5Bm25Data *pData; /* Values allocated/calculated once only */
646 int i; /* Iterator variable */
647 int nInst = 0; /* Value returned by xInstCount() */
648 double D = 0.0; /* Total number of tokens in row */
649 double *aFreq = 0; /* Array of phrase freq. for current row */
650
651 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
652 ** for each phrase in the query for the current row. */
653 rc = fts5Bm25GetData(pApi, pFts, &pData);
654 if( rc==SQLITE_OK ){
655 aFreq = pData->aFreq;
656 memset(aFreq, 0, sizeof(double) * pData->nPhrase);
657 rc = pApi->xInstCount(pFts, &nInst);
658 }
659 for(i=0; rc==SQLITE_OK && i<nInst; i++){
660 int ip; int ic; int io;
661 rc = pApi->xInst(pFts, i, &ip, &ic, &io);
662 if( rc==SQLITE_OK ){
663 double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
664 aFreq[ip] += w;
665 }
666 }
667
668 /* Figure out the total size of the current row in tokens. */
669 if( rc==SQLITE_OK ){
670 int nTok;
671 rc = pApi->xColumnSize(pFts, -1, &nTok);
672 D = (double)nTok;
673 }
674
675 /* Determine the BM25 score for the current row. */
676 for(i=0; rc==SQLITE_OK && i<pData->nPhrase; i++){
677 score += pData->aIDF[i] * (
678 ( aFreq[i] * (k1 + 1.0) ) /
679 ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
680 );
681 }
682
683 /* If no error has occurred, return the calculated score. Otherwise,
684 ** throw an SQL exception. */
685 if( rc==SQLITE_OK ){
686 sqlite3_result_double(pCtx, -1.0 * score);
687 }else{
688 sqlite3_result_error_code(pCtx, rc);
689 }
690 }
691
sqlite3Fts5AuxInit(fts5_api * pApi)692 int sqlite3Fts5AuxInit(fts5_api *pApi){
693 struct Builtin {
694 const char *zFunc; /* Function name (nul-terminated) */
695 void *pUserData; /* User-data pointer */
696 fts5_extension_function xFunc;/* Callback function */
697 void (*xDestroy)(void*); /* Destructor function */
698 } aBuiltin [] = {
699 { "snippet", 0, fts5SnippetFunction, 0 },
700 { "highlight", 0, fts5HighlightFunction, 0 },
701 { "bm25", 0, fts5Bm25Function, 0 },
702 };
703 int rc = SQLITE_OK; /* Return code */
704 int i; /* To iterate through builtin functions */
705
706 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
707 rc = pApi->xCreateFunction(pApi,
708 aBuiltin[i].zFunc,
709 aBuiltin[i].pUserData,
710 aBuiltin[i].xFunc,
711 aBuiltin[i].xDestroy
712 );
713 }
714
715 return rc;
716 }
717