1 #include "fragmenter.h"
2 #include "toksep.h"
3 #include "tokenize.h"
4 #include "util/minmax.h"
5 #include <ctype.h>
6 #include <float.h>
7 #include <sys/uio.h>
8 #include <assert.h>
9 
10 // Estimated characters per token
11 #define EST_CHARS_PER_TOK 6
12 
FragmentList_LastFragment(FragmentList * fragList)13 static Fragment *FragmentList_LastFragment(FragmentList *fragList) {
14   if (!fragList->frags.len) {
15     return NULL;
16   }
17   return (Fragment *)(fragList->frags.data + (fragList->frags.len - sizeof(Fragment)));
18 }
19 
FragmentList_AddFragment(FragmentList * fragList)20 static Fragment *FragmentList_AddFragment(FragmentList *fragList) {
21   Fragment *frag = Array_Add(&fragList->frags, sizeof(Fragment));
22   memset(frag, 0, sizeof(*frag));
23   frag->fragPos = fragList->numFrags++;
24   Array_Init(&frag->termLocs);
25   return frag;
26 }
27 
Fragment_GetNumTerms(const Fragment * frag)28 static size_t Fragment_GetNumTerms(const Fragment *frag) {
29   return ARRAY_GETSIZE_AS(&frag->termLocs, TermLoc);
30 }
31 
Fragment_HasTerm(const Fragment * frag,uint32_t termId)32 static int Fragment_HasTerm(const Fragment *frag, uint32_t termId) {
33   TermLoc *locs = ARRAY_GETARRAY_AS(&frag->termLocs, TermLoc *);
34 
35   int firstOcurrence = 1;
36   // If this is the first time the term appears in the fragment, increment the
37   // fragment's score by the term's score. Otherwise, increment it by half
38   // the fragment's score. This allows for better 'blended' results.
39   for (size_t ii = 0; ii < Fragment_GetNumTerms(frag); ii++) {
40     if (locs[ii].termId == termId) {
41       return 1;
42     }
43   }
44   return 0;
45 }
46 
FragmentList_AddMatchingTerm(FragmentList * fragList,uint32_t termId,uint32_t tokPos,const char * tokBuf,size_t tokLen,float baseScore)47 static Fragment *FragmentList_AddMatchingTerm(FragmentList *fragList, uint32_t termId,
48                                               uint32_t tokPos, const char *tokBuf, size_t tokLen,
49                                               float baseScore) {
50 
51   Fragment *curFrag = FragmentList_LastFragment(fragList);
52   if (curFrag && tokPos - curFrag->lastMatchPos > fragList->maxDistance) {
53     // There is too much distance between tokens for it to still be relevant.
54     curFrag = NULL;
55   }
56 
57   if (curFrag == NULL) {
58     curFrag = FragmentList_AddFragment(fragList);
59     fragList->numToksSinceLastMatch = 0;
60     curFrag->buf = tokBuf;
61   }
62 
63   if (!Fragment_HasTerm(curFrag, termId)) {
64     curFrag->score += baseScore;
65   }
66 
67   curFrag->len = (tokBuf - curFrag->buf) + tokLen;
68   curFrag->lastMatchPos = tokPos;
69   curFrag->numMatches++;
70   curFrag->totalTokens += fragList->numToksSinceLastMatch + 1;
71   fragList->numToksSinceLastMatch = 0;
72 
73   TermLoc *newLoc = Array_Add(&curFrag->termLocs, sizeof(TermLoc));
74   newLoc->termId = termId;
75   newLoc->offset = tokBuf - curFrag->buf;
76   newLoc->len = tokLen;
77 
78   return curFrag;
79 }
80 
extractToken(FragmentList * fragList,const Token * tokInfo,const FragmentSearchTerm * terms,size_t numTerms)81 static void extractToken(FragmentList *fragList, const Token *tokInfo,
82                          const FragmentSearchTerm *terms, size_t numTerms) {
83   const FragmentSearchTerm *term = NULL;
84   uint32_t termId;
85 
86   // See if this token matches any of our terms.
87   for (termId = 0; termId < numTerms; ++termId) {
88     const FragmentSearchTerm *cur = terms + termId;
89     if (tokInfo->tokLen == cur->len && strncmp(tokInfo->tok, cur->tok, cur->len) == 0) {
90     } else if (tokInfo->stem && tokInfo->stemLen == cur->len &&
91                strncmp(tokInfo->stem, cur->tok, cur->len) == 0) {
92     } else {
93       continue;
94     }
95     term = cur;
96     break;
97   }
98 
99   // Don't care about this token
100   if (!term) {
101     fragList->numToksSinceLastMatch++;
102     return;
103   }
104 
105   FragmentList_AddMatchingTerm(fragList, termId, tokInfo->pos, tokInfo->raw, tokInfo->rawLen,
106                                term->score);
107 }
108 
FragmentList_FragmentizeBuffer(FragmentList * fragList,const char * doc,Stemmer * stemmer,StopWordList * stopwords,const FragmentSearchTerm * terms,size_t numTerms)109 void FragmentList_FragmentizeBuffer(FragmentList *fragList, const char *doc, Stemmer *stemmer,
110                                     StopWordList *stopwords, const FragmentSearchTerm *terms,
111                                     size_t numTerms) {
112   fragList->doc = doc;
113   fragList->docLen = strlen(doc);
114   RSTokenizer *tokenizer = NewSimpleTokenizer(stemmer, stopwords, TOKENIZE_NOMODIFY);
115   tokenizer->Start(tokenizer, (char *)fragList->doc, fragList->docLen, 0);
116   Token tokInfo;
117   while (tokenizer->Next(tokenizer, &tokInfo)) {
118     extractToken(fragList, &tokInfo, terms, numTerms);
119   }
120   tokenizer->Free(tokenizer);
121 }
122 
addToIov(const char * s,size_t n,Array * b)123 static void addToIov(const char *s, size_t n, Array *b) {
124   if (n == 0 || s == NULL) {
125     return;
126   }
127   struct iovec *iov = Array_Add(b, sizeof(*iov));
128   assert(iov);
129   iov->iov_base = (void *)s;
130   iov->iov_len = n;
131 }
132 
133 /**
134  * Writes a complete fragment as a series of IOVs.
135  * - fragment is the fragment to write
136  * - tags is the tags to use
137  * - contextLen is any amount of context used to surround the fragment with
138  * - iovs is the target buffer in which the iovs should be written
139  *
140  * - preamble is any prior text which may need to be written alongside the fragment.
141  *    In output, it contains the first byte after the fragment+context. This may be
142  *    used as the 'preamble' value for a subsequent call to this function, if the next
143  *    fragment being written is after the current one.
144  */
Fragment_WriteIovs(const Fragment * curFrag,const char * openTag,size_t openLen,const char * closeTag,size_t closeLen,Array * iovs,const char ** preamble)145 static void Fragment_WriteIovs(const Fragment *curFrag, const char *openTag, size_t openLen,
146                                const char *closeTag, size_t closeLen, Array *iovs,
147                                const char **preamble) {
148 
149   const TermLoc *locs = ARRAY_GETARRAY_AS(&curFrag->termLocs, const TermLoc *);
150   size_t nlocs = ARRAY_GETSIZE_AS(&curFrag->termLocs, TermLoc);
151   const char *preamble_s = NULL;
152 
153   if (!preamble) {
154     preamble = &preamble_s;
155   }
156   if (!*preamble) {
157     *preamble = curFrag->buf;
158   }
159 
160   for (size_t ii = 0; ii < nlocs; ++ii) {
161     const TermLoc *curLoc = locs + ii;
162 
163     size_t preambleLen = (curFrag->buf + curLoc->offset) - *preamble;
164 
165     // Add any prior text
166     if (preambleLen) {
167       addToIov(*preamble, preambleLen, iovs);
168     }
169 
170     if (openLen) {
171       addToIov(openTag, openLen, iovs);
172     }
173 
174     // Add the token itself
175     addToIov(curFrag->buf + curLoc->offset, curLoc->len, iovs);
176 
177     // Add close tag
178     if (closeLen) {
179       addToIov(closeTag, closeLen, iovs);
180     }
181 
182     *preamble = curFrag->buf + curLoc->offset + curLoc->len;
183   }
184 }
185 
FragmentList_HighlightWholeDocV(const FragmentList * fragList,const HighlightTags * tags,Array * iovs)186 void FragmentList_HighlightWholeDocV(const FragmentList *fragList, const HighlightTags *tags,
187                                      Array *iovs) {
188   const Fragment *frags = FragmentList_GetFragments(fragList);
189 
190   if (!fragList->numFrags) {
191     // Whole doc, but no matches found
192     addToIov(fragList->doc, fragList->docLen, iovs);
193     return;
194   }
195 
196   const char *preamble = fragList->doc;
197   size_t openLen = strlen(tags->openTag);
198   size_t closeLen = strlen(tags->closeTag);
199 
200   for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
201     const Fragment *curFrag = frags + ii;
202     Fragment_WriteIovs(curFrag, tags->openTag, openLen, tags->closeTag, closeLen, iovs, &preamble);
203   }
204 
205   // Write the last preamble
206   size_t preambleLen = (fragList->doc + fragList->docLen) - preamble;
207   // size_t preambleLen = strlen(preamble);
208   if (preambleLen) {
209     addToIov(preamble, preambleLen, iovs);
210   }
211 }
212 
FragmentList_HighlightWholeDocS(const FragmentList * fragList,const HighlightTags * tags)213 char *FragmentList_HighlightWholeDocS(const FragmentList *fragList, const HighlightTags *tags) {
214   Array iovsArr;
215   Array_Init(&iovsArr);
216   FragmentList_HighlightWholeDocV(fragList, tags, &iovsArr);
217 
218   // Calculate the length
219   struct iovec *iovs = ARRAY_GETARRAY_AS(&iovsArr, struct iovec *);
220   size_t niovs = ARRAY_GETSIZE_AS(&iovsArr, struct iovec);
221   size_t docLen = 0;
222   for (size_t ii = 0; ii < niovs; ++ii) {
223     docLen += iovs[ii].iov_len;
224   }
225 
226   char *docBuf = rm_malloc(docLen + 1);
227   docBuf[docLen] = '\0';
228 
229   assert(docBuf);
230   size_t offset = 0;
231   for (size_t ii = 0; ii < niovs; ++ii) {
232     memcpy(docBuf + offset, iovs[ii].iov_base, iovs[ii].iov_len);
233     offset += iovs[ii].iov_len;
234   }
235 
236   Array_Free(&iovsArr);
237   return docBuf;
238 }
239 
fragSortCmp(const void * pa,const void * pb)240 static int fragSortCmp(const void *pa, const void *pb) {
241   const Fragment *a = *(const Fragment **)pa, *b = *(const Fragment **)pb;
242   if (a->score == b->score) {
243     return a - b;
244   }
245   return a->score > b->score ? -1 : 1;
246 }
247 
FragmentList_Sort(FragmentList * fragList)248 static void FragmentList_Sort(FragmentList *fragList) {
249   if (fragList->sortedFrags) {
250     return;
251   }
252 
253   const Fragment *origFrags = FragmentList_GetFragments(fragList);
254   fragList->sortedFrags = rm_malloc(sizeof(*fragList->sortedFrags) * fragList->numFrags);
255 
256   for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
257     fragList->sortedFrags[ii] = origFrags + ii;
258   }
259 
260   qsort(fragList->sortedFrags, fragList->numFrags, sizeof(fragList->sortedFrags[0]), fragSortCmp);
261   for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
262     ((Fragment *)fragList->sortedFrags[ii])->scoreRank = ii;
263   }
264 }
265 
sortByOrder(const void * pa,const void * pb)266 static int sortByOrder(const void *pa, const void *pb) {
267   const Fragment *a = *(const Fragment **)pa, *b = *(const Fragment **)pb;
268   return (int)a->fragPos - (int)b->fragPos;
269 }
270 
271 /**
272  * Add context before and after the fragment.
273  * - frag is the fragment to contextualize
274  * - limitBefore, limitAfter are boundaries, such that the following will be
275  *   true:
276  *   - limitBefore <= before <= frag->buf
277  *   - limitAfter > after >= frag->buf + frag->len
278  *   If limitBefore is not specified, it defaults to the beginning of the fragList's doc
279  *   If limitAfter is not specified, then the limit ends after the first NUL terminator.
280  */
FragmentList_FindContext(const FragmentList * fragList,const Fragment * frag,const char * limitBefore,const char * limitAfter,size_t contextSize,struct iovec * before,struct iovec * after)281 static void FragmentList_FindContext(const FragmentList *fragList, const Fragment *frag,
282                                      const char *limitBefore, const char *limitAfter,
283                                      size_t contextSize, struct iovec *before,
284                                      struct iovec *after) {
285 
286   if (limitBefore == NULL) {
287     limitBefore = fragList->doc;
288   }
289   if (limitAfter == NULL) {
290     limitAfter = fragList->doc + fragList->docLen - 1;
291   }
292 
293   // Subtract the number of context (i.e. non-match) words
294   // already inside the
295   // snippet.
296   if (contextSize <= frag->totalTokens - frag->numMatches) {
297     before->iov_base = after->iov_base = NULL;
298     before->iov_len = after->iov_len = 0;
299     return;
300   }
301 
302   contextSize -= (frag->totalTokens - frag->numMatches);
303 
304   // i.e. how much context before and after
305   contextSize /= 2;
306 
307   // At some point we need to make a cutoff in terms of *bytes*
308   contextSize *= fragList->estAvgWordSize;
309 
310   // TODO: If this context flows directly into a neighboring context, signal
311   // some way to *merge* them.
312 
313   limitBefore = Max(frag->buf - contextSize, limitBefore);
314   limitAfter = Min(frag->buf + frag->len + contextSize, limitAfter);
315 
316   before->iov_base = (void *)frag->buf;
317   before->iov_len = 0;
318 
319   // Find the context immediately prior to our fragment, this means to advance
320   // the cursor as much as possible until a separator is reached, and then
321   // seek past that separator (if there are separators)
322   for (; limitBefore < frag->buf && !istoksep(*limitBefore); limitBefore++) {
323     // Found a separator.
324   }
325   for (; limitBefore < frag->buf && istoksep(*limitBefore); limitBefore++) {
326     // Strip away future separators
327   }
328   before->iov_base = (void *)limitBefore;
329   before->iov_len = frag->buf - limitBefore;
330 
331   // Do the same for the 'after' context.
332   for (; limitAfter > frag->buf + frag->len && !istoksep(*limitAfter); limitAfter--) {
333     // Found a separator
334   }
335 
336   for (; limitAfter > frag->buf + frag->len && istoksep(*limitAfter); limitAfter--) {
337     // Seek to the end of the last non-separator word
338   }
339 
340   after->iov_base = (void *)frag->buf + frag->len;
341   after->iov_len = limitAfter - (frag->buf + frag->len) + 1;
342 }
343 
FragmentList_HighlightFragments(FragmentList * fragList,const HighlightTags * tags,size_t contextSize,Array * iovArrList,size_t niovs,int order)344 void FragmentList_HighlightFragments(FragmentList *fragList, const HighlightTags *tags,
345                                      size_t contextSize, Array *iovArrList, size_t niovs,
346                                      int order) {
347 
348   const Fragment *frags = FragmentList_GetFragments(fragList);
349   niovs = Min(niovs, fragList->numFrags);
350 
351   if (!fragList->scratchFrags) {
352     fragList->scratchFrags = rm_malloc(sizeof(*fragList->scratchFrags) * fragList->numFrags);
353   }
354   const Fragment **indexes = fragList->scratchFrags;
355 
356   if (order == HIGHLIGHT_ORDER_POS) {
357     for (size_t ii = 0; ii < niovs; ++ii) {
358       indexes[ii] = frags + ii;
359     }
360   } else if (order & HIGHLIGHT_ORDER_SCORE) {
361     FragmentList_Sort(fragList);
362     for (size_t ii = 0; ii < niovs; ++ii) {
363       indexes[ii] = fragList->sortedFrags[ii];
364     }
365     if (order & HIGHLIGHT_ORDER_POS) {
366       qsort(indexes, niovs, sizeof indexes[0], sortByOrder);
367     }
368   }
369 
370   size_t openLen = tags->openTag ? strlen(tags->openTag) : 0;
371   size_t closeLen = tags->closeTag ? strlen(tags->closeTag) : 0;
372 
373   for (size_t ii = 0; ii < niovs; ++ii) {
374     Array *curArr = iovArrList + ii;
375 
376     const char *beforeLimit = NULL, *afterLimit = NULL;
377     const Fragment *curFrag = indexes[ii];
378 
379     if (order & HIGHLIGHT_ORDER_POS) {
380       if (ii > 0) {
381         beforeLimit = indexes[ii - 1]->buf + indexes[ii - 1]->len;
382       }
383       if (ii + 1 < niovs) {
384         afterLimit = indexes[ii + 1]->buf;
385       }
386     }
387 
388     struct iovec before, after;
389     FragmentList_FindContext(fragList, curFrag, beforeLimit, afterLimit, contextSize, &before,
390                              &after);
391     addToIov(before.iov_base, before.iov_len, curArr);
392     Fragment_WriteIovs(curFrag, tags->openTag, openLen, tags->closeTag, closeLen, curArr, NULL);
393     addToIov(after.iov_base, after.iov_len, curArr);
394   }
395 }
396 
FragmentList_Free(FragmentList * fragList)397 void FragmentList_Free(FragmentList *fragList) {
398   Fragment *frags = (Fragment *)FragmentList_GetFragments(fragList);
399   for (size_t ii = 0; ii < fragList->numFrags; ii++) {
400     Array_Free(&frags[ii].termLocs);
401   }
402   Array_Free(&fragList->frags);
403   rm_free(fragList->sortedFrags);
404   rm_free(fragList->scratchFrags);
405 }
406 
407 /**
408  * Tokenization:
409  * If we have term offsets and document terms, we can skip the tokenization process.
410  *
411  * 1) Gather all matching terms for the documents, and get their offsets (in position)
412  * 2) Sort all terms, by position
413  * 3) Start reading the byte offset list, until we reach the first term of the match
414  *    list, then, consume the matches until the maximum distance has been reached,
415  *    noting the terms for each.
416  */
FragmentList_FragmentizeIter(FragmentList * fragList,const char * doc,size_t docLen,FragmentTermIterator * iter,int options)417 void FragmentList_FragmentizeIter(FragmentList *fragList, const char *doc, size_t docLen,
418                                   FragmentTermIterator *iter, int options) {
419   fragList->docLen = docLen;
420   fragList->doc = doc;
421   FragmentTerm *curTerm;
422   size_t lastTokPos = -1;
423   size_t lastByteEnd = 0;
424 
425   while (FragmentTermIterator_Next(iter, &curTerm)) {
426     if (curTerm == NULL) {
427       fragList->numToksSinceLastMatch++;
428       continue;
429     }
430 
431     if (curTerm->tokPos == lastTokPos) {
432       continue;
433     }
434 
435     if (curTerm->bytePos < lastByteEnd) {
436       // If our length estimations are off, don't use already-swallowed matches
437       continue;
438     }
439 
440     // Get the length of the current token. This is used to highlight the term
441     // (if requested), and just terminates at the first non-separator character
442     size_t len;
443     if (options & FRAGMENTIZE_TOKLEN_EXACT) {
444       len = curTerm->len;
445     } else {
446       len = 0;
447       for (size_t ii = curTerm->bytePos; ii < fragList->docLen && !istoksep(doc[ii]); ++ii, ++len) {
448       }
449     }
450 
451     FragmentList_AddMatchingTerm(fragList, curTerm->termId, curTerm->tokPos, doc + curTerm->bytePos,
452                                  len, curTerm->score);
453     lastTokPos = curTerm->tokPos;
454     lastByteEnd = curTerm->bytePos + len;
455   }
456 }
457 
FragmentTermIterator_InitOffsets(FragmentTermIterator * iter,RSByteOffsetIterator * byteOffsets,RSOffsetIterator * offIter)458 void FragmentTermIterator_InitOffsets(FragmentTermIterator *iter, RSByteOffsetIterator *byteOffsets,
459                                       RSOffsetIterator *offIter) {
460   iter->offsetIter = offIter;
461   iter->byteIter = byteOffsets;
462   iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
463 
464   // Advance the offset iterator to the first offset we care about (i.e. that
465   // correlates with the first byte offset)
466   do {
467     iter->curTokPos = iter->offsetIter->Next(iter->offsetIter->ctx, &iter->curMatchRec);
468   } while (iter->byteIter->curPos > iter->curTokPos);
469 }
470 
FragmentTermIterator_Next(FragmentTermIterator * iter,FragmentTerm ** termInfo)471 int FragmentTermIterator_Next(FragmentTermIterator *iter, FragmentTerm **termInfo) {
472   if (iter->curMatchRec == NULL || iter->curByteOffset == RSBYTEOFFSET_EOF ||
473       iter->curTokPos == RS_OFFSETVECTOR_EOF) {
474     return 0;
475   }
476 
477   if (iter->byteIter->curPos < iter->curTokPos) {
478     iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
479     // No matching term at this position.
480     // printf("IterPos=%lu. LastMatchPos=%u\n", iter->byteIter->curPos, iter->curTokPos);
481     *termInfo = NULL;
482     return 1;
483   }
484 
485   // printf("ByteOffset=%lu. LastMatchPos=%u\n", iter->curByteOffset, iter->curTokPos);
486 
487   RSQueryTerm *term = iter->curMatchRec;
488 
489   // printf("Term Pointer: %p\n", term);
490   iter->tmpTerm.score = term->idf;
491   iter->tmpTerm.termId = term->id;
492   iter->tmpTerm.len = term->len;
493   iter->tmpTerm.tokPos = iter->curTokPos;
494   iter->tmpTerm.bytePos = iter->curByteOffset;
495   *termInfo = &iter->tmpTerm;
496 
497   uint32_t nextPos = iter->offsetIter->Next(iter->offsetIter->ctx, &iter->curMatchRec);
498   if (nextPos != iter->curTokPos) {
499     iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
500   }
501   iter->curTokPos = nextPos;
502   return 1;
503 }
504 
505 // LCOV_EXCL_START debug
FragmentList_Dump(const FragmentList * fragList)506 void FragmentList_Dump(const FragmentList *fragList) {
507   printf("NumFrags: %u\n", fragList->numFrags);
508   for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
509     const Fragment *frag = ARRAY_GETITEM_AS(&fragList->frags, ii, Fragment *);
510     printf("Frag[%lu]: Buf=%p, (pos=%lu), Len=%u\n", ii, frag->buf, frag->buf - fragList->doc,
511            frag->len);
512     printf("  Score: %f, Rank=%u. Total Tokens=%u\n", frag->score, frag->scoreRank,
513            frag->totalTokens);
514     printf("  BEGIN:\n");
515     printf("     %.*s\n", (int)frag->len, frag->buf);
516     printf("  END\n");
517     printf("\n");
518   }
519 }
520 // LCOV_EXCL_STOP