1 #include "fragmenter.h"
2 #include "toksep.h"
3 #include "tokenize.h"
4 #include "util/minmax.h"
5 #include <ctype.h>
6 #include <float.h>
7 #include <sys/uio.h>
8 #include <assert.h>
9
10 // Estimated characters per token
11 #define EST_CHARS_PER_TOK 6
12
FragmentList_LastFragment(FragmentList * fragList)13 static Fragment *FragmentList_LastFragment(FragmentList *fragList) {
14 if (!fragList->frags.len) {
15 return NULL;
16 }
17 return (Fragment *)(fragList->frags.data + (fragList->frags.len - sizeof(Fragment)));
18 }
19
FragmentList_AddFragment(FragmentList * fragList)20 static Fragment *FragmentList_AddFragment(FragmentList *fragList) {
21 Fragment *frag = Array_Add(&fragList->frags, sizeof(Fragment));
22 memset(frag, 0, sizeof(*frag));
23 frag->fragPos = fragList->numFrags++;
24 Array_Init(&frag->termLocs);
25 return frag;
26 }
27
Fragment_GetNumTerms(const Fragment * frag)28 static size_t Fragment_GetNumTerms(const Fragment *frag) {
29 return ARRAY_GETSIZE_AS(&frag->termLocs, TermLoc);
30 }
31
Fragment_HasTerm(const Fragment * frag,uint32_t termId)32 static int Fragment_HasTerm(const Fragment *frag, uint32_t termId) {
33 TermLoc *locs = ARRAY_GETARRAY_AS(&frag->termLocs, TermLoc *);
34
35 int firstOcurrence = 1;
36 // If this is the first time the term appears in the fragment, increment the
37 // fragment's score by the term's score. Otherwise, increment it by half
38 // the fragment's score. This allows for better 'blended' results.
39 for (size_t ii = 0; ii < Fragment_GetNumTerms(frag); ii++) {
40 if (locs[ii].termId == termId) {
41 return 1;
42 }
43 }
44 return 0;
45 }
46
FragmentList_AddMatchingTerm(FragmentList * fragList,uint32_t termId,uint32_t tokPos,const char * tokBuf,size_t tokLen,float baseScore)47 static Fragment *FragmentList_AddMatchingTerm(FragmentList *fragList, uint32_t termId,
48 uint32_t tokPos, const char *tokBuf, size_t tokLen,
49 float baseScore) {
50
51 Fragment *curFrag = FragmentList_LastFragment(fragList);
52 if (curFrag && tokPos - curFrag->lastMatchPos > fragList->maxDistance) {
53 // There is too much distance between tokens for it to still be relevant.
54 curFrag = NULL;
55 }
56
57 if (curFrag == NULL) {
58 curFrag = FragmentList_AddFragment(fragList);
59 fragList->numToksSinceLastMatch = 0;
60 curFrag->buf = tokBuf;
61 }
62
63 if (!Fragment_HasTerm(curFrag, termId)) {
64 curFrag->score += baseScore;
65 }
66
67 curFrag->len = (tokBuf - curFrag->buf) + tokLen;
68 curFrag->lastMatchPos = tokPos;
69 curFrag->numMatches++;
70 curFrag->totalTokens += fragList->numToksSinceLastMatch + 1;
71 fragList->numToksSinceLastMatch = 0;
72
73 TermLoc *newLoc = Array_Add(&curFrag->termLocs, sizeof(TermLoc));
74 newLoc->termId = termId;
75 newLoc->offset = tokBuf - curFrag->buf;
76 newLoc->len = tokLen;
77
78 return curFrag;
79 }
80
extractToken(FragmentList * fragList,const Token * tokInfo,const FragmentSearchTerm * terms,size_t numTerms)81 static void extractToken(FragmentList *fragList, const Token *tokInfo,
82 const FragmentSearchTerm *terms, size_t numTerms) {
83 const FragmentSearchTerm *term = NULL;
84 uint32_t termId;
85
86 // See if this token matches any of our terms.
87 for (termId = 0; termId < numTerms; ++termId) {
88 const FragmentSearchTerm *cur = terms + termId;
89 if (tokInfo->tokLen == cur->len && strncmp(tokInfo->tok, cur->tok, cur->len) == 0) {
90 } else if (tokInfo->stem && tokInfo->stemLen == cur->len &&
91 strncmp(tokInfo->stem, cur->tok, cur->len) == 0) {
92 } else {
93 continue;
94 }
95 term = cur;
96 break;
97 }
98
99 // Don't care about this token
100 if (!term) {
101 fragList->numToksSinceLastMatch++;
102 return;
103 }
104
105 FragmentList_AddMatchingTerm(fragList, termId, tokInfo->pos, tokInfo->raw, tokInfo->rawLen,
106 term->score);
107 }
108
FragmentList_FragmentizeBuffer(FragmentList * fragList,const char * doc,Stemmer * stemmer,StopWordList * stopwords,const FragmentSearchTerm * terms,size_t numTerms)109 void FragmentList_FragmentizeBuffer(FragmentList *fragList, const char *doc, Stemmer *stemmer,
110 StopWordList *stopwords, const FragmentSearchTerm *terms,
111 size_t numTerms) {
112 fragList->doc = doc;
113 fragList->docLen = strlen(doc);
114 RSTokenizer *tokenizer = NewSimpleTokenizer(stemmer, stopwords, TOKENIZE_NOMODIFY);
115 tokenizer->Start(tokenizer, (char *)fragList->doc, fragList->docLen, 0);
116 Token tokInfo;
117 while (tokenizer->Next(tokenizer, &tokInfo)) {
118 extractToken(fragList, &tokInfo, terms, numTerms);
119 }
120 tokenizer->Free(tokenizer);
121 }
122
addToIov(const char * s,size_t n,Array * b)123 static void addToIov(const char *s, size_t n, Array *b) {
124 if (n == 0 || s == NULL) {
125 return;
126 }
127 struct iovec *iov = Array_Add(b, sizeof(*iov));
128 assert(iov);
129 iov->iov_base = (void *)s;
130 iov->iov_len = n;
131 }
132
133 /**
134 * Writes a complete fragment as a series of IOVs.
135 * - fragment is the fragment to write
136 * - tags is the tags to use
137 * - contextLen is any amount of context used to surround the fragment with
138 * - iovs is the target buffer in which the iovs should be written
139 *
140 * - preamble is any prior text which may need to be written alongside the fragment.
141 * In output, it contains the first byte after the fragment+context. This may be
142 * used as the 'preamble' value for a subsequent call to this function, if the next
143 * fragment being written is after the current one.
144 */
Fragment_WriteIovs(const Fragment * curFrag,const char * openTag,size_t openLen,const char * closeTag,size_t closeLen,Array * iovs,const char ** preamble)145 static void Fragment_WriteIovs(const Fragment *curFrag, const char *openTag, size_t openLen,
146 const char *closeTag, size_t closeLen, Array *iovs,
147 const char **preamble) {
148
149 const TermLoc *locs = ARRAY_GETARRAY_AS(&curFrag->termLocs, const TermLoc *);
150 size_t nlocs = ARRAY_GETSIZE_AS(&curFrag->termLocs, TermLoc);
151 const char *preamble_s = NULL;
152
153 if (!preamble) {
154 preamble = &preamble_s;
155 }
156 if (!*preamble) {
157 *preamble = curFrag->buf;
158 }
159
160 for (size_t ii = 0; ii < nlocs; ++ii) {
161 const TermLoc *curLoc = locs + ii;
162
163 size_t preambleLen = (curFrag->buf + curLoc->offset) - *preamble;
164
165 // Add any prior text
166 if (preambleLen) {
167 addToIov(*preamble, preambleLen, iovs);
168 }
169
170 if (openLen) {
171 addToIov(openTag, openLen, iovs);
172 }
173
174 // Add the token itself
175 addToIov(curFrag->buf + curLoc->offset, curLoc->len, iovs);
176
177 // Add close tag
178 if (closeLen) {
179 addToIov(closeTag, closeLen, iovs);
180 }
181
182 *preamble = curFrag->buf + curLoc->offset + curLoc->len;
183 }
184 }
185
FragmentList_HighlightWholeDocV(const FragmentList * fragList,const HighlightTags * tags,Array * iovs)186 void FragmentList_HighlightWholeDocV(const FragmentList *fragList, const HighlightTags *tags,
187 Array *iovs) {
188 const Fragment *frags = FragmentList_GetFragments(fragList);
189
190 if (!fragList->numFrags) {
191 // Whole doc, but no matches found
192 addToIov(fragList->doc, fragList->docLen, iovs);
193 return;
194 }
195
196 const char *preamble = fragList->doc;
197 size_t openLen = strlen(tags->openTag);
198 size_t closeLen = strlen(tags->closeTag);
199
200 for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
201 const Fragment *curFrag = frags + ii;
202 Fragment_WriteIovs(curFrag, tags->openTag, openLen, tags->closeTag, closeLen, iovs, &preamble);
203 }
204
205 // Write the last preamble
206 size_t preambleLen = (fragList->doc + fragList->docLen) - preamble;
207 // size_t preambleLen = strlen(preamble);
208 if (preambleLen) {
209 addToIov(preamble, preambleLen, iovs);
210 }
211 }
212
FragmentList_HighlightWholeDocS(const FragmentList * fragList,const HighlightTags * tags)213 char *FragmentList_HighlightWholeDocS(const FragmentList *fragList, const HighlightTags *tags) {
214 Array iovsArr;
215 Array_Init(&iovsArr);
216 FragmentList_HighlightWholeDocV(fragList, tags, &iovsArr);
217
218 // Calculate the length
219 struct iovec *iovs = ARRAY_GETARRAY_AS(&iovsArr, struct iovec *);
220 size_t niovs = ARRAY_GETSIZE_AS(&iovsArr, struct iovec);
221 size_t docLen = 0;
222 for (size_t ii = 0; ii < niovs; ++ii) {
223 docLen += iovs[ii].iov_len;
224 }
225
226 char *docBuf = rm_malloc(docLen + 1);
227 docBuf[docLen] = '\0';
228
229 assert(docBuf);
230 size_t offset = 0;
231 for (size_t ii = 0; ii < niovs; ++ii) {
232 memcpy(docBuf + offset, iovs[ii].iov_base, iovs[ii].iov_len);
233 offset += iovs[ii].iov_len;
234 }
235
236 Array_Free(&iovsArr);
237 return docBuf;
238 }
239
fragSortCmp(const void * pa,const void * pb)240 static int fragSortCmp(const void *pa, const void *pb) {
241 const Fragment *a = *(const Fragment **)pa, *b = *(const Fragment **)pb;
242 if (a->score == b->score) {
243 return a - b;
244 }
245 return a->score > b->score ? -1 : 1;
246 }
247
FragmentList_Sort(FragmentList * fragList)248 static void FragmentList_Sort(FragmentList *fragList) {
249 if (fragList->sortedFrags) {
250 return;
251 }
252
253 const Fragment *origFrags = FragmentList_GetFragments(fragList);
254 fragList->sortedFrags = rm_malloc(sizeof(*fragList->sortedFrags) * fragList->numFrags);
255
256 for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
257 fragList->sortedFrags[ii] = origFrags + ii;
258 }
259
260 qsort(fragList->sortedFrags, fragList->numFrags, sizeof(fragList->sortedFrags[0]), fragSortCmp);
261 for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
262 ((Fragment *)fragList->sortedFrags[ii])->scoreRank = ii;
263 }
264 }
265
sortByOrder(const void * pa,const void * pb)266 static int sortByOrder(const void *pa, const void *pb) {
267 const Fragment *a = *(const Fragment **)pa, *b = *(const Fragment **)pb;
268 return (int)a->fragPos - (int)b->fragPos;
269 }
270
271 /**
272 * Add context before and after the fragment.
273 * - frag is the fragment to contextualize
274 * - limitBefore, limitAfter are boundaries, such that the following will be
275 * true:
276 * - limitBefore <= before <= frag->buf
277 * - limitAfter > after >= frag->buf + frag->len
278 * If limitBefore is not specified, it defaults to the beginning of the fragList's doc
279 * If limitAfter is not specified, then the limit ends after the first NUL terminator.
280 */
FragmentList_FindContext(const FragmentList * fragList,const Fragment * frag,const char * limitBefore,const char * limitAfter,size_t contextSize,struct iovec * before,struct iovec * after)281 static void FragmentList_FindContext(const FragmentList *fragList, const Fragment *frag,
282 const char *limitBefore, const char *limitAfter,
283 size_t contextSize, struct iovec *before,
284 struct iovec *after) {
285
286 if (limitBefore == NULL) {
287 limitBefore = fragList->doc;
288 }
289 if (limitAfter == NULL) {
290 limitAfter = fragList->doc + fragList->docLen - 1;
291 }
292
293 // Subtract the number of context (i.e. non-match) words
294 // already inside the
295 // snippet.
296 if (contextSize <= frag->totalTokens - frag->numMatches) {
297 before->iov_base = after->iov_base = NULL;
298 before->iov_len = after->iov_len = 0;
299 return;
300 }
301
302 contextSize -= (frag->totalTokens - frag->numMatches);
303
304 // i.e. how much context before and after
305 contextSize /= 2;
306
307 // At some point we need to make a cutoff in terms of *bytes*
308 contextSize *= fragList->estAvgWordSize;
309
310 // TODO: If this context flows directly into a neighboring context, signal
311 // some way to *merge* them.
312
313 limitBefore = Max(frag->buf - contextSize, limitBefore);
314 limitAfter = Min(frag->buf + frag->len + contextSize, limitAfter);
315
316 before->iov_base = (void *)frag->buf;
317 before->iov_len = 0;
318
319 // Find the context immediately prior to our fragment, this means to advance
320 // the cursor as much as possible until a separator is reached, and then
321 // seek past that separator (if there are separators)
322 for (; limitBefore < frag->buf && !istoksep(*limitBefore); limitBefore++) {
323 // Found a separator.
324 }
325 for (; limitBefore < frag->buf && istoksep(*limitBefore); limitBefore++) {
326 // Strip away future separators
327 }
328 before->iov_base = (void *)limitBefore;
329 before->iov_len = frag->buf - limitBefore;
330
331 // Do the same for the 'after' context.
332 for (; limitAfter > frag->buf + frag->len && !istoksep(*limitAfter); limitAfter--) {
333 // Found a separator
334 }
335
336 for (; limitAfter > frag->buf + frag->len && istoksep(*limitAfter); limitAfter--) {
337 // Seek to the end of the last non-separator word
338 }
339
340 after->iov_base = (void *)frag->buf + frag->len;
341 after->iov_len = limitAfter - (frag->buf + frag->len) + 1;
342 }
343
FragmentList_HighlightFragments(FragmentList * fragList,const HighlightTags * tags,size_t contextSize,Array * iovArrList,size_t niovs,int order)344 void FragmentList_HighlightFragments(FragmentList *fragList, const HighlightTags *tags,
345 size_t contextSize, Array *iovArrList, size_t niovs,
346 int order) {
347
348 const Fragment *frags = FragmentList_GetFragments(fragList);
349 niovs = Min(niovs, fragList->numFrags);
350
351 if (!fragList->scratchFrags) {
352 fragList->scratchFrags = rm_malloc(sizeof(*fragList->scratchFrags) * fragList->numFrags);
353 }
354 const Fragment **indexes = fragList->scratchFrags;
355
356 if (order == HIGHLIGHT_ORDER_POS) {
357 for (size_t ii = 0; ii < niovs; ++ii) {
358 indexes[ii] = frags + ii;
359 }
360 } else if (order & HIGHLIGHT_ORDER_SCORE) {
361 FragmentList_Sort(fragList);
362 for (size_t ii = 0; ii < niovs; ++ii) {
363 indexes[ii] = fragList->sortedFrags[ii];
364 }
365 if (order & HIGHLIGHT_ORDER_POS) {
366 qsort(indexes, niovs, sizeof indexes[0], sortByOrder);
367 }
368 }
369
370 size_t openLen = tags->openTag ? strlen(tags->openTag) : 0;
371 size_t closeLen = tags->closeTag ? strlen(tags->closeTag) : 0;
372
373 for (size_t ii = 0; ii < niovs; ++ii) {
374 Array *curArr = iovArrList + ii;
375
376 const char *beforeLimit = NULL, *afterLimit = NULL;
377 const Fragment *curFrag = indexes[ii];
378
379 if (order & HIGHLIGHT_ORDER_POS) {
380 if (ii > 0) {
381 beforeLimit = indexes[ii - 1]->buf + indexes[ii - 1]->len;
382 }
383 if (ii + 1 < niovs) {
384 afterLimit = indexes[ii + 1]->buf;
385 }
386 }
387
388 struct iovec before, after;
389 FragmentList_FindContext(fragList, curFrag, beforeLimit, afterLimit, contextSize, &before,
390 &after);
391 addToIov(before.iov_base, before.iov_len, curArr);
392 Fragment_WriteIovs(curFrag, tags->openTag, openLen, tags->closeTag, closeLen, curArr, NULL);
393 addToIov(after.iov_base, after.iov_len, curArr);
394 }
395 }
396
FragmentList_Free(FragmentList * fragList)397 void FragmentList_Free(FragmentList *fragList) {
398 Fragment *frags = (Fragment *)FragmentList_GetFragments(fragList);
399 for (size_t ii = 0; ii < fragList->numFrags; ii++) {
400 Array_Free(&frags[ii].termLocs);
401 }
402 Array_Free(&fragList->frags);
403 rm_free(fragList->sortedFrags);
404 rm_free(fragList->scratchFrags);
405 }
406
407 /**
408 * Tokenization:
409 * If we have term offsets and document terms, we can skip the tokenization process.
410 *
411 * 1) Gather all matching terms for the documents, and get their offsets (in position)
412 * 2) Sort all terms, by position
413 * 3) Start reading the byte offset list, until we reach the first term of the match
414 * list, then, consume the matches until the maximum distance has been reached,
415 * noting the terms for each.
416 */
FragmentList_FragmentizeIter(FragmentList * fragList,const char * doc,size_t docLen,FragmentTermIterator * iter,int options)417 void FragmentList_FragmentizeIter(FragmentList *fragList, const char *doc, size_t docLen,
418 FragmentTermIterator *iter, int options) {
419 fragList->docLen = docLen;
420 fragList->doc = doc;
421 FragmentTerm *curTerm;
422 size_t lastTokPos = -1;
423 size_t lastByteEnd = 0;
424
425 while (FragmentTermIterator_Next(iter, &curTerm)) {
426 if (curTerm == NULL) {
427 fragList->numToksSinceLastMatch++;
428 continue;
429 }
430
431 if (curTerm->tokPos == lastTokPos) {
432 continue;
433 }
434
435 if (curTerm->bytePos < lastByteEnd) {
436 // If our length estimations are off, don't use already-swallowed matches
437 continue;
438 }
439
440 // Get the length of the current token. This is used to highlight the term
441 // (if requested), and just terminates at the first non-separator character
442 size_t len;
443 if (options & FRAGMENTIZE_TOKLEN_EXACT) {
444 len = curTerm->len;
445 } else {
446 len = 0;
447 for (size_t ii = curTerm->bytePos; ii < fragList->docLen && !istoksep(doc[ii]); ++ii, ++len) {
448 }
449 }
450
451 FragmentList_AddMatchingTerm(fragList, curTerm->termId, curTerm->tokPos, doc + curTerm->bytePos,
452 len, curTerm->score);
453 lastTokPos = curTerm->tokPos;
454 lastByteEnd = curTerm->bytePos + len;
455 }
456 }
457
FragmentTermIterator_InitOffsets(FragmentTermIterator * iter,RSByteOffsetIterator * byteOffsets,RSOffsetIterator * offIter)458 void FragmentTermIterator_InitOffsets(FragmentTermIterator *iter, RSByteOffsetIterator *byteOffsets,
459 RSOffsetIterator *offIter) {
460 iter->offsetIter = offIter;
461 iter->byteIter = byteOffsets;
462 iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
463
464 // Advance the offset iterator to the first offset we care about (i.e. that
465 // correlates with the first byte offset)
466 do {
467 iter->curTokPos = iter->offsetIter->Next(iter->offsetIter->ctx, &iter->curMatchRec);
468 } while (iter->byteIter->curPos > iter->curTokPos);
469 }
470
FragmentTermIterator_Next(FragmentTermIterator * iter,FragmentTerm ** termInfo)471 int FragmentTermIterator_Next(FragmentTermIterator *iter, FragmentTerm **termInfo) {
472 if (iter->curMatchRec == NULL || iter->curByteOffset == RSBYTEOFFSET_EOF ||
473 iter->curTokPos == RS_OFFSETVECTOR_EOF) {
474 return 0;
475 }
476
477 if (iter->byteIter->curPos < iter->curTokPos) {
478 iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
479 // No matching term at this position.
480 // printf("IterPos=%lu. LastMatchPos=%u\n", iter->byteIter->curPos, iter->curTokPos);
481 *termInfo = NULL;
482 return 1;
483 }
484
485 // printf("ByteOffset=%lu. LastMatchPos=%u\n", iter->curByteOffset, iter->curTokPos);
486
487 RSQueryTerm *term = iter->curMatchRec;
488
489 // printf("Term Pointer: %p\n", term);
490 iter->tmpTerm.score = term->idf;
491 iter->tmpTerm.termId = term->id;
492 iter->tmpTerm.len = term->len;
493 iter->tmpTerm.tokPos = iter->curTokPos;
494 iter->tmpTerm.bytePos = iter->curByteOffset;
495 *termInfo = &iter->tmpTerm;
496
497 uint32_t nextPos = iter->offsetIter->Next(iter->offsetIter->ctx, &iter->curMatchRec);
498 if (nextPos != iter->curTokPos) {
499 iter->curByteOffset = RSByteOffsetIterator_Next(iter->byteIter);
500 }
501 iter->curTokPos = nextPos;
502 return 1;
503 }
504
505 // LCOV_EXCL_START debug
FragmentList_Dump(const FragmentList * fragList)506 void FragmentList_Dump(const FragmentList *fragList) {
507 printf("NumFrags: %u\n", fragList->numFrags);
508 for (size_t ii = 0; ii < fragList->numFrags; ++ii) {
509 const Fragment *frag = ARRAY_GETITEM_AS(&fragList->frags, ii, Fragment *);
510 printf("Frag[%lu]: Buf=%p, (pos=%lu), Len=%u\n", ii, frag->buf, frag->buf - fragList->doc,
511 frag->len);
512 printf(" Score: %f, Rank=%u. Total Tokens=%u\n", frag->score, frag->scoreRank,
513 frag->totalTokens);
514 printf(" BEGIN:\n");
515 printf(" %.*s\n", (int)frag->len, frag->buf);
516 printf(" END\n");
517 printf("\n");
518 }
519 }
520 // LCOV_EXCL_STOP