1 #include "result_processor.h"
2 #include "fragmenter.h"
3 #include "value.h"
4 #include "util/minmax.h"
5 #include "toksep.h"
6 #include <ctype.h>
7 
8 typedef struct {
9   ResultProcessor base;
10   int fragmentizeOptions;
11   const FieldList *fields;
12   const RLookup *lookup;
13 } HlpProcessor;
14 
15 /**
16  * Common parameters passed around for highlighting one or more fields within
17  * a document. This structure exists to avoid passing these four parameters
18  * discreetly (as we did in previous versiosn)
19  */
20 typedef struct {
21   // Byte offsets, byte-wise
22   const RSByteOffsets *byteOffsets;
23 
24   // Index result, which contains the term offsets (word-wise)
25   const RSIndexResult *indexResult;
26 
27   // Array used for in/out when writing fields. Optimization cache
28   Array *iovsArr;
29 
30   RLookupRow *row;
31 
32 } hlpDocContext;
33 
34 /**
35  * Attempts to fragmentize a single field from its offset entries. This takes
36  * the field name, gets the matching field ID, retrieves the offset iterator
37  * for the field ID, and fragments the text based on the offsets. The fragmenter
38  * itself is in fragmenter.{c,h}
39  *
40  * Returns true if the fragmentation succeeded, false otherwise.
41  */
fragmentizeOffsets(IndexSpec * spec,const char * fieldName,const char * fieldText,size_t fieldLen,const RSIndexResult * indexResult,const RSByteOffsets * byteOffsets,FragmentList * fragList,int options)42 static int fragmentizeOffsets(IndexSpec *spec, const char *fieldName, const char *fieldText,
43                               size_t fieldLen, const RSIndexResult *indexResult,
44                               const RSByteOffsets *byteOffsets, FragmentList *fragList,
45                               int options) {
46   const FieldSpec *fs = IndexSpec_GetField(spec, fieldName, strlen(fieldName));
47   if (!fs || !FIELD_IS(fs, INDEXFLD_T_FULLTEXT)) {
48     return 0;
49   }
50 
51   int rc = 0;
52   RSOffsetIterator offsIter = RSIndexResult_IterateOffsets(indexResult);
53   FragmentTermIterator fragIter = {NULL};
54   RSByteOffsetIterator bytesIter;
55   if (RSByteOffset_Iterate(byteOffsets, fs->ftId, &bytesIter) != REDISMODULE_OK) {
56     goto done;
57   }
58 
59   FragmentTermIterator_InitOffsets(&fragIter, &bytesIter, &offsIter);
60   FragmentList_FragmentizeIter(fragList, fieldText, fieldLen, &fragIter, options);
61   if (fragList->numFrags == 0) {
62     goto done;
63   }
64   rc = 1;
65 
66 done:
67   offsIter.Free(offsIter.ctx);
68   return rc;
69 }
70 
71 // Strip spaces from a buffer in place. Returns the new length of the text,
72 // with all duplicate spaces stripped and converted to a single ' '.
stripDuplicateSpaces(char * s,size_t n)73 static size_t stripDuplicateSpaces(char *s, size_t n) {
74   int isLastSpace = 0;
75   size_t oix = 0;
76   char *out = s;
77   for (size_t ii = 0; ii < n; ++ii) {
78     if (isspace(s[ii])) {
79       if (isLastSpace) {
80         continue;
81       } else {
82         isLastSpace = 1;
83         out[oix++] = ' ';
84       }
85     } else {
86       isLastSpace = 0;
87       out[oix++] = s[ii];
88     }
89   }
90   return oix;
91 }
92 
93 /**
94  * Returns the length of the buffer without trailing spaces
95  */
trimTrailingSpaces(const char * s,size_t input)96 static size_t trimTrailingSpaces(const char *s, size_t input) {
97   for (; input && isspace(s[input - 1]); --input) {
98     // Nothing
99   }
100   return input;
101 }
102 
normalizeSettings(const ReturnedField * srcField,const ReturnedField * defaults,ReturnedField * out)103 static void normalizeSettings(const ReturnedField *srcField, const ReturnedField *defaults,
104                               ReturnedField *out) {
105   if (srcField == NULL) {
106     // Global setting
107     *out = *defaults;
108     return;
109   }
110 
111   // Otherwise it gets more complex
112   if ((defaults->mode & SummarizeMode_Highlight) &&
113       (srcField->mode & SummarizeMode_Highlight) == 0) {
114     out->highlightSettings = defaults->highlightSettings;
115   } else if (srcField->mode && SummarizeMode_Highlight) {
116     out->highlightSettings = srcField->highlightSettings;
117   }
118 
119   if ((defaults->mode & SummarizeMode_Synopsis) && (srcField->mode & SummarizeMode_Synopsis) == 0) {
120     out->summarizeSettings = defaults->summarizeSettings;
121   } else {
122     out->summarizeSettings = srcField->summarizeSettings;
123   }
124 
125   out->mode |= defaults->mode | srcField->mode;
126   out->name = srcField->name;
127   out->lookupKey = srcField->lookupKey;
128 }
129 
130 // Called when we cannot fragmentize based on byte offsets.
131 // docLen is an in/out parameter. On input it should contain the length of the
132 // field, and on output it contains the length of the trimmed summary.
133 // Returns a string which should be freed using free()
trimField(const ReturnedField * fieldInfo,const char * docStr,size_t * docLen,size_t estWordSize)134 static char *trimField(const ReturnedField *fieldInfo, const char *docStr, size_t *docLen,
135                        size_t estWordSize) {
136 
137   // Number of desired fragments times the number of context words in each fragments,
138   // in characters (estWordSize)
139   size_t headLen =
140       estWordSize * fieldInfo->summarizeSettings.contextLen * fieldInfo->summarizeSettings.numFrags;
141   headLen += estWordSize;  // Because we trim off a word when finding the toksep
142   headLen = Min(headLen, *docLen);
143 
144   Array bufTmp;
145   Array_InitEx(&bufTmp, ArrayAlloc_RM);
146 
147   Array_Write(&bufTmp, docStr, headLen);
148   headLen = stripDuplicateSpaces(bufTmp.data, headLen);
149   Array_Resize(&bufTmp, headLen);
150 
151   while (bufTmp.len > 1) {
152     if (istoksep(bufTmp.data[bufTmp.len - 1])) {
153       break;
154     }
155     bufTmp.len--;
156   }
157 
158   bufTmp.len = trimTrailingSpaces(bufTmp.data, bufTmp.len);
159   char *ret = Array_Steal(&bufTmp, docLen);
160   return ret;
161 }
162 
summarizeField(IndexSpec * spec,const ReturnedField * fieldInfo,const char * fieldName,const RSValue * returnedField,hlpDocContext * docParams,int options)163 static RSValue *summarizeField(IndexSpec *spec, const ReturnedField *fieldInfo,
164                                const char *fieldName, const RSValue *returnedField,
165                                hlpDocContext *docParams, int options) {
166 
167   FragmentList frags;
168   FragmentList_Init(&frags, 8, 6);
169 
170   // Start gathering the terms
171   HighlightTags tags = {.openTag = fieldInfo->highlightSettings.openTag,
172                         .closeTag = fieldInfo->highlightSettings.closeTag};
173 
174   // First actually generate the fragments
175   size_t docLen;
176   const char *docStr = RSValue_StringPtrLen(returnedField, &docLen);
177   if (docParams->byteOffsets == NULL ||
178       !fragmentizeOffsets(spec, fieldName, docStr, docLen, docParams->indexResult,
179                           docParams->byteOffsets, &frags, options)) {
180     if (fieldInfo->mode == SummarizeMode_Synopsis) {
181       // If summarizing is requested then trim the field so that the user isn't
182       // spammed with a large blob of text
183       char *summarized = trimField(fieldInfo, docStr, &docLen, frags.estAvgWordSize);
184       return RS_StringVal(summarized, docLen);
185     } else {
186       // Otherwise, just return the whole field, but without highlighting
187     }
188     FragmentList_Free(&frags);
189     return NULL;
190   }
191 
192   // Highlight only
193   if (fieldInfo->mode == SummarizeMode_Highlight) {
194     // No need to return snippets; just return the entire doc with relevant tags
195     // highlighted.
196     char *hlDoc = FragmentList_HighlightWholeDocS(&frags, &tags);
197     FragmentList_Free(&frags);
198     return RS_StringValC(hlDoc);
199   }
200 
201   size_t numIovArr = Min(fieldInfo->summarizeSettings.numFrags, FragmentList_GetNumFrags(&frags));
202   for (size_t ii = 0; ii < numIovArr; ++ii) {
203     Array_Resize(&docParams->iovsArr[ii], 0);
204   }
205 
206   FragmentList_HighlightFragments(&frags, &tags, fieldInfo->summarizeSettings.contextLen,
207                                   docParams->iovsArr, numIovArr, HIGHLIGHT_ORDER_SCOREPOS);
208 
209   // Buffer to store concatenated fragments
210   Array bufTmp;
211   Array_InitEx(&bufTmp, ArrayAlloc_RM);
212 
213   for (size_t ii = 0; ii < numIovArr; ++ii) {
214     Array *curIovs = docParams->iovsArr + ii;
215     struct iovec *iovs = ARRAY_GETARRAY_AS(curIovs, struct iovec *);
216     size_t numIovs = ARRAY_GETSIZE_AS(curIovs, struct iovec);
217     size_t lastSize = bufTmp.len;
218 
219     for (size_t jj = 0; jj < numIovs; ++jj) {
220       Array_Write(&bufTmp, iovs[jj].iov_base, iovs[jj].iov_len);
221     }
222 
223     // Duplicate spaces for the current snippet are eliminated here. We shouldn't
224     // move it to the end because the delimiter itself may contain a special kind
225     // of whitespace.
226     size_t newSize = stripDuplicateSpaces(bufTmp.data + lastSize, bufTmp.len - lastSize);
227     Array_Resize(&bufTmp, lastSize + newSize);
228     Array_Write(&bufTmp, fieldInfo->summarizeSettings.separator,
229                 strlen(fieldInfo->summarizeSettings.separator));
230   }
231 
232   // Set the string value to the contents of the array. It might be nice if we didn't
233   // need to strndup it.
234   size_t hlLen;
235   char *hlText = Array_Steal(&bufTmp, &hlLen);
236   Array_Free(&bufTmp);
237   FragmentList_Free(&frags);
238   return RS_StringVal(hlText, hlLen);
239 }
240 
resetIovsArr(Array ** iovsArrp,size_t * curSize,size_t newSize)241 static void resetIovsArr(Array **iovsArrp, size_t *curSize, size_t newSize) {
242   if (*curSize < newSize) {
243     *iovsArrp = rm_realloc(*iovsArrp, sizeof(**iovsArrp) * newSize);
244   }
245   for (size_t ii = 0; ii < *curSize; ++ii) {
246     Array_Resize((*iovsArrp) + ii, 0);
247   }
248   for (size_t ii = *curSize; ii < newSize; ++ii) {
249     Array_Init((*iovsArrp) + ii);
250   }
251   *curSize = newSize;
252 }
253 
processField(HlpProcessor * hlpCtx,hlpDocContext * docParams,ReturnedField * spec)254 static void processField(HlpProcessor *hlpCtx, hlpDocContext *docParams, ReturnedField *spec) {
255   const char *fName = spec->name;
256   const RSValue *fieldValue = RLookup_GetItem(spec->lookupKey, docParams->row);
257 
258   if (fieldValue == NULL || !RSValue_IsString(fieldValue)) {
259     return;
260   }
261   RSValue *v = summarizeField(RP_SPEC(&hlpCtx->base), spec, fName, fieldValue, docParams,
262                               hlpCtx->fragmentizeOptions);
263   if (v) {
264     RLookup_WriteOwnKey(spec->lookupKey, docParams->row, v);
265   }
266 }
267 
getIndexResult(ResultProcessor * rp,t_docId docId)268 static const RSIndexResult *getIndexResult(ResultProcessor *rp, t_docId docId) {
269   IndexIterator *it = QITR_GetRootFilter(rp->parent);
270   RSIndexResult *ir = NULL;
271   if (!it) {
272     return NULL;
273   }
274   it->Rewind(it->ctx);
275   if (INDEXREAD_OK != it->SkipTo(it->ctx, docId, &ir)) {
276     return NULL;
277   }
278   return ir;
279 }
280 
hlpNext(ResultProcessor * rbase,SearchResult * r)281 static int hlpNext(ResultProcessor *rbase, SearchResult *r) {
282   int rc = rbase->upstream->Next(rbase->upstream, r);
283   if (rc != RS_RESULT_OK) {
284     return rc;
285   }
286 
287   HlpProcessor *hlp = (HlpProcessor *)rbase;
288 
289   // Get the index result for the current document from the root iterator.
290   // The current result should not contain an index result
291   const RSIndexResult *ir = r->indexResult ? r->indexResult : getIndexResult(rbase, r->docId);
292 
293   // we can't work withot the inex result, just return QUEUED
294   if (!ir) {
295     return RS_RESULT_OK;
296   }
297 
298   size_t numIovsArr = 0;
299   const FieldList *fields = hlp->fields;
300   RSDocumentMetadata *dmd = r->dmd;
301   if (!dmd) {
302     return RS_RESULT_OK;
303   }
304 
305   hlpDocContext docParams = {.byteOffsets = dmd->byteOffsets,  // nl
306                              .iovsArr = NULL,
307                              .indexResult = ir,
308                              .row = &r->rowdata};
309 
310   if (fields->numFields) {
311     for (size_t ii = 0; ii < fields->numFields; ++ii) {
312       const ReturnedField *ff = fields->fields + ii;
313       if (ff->mode == SummarizeMode_None && fields->defaultField.mode == SummarizeMode_None) {
314         // Ignore - this is a field for `RETURN`, not `SUMMARIZE`
315         continue;
316       }
317       ReturnedField combinedSpec = {0};
318       normalizeSettings(ff, &fields->defaultField, &combinedSpec);
319       resetIovsArr(&docParams.iovsArr, &numIovsArr, combinedSpec.summarizeSettings.numFrags);
320       processField(hlp, &docParams, &combinedSpec);
321     }
322   } else if (fields->defaultField.mode != SummarizeMode_None) {
323     for (const RLookupKey *k = hlp->lookup->head; k; k = k->next) {
324       if (k->flags & RLOOKUP_F_HIDDEN) {
325         continue;
326       }
327       ReturnedField spec = {0};
328       normalizeSettings(NULL, &fields->defaultField, &spec);
329       spec.lookupKey = k;
330       spec.name = k->name;
331       resetIovsArr(&docParams.iovsArr, &numIovsArr, spec.summarizeSettings.numFrags);
332       processField(hlp, &docParams, &spec);
333     }
334   }
335   for (size_t ii = 0; ii < numIovsArr; ++ii) {
336     Array_Free(&docParams.iovsArr[ii]);
337   }
338   rm_free(docParams.iovsArr);
339   return RS_RESULT_OK;
340 }
341 
hlpFree(ResultProcessor * p)342 static void hlpFree(ResultProcessor *p) {
343   rm_free(p);
344 }
345 
RPHighlighter_New(const RSSearchOptions * searchopts,const FieldList * fields,const RLookup * lookup)346 ResultProcessor *RPHighlighter_New(const RSSearchOptions *searchopts, const FieldList *fields,
347                                    const RLookup *lookup) {
348   HlpProcessor *hlp = rm_calloc(1, sizeof(*hlp));
349   if (searchopts->language == RS_LANG_CHINESE) {
350     hlp->fragmentizeOptions = FRAGMENTIZE_TOKLEN_EXACT;
351   }
352   hlp->base.Next = hlpNext;
353   hlp->base.Free = hlpFree;
354   hlp->fields = fields;
355   hlp->lookup = lookup;
356   return &hlp->base;
357 }
358