1 #include "result_processor.h"
2 #include "fragmenter.h"
3 #include "value.h"
4 #include "util/minmax.h"
5 #include "toksep.h"
6 #include <ctype.h>
7
8 typedef struct {
9 ResultProcessor base;
10 int fragmentizeOptions;
11 const FieldList *fields;
12 const RLookup *lookup;
13 } HlpProcessor;
14
15 /**
16 * Common parameters passed around for highlighting one or more fields within
17 * a document. This structure exists to avoid passing these four parameters
18 * discreetly (as we did in previous versiosn)
19 */
20 typedef struct {
21 // Byte offsets, byte-wise
22 const RSByteOffsets *byteOffsets;
23
24 // Index result, which contains the term offsets (word-wise)
25 const RSIndexResult *indexResult;
26
27 // Array used for in/out when writing fields. Optimization cache
28 Array *iovsArr;
29
30 RLookupRow *row;
31
32 } hlpDocContext;
33
34 /**
35 * Attempts to fragmentize a single field from its offset entries. This takes
36 * the field name, gets the matching field ID, retrieves the offset iterator
37 * for the field ID, and fragments the text based on the offsets. The fragmenter
38 * itself is in fragmenter.{c,h}
39 *
40 * Returns true if the fragmentation succeeded, false otherwise.
41 */
fragmentizeOffsets(IndexSpec * spec,const char * fieldName,const char * fieldText,size_t fieldLen,const RSIndexResult * indexResult,const RSByteOffsets * byteOffsets,FragmentList * fragList,int options)42 static int fragmentizeOffsets(IndexSpec *spec, const char *fieldName, const char *fieldText,
43 size_t fieldLen, const RSIndexResult *indexResult,
44 const RSByteOffsets *byteOffsets, FragmentList *fragList,
45 int options) {
46 const FieldSpec *fs = IndexSpec_GetField(spec, fieldName, strlen(fieldName));
47 if (!fs || !FIELD_IS(fs, INDEXFLD_T_FULLTEXT)) {
48 return 0;
49 }
50
51 int rc = 0;
52 RSOffsetIterator offsIter = RSIndexResult_IterateOffsets(indexResult);
53 FragmentTermIterator fragIter = {NULL};
54 RSByteOffsetIterator bytesIter;
55 if (RSByteOffset_Iterate(byteOffsets, fs->ftId, &bytesIter) != REDISMODULE_OK) {
56 goto done;
57 }
58
59 FragmentTermIterator_InitOffsets(&fragIter, &bytesIter, &offsIter);
60 FragmentList_FragmentizeIter(fragList, fieldText, fieldLen, &fragIter, options);
61 if (fragList->numFrags == 0) {
62 goto done;
63 }
64 rc = 1;
65
66 done:
67 offsIter.Free(offsIter.ctx);
68 return rc;
69 }
70
71 // Strip spaces from a buffer in place. Returns the new length of the text,
72 // with all duplicate spaces stripped and converted to a single ' '.
stripDuplicateSpaces(char * s,size_t n)73 static size_t stripDuplicateSpaces(char *s, size_t n) {
74 int isLastSpace = 0;
75 size_t oix = 0;
76 char *out = s;
77 for (size_t ii = 0; ii < n; ++ii) {
78 if (isspace(s[ii])) {
79 if (isLastSpace) {
80 continue;
81 } else {
82 isLastSpace = 1;
83 out[oix++] = ' ';
84 }
85 } else {
86 isLastSpace = 0;
87 out[oix++] = s[ii];
88 }
89 }
90 return oix;
91 }
92
93 /**
94 * Returns the length of the buffer without trailing spaces
95 */
trimTrailingSpaces(const char * s,size_t input)96 static size_t trimTrailingSpaces(const char *s, size_t input) {
97 for (; input && isspace(s[input - 1]); --input) {
98 // Nothing
99 }
100 return input;
101 }
102
normalizeSettings(const ReturnedField * srcField,const ReturnedField * defaults,ReturnedField * out)103 static void normalizeSettings(const ReturnedField *srcField, const ReturnedField *defaults,
104 ReturnedField *out) {
105 if (srcField == NULL) {
106 // Global setting
107 *out = *defaults;
108 return;
109 }
110
111 // Otherwise it gets more complex
112 if ((defaults->mode & SummarizeMode_Highlight) &&
113 (srcField->mode & SummarizeMode_Highlight) == 0) {
114 out->highlightSettings = defaults->highlightSettings;
115 } else if (srcField->mode && SummarizeMode_Highlight) {
116 out->highlightSettings = srcField->highlightSettings;
117 }
118
119 if ((defaults->mode & SummarizeMode_Synopsis) && (srcField->mode & SummarizeMode_Synopsis) == 0) {
120 out->summarizeSettings = defaults->summarizeSettings;
121 } else {
122 out->summarizeSettings = srcField->summarizeSettings;
123 }
124
125 out->mode |= defaults->mode | srcField->mode;
126 out->name = srcField->name;
127 out->lookupKey = srcField->lookupKey;
128 }
129
130 // Called when we cannot fragmentize based on byte offsets.
131 // docLen is an in/out parameter. On input it should contain the length of the
132 // field, and on output it contains the length of the trimmed summary.
133 // Returns a string which should be freed using free()
trimField(const ReturnedField * fieldInfo,const char * docStr,size_t * docLen,size_t estWordSize)134 static char *trimField(const ReturnedField *fieldInfo, const char *docStr, size_t *docLen,
135 size_t estWordSize) {
136
137 // Number of desired fragments times the number of context words in each fragments,
138 // in characters (estWordSize)
139 size_t headLen =
140 estWordSize * fieldInfo->summarizeSettings.contextLen * fieldInfo->summarizeSettings.numFrags;
141 headLen += estWordSize; // Because we trim off a word when finding the toksep
142 headLen = Min(headLen, *docLen);
143
144 Array bufTmp;
145 Array_InitEx(&bufTmp, ArrayAlloc_RM);
146
147 Array_Write(&bufTmp, docStr, headLen);
148 headLen = stripDuplicateSpaces(bufTmp.data, headLen);
149 Array_Resize(&bufTmp, headLen);
150
151 while (bufTmp.len > 1) {
152 if (istoksep(bufTmp.data[bufTmp.len - 1])) {
153 break;
154 }
155 bufTmp.len--;
156 }
157
158 bufTmp.len = trimTrailingSpaces(bufTmp.data, bufTmp.len);
159 char *ret = Array_Steal(&bufTmp, docLen);
160 return ret;
161 }
162
summarizeField(IndexSpec * spec,const ReturnedField * fieldInfo,const char * fieldName,const RSValue * returnedField,hlpDocContext * docParams,int options)163 static RSValue *summarizeField(IndexSpec *spec, const ReturnedField *fieldInfo,
164 const char *fieldName, const RSValue *returnedField,
165 hlpDocContext *docParams, int options) {
166
167 FragmentList frags;
168 FragmentList_Init(&frags, 8, 6);
169
170 // Start gathering the terms
171 HighlightTags tags = {.openTag = fieldInfo->highlightSettings.openTag,
172 .closeTag = fieldInfo->highlightSettings.closeTag};
173
174 // First actually generate the fragments
175 size_t docLen;
176 const char *docStr = RSValue_StringPtrLen(returnedField, &docLen);
177 if (docParams->byteOffsets == NULL ||
178 !fragmentizeOffsets(spec, fieldName, docStr, docLen, docParams->indexResult,
179 docParams->byteOffsets, &frags, options)) {
180 if (fieldInfo->mode == SummarizeMode_Synopsis) {
181 // If summarizing is requested then trim the field so that the user isn't
182 // spammed with a large blob of text
183 char *summarized = trimField(fieldInfo, docStr, &docLen, frags.estAvgWordSize);
184 return RS_StringVal(summarized, docLen);
185 } else {
186 // Otherwise, just return the whole field, but without highlighting
187 }
188 FragmentList_Free(&frags);
189 return NULL;
190 }
191
192 // Highlight only
193 if (fieldInfo->mode == SummarizeMode_Highlight) {
194 // No need to return snippets; just return the entire doc with relevant tags
195 // highlighted.
196 char *hlDoc = FragmentList_HighlightWholeDocS(&frags, &tags);
197 FragmentList_Free(&frags);
198 return RS_StringValC(hlDoc);
199 }
200
201 size_t numIovArr = Min(fieldInfo->summarizeSettings.numFrags, FragmentList_GetNumFrags(&frags));
202 for (size_t ii = 0; ii < numIovArr; ++ii) {
203 Array_Resize(&docParams->iovsArr[ii], 0);
204 }
205
206 FragmentList_HighlightFragments(&frags, &tags, fieldInfo->summarizeSettings.contextLen,
207 docParams->iovsArr, numIovArr, HIGHLIGHT_ORDER_SCOREPOS);
208
209 // Buffer to store concatenated fragments
210 Array bufTmp;
211 Array_InitEx(&bufTmp, ArrayAlloc_RM);
212
213 for (size_t ii = 0; ii < numIovArr; ++ii) {
214 Array *curIovs = docParams->iovsArr + ii;
215 struct iovec *iovs = ARRAY_GETARRAY_AS(curIovs, struct iovec *);
216 size_t numIovs = ARRAY_GETSIZE_AS(curIovs, struct iovec);
217 size_t lastSize = bufTmp.len;
218
219 for (size_t jj = 0; jj < numIovs; ++jj) {
220 Array_Write(&bufTmp, iovs[jj].iov_base, iovs[jj].iov_len);
221 }
222
223 // Duplicate spaces for the current snippet are eliminated here. We shouldn't
224 // move it to the end because the delimiter itself may contain a special kind
225 // of whitespace.
226 size_t newSize = stripDuplicateSpaces(bufTmp.data + lastSize, bufTmp.len - lastSize);
227 Array_Resize(&bufTmp, lastSize + newSize);
228 Array_Write(&bufTmp, fieldInfo->summarizeSettings.separator,
229 strlen(fieldInfo->summarizeSettings.separator));
230 }
231
232 // Set the string value to the contents of the array. It might be nice if we didn't
233 // need to strndup it.
234 size_t hlLen;
235 char *hlText = Array_Steal(&bufTmp, &hlLen);
236 Array_Free(&bufTmp);
237 FragmentList_Free(&frags);
238 return RS_StringVal(hlText, hlLen);
239 }
240
resetIovsArr(Array ** iovsArrp,size_t * curSize,size_t newSize)241 static void resetIovsArr(Array **iovsArrp, size_t *curSize, size_t newSize) {
242 if (*curSize < newSize) {
243 *iovsArrp = rm_realloc(*iovsArrp, sizeof(**iovsArrp) * newSize);
244 }
245 for (size_t ii = 0; ii < *curSize; ++ii) {
246 Array_Resize((*iovsArrp) + ii, 0);
247 }
248 for (size_t ii = *curSize; ii < newSize; ++ii) {
249 Array_Init((*iovsArrp) + ii);
250 }
251 *curSize = newSize;
252 }
253
processField(HlpProcessor * hlpCtx,hlpDocContext * docParams,ReturnedField * spec)254 static void processField(HlpProcessor *hlpCtx, hlpDocContext *docParams, ReturnedField *spec) {
255 const char *fName = spec->name;
256 const RSValue *fieldValue = RLookup_GetItem(spec->lookupKey, docParams->row);
257
258 if (fieldValue == NULL || !RSValue_IsString(fieldValue)) {
259 return;
260 }
261 RSValue *v = summarizeField(RP_SPEC(&hlpCtx->base), spec, fName, fieldValue, docParams,
262 hlpCtx->fragmentizeOptions);
263 if (v) {
264 RLookup_WriteOwnKey(spec->lookupKey, docParams->row, v);
265 }
266 }
267
getIndexResult(ResultProcessor * rp,t_docId docId)268 static const RSIndexResult *getIndexResult(ResultProcessor *rp, t_docId docId) {
269 IndexIterator *it = QITR_GetRootFilter(rp->parent);
270 RSIndexResult *ir = NULL;
271 if (!it) {
272 return NULL;
273 }
274 it->Rewind(it->ctx);
275 if (INDEXREAD_OK != it->SkipTo(it->ctx, docId, &ir)) {
276 return NULL;
277 }
278 return ir;
279 }
280
hlpNext(ResultProcessor * rbase,SearchResult * r)281 static int hlpNext(ResultProcessor *rbase, SearchResult *r) {
282 int rc = rbase->upstream->Next(rbase->upstream, r);
283 if (rc != RS_RESULT_OK) {
284 return rc;
285 }
286
287 HlpProcessor *hlp = (HlpProcessor *)rbase;
288
289 // Get the index result for the current document from the root iterator.
290 // The current result should not contain an index result
291 const RSIndexResult *ir = r->indexResult ? r->indexResult : getIndexResult(rbase, r->docId);
292
293 // we can't work withot the inex result, just return QUEUED
294 if (!ir) {
295 return RS_RESULT_OK;
296 }
297
298 size_t numIovsArr = 0;
299 const FieldList *fields = hlp->fields;
300 RSDocumentMetadata *dmd = r->dmd;
301 if (!dmd) {
302 return RS_RESULT_OK;
303 }
304
305 hlpDocContext docParams = {.byteOffsets = dmd->byteOffsets, // nl
306 .iovsArr = NULL,
307 .indexResult = ir,
308 .row = &r->rowdata};
309
310 if (fields->numFields) {
311 for (size_t ii = 0; ii < fields->numFields; ++ii) {
312 const ReturnedField *ff = fields->fields + ii;
313 if (ff->mode == SummarizeMode_None && fields->defaultField.mode == SummarizeMode_None) {
314 // Ignore - this is a field for `RETURN`, not `SUMMARIZE`
315 continue;
316 }
317 ReturnedField combinedSpec = {0};
318 normalizeSettings(ff, &fields->defaultField, &combinedSpec);
319 resetIovsArr(&docParams.iovsArr, &numIovsArr, combinedSpec.summarizeSettings.numFrags);
320 processField(hlp, &docParams, &combinedSpec);
321 }
322 } else if (fields->defaultField.mode != SummarizeMode_None) {
323 for (const RLookupKey *k = hlp->lookup->head; k; k = k->next) {
324 if (k->flags & RLOOKUP_F_HIDDEN) {
325 continue;
326 }
327 ReturnedField spec = {0};
328 normalizeSettings(NULL, &fields->defaultField, &spec);
329 spec.lookupKey = k;
330 spec.name = k->name;
331 resetIovsArr(&docParams.iovsArr, &numIovsArr, spec.summarizeSettings.numFrags);
332 processField(hlp, &docParams, &spec);
333 }
334 }
335 for (size_t ii = 0; ii < numIovsArr; ++ii) {
336 Array_Free(&docParams.iovsArr[ii]);
337 }
338 rm_free(docParams.iovsArr);
339 return RS_RESULT_OK;
340 }
341
hlpFree(ResultProcessor * p)342 static void hlpFree(ResultProcessor *p) {
343 rm_free(p);
344 }
345
RPHighlighter_New(const RSSearchOptions * searchopts,const FieldList * fields,const RLookup * lookup)346 ResultProcessor *RPHighlighter_New(const RSSearchOptions *searchopts, const FieldList *fields,
347 const RLookup *lookup) {
348 HlpProcessor *hlp = rm_calloc(1, sizeof(*hlp));
349 if (searchopts->language == RS_LANG_CHINESE) {
350 hlp->fragmentizeOptions = FRAGMENTIZE_TOKLEN_EXACT;
351 }
352 hlp->base.Next = hlpNext;
353 hlp->base.Free = hlpFree;
354 hlp->fields = fields;
355 hlp->lookup = lookup;
356 return &hlp->base;
357 }
358