1 #include "index_result.h"
2 #include "varint.h"
3 #include "rmalloc.h"
4 #include <math.h>
5 #include <sys/param.h>
6
7 /* Allocate a new aggregate result of a given type with a given capacity*/
__newAggregateResult(size_t cap,RSResultType t,double weight)8 RSIndexResult *__newAggregateResult(size_t cap, RSResultType t, double weight) {
9 RSIndexResult *res = rm_new(RSIndexResult);
10
11 *res = (RSIndexResult){
12 .type = t,
13 .docId = 0,
14 .freq = 0,
15 .fieldMask = 0,
16 .isCopy = 0,
17 .weight = weight,
18 .agg = (RSAggregateResult){.numChildren = 0,
19 .childrenCap = cap,
20 .typeMask = 0x0000,
21 .children = rm_calloc(cap, sizeof(RSIndexResult *))}};
22 return res;
23 }
24
25 /* Allocate a new intersection result with a given capacity*/
NewIntersectResult(size_t cap,double weight)26 RSIndexResult *NewIntersectResult(size_t cap, double weight) {
27 return __newAggregateResult(cap, RSResultType_Intersection, weight);
28 }
29
30 /* Allocate a new union result with a given capacity*/
NewUnionResult(size_t cap,double weight)31 RSIndexResult *NewUnionResult(size_t cap, double weight) {
32 return __newAggregateResult(cap, RSResultType_Union, weight);
33 }
34
35 /* Allocate a new token record result for a given term */
NewTokenRecord(RSQueryTerm * term,double weight)36 RSIndexResult *NewTokenRecord(RSQueryTerm *term, double weight) {
37 RSIndexResult *res = rm_new(RSIndexResult);
38
39 *res = (RSIndexResult){.type = RSResultType_Term,
40 .docId = 0,
41 .fieldMask = 0,
42 .isCopy = 0,
43 .freq = 0,
44 .weight = weight,
45 .term = (RSTermRecord){
46 .term = term,
47 .offsets = (RSOffsetVector){},
48 }};
49 return res;
50 }
51
NewNumericResult()52 RSIndexResult *NewNumericResult() {
53 RSIndexResult *res = rm_new(RSIndexResult);
54
55 *res = (RSIndexResult){.type = RSResultType_Numeric,
56 .docId = 0,
57 .isCopy = 0,
58 .fieldMask = RS_FIELDMASK_ALL,
59 .freq = 1,
60 .weight = 1,
61
62 .num = (RSNumericRecord){.value = 0}};
63 return res;
64 }
65
NewVirtualResult(double weight)66 RSIndexResult *NewVirtualResult(double weight) {
67 RSIndexResult *res = rm_new(RSIndexResult);
68
69 *res = (RSIndexResult){
70 .type = RSResultType_Virtual,
71 .docId = 0,
72 .fieldMask = 0,
73 .freq = 0,
74 .weight = weight,
75
76 .isCopy = 0,
77 };
78 return res;
79 }
80
IndexResult_DeepCopy(const RSIndexResult * src)81 RSIndexResult *IndexResult_DeepCopy(const RSIndexResult *src) {
82 RSIndexResult *ret = rm_new(RSIndexResult);
83 *ret = *src;
84 ret->isCopy = 1;
85
86 switch (src->type) {
87 // copy aggregate types
88 case RSResultType_Intersection:
89 case RSResultType_Union:
90 // allocate a new child pointer array
91 ret->agg.children = rm_malloc(src->agg.numChildren * sizeof(RSIndexResult *));
92 ret->agg.childrenCap = src->agg.numChildren;
93 // deep copy recursively all children
94 for (int i = 0; i < src->agg.numChildren; i++) {
95 ret->agg.children[i] = IndexResult_DeepCopy(src->agg.children[i]);
96 }
97 break;
98
99 // copy term results
100 case RSResultType_Term:
101 // copy the offset vectors
102 if (src->term.offsets.data) {
103 ret->term.offsets.data = rm_malloc(ret->term.offsets.len);
104 memcpy(ret->term.offsets.data, src->term.offsets.data, ret->term.offsets.len);
105 }
106 break;
107
108 // the rest have no dynamic stuff, we can just copy the base result
109 default:
110 break;
111 }
112 return ret;
113 }
114
IndexResult_Print(RSIndexResult * r,int depth)115 void IndexResult_Print(RSIndexResult *r, int depth) {
116 for (int i = 0; i < depth; i++) printf(" ");
117
118 if (r->type == RSResultType_Term) {
119 printf("Term{%llu: %s},\n", (unsigned long long)r->docId,
120 r->term.term ? r->term.term->str : "nil");
121 return;
122 }
123 if (r->type == RSResultType_Virtual) {
124 printf("Virtual{%llu},\n", (unsigned long long)r->docId);
125 return;
126 }
127 if (r->type == RSResultType_Numeric) {
128 printf("Numeric{%llu:%f},\n", (unsigned long long)r->docId, r->num.value);
129 return;
130 }
131 printf("%s => %llu{ \n", r->type == RSResultType_Intersection ? "Inter" : "Union",
132 (unsigned long long)r->docId);
133
134 for (int i = 0; i < r->agg.numChildren; i++) {
135
136 IndexResult_Print(r->agg.children[i], depth + 1);
137 }
138 for (int i = 0; i < depth; i++) printf(" ");
139
140 printf("},\n");
141
142 // printf("docId: %d, finalScore: %f, flags %x. Terms:\n", r->docId, r->finalScore, r->fieldMask);
143
144 // for (int i = 0; i < r->numRecords; i++) {
145 // printf("\t%s, %d tf %d, flags %x\n", r->records[i].term->str, r->records[i].docId,
146 // r->records[i].freq, r->records[i].fieldMask);
147 // }
148 // printf("----------\n");
149 }
150
NewQueryTerm(RSToken * tok,int id)151 RSQueryTerm *NewQueryTerm(RSToken *tok, int id) {
152 RSQueryTerm *ret = rm_malloc(sizeof(RSQueryTerm));
153 ret->idf = 1;
154 ret->str = tok->str ? rm_strndup(tok->str, tok->len) : NULL;
155 ret->len = tok->len;
156 ret->flags = tok->flags;
157 ret->id = id;
158 return ret;
159 }
160
Term_Free(RSQueryTerm * t)161 void Term_Free(RSQueryTerm *t) {
162 if (t) {
163 if (t->str) rm_free(t->str);
164 rm_free(t);
165 }
166 }
167
IndexResult_Init(RSIndexResult * h)168 void IndexResult_Init(RSIndexResult *h) {
169
170 h->docId = 0;
171 h->fieldMask = 0;
172 h->freq = 0;
173
174 if (h->type == RSResultType_Intersection || h->type == RSResultType_Union) {
175 h->agg.numChildren = 0;
176 }
177 }
178
RSIndexResult_HasOffsets(const RSIndexResult * res)179 int RSIndexResult_HasOffsets(const RSIndexResult *res) {
180 switch (res->type) {
181 case RSResultType_Term:
182 return res->term.offsets.len > 0;
183 case RSResultType_Intersection:
184 case RSResultType_Union:
185 // the intersection and union aggregates can have offsets if they are not purely made of
186 // virtual results
187 return res->agg.typeMask != RSResultType_Virtual && res->agg.typeMask != RSResultType_Numeric;
188
189 // a virtual result doesn't have offsets!
190 case RSResultType_Virtual:
191 case RSResultType_Numeric:
192 default:
193 return 0;
194 }
195 }
196
IndexResult_Free(RSIndexResult * r)197 void IndexResult_Free(RSIndexResult *r) {
198 if (!r) return;
199 if (r->type == RSResultType_Intersection || r->type == RSResultType_Union) {
200 // for deep-copy results we also free the children
201 if (r->isCopy && r->agg.children) {
202 for (int i = 0; i < r->agg.numChildren; i++) {
203 IndexResult_Free(r->agg.children[i]);
204 }
205 }
206 rm_free(r->agg.children);
207 r->agg.children = NULL;
208 } else if (r->type == RSResultType_Term) {
209 if (r->isCopy) {
210 rm_free(r->term.offsets.data);
211
212 } else { // non copy result...
213
214 // we only free up terms for non copy results
215 if (r->term.term != NULL) {
216 Term_Free(r->term.term);
217 }
218 }
219 }
220
221 rm_free(r);
222 }
223
RSIndexResult_IsAggregate(const RSIndexResult * r)224 inline int RSIndexResult_IsAggregate(const RSIndexResult *r) {
225 return (r->type & RS_RESULT_AGGREGATE) != 0;
226 }
227 #define __absdelta(x, y) (x > y ? x - y : y - x)
228 /**
229 Find the minimal distance between members of the vectos.
230 e.g. if V1 is {2,4,8} and V2 is {0,5,12}, the distance is 1 - abs(4-5)
231 @param vs a list of vector pointers
232 @param num the size of the list
233 */
IndexResult_MinOffsetDelta(const RSIndexResult * r)234 int IndexResult_MinOffsetDelta(const RSIndexResult *r) {
235 if (!RSIndexResult_IsAggregate(r) || r->agg.numChildren <= 1) {
236 return 1;
237 }
238
239 const RSAggregateResult *agg = &r->agg;
240 int dist = 0;
241 int num = agg->numChildren;
242
243 RSOffsetIterator v1, v2;
244 int i = 0;
245 while (i < num) {
246 // if either
247 while (i < num && !RSIndexResult_HasOffsets(agg->children[i])) {
248 i++;
249 continue;
250 }
251 if (i == num) break;
252 v1 = RSIndexResult_IterateOffsets(agg->children[i]);
253 i++;
254
255 while (i < num && !RSIndexResult_HasOffsets(agg->children[i])) {
256 i++;
257 continue;
258 }
259 if (i == num) {
260 v1.Free(v1.ctx);
261 break;
262 }
263 v2 = RSIndexResult_IterateOffsets(agg->children[i]);
264
265 uint32_t p1 = v1.Next(v1.ctx, NULL);
266 uint32_t p2 = v2.Next(v2.ctx, NULL);
267 int cd = __absdelta(p2, p1);
268 while (cd > 1 && p1 != RS_OFFSETVECTOR_EOF && p2 != RS_OFFSETVECTOR_EOF) {
269 cd = MIN(__absdelta(p2, p1), cd);
270 if (p2 > p1) {
271 p1 = v1.Next(v1.ctx, NULL);
272 } else {
273 p2 = v2.Next(v2.ctx, NULL);
274 }
275 }
276
277 v1.Free(v1.ctx);
278 v2.Free(v2.ctx);
279
280 dist += cd * cd;
281 }
282
283 // we return 1 if ditance could not be calculate, to avoid division by zero
284 return dist ? sqrt(dist) : agg->numChildren - 1;
285 }
286
result_GetMatchedTerms(RSIndexResult * r,RSQueryTerm * arr[],size_t cap,size_t * len)287 void result_GetMatchedTerms(RSIndexResult *r, RSQueryTerm *arr[], size_t cap, size_t *len) {
288 if (*len == cap) return;
289
290 switch (r->type) {
291 case RSResultType_Intersection:
292 case RSResultType_Union:
293
294 for (int i = 0; i < r->agg.numChildren; i++) {
295 result_GetMatchedTerms(r->agg.children[i], arr, cap, len);
296 }
297 break;
298 case RSResultType_Term:
299 if (r->term.term) {
300 const char *s = r->term.term->str;
301 // make sure we have a term string and it's not an expansion
302 if (s) {
303 arr[(*len)++] = r->term.term;
304 }
305
306 // fprintf(stderr, "Term! %zd\n", *len);
307 }
308 default:
309 return;
310 }
311 }
312
IndexResult_GetMatchedTerms(RSIndexResult * r,RSQueryTerm ** arr,size_t cap)313 size_t IndexResult_GetMatchedTerms(RSIndexResult *r, RSQueryTerm **arr, size_t cap) {
314 size_t arrlen = 0;
315 result_GetMatchedTerms(r, arr, cap, &arrlen);
316 return arrlen;
317 }
318
__indexResult_withinRangeInOrder(RSOffsetIterator * iters,uint32_t * positions,int num,int maxSlop)319 int __indexResult_withinRangeInOrder(RSOffsetIterator *iters, uint32_t *positions, int num,
320 int maxSlop) {
321 while (1) {
322
323 // we start from the beginning, and a span of 0
324 int span = 0;
325 for (int i = 0; i < num; i++) {
326 // take the current position and the position of the previous iterator.
327 // For the first iterator we always advance once
328 uint32_t pos = i ? positions[i] : iters[i].Next(iters[i].ctx, NULL);
329 uint32_t lastPos = i ? positions[i - 1] : 0;
330 // printf("Before: i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
331
332 // read while we are not in order
333 while (pos != RS_OFFSETVECTOR_EOF && pos < lastPos) {
334 pos = iters[i].Next(iters[i].ctx, NULL);
335 // printf("Reading: i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
336 }
337 // printf("i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
338
339 // we've read through the entire list and it's not in order relative to the last pos
340 if (pos == RS_OFFSETVECTOR_EOF) {
341 return 0;
342 }
343 positions[i] = pos;
344
345 // add the diff from the last pos to the total span
346 if (i > 0) {
347 span += ((int)pos - (int)lastPos - 1);
348 // if we are already out of slop - just quit
349 if (span > maxSlop) {
350 break;
351 }
352 }
353 }
354
355 if (span <= maxSlop) {
356 return 1;
357 }
358 }
359
360 return 0;
361 }
362
_arrayMin(uint32_t * arr,int len,uint32_t * pos)363 static inline uint32_t _arrayMin(uint32_t *arr, int len, uint32_t *pos) {
364 int m = arr[0];
365 *pos = 0;
366 for (int i = 1; i < len; i++) {
367 if (arr[i] < m) {
368 m = arr[i];
369 *pos = i;
370 }
371 }
372 return m;
373 }
374
_arrayMax(uint32_t * arr,int len,uint32_t * pos)375 static inline uint32_t _arrayMax(uint32_t *arr, int len, uint32_t *pos) {
376 int m = arr[0];
377 *pos = 0;
378 for (int i = 1; i < len; i++) {
379 if (arr[i] >= m) {
380 m = arr[i];
381 *pos = i;
382 }
383 }
384 return m;
385 }
386
387 /* Check the index result for maximal slop, in an unordered fashion.
388 * The algorithm is simple - we find the first offsets min and max such that max-min<=maxSlop */
__indexResult_withinRangeUnordered(RSOffsetIterator * iters,uint32_t * positions,int num,int maxSlop)389 int __indexResult_withinRangeUnordered(RSOffsetIterator *iters, uint32_t *positions, int num,
390 int maxSlop) {
391 for (int i = 0; i < num; i++) {
392 positions[i] = iters[i].Next(iters[i].ctx, NULL);
393 }
394 uint32_t minPos, maxPos, min, max;
395 // find the max member
396 max = _arrayMax(positions, num, &maxPos);
397
398 while (1) {
399
400 // we start from the beginning, and a span of 0
401 min = _arrayMin(positions, num, &minPos);
402 if (min != max) {
403 // calculate max - min
404 int span = (int)max - (int)min - (num - 1);
405 // printf("maxslop %d min %d, max %d, minPos %d, maxPos %d, span %d\n", maxSlop, min, max,
406 // minPos, maxPos, span);
407 // if it matches the condition - just return success
408 if (span <= maxSlop) {
409 return 1;
410 }
411 }
412
413 // if we are not meeting the conditions - advance the minimal iterator
414 positions[minPos] = iters[minPos].Next(iters[minPos].ctx, NULL);
415 // If the minimal iterator is larger than the max iterator, the minimal iterator is the new
416 // maximal iterator.
417 if (positions[minPos] != RS_OFFSETVECTOR_EOF && positions[minPos] > max) {
418 maxPos = minPos;
419 max = positions[maxPos];
420
421 } else if (positions[minPos] == RS_OFFSETVECTOR_EOF) {
422 // this means we've reached the end
423 break;
424 }
425 }
426
427 return 0;
428 }
429
430 /** Test the result offset vectors to see if they fall within a max "slop" or distance between the
431 * terms. That is the total number of non matched offsets between the terms is no bigger than
432 * maxSlop.
433 * e.g. for an exact match, the slop allowed is 0.
434 */
IndexResult_IsWithinRange(RSIndexResult * ir,int maxSlop,int inOrder)435 int IndexResult_IsWithinRange(RSIndexResult *ir, int maxSlop, int inOrder) {
436
437 // check if calculation is even relevant here...
438 if ((ir->type & (RSResultType_Term | RSResultType_Virtual | RSResultType_Numeric)) ||
439 ir->agg.numChildren <= 1) {
440 return 1;
441 }
442 RSAggregateResult *r = &ir->agg;
443 int num = r->numChildren;
444
445 // Fill a list of iterators and the last read positions
446 RSOffsetIterator iters[num];
447 uint32_t positions[num];
448 int n = 0;
449 for (int i = 0; i < num; i++) {
450 // collect only iterators for nodes that can have offsets
451 if (RSIndexResult_HasOffsets(r->children[i])) {
452 iters[n] = RSIndexResult_IterateOffsets(r->children[i]);
453 positions[n] = 0;
454 n++;
455 }
456 }
457
458 // No applicable offset children - just return 1
459 if (n == 0) {
460 return 1;
461 }
462
463 int rc;
464 // cal the relevant algorithm based on ordered/unordered condition
465 if (inOrder)
466 rc = __indexResult_withinRangeInOrder(iters, positions, n, maxSlop);
467 else
468 rc = __indexResult_withinRangeUnordered(iters, positions, n, maxSlop);
469 // printf("slop result for %d: %d\n", ir->docId, rc);
470 for (int i = 0; i < n; i++) {
471 iters[i].Free(iters[i].ctx);
472 }
473 return rc;
474 }
475