1 #include "index_result.h"
2 #include "varint.h"
3 #include "rmalloc.h"
4 #include <math.h>
5 #include <sys/param.h>
6 
7 /* Allocate a new aggregate result of a given type with a given capacity*/
__newAggregateResult(size_t cap,RSResultType t,double weight)8 RSIndexResult *__newAggregateResult(size_t cap, RSResultType t, double weight) {
9   RSIndexResult *res = rm_new(RSIndexResult);
10 
11   *res = (RSIndexResult){
12       .type = t,
13       .docId = 0,
14       .freq = 0,
15       .fieldMask = 0,
16       .isCopy = 0,
17       .weight = weight,
18       .agg = (RSAggregateResult){.numChildren = 0,
19                                  .childrenCap = cap,
20                                  .typeMask = 0x0000,
21                                  .children = rm_calloc(cap, sizeof(RSIndexResult *))}};
22   return res;
23 }
24 
25 /* Allocate a new intersection result with a given capacity*/
NewIntersectResult(size_t cap,double weight)26 RSIndexResult *NewIntersectResult(size_t cap, double weight) {
27   return __newAggregateResult(cap, RSResultType_Intersection, weight);
28 }
29 
30 /* Allocate a new union result with a given capacity*/
NewUnionResult(size_t cap,double weight)31 RSIndexResult *NewUnionResult(size_t cap, double weight) {
32   return __newAggregateResult(cap, RSResultType_Union, weight);
33 }
34 
35 /* Allocate a new token record result for a given term */
NewTokenRecord(RSQueryTerm * term,double weight)36 RSIndexResult *NewTokenRecord(RSQueryTerm *term, double weight) {
37   RSIndexResult *res = rm_new(RSIndexResult);
38 
39   *res = (RSIndexResult){.type = RSResultType_Term,
40                          .docId = 0,
41                          .fieldMask = 0,
42                          .isCopy = 0,
43                          .freq = 0,
44                          .weight = weight,
45                          .term = (RSTermRecord){
46                              .term = term,
47                              .offsets = (RSOffsetVector){},
48                          }};
49   return res;
50 }
51 
NewNumericResult()52 RSIndexResult *NewNumericResult() {
53   RSIndexResult *res = rm_new(RSIndexResult);
54 
55   *res = (RSIndexResult){.type = RSResultType_Numeric,
56                          .docId = 0,
57                          .isCopy = 0,
58                          .fieldMask = RS_FIELDMASK_ALL,
59                          .freq = 1,
60                          .weight = 1,
61 
62                          .num = (RSNumericRecord){.value = 0}};
63   return res;
64 }
65 
NewVirtualResult(double weight)66 RSIndexResult *NewVirtualResult(double weight) {
67   RSIndexResult *res = rm_new(RSIndexResult);
68 
69   *res = (RSIndexResult){
70       .type = RSResultType_Virtual,
71       .docId = 0,
72       .fieldMask = 0,
73       .freq = 0,
74       .weight = weight,
75 
76       .isCopy = 0,
77   };
78   return res;
79 }
80 
IndexResult_DeepCopy(const RSIndexResult * src)81 RSIndexResult *IndexResult_DeepCopy(const RSIndexResult *src) {
82   RSIndexResult *ret = rm_new(RSIndexResult);
83   *ret = *src;
84   ret->isCopy = 1;
85 
86   switch (src->type) {
87     // copy aggregate types
88     case RSResultType_Intersection:
89     case RSResultType_Union:
90       // allocate a new child pointer array
91       ret->agg.children = rm_malloc(src->agg.numChildren * sizeof(RSIndexResult *));
92       ret->agg.childrenCap = src->agg.numChildren;
93       // deep copy recursively all children
94       for (int i = 0; i < src->agg.numChildren; i++) {
95         ret->agg.children[i] = IndexResult_DeepCopy(src->agg.children[i]);
96       }
97       break;
98 
99     // copy term results
100     case RSResultType_Term:
101       // copy the offset vectors
102       if (src->term.offsets.data) {
103         ret->term.offsets.data = rm_malloc(ret->term.offsets.len);
104         memcpy(ret->term.offsets.data, src->term.offsets.data, ret->term.offsets.len);
105       }
106       break;
107 
108     // the rest have no dynamic stuff, we can just copy the base result
109     default:
110       break;
111   }
112   return ret;
113 }
114 
IndexResult_Print(RSIndexResult * r,int depth)115 void IndexResult_Print(RSIndexResult *r, int depth) {
116   for (int i = 0; i < depth; i++) printf("  ");
117 
118   if (r->type == RSResultType_Term) {
119     printf("Term{%llu: %s},\n", (unsigned long long)r->docId,
120            r->term.term ? r->term.term->str : "nil");
121     return;
122   }
123   if (r->type == RSResultType_Virtual) {
124     printf("Virtual{%llu},\n", (unsigned long long)r->docId);
125     return;
126   }
127   if (r->type == RSResultType_Numeric) {
128     printf("Numeric{%llu:%f},\n", (unsigned long long)r->docId, r->num.value);
129     return;
130   }
131   printf("%s => %llu{ \n", r->type == RSResultType_Intersection ? "Inter" : "Union",
132          (unsigned long long)r->docId);
133 
134   for (int i = 0; i < r->agg.numChildren; i++) {
135 
136     IndexResult_Print(r->agg.children[i], depth + 1);
137   }
138   for (int i = 0; i < depth; i++) printf("  ");
139 
140   printf("},\n");
141 
142   // printf("docId: %d, finalScore: %f, flags %x. Terms:\n", r->docId, r->finalScore, r->fieldMask);
143 
144   // for (int i = 0; i < r->numRecords; i++) {
145   //   printf("\t%s, %d tf %d, flags %x\n", r->records[i].term->str, r->records[i].docId,
146   //          r->records[i].freq, r->records[i].fieldMask);
147   // }
148   // printf("----------\n");
149 }
150 
NewQueryTerm(RSToken * tok,int id)151 RSQueryTerm *NewQueryTerm(RSToken *tok, int id) {
152   RSQueryTerm *ret = rm_malloc(sizeof(RSQueryTerm));
153   ret->idf = 1;
154   ret->str = tok->str ? rm_strndup(tok->str, tok->len) : NULL;
155   ret->len = tok->len;
156   ret->flags = tok->flags;
157   ret->id = id;
158   return ret;
159 }
160 
Term_Free(RSQueryTerm * t)161 void Term_Free(RSQueryTerm *t) {
162   if (t) {
163     if (t->str) rm_free(t->str);
164     rm_free(t);
165   }
166 }
167 
IndexResult_Init(RSIndexResult * h)168 void IndexResult_Init(RSIndexResult *h) {
169 
170   h->docId = 0;
171   h->fieldMask = 0;
172   h->freq = 0;
173 
174   if (h->type == RSResultType_Intersection || h->type == RSResultType_Union) {
175     h->agg.numChildren = 0;
176   }
177 }
178 
RSIndexResult_HasOffsets(const RSIndexResult * res)179 int RSIndexResult_HasOffsets(const RSIndexResult *res) {
180   switch (res->type) {
181     case RSResultType_Term:
182       return res->term.offsets.len > 0;
183     case RSResultType_Intersection:
184     case RSResultType_Union:
185       // the intersection and union aggregates can have offsets if they are not purely made of
186       // virtual results
187       return res->agg.typeMask != RSResultType_Virtual && res->agg.typeMask != RSResultType_Numeric;
188 
189     // a virtual result doesn't have offsets!
190     case RSResultType_Virtual:
191     case RSResultType_Numeric:
192     default:
193       return 0;
194   }
195 }
196 
IndexResult_Free(RSIndexResult * r)197 void IndexResult_Free(RSIndexResult *r) {
198   if (!r) return;
199   if (r->type == RSResultType_Intersection || r->type == RSResultType_Union) {
200     // for deep-copy results we also free the children
201     if (r->isCopy && r->agg.children) {
202       for (int i = 0; i < r->agg.numChildren; i++) {
203         IndexResult_Free(r->agg.children[i]);
204       }
205     }
206     rm_free(r->agg.children);
207     r->agg.children = NULL;
208   } else if (r->type == RSResultType_Term) {
209     if (r->isCopy) {
210       rm_free(r->term.offsets.data);
211 
212     } else {  // non copy result...
213 
214       // we only free up terms for non copy results
215       if (r->term.term != NULL) {
216         Term_Free(r->term.term);
217       }
218     }
219   }
220 
221   rm_free(r);
222 }
223 
RSIndexResult_IsAggregate(const RSIndexResult * r)224 inline int RSIndexResult_IsAggregate(const RSIndexResult *r) {
225   return (r->type & RS_RESULT_AGGREGATE) != 0;
226 }
227 #define __absdelta(x, y) (x > y ? x - y : y - x)
228 /**
229 Find the minimal distance between members of the vectos.
230 e.g. if V1 is {2,4,8} and V2 is {0,5,12}, the distance is 1 - abs(4-5)
231 @param vs a list of vector pointers
232 @param num the size of the list
233 */
IndexResult_MinOffsetDelta(const RSIndexResult * r)234 int IndexResult_MinOffsetDelta(const RSIndexResult *r) {
235   if (!RSIndexResult_IsAggregate(r) || r->agg.numChildren <= 1) {
236     return 1;
237   }
238 
239   const RSAggregateResult *agg = &r->agg;
240   int dist = 0;
241   int num = agg->numChildren;
242 
243   RSOffsetIterator v1, v2;
244   int i = 0;
245   while (i < num) {
246     // if either
247     while (i < num && !RSIndexResult_HasOffsets(agg->children[i])) {
248       i++;
249       continue;
250     }
251     if (i == num) break;
252     v1 = RSIndexResult_IterateOffsets(agg->children[i]);
253     i++;
254 
255     while (i < num && !RSIndexResult_HasOffsets(agg->children[i])) {
256       i++;
257       continue;
258     }
259     if (i == num) {
260       v1.Free(v1.ctx);
261       break;
262     }
263     v2 = RSIndexResult_IterateOffsets(agg->children[i]);
264 
265     uint32_t p1 = v1.Next(v1.ctx, NULL);
266     uint32_t p2 = v2.Next(v2.ctx, NULL);
267     int cd = __absdelta(p2, p1);
268     while (cd > 1 && p1 != RS_OFFSETVECTOR_EOF && p2 != RS_OFFSETVECTOR_EOF) {
269       cd = MIN(__absdelta(p2, p1), cd);
270       if (p2 > p1) {
271         p1 = v1.Next(v1.ctx, NULL);
272       } else {
273         p2 = v2.Next(v2.ctx, NULL);
274       }
275     }
276 
277     v1.Free(v1.ctx);
278     v2.Free(v2.ctx);
279 
280     dist += cd * cd;
281   }
282 
283   // we return 1 if ditance could not be calculate, to avoid division by zero
284   return dist ? sqrt(dist) : agg->numChildren - 1;
285 }
286 
result_GetMatchedTerms(RSIndexResult * r,RSQueryTerm * arr[],size_t cap,size_t * len)287 void result_GetMatchedTerms(RSIndexResult *r, RSQueryTerm *arr[], size_t cap, size_t *len) {
288   if (*len == cap) return;
289 
290   switch (r->type) {
291     case RSResultType_Intersection:
292     case RSResultType_Union:
293 
294       for (int i = 0; i < r->agg.numChildren; i++) {
295         result_GetMatchedTerms(r->agg.children[i], arr, cap, len);
296       }
297       break;
298     case RSResultType_Term:
299       if (r->term.term) {
300         const char *s = r->term.term->str;
301         // make sure we have a term string and it's not an expansion
302         if (s) {
303           arr[(*len)++] = r->term.term;
304         }
305 
306         // fprintf(stderr, "Term! %zd\n", *len);
307       }
308     default:
309       return;
310   }
311 }
312 
IndexResult_GetMatchedTerms(RSIndexResult * r,RSQueryTerm ** arr,size_t cap)313 size_t IndexResult_GetMatchedTerms(RSIndexResult *r, RSQueryTerm **arr, size_t cap) {
314   size_t arrlen = 0;
315   result_GetMatchedTerms(r, arr, cap, &arrlen);
316   return arrlen;
317 }
318 
__indexResult_withinRangeInOrder(RSOffsetIterator * iters,uint32_t * positions,int num,int maxSlop)319 int __indexResult_withinRangeInOrder(RSOffsetIterator *iters, uint32_t *positions, int num,
320                                      int maxSlop) {
321   while (1) {
322 
323     // we start from the beginning, and a span of 0
324     int span = 0;
325     for (int i = 0; i < num; i++) {
326       // take the current position and the position of the previous iterator.
327       // For the first iterator we always advance once
328       uint32_t pos = i ? positions[i] : iters[i].Next(iters[i].ctx, NULL);
329       uint32_t lastPos = i ? positions[i - 1] : 0;
330       // printf("Before: i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
331 
332       // read while we are not in order
333       while (pos != RS_OFFSETVECTOR_EOF && pos < lastPos) {
334         pos = iters[i].Next(iters[i].ctx, NULL);
335         // printf("Reading: i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
336       }
337       // printf("i=%d, pos=%d, lastPos %d\n", i, pos, lastPos);
338 
339       // we've read through the entire list and it's not in order relative to the last pos
340       if (pos == RS_OFFSETVECTOR_EOF) {
341         return 0;
342       }
343       positions[i] = pos;
344 
345       // add the diff from the last pos to the total span
346       if (i > 0) {
347         span += ((int)pos - (int)lastPos - 1);
348         // if we are already out of slop - just quit
349         if (span > maxSlop) {
350           break;
351         }
352       }
353     }
354 
355     if (span <= maxSlop) {
356       return 1;
357     }
358   }
359 
360   return 0;
361 }
362 
_arrayMin(uint32_t * arr,int len,uint32_t * pos)363 static inline uint32_t _arrayMin(uint32_t *arr, int len, uint32_t *pos) {
364   int m = arr[0];
365   *pos = 0;
366   for (int i = 1; i < len; i++) {
367     if (arr[i] < m) {
368       m = arr[i];
369       *pos = i;
370     }
371   }
372   return m;
373 }
374 
_arrayMax(uint32_t * arr,int len,uint32_t * pos)375 static inline uint32_t _arrayMax(uint32_t *arr, int len, uint32_t *pos) {
376   int m = arr[0];
377   *pos = 0;
378   for (int i = 1; i < len; i++) {
379     if (arr[i] >= m) {
380       m = arr[i];
381       *pos = i;
382     }
383   }
384   return m;
385 }
386 
387 /* Check the index result for maximal slop, in an unordered fashion.
388  * The algorithm is simple - we find the first offsets min and max such that max-min<=maxSlop */
__indexResult_withinRangeUnordered(RSOffsetIterator * iters,uint32_t * positions,int num,int maxSlop)389 int __indexResult_withinRangeUnordered(RSOffsetIterator *iters, uint32_t *positions, int num,
390                                        int maxSlop) {
391   for (int i = 0; i < num; i++) {
392     positions[i] = iters[i].Next(iters[i].ctx, NULL);
393   }
394   uint32_t minPos, maxPos, min, max;
395   // find the max member
396   max = _arrayMax(positions, num, &maxPos);
397 
398   while (1) {
399 
400     // we start from the beginning, and a span of 0
401     min = _arrayMin(positions, num, &minPos);
402     if (min != max) {
403       // calculate max - min
404       int span = (int)max - (int)min - (num - 1);
405       // printf("maxslop %d min %d, max %d, minPos %d, maxPos %d, span %d\n", maxSlop, min, max,
406       //        minPos, maxPos, span);
407       // if it matches the condition - just return success
408       if (span <= maxSlop) {
409         return 1;
410       }
411     }
412 
413     // if we are not meeting the conditions - advance the minimal iterator
414     positions[minPos] = iters[minPos].Next(iters[minPos].ctx, NULL);
415     // If the minimal iterator is larger than the max iterator, the minimal iterator is the new
416     // maximal iterator.
417     if (positions[minPos] != RS_OFFSETVECTOR_EOF && positions[minPos] > max) {
418       maxPos = minPos;
419       max = positions[maxPos];
420 
421     } else if (positions[minPos] == RS_OFFSETVECTOR_EOF) {
422       // this means we've reached the end
423       break;
424     }
425   }
426 
427   return 0;
428 }
429 
430 /** Test the result offset vectors to see if they fall within a max "slop" or distance between the
431  * terms. That is the total number of non matched offsets between the terms is no bigger than
432  * maxSlop.
433  * e.g. for an exact match, the slop allowed is 0.
434  */
IndexResult_IsWithinRange(RSIndexResult * ir,int maxSlop,int inOrder)435 int IndexResult_IsWithinRange(RSIndexResult *ir, int maxSlop, int inOrder) {
436 
437   // check if calculation is even relevant here...
438   if ((ir->type & (RSResultType_Term | RSResultType_Virtual | RSResultType_Numeric)) ||
439       ir->agg.numChildren <= 1) {
440     return 1;
441   }
442   RSAggregateResult *r = &ir->agg;
443   int num = r->numChildren;
444 
445   // Fill a list of iterators and the last read positions
446   RSOffsetIterator iters[num];
447   uint32_t positions[num];
448   int n = 0;
449   for (int i = 0; i < num; i++) {
450     // collect only iterators for nodes that can have offsets
451     if (RSIndexResult_HasOffsets(r->children[i])) {
452       iters[n] = RSIndexResult_IterateOffsets(r->children[i]);
453       positions[n] = 0;
454       n++;
455     }
456   }
457 
458   // No applicable offset children - just return 1
459   if (n == 0) {
460     return 1;
461   }
462 
463   int rc;
464   // cal the relevant algorithm based on ordered/unordered condition
465   if (inOrder)
466     rc = __indexResult_withinRangeInOrder(iters, positions, n, maxSlop);
467   else
468     rc = __indexResult_withinRangeUnordered(iters, positions, n, maxSlop);
469   // printf("slop result for %d: %d\n", ir->docId, rc);
470   for (int i = 0; i < n; i++) {
471     iters[i].Free(iters[i].ctx);
472   }
473   return rc;
474 }
475