1 /*-------------------------------------------------------------------------
2 *
3 * tsrank.c
4 * rank tsvector by tsquery
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsrank.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include <limits.h>
17 #include <math.h>
18
19 #include "tsearch/ts_utils.h"
20 #include "utils/array.h"
21 #include "miscadmin.h"
22
23
24 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
25
26 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
27
28 #define RANK_NO_NORM 0x00
29 #define RANK_NORM_LOGLENGTH 0x01
30 #define RANK_NORM_LENGTH 0x02
31 #define RANK_NORM_EXTDIST 0x04
32 #define RANK_NORM_UNIQ 0x08
33 #define RANK_NORM_LOGUNIQ 0x10
34 #define RANK_NORM_RDIVRPLUS1 0x20
35 #define DEF_NORM_METHOD RANK_NO_NORM
36
37 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
38 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
39
40 /*
41 * Returns a weight of a word collocation
42 */
43 static float4
word_distance(int32 w)44 word_distance(int32 w)
45 {
46 if (w > 100)
47 return 1e-30f;
48
49 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
50 }
51
52 static int
cnt_length(TSVector t)53 cnt_length(TSVector t)
54 {
55 WordEntry *ptr = ARRPTR(t),
56 *end = (WordEntry *) STRPTR(t);
57 int len = 0;
58
59 while (ptr < end)
60 {
61 int clen = POSDATALEN(t, ptr);
62
63 if (clen == 0)
64 len += 1;
65 else
66 len += clen;
67
68 ptr++;
69 }
70
71 return len;
72 }
73
74
75 #define WordECompareQueryItem(e,q,p,i,m) \
76 tsCompareString((q) + (i)->distance, (i)->length, \
77 (e) + (p)->pos, (p)->len, (m))
78
79
80 /*
81 * Returns a pointer to a WordEntry's array corresponding to 'item' from
82 * tsvector 't'. 'q' is the TSQuery containing 'item'.
83 * Returns NULL if not found.
84 */
85 static WordEntry *
find_wordentry(TSVector t,TSQuery q,QueryOperand * item,int32 * nitem)86 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
87 {
88 WordEntry *StopLow = ARRPTR(t);
89 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
90 WordEntry *StopMiddle = StopHigh;
91 int difference;
92
93 *nitem = 0;
94
95 /* Loop invariant: StopLow <= item < StopHigh */
96 while (StopLow < StopHigh)
97 {
98 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
99 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
100 if (difference == 0)
101 {
102 StopHigh = StopMiddle;
103 *nitem = 1;
104 break;
105 }
106 else if (difference > 0)
107 StopLow = StopMiddle + 1;
108 else
109 StopHigh = StopMiddle;
110 }
111
112 if (item->prefix)
113 {
114 if (StopLow >= StopHigh)
115 StopMiddle = StopHigh;
116
117 *nitem = 0;
118
119 while (StopMiddle < (WordEntry *) STRPTR(t) &&
120 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
121 {
122 (*nitem)++;
123 StopMiddle++;
124 }
125 }
126
127 return (*nitem > 0) ? StopHigh : NULL;
128 }
129
130
131 /*
132 * sort QueryOperands by (length, word)
133 */
134 static int
compareQueryOperand(const void * a,const void * b,void * arg)135 compareQueryOperand(const void *a, const void *b, void *arg)
136 {
137 char *operand = (char *) arg;
138 QueryOperand *qa = (*(QueryOperand *const *) a);
139 QueryOperand *qb = (*(QueryOperand *const *) b);
140
141 return tsCompareString(operand + qa->distance, qa->length,
142 operand + qb->distance, qb->length,
143 false);
144 }
145
146 /*
147 * Returns a sorted, de-duplicated array of QueryOperands in a query.
148 * The returned QueryOperands are pointers to the original QueryOperands
149 * in the query.
150 *
151 * Length of the returned array is stored in *size
152 */
153 static QueryOperand **
SortAndUniqItems(TSQuery q,int * size)154 SortAndUniqItems(TSQuery q, int *size)
155 {
156 char *operand = GETOPERAND(q);
157 QueryItem *item = GETQUERY(q);
158 QueryOperand **res,
159 **ptr,
160 **prevptr;
161
162 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
163
164 /* Collect all operands from the tree to res */
165 while ((*size)--)
166 {
167 if (item->type == QI_VAL)
168 {
169 *ptr = (QueryOperand *) item;
170 ptr++;
171 }
172 item++;
173 }
174
175 *size = ptr - res;
176 if (*size < 2)
177 return res;
178
179 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
180
181 ptr = res + 1;
182 prevptr = res;
183
184 /* remove duplicates */
185 while (ptr - res < *size)
186 {
187 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
188 {
189 prevptr++;
190 *prevptr = *ptr;
191 }
192 ptr++;
193 }
194
195 *size = prevptr + 1 - res;
196 return res;
197 }
198
199 static float
calc_rank_and(const float * w,TSVector t,TSQuery q)200 calc_rank_and(const float *w, TSVector t, TSQuery q)
201 {
202 WordEntryPosVector **pos;
203 WordEntryPosVector1 posnull;
204 WordEntryPosVector *POSNULL;
205 int i,
206 k,
207 l,
208 p;
209 WordEntry *entry,
210 *firstentry;
211 WordEntryPos *post,
212 *ct;
213 int32 dimt,
214 lenct,
215 dist,
216 nitem;
217 float res = -1.0;
218 QueryOperand **item;
219 int size = q->size;
220
221 item = SortAndUniqItems(q, &size);
222 if (size < 2)
223 {
224 pfree(item);
225 return calc_rank_or(w, t, q);
226 }
227 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
228
229 /* A dummy WordEntryPos array to use when haspos is false */
230 posnull.npos = 1;
231 posnull.pos[0] = 0;
232 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
233 POSNULL = (WordEntryPosVector *) &posnull;
234
235 for (i = 0; i < size; i++)
236 {
237 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
238 if (!entry)
239 continue;
240
241 while (entry - firstentry < nitem)
242 {
243 if (entry->haspos)
244 pos[i] = _POSVECPTR(t, entry);
245 else
246 pos[i] = POSNULL;
247
248 dimt = pos[i]->npos;
249 post = pos[i]->pos;
250 for (k = 0; k < i; k++)
251 {
252 if (!pos[k])
253 continue;
254 lenct = pos[k]->npos;
255 ct = pos[k]->pos;
256 for (l = 0; l < dimt; l++)
257 {
258 for (p = 0; p < lenct; p++)
259 {
260 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
261 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
262 {
263 float curw;
264
265 if (!dist)
266 dist = MAXENTRYPOS;
267 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
268 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
269 }
270 }
271 }
272 }
273
274 entry++;
275 }
276 }
277 pfree(pos);
278 pfree(item);
279 return res;
280 }
281
282 static float
calc_rank_or(const float * w,TSVector t,TSQuery q)283 calc_rank_or(const float *w, TSVector t, TSQuery q)
284 {
285 WordEntry *entry,
286 *firstentry;
287 WordEntryPosVector1 posnull;
288 WordEntryPos *post;
289 int32 dimt,
290 j,
291 i,
292 nitem;
293 float res = 0.0;
294 QueryOperand **item;
295 int size = q->size;
296
297 /* A dummy WordEntryPos array to use when haspos is false */
298 posnull.npos = 1;
299 posnull.pos[0] = 0;
300
301 item = SortAndUniqItems(q, &size);
302
303 for (i = 0; i < size; i++)
304 {
305 float resj,
306 wjm;
307 int32 jm;
308
309 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
310 if (!entry)
311 continue;
312
313 while (entry - firstentry < nitem)
314 {
315 if (entry->haspos)
316 {
317 dimt = POSDATALEN(t, entry);
318 post = POSDATAPTR(t, entry);
319 }
320 else
321 {
322 dimt = posnull.npos;
323 post = posnull.pos;
324 }
325
326 resj = 0.0;
327 wjm = -1.0;
328 jm = 0;
329 for (j = 0; j < dimt; j++)
330 {
331 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
332 if (wpos(post[j]) > wjm)
333 {
334 wjm = wpos(post[j]);
335 jm = j;
336 }
337 }
338 /*
339 limit (sum(i/i^2),i->inf) = pi^2/6
340 resj = sum(wi/i^2),i=1,noccurence,
341 wi - should be sorted desc,
342 don't sort for now, just choose maximum weight. This should be corrected
343 Oleg Bartunov
344 */
345 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
346
347 entry++;
348 }
349 }
350 if (size > 0)
351 res = res / size;
352 pfree(item);
353 return res;
354 }
355
356 static float
calc_rank(const float * w,TSVector t,TSQuery q,int32 method)357 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
358 {
359 QueryItem *item = GETQUERY(q);
360 float res = 0.0;
361 int len;
362
363 if (!t->size || !q->size)
364 return 0.0;
365
366 /* XXX: What about NOT? */
367 res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
368 item->qoperator.oper == OP_PHRASE)) ?
369 calc_rank_and(w, t, q) :
370 calc_rank_or(w, t, q);
371
372 if (res < 0)
373 res = 1e-20f;
374
375 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
376 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
377
378 if (method & RANK_NORM_LENGTH)
379 {
380 len = cnt_length(t);
381 if (len > 0)
382 res /= (float) len;
383 }
384
385 /* RANK_NORM_EXTDIST not applicable */
386
387 if ((method & RANK_NORM_UNIQ) && t->size > 0)
388 res /= (float) (t->size);
389
390 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
391 res /= log((double) (t->size + 1)) / log(2.0);
392
393 if (method & RANK_NORM_RDIVRPLUS1)
394 res /= (res + 1);
395
396 return res;
397 }
398
399 static const float *
getWeights(ArrayType * win)400 getWeights(ArrayType *win)
401 {
402 static float ws[lengthof(weights)];
403 int i;
404 float4 *arrdata;
405
406 if (win == NULL)
407 return weights;
408
409 if (ARR_NDIM(win) != 1)
410 ereport(ERROR,
411 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
412 errmsg("array of weight must be one-dimensional")));
413
414 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
415 ereport(ERROR,
416 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
417 errmsg("array of weight is too short")));
418
419 if (array_contains_nulls(win))
420 ereport(ERROR,
421 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
422 errmsg("array of weight must not contain nulls")));
423
424 arrdata = (float4 *) ARR_DATA_PTR(win);
425 for (i = 0; i < lengthof(weights); i++)
426 {
427 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
428 if (ws[i] > 1.0)
429 ereport(ERROR,
430 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
431 errmsg("weight out of range")));
432 }
433
434 return ws;
435 }
436
437 Datum
ts_rank_wttf(PG_FUNCTION_ARGS)438 ts_rank_wttf(PG_FUNCTION_ARGS)
439 {
440 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
441 TSVector txt = PG_GETARG_TSVECTOR(1);
442 TSQuery query = PG_GETARG_TSQUERY(2);
443 int method = PG_GETARG_INT32(3);
444 float res;
445
446 res = calc_rank(getWeights(win), txt, query, method);
447
448 PG_FREE_IF_COPY(win, 0);
449 PG_FREE_IF_COPY(txt, 1);
450 PG_FREE_IF_COPY(query, 2);
451 PG_RETURN_FLOAT4(res);
452 }
453
454 Datum
ts_rank_wtt(PG_FUNCTION_ARGS)455 ts_rank_wtt(PG_FUNCTION_ARGS)
456 {
457 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
458 TSVector txt = PG_GETARG_TSVECTOR(1);
459 TSQuery query = PG_GETARG_TSQUERY(2);
460 float res;
461
462 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
463
464 PG_FREE_IF_COPY(win, 0);
465 PG_FREE_IF_COPY(txt, 1);
466 PG_FREE_IF_COPY(query, 2);
467 PG_RETURN_FLOAT4(res);
468 }
469
470 Datum
ts_rank_ttf(PG_FUNCTION_ARGS)471 ts_rank_ttf(PG_FUNCTION_ARGS)
472 {
473 TSVector txt = PG_GETARG_TSVECTOR(0);
474 TSQuery query = PG_GETARG_TSQUERY(1);
475 int method = PG_GETARG_INT32(2);
476 float res;
477
478 res = calc_rank(getWeights(NULL), txt, query, method);
479
480 PG_FREE_IF_COPY(txt, 0);
481 PG_FREE_IF_COPY(query, 1);
482 PG_RETURN_FLOAT4(res);
483 }
484
485 Datum
ts_rank_tt(PG_FUNCTION_ARGS)486 ts_rank_tt(PG_FUNCTION_ARGS)
487 {
488 TSVector txt = PG_GETARG_TSVECTOR(0);
489 TSQuery query = PG_GETARG_TSQUERY(1);
490 float res;
491
492 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
493
494 PG_FREE_IF_COPY(txt, 0);
495 PG_FREE_IF_COPY(query, 1);
496 PG_RETURN_FLOAT4(res);
497 }
498
499 typedef struct
500 {
501 union
502 {
503 struct
504 { /* compiled doc representation */
505 QueryItem **items;
506 int16 nitem;
507 } query;
508 struct
509 { /* struct is used for preparing doc
510 * representation */
511 QueryItem *item;
512 WordEntry *entry;
513 } map;
514 } data;
515 WordEntryPos pos;
516 } DocRepresentation;
517
518 static int
compareDocR(const void * va,const void * vb)519 compareDocR(const void *va, const void *vb)
520 {
521 const DocRepresentation *a = (const DocRepresentation *) va;
522 const DocRepresentation *b = (const DocRepresentation *) vb;
523
524 if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
525 {
526 if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
527 {
528 if (a->data.map.entry == b->data.map.entry)
529 return 0;
530
531 return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
532 }
533
534 return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
535 }
536
537 return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
538 }
539
540 #define MAXQROPOS MAXENTRYPOS
541 typedef struct
542 {
543 bool operandexists;
544 bool reverseinsert; /* indicates insert order, true means
545 * descending order */
546 uint32 npos;
547 WordEntryPos pos[MAXQROPOS];
548 } QueryRepresentationOperand;
549
550 typedef struct
551 {
552 TSQuery query;
553 QueryRepresentationOperand *operandData;
554 } QueryRepresentation;
555
556 #define QR_GET_OPERAND_DATA(q, v) \
557 ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
558
559 static bool
checkcondition_QueryOperand(void * checkval,QueryOperand * val,ExecPhraseData * data)560 checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
561 {
562 QueryRepresentation *qr = (QueryRepresentation *) checkval;
563 QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
564
565 if (!opData->operandexists)
566 return false;
567
568 if (data)
569 {
570 data->npos = opData->npos;
571 data->pos = opData->pos;
572 if (opData->reverseinsert)
573 data->pos += MAXQROPOS - opData->npos;
574 }
575
576 return true;
577 }
578
579 typedef struct
580 {
581 int pos;
582 int p;
583 int q;
584 DocRepresentation *begin;
585 DocRepresentation *end;
586 } CoverExt;
587
588 static void
resetQueryRepresentation(QueryRepresentation * qr,bool reverseinsert)589 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
590 {
591 int i;
592
593 for (i = 0; i < qr->query->size; i++)
594 {
595 qr->operandData[i].operandexists = false;
596 qr->operandData[i].reverseinsert = reverseinsert;
597 qr->operandData[i].npos = 0;
598 }
599 }
600
601 static void
fillQueryRepresentationData(QueryRepresentation * qr,DocRepresentation * entry)602 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
603 {
604 int i;
605 int lastPos;
606 QueryRepresentationOperand *opData;
607
608 for (i = 0; i < entry->data.query.nitem; i++)
609 {
610 if (entry->data.query.items[i]->type != QI_VAL)
611 continue;
612
613 opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
614
615 opData->operandexists = true;
616
617 if (opData->npos == 0)
618 {
619 lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
620 opData->pos[lastPos] = entry->pos;
621 opData->npos++;
622 continue;
623 }
624
625 lastPos = opData->reverseinsert ?
626 (MAXQROPOS - opData->npos) :
627 (opData->npos - 1);
628
629 if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
630 {
631 lastPos = opData->reverseinsert ?
632 (MAXQROPOS - 1 - opData->npos) :
633 (opData->npos);
634
635 opData->pos[lastPos] = entry->pos;
636 opData->npos++;
637 }
638 }
639 }
640
641 static bool
Cover(DocRepresentation * doc,int len,QueryRepresentation * qr,CoverExt * ext)642 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
643 {
644 DocRepresentation *ptr;
645 int lastpos = ext->pos;
646 bool found = false;
647
648 /*
649 * since this function recurses, it could be driven to stack overflow.
650 * (though any decent compiler will optimize away the tail-recursion.
651 */
652 check_stack_depth();
653
654 resetQueryRepresentation(qr, false);
655
656 ext->p = INT_MAX;
657 ext->q = 0;
658 ptr = doc + ext->pos;
659
660 /* find upper bound of cover from current position, move up */
661 while (ptr - doc < len)
662 {
663 fillQueryRepresentationData(qr, ptr);
664
665 if (TS_execute(GETQUERY(qr->query), (void *) qr,
666 TS_EXEC_EMPTY, checkcondition_QueryOperand))
667 {
668 if (WEP_GETPOS(ptr->pos) > ext->q)
669 {
670 ext->q = WEP_GETPOS(ptr->pos);
671 ext->end = ptr;
672 lastpos = ptr - doc;
673 found = true;
674 }
675 break;
676 }
677 ptr++;
678 }
679
680 if (!found)
681 return false;
682
683 resetQueryRepresentation(qr, true);
684
685 ptr = doc + lastpos;
686
687 /* find lower bound of cover from found upper bound, move down */
688 while (ptr >= doc + ext->pos)
689 {
690 /*
691 * we scan doc from right to left, so pos info in reverse order!
692 */
693 fillQueryRepresentationData(qr, ptr);
694
695 if (TS_execute(GETQUERY(qr->query), (void *) qr,
696 TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
697 {
698 if (WEP_GETPOS(ptr->pos) < ext->p)
699 {
700 ext->begin = ptr;
701 ext->p = WEP_GETPOS(ptr->pos);
702 }
703 break;
704 }
705 ptr--;
706 }
707
708 if (ext->p <= ext->q)
709 {
710 /*
711 * set position for next try to next lexeme after beginning of found
712 * cover
713 */
714 ext->pos = (ptr - doc) + 1;
715 return true;
716 }
717
718 ext->pos++;
719 return Cover(doc, len, qr, ext);
720 }
721
722 static DocRepresentation *
get_docrep(TSVector txt,QueryRepresentation * qr,int * doclen)723 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
724 {
725 QueryItem *item = GETQUERY(qr->query);
726 WordEntry *entry,
727 *firstentry;
728 WordEntryPos *post;
729 int32 dimt, /* number of 'post' items */
730 j,
731 i,
732 nitem;
733 int len = qr->query->size * 4,
734 cur = 0;
735 DocRepresentation *doc;
736
737 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
738
739 /*
740 * Iterate through query to make DocRepresentaion for words and it's
741 * entries satisfied by query
742 */
743 for (i = 0; i < qr->query->size; i++)
744 {
745 QueryOperand *curoperand;
746
747 if (item[i].type != QI_VAL)
748 continue;
749
750 curoperand = &item[i].qoperand;
751
752 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
753 if (!entry)
754 continue;
755
756 /* iterations over entries in tsvector */
757 while (entry - firstentry < nitem)
758 {
759 if (entry->haspos)
760 {
761 dimt = POSDATALEN(txt, entry);
762 post = POSDATAPTR(txt, entry);
763 }
764 else
765 {
766 /* ignore words without positions */
767 entry++;
768 continue;
769 }
770
771 while (cur + dimt >= len)
772 {
773 len *= 2;
774 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
775 }
776
777 /* iterations over entry's positions */
778 for (j = 0; j < dimt; j++)
779 {
780 if (curoperand->weight == 0 ||
781 curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
782 {
783 doc[cur].pos = post[j];
784 doc[cur].data.map.entry = entry;
785 doc[cur].data.map.item = (QueryItem *) curoperand;
786 cur++;
787 }
788 }
789
790 entry++;
791 }
792 }
793
794 if (cur > 0)
795 {
796 DocRepresentation *rptr = doc + 1,
797 *wptr = doc,
798 storage;
799
800 /*
801 * Sort representation in ascending order by pos and entry
802 */
803 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
804
805 /*
806 * Join QueryItem per WordEntry and it's position
807 */
808 storage.pos = doc->pos;
809 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
810 storage.data.query.items[0] = doc->data.map.item;
811 storage.data.query.nitem = 1;
812
813 while (rptr - doc < cur)
814 {
815 if (rptr->pos == (rptr - 1)->pos &&
816 rptr->data.map.entry == (rptr - 1)->data.map.entry)
817 {
818 storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
819 storage.data.query.nitem++;
820 }
821 else
822 {
823 *wptr = storage;
824 wptr++;
825 storage.pos = rptr->pos;
826 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
827 storage.data.query.items[0] = rptr->data.map.item;
828 storage.data.query.nitem = 1;
829 }
830
831 rptr++;
832 }
833
834 *wptr = storage;
835 wptr++;
836
837 *doclen = wptr - doc;
838 return doc;
839 }
840
841 pfree(doc);
842 return NULL;
843 }
844
845 static float4
calc_rank_cd(const float4 * arrdata,TSVector txt,TSQuery query,int method)846 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
847 {
848 DocRepresentation *doc;
849 int len,
850 i,
851 doclen = 0;
852 CoverExt ext;
853 double Wdoc = 0.0;
854 double invws[lengthof(weights)];
855 double SumDist = 0.0,
856 PrevExtPos = 0.0,
857 CurExtPos = 0.0;
858 int NExtent = 0;
859 QueryRepresentation qr;
860
861
862 for (i = 0; i < lengthof(weights); i++)
863 {
864 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
865 if (invws[i] > 1.0)
866 ereport(ERROR,
867 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
868 errmsg("weight out of range")));
869 invws[i] = 1.0 / invws[i];
870 }
871
872 qr.query = query;
873 qr.operandData = (QueryRepresentationOperand *)
874 palloc0(sizeof(QueryRepresentationOperand) * query->size);
875
876 doc = get_docrep(txt, &qr, &doclen);
877 if (!doc)
878 {
879 pfree(qr.operandData);
880 return 0.0;
881 }
882
883 MemSet(&ext, 0, sizeof(CoverExt));
884 while (Cover(doc, doclen, &qr, &ext))
885 {
886 double Cpos = 0.0;
887 double InvSum = 0.0;
888 int nNoise;
889 DocRepresentation *ptr = ext.begin;
890
891 while (ptr <= ext.end)
892 {
893 InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
894 ptr++;
895 }
896
897 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
898
899 /*
900 * if doc are big enough then ext.q may be equal to ext.p due to limit
901 * of positional information. In this case we approximate number of
902 * noise word as half cover's length
903 */
904 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
905 if (nNoise < 0)
906 nNoise = (ext.end - ext.begin) / 2;
907 Wdoc += Cpos / ((double) (1 + nNoise));
908
909 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
910 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by
911 * zero in a case of
912 multiple lexize */ )
913 SumDist += 1.0 / (CurExtPos - PrevExtPos);
914
915 PrevExtPos = CurExtPos;
916 NExtent++;
917 }
918
919 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
920 Wdoc /= log((double) (cnt_length(txt) + 1));
921
922 if (method & RANK_NORM_LENGTH)
923 {
924 len = cnt_length(txt);
925 if (len > 0)
926 Wdoc /= (double) len;
927 }
928
929 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
930 Wdoc /= ((double) NExtent) / SumDist;
931
932 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
933 Wdoc /= (double) (txt->size);
934
935 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
936 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
937
938 if (method & RANK_NORM_RDIVRPLUS1)
939 Wdoc /= (Wdoc + 1);
940
941 pfree(doc);
942
943 pfree(qr.operandData);
944
945 return (float4) Wdoc;
946 }
947
948 Datum
ts_rankcd_wttf(PG_FUNCTION_ARGS)949 ts_rankcd_wttf(PG_FUNCTION_ARGS)
950 {
951 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
952 TSVector txt = PG_GETARG_TSVECTOR(1);
953 TSQuery query = PG_GETARG_TSQUERY(2);
954 int method = PG_GETARG_INT32(3);
955 float res;
956
957 res = calc_rank_cd(getWeights(win), txt, query, method);
958
959 PG_FREE_IF_COPY(win, 0);
960 PG_FREE_IF_COPY(txt, 1);
961 PG_FREE_IF_COPY(query, 2);
962 PG_RETURN_FLOAT4(res);
963 }
964
965 Datum
ts_rankcd_wtt(PG_FUNCTION_ARGS)966 ts_rankcd_wtt(PG_FUNCTION_ARGS)
967 {
968 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
969 TSVector txt = PG_GETARG_TSVECTOR(1);
970 TSQuery query = PG_GETARG_TSQUERY(2);
971 float res;
972
973 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
974
975 PG_FREE_IF_COPY(win, 0);
976 PG_FREE_IF_COPY(txt, 1);
977 PG_FREE_IF_COPY(query, 2);
978 PG_RETURN_FLOAT4(res);
979 }
980
981 Datum
ts_rankcd_ttf(PG_FUNCTION_ARGS)982 ts_rankcd_ttf(PG_FUNCTION_ARGS)
983 {
984 TSVector txt = PG_GETARG_TSVECTOR(0);
985 TSQuery query = PG_GETARG_TSQUERY(1);
986 int method = PG_GETARG_INT32(2);
987 float res;
988
989 res = calc_rank_cd(getWeights(NULL), txt, query, method);
990
991 PG_FREE_IF_COPY(txt, 0);
992 PG_FREE_IF_COPY(query, 1);
993 PG_RETURN_FLOAT4(res);
994 }
995
996 Datum
ts_rankcd_tt(PG_FUNCTION_ARGS)997 ts_rankcd_tt(PG_FUNCTION_ARGS)
998 {
999 TSVector txt = PG_GETARG_TSVECTOR(0);
1000 TSQuery query = PG_GETARG_TSQUERY(1);
1001 float res;
1002
1003 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1004
1005 PG_FREE_IF_COPY(txt, 0);
1006 PG_FREE_IF_COPY(query, 1);
1007 PG_RETURN_FLOAT4(res);
1008 }
1009