1 /*-------------------------------------------------------------------------
2 *
3 * tsrank.c
4 * rank tsvector by tsquery
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsrank.c
11 *
12 *-------------------------------------------------------------------------
13 */
14 #include "postgres.h"
15
16 #include <limits.h>
17 #include <math.h>
18
19 #include "tsearch/ts_utils.h"
20 #include "utils/array.h"
21 #include "utils/builtins.h"
22 #include "miscadmin.h"
23
24
25 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
26
27 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] )
28
29 #define RANK_NO_NORM 0x00
30 #define RANK_NORM_LOGLENGTH 0x01
31 #define RANK_NORM_LENGTH 0x02
32 #define RANK_NORM_EXTDIST 0x04
33 #define RANK_NORM_UNIQ 0x08
34 #define RANK_NORM_LOGUNIQ 0x10
35 #define RANK_NORM_RDIVRPLUS1 0x20
36 #define DEF_NORM_METHOD RANK_NO_NORM
37
38 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
39 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
40
41 /*
42 * Returns a weight of a word collocation
43 */
44 static float4
word_distance(int32 w)45 word_distance(int32 w)
46 {
47 if (w > 100)
48 return 1e-30f;
49
50 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
51 }
52
53 static int
cnt_length(TSVector t)54 cnt_length(TSVector t)
55 {
56 WordEntry *ptr = ARRPTR(t),
57 *end = (WordEntry *) STRPTR(t);
58 int len = 0;
59
60 while (ptr < end)
61 {
62 int clen = POSDATALEN(t, ptr);
63
64 if (clen == 0)
65 len += 1;
66 else
67 len += clen;
68
69 ptr++;
70 }
71
72 return len;
73 }
74
75
76 #define WordECompareQueryItem(e,q,p,i,m) \
77 tsCompareString((q) + (i)->distance, (i)->length, \
78 (e) + (p)->pos, (p)->len, (m))
79
80
81 /*
82 * Returns a pointer to a WordEntry's array corresponding to 'item' from
83 * tsvector 't'. 'q' is the TSQuery containing 'item'.
84 * Returns NULL if not found.
85 */
86 static WordEntry *
find_wordentry(TSVector t,TSQuery q,QueryOperand * item,int32 * nitem)87 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
88 {
89 WordEntry *StopLow = ARRPTR(t);
90 WordEntry *StopHigh = (WordEntry *) STRPTR(t);
91 WordEntry *StopMiddle = StopHigh;
92 int difference;
93
94 *nitem = 0;
95
96 /* Loop invariant: StopLow <= item < StopHigh */
97 while (StopLow < StopHigh)
98 {
99 StopMiddle = StopLow + (StopHigh - StopLow) / 2;
100 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
101 if (difference == 0)
102 {
103 StopHigh = StopMiddle;
104 *nitem = 1;
105 break;
106 }
107 else if (difference > 0)
108 StopLow = StopMiddle + 1;
109 else
110 StopHigh = StopMiddle;
111 }
112
113 if (item->prefix)
114 {
115 if (StopLow >= StopHigh)
116 StopMiddle = StopHigh;
117
118 *nitem = 0;
119
120 while (StopMiddle < (WordEntry *) STRPTR(t) &&
121 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
122 {
123 (*nitem)++;
124 StopMiddle++;
125 }
126 }
127
128 return (*nitem > 0) ? StopHigh : NULL;
129 }
130
131
132 /*
133 * sort QueryOperands by (length, word)
134 */
135 static int
compareQueryOperand(const void * a,const void * b,void * arg)136 compareQueryOperand(const void *a, const void *b, void *arg)
137 {
138 char *operand = (char *) arg;
139 QueryOperand *qa = (*(QueryOperand *const *) a);
140 QueryOperand *qb = (*(QueryOperand *const *) b);
141
142 return tsCompareString(operand + qa->distance, qa->length,
143 operand + qb->distance, qb->length,
144 false);
145 }
146
147 /*
148 * Returns a sorted, de-duplicated array of QueryOperands in a query.
149 * The returned QueryOperands are pointers to the original QueryOperands
150 * in the query.
151 *
152 * Length of the returned array is stored in *size
153 */
154 static QueryOperand **
SortAndUniqItems(TSQuery q,int * size)155 SortAndUniqItems(TSQuery q, int *size)
156 {
157 char *operand = GETOPERAND(q);
158 QueryItem *item = GETQUERY(q);
159 QueryOperand **res,
160 **ptr,
161 **prevptr;
162
163 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
164
165 /* Collect all operands from the tree to res */
166 while ((*size)--)
167 {
168 if (item->type == QI_VAL)
169 {
170 *ptr = (QueryOperand *) item;
171 ptr++;
172 }
173 item++;
174 }
175
176 *size = ptr - res;
177 if (*size < 2)
178 return res;
179
180 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
181
182 ptr = res + 1;
183 prevptr = res;
184
185 /* remove duplicates */
186 while (ptr - res < *size)
187 {
188 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
189 {
190 prevptr++;
191 *prevptr = *ptr;
192 }
193 ptr++;
194 }
195
196 *size = prevptr + 1 - res;
197 return res;
198 }
199
200 static float
calc_rank_and(const float * w,TSVector t,TSQuery q)201 calc_rank_and(const float *w, TSVector t, TSQuery q)
202 {
203 WordEntryPosVector **pos;
204 WordEntryPosVector1 posnull;
205 WordEntryPosVector *POSNULL;
206 int i,
207 k,
208 l,
209 p;
210 WordEntry *entry,
211 *firstentry;
212 WordEntryPos *post,
213 *ct;
214 int32 dimt,
215 lenct,
216 dist,
217 nitem;
218 float res = -1.0;
219 QueryOperand **item;
220 int size = q->size;
221
222 item = SortAndUniqItems(q, &size);
223 if (size < 2)
224 {
225 pfree(item);
226 return calc_rank_or(w, t, q);
227 }
228 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
229
230 /* A dummy WordEntryPos array to use when haspos is false */
231 posnull.npos = 1;
232 posnull.pos[0] = 0;
233 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
234 POSNULL = (WordEntryPosVector *) &posnull;
235
236 for (i = 0; i < size; i++)
237 {
238 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
239 if (!entry)
240 continue;
241
242 while (entry - firstentry < nitem)
243 {
244 if (entry->haspos)
245 pos[i] = _POSVECPTR(t, entry);
246 else
247 pos[i] = POSNULL;
248
249 dimt = pos[i]->npos;
250 post = pos[i]->pos;
251 for (k = 0; k < i; k++)
252 {
253 if (!pos[k])
254 continue;
255 lenct = pos[k]->npos;
256 ct = pos[k]->pos;
257 for (l = 0; l < dimt; l++)
258 {
259 for (p = 0; p < lenct; p++)
260 {
261 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
262 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
263 {
264 float curw;
265
266 if (!dist)
267 dist = MAXENTRYPOS;
268 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
269 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
270 }
271 }
272 }
273 }
274
275 entry++;
276 }
277 }
278 pfree(pos);
279 pfree(item);
280 return res;
281 }
282
283 static float
calc_rank_or(const float * w,TSVector t,TSQuery q)284 calc_rank_or(const float *w, TSVector t, TSQuery q)
285 {
286 WordEntry *entry,
287 *firstentry;
288 WordEntryPosVector1 posnull;
289 WordEntryPos *post;
290 int32 dimt,
291 j,
292 i,
293 nitem;
294 float res = 0.0;
295 QueryOperand **item;
296 int size = q->size;
297
298 /* A dummy WordEntryPos array to use when haspos is false */
299 posnull.npos = 1;
300 posnull.pos[0] = 0;
301
302 item = SortAndUniqItems(q, &size);
303
304 for (i = 0; i < size; i++)
305 {
306 float resj,
307 wjm;
308 int32 jm;
309
310 firstentry = entry = find_wordentry(t, q, item[i], &nitem);
311 if (!entry)
312 continue;
313
314 while (entry - firstentry < nitem)
315 {
316 if (entry->haspos)
317 {
318 dimt = POSDATALEN(t, entry);
319 post = POSDATAPTR(t, entry);
320 }
321 else
322 {
323 dimt = posnull.npos;
324 post = posnull.pos;
325 }
326
327 resj = 0.0;
328 wjm = -1.0;
329 jm = 0;
330 for (j = 0; j < dimt; j++)
331 {
332 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
333 if (wpos(post[j]) > wjm)
334 {
335 wjm = wpos(post[j]);
336 jm = j;
337 }
338 }
339 /*
340 limit (sum(i/i^2),i->inf) = pi^2/6
341 resj = sum(wi/i^2),i=1,noccurence,
342 wi - should be sorted desc,
343 don't sort for now, just choose maximum weight. This should be corrected
344 Oleg Bartunov
345 */
346 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
347
348 entry++;
349 }
350 }
351 if (size > 0)
352 res = res / size;
353 pfree(item);
354 return res;
355 }
356
357 static float
calc_rank(const float * w,TSVector t,TSQuery q,int32 method)358 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
359 {
360 QueryItem *item = GETQUERY(q);
361 float res = 0.0;
362 int len;
363
364 if (!t->size || !q->size)
365 return 0.0;
366
367 /* XXX: What about NOT? */
368 res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
369 item->qoperator.oper == OP_PHRASE)) ?
370 calc_rank_and(w, t, q) :
371 calc_rank_or(w, t, q);
372
373 if (res < 0)
374 res = 1e-20f;
375
376 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
377 res /= log((double) (cnt_length(t) + 1)) / log(2.0);
378
379 if (method & RANK_NORM_LENGTH)
380 {
381 len = cnt_length(t);
382 if (len > 0)
383 res /= (float) len;
384 }
385
386 /* RANK_NORM_EXTDIST not applicable */
387
388 if ((method & RANK_NORM_UNIQ) && t->size > 0)
389 res /= (float) (t->size);
390
391 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
392 res /= log((double) (t->size + 1)) / log(2.0);
393
394 if (method & RANK_NORM_RDIVRPLUS1)
395 res /= (res + 1);
396
397 return res;
398 }
399
400 static const float *
getWeights(ArrayType * win)401 getWeights(ArrayType *win)
402 {
403 static float ws[lengthof(weights)];
404 int i;
405 float4 *arrdata;
406
407 if (win == NULL)
408 return weights;
409
410 if (ARR_NDIM(win) != 1)
411 ereport(ERROR,
412 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
413 errmsg("array of weight must be one-dimensional")));
414
415 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
416 ereport(ERROR,
417 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
418 errmsg("array of weight is too short")));
419
420 if (array_contains_nulls(win))
421 ereport(ERROR,
422 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
423 errmsg("array of weight must not contain nulls")));
424
425 arrdata = (float4 *) ARR_DATA_PTR(win);
426 for (i = 0; i < lengthof(weights); i++)
427 {
428 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
429 if (ws[i] > 1.0)
430 ereport(ERROR,
431 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
432 errmsg("weight out of range")));
433 }
434
435 return ws;
436 }
437
438 Datum
ts_rank_wttf(PG_FUNCTION_ARGS)439 ts_rank_wttf(PG_FUNCTION_ARGS)
440 {
441 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
442 TSVector txt = PG_GETARG_TSVECTOR(1);
443 TSQuery query = PG_GETARG_TSQUERY(2);
444 int method = PG_GETARG_INT32(3);
445 float res;
446
447 res = calc_rank(getWeights(win), txt, query, method);
448
449 PG_FREE_IF_COPY(win, 0);
450 PG_FREE_IF_COPY(txt, 1);
451 PG_FREE_IF_COPY(query, 2);
452 PG_RETURN_FLOAT4(res);
453 }
454
455 Datum
ts_rank_wtt(PG_FUNCTION_ARGS)456 ts_rank_wtt(PG_FUNCTION_ARGS)
457 {
458 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
459 TSVector txt = PG_GETARG_TSVECTOR(1);
460 TSQuery query = PG_GETARG_TSQUERY(2);
461 float res;
462
463 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
464
465 PG_FREE_IF_COPY(win, 0);
466 PG_FREE_IF_COPY(txt, 1);
467 PG_FREE_IF_COPY(query, 2);
468 PG_RETURN_FLOAT4(res);
469 }
470
471 Datum
ts_rank_ttf(PG_FUNCTION_ARGS)472 ts_rank_ttf(PG_FUNCTION_ARGS)
473 {
474 TSVector txt = PG_GETARG_TSVECTOR(0);
475 TSQuery query = PG_GETARG_TSQUERY(1);
476 int method = PG_GETARG_INT32(2);
477 float res;
478
479 res = calc_rank(getWeights(NULL), txt, query, method);
480
481 PG_FREE_IF_COPY(txt, 0);
482 PG_FREE_IF_COPY(query, 1);
483 PG_RETURN_FLOAT4(res);
484 }
485
486 Datum
ts_rank_tt(PG_FUNCTION_ARGS)487 ts_rank_tt(PG_FUNCTION_ARGS)
488 {
489 TSVector txt = PG_GETARG_TSVECTOR(0);
490 TSQuery query = PG_GETARG_TSQUERY(1);
491 float res;
492
493 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
494
495 PG_FREE_IF_COPY(txt, 0);
496 PG_FREE_IF_COPY(query, 1);
497 PG_RETURN_FLOAT4(res);
498 }
499
500 typedef struct
501 {
502 union
503 {
504 struct
505 { /* compiled doc representation */
506 QueryItem **items;
507 int16 nitem;
508 } query;
509 struct
510 { /* struct is used for preparing doc
511 * representation */
512 QueryItem *item;
513 WordEntry *entry;
514 } map;
515 } data;
516 WordEntryPos pos;
517 } DocRepresentation;
518
519 static int
compareDocR(const void * va,const void * vb)520 compareDocR(const void *va, const void *vb)
521 {
522 const DocRepresentation *a = (const DocRepresentation *) va;
523 const DocRepresentation *b = (const DocRepresentation *) vb;
524
525 if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
526 {
527 if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
528 {
529 if (a->data.map.entry == b->data.map.entry)
530 return 0;
531
532 return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
533 }
534
535 return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
536 }
537
538 return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
539 }
540
541 #define MAXQROPOS MAXENTRYPOS
542 typedef struct
543 {
544 bool operandexists;
545 bool reverseinsert; /* indicates insert order, true means
546 * descending order */
547 uint32 npos;
548 WordEntryPos pos[MAXQROPOS];
549 } QueryRepresentationOperand;
550
551 typedef struct
552 {
553 TSQuery query;
554 QueryRepresentationOperand *operandData;
555 } QueryRepresentation;
556
557 #define QR_GET_OPERAND_DATA(q, v) \
558 ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
559
560 static bool
checkcondition_QueryOperand(void * checkval,QueryOperand * val,ExecPhraseData * data)561 checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
562 {
563 QueryRepresentation *qr = (QueryRepresentation *) checkval;
564 QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
565
566 if (!opData->operandexists)
567 return false;
568
569 if (data)
570 {
571 data->npos = opData->npos;
572 data->pos = opData->pos;
573 if (opData->reverseinsert)
574 data->pos += MAXQROPOS - opData->npos;
575 }
576
577 return true;
578 }
579
580 typedef struct
581 {
582 int pos;
583 int p;
584 int q;
585 DocRepresentation *begin;
586 DocRepresentation *end;
587 } CoverExt;
588
589 static void
resetQueryRepresentation(QueryRepresentation * qr,bool reverseinsert)590 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
591 {
592 int i;
593
594 for (i = 0; i < qr->query->size; i++)
595 {
596 qr->operandData[i].operandexists = false;
597 qr->operandData[i].reverseinsert = reverseinsert;
598 qr->operandData[i].npos = 0;
599 }
600 }
601
602 static void
fillQueryRepresentationData(QueryRepresentation * qr,DocRepresentation * entry)603 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
604 {
605 int i;
606 int lastPos;
607 QueryRepresentationOperand *opData;
608
609 for (i = 0; i < entry->data.query.nitem; i++)
610 {
611 if (entry->data.query.items[i]->type != QI_VAL)
612 continue;
613
614 opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
615
616 opData->operandexists = true;
617
618 if (opData->npos == 0)
619 {
620 lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
621 opData->pos[lastPos] = entry->pos;
622 opData->npos++;
623 continue;
624 }
625
626 lastPos = opData->reverseinsert ?
627 (MAXQROPOS - opData->npos) :
628 (opData->npos - 1);
629
630 if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
631 {
632 lastPos = opData->reverseinsert ?
633 (MAXQROPOS - 1 - opData->npos) :
634 (opData->npos);
635
636 opData->pos[lastPos] = entry->pos;
637 opData->npos++;
638 }
639 }
640 }
641
642 static bool
Cover(DocRepresentation * doc,int len,QueryRepresentation * qr,CoverExt * ext)643 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
644 {
645 DocRepresentation *ptr;
646 int lastpos = ext->pos;
647 bool found = false;
648
649 /*
650 * since this function recurses, it could be driven to stack overflow.
651 * (though any decent compiler will optimize away the tail-recursion.
652 */
653 check_stack_depth();
654
655 resetQueryRepresentation(qr, false);
656
657 ext->p = INT_MAX;
658 ext->q = 0;
659 ptr = doc + ext->pos;
660
661 /* find upper bound of cover from current position, move up */
662 while (ptr - doc < len)
663 {
664 fillQueryRepresentationData(qr, ptr);
665
666 if (TS_execute(GETQUERY(qr->query), (void *) qr,
667 TS_EXEC_EMPTY, checkcondition_QueryOperand))
668 {
669 if (WEP_GETPOS(ptr->pos) > ext->q)
670 {
671 ext->q = WEP_GETPOS(ptr->pos);
672 ext->end = ptr;
673 lastpos = ptr - doc;
674 found = true;
675 }
676 break;
677 }
678 ptr++;
679 }
680
681 if (!found)
682 return false;
683
684 resetQueryRepresentation(qr, true);
685
686 ptr = doc + lastpos;
687
688 /* find lower bound of cover from found upper bound, move down */
689 while (ptr >= doc + ext->pos)
690 {
691 /*
692 * we scan doc from right to left, so pos info in reverse order!
693 */
694 fillQueryRepresentationData(qr, ptr);
695
696 if (TS_execute(GETQUERY(qr->query), (void *) qr,
697 TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
698 {
699 if (WEP_GETPOS(ptr->pos) < ext->p)
700 {
701 ext->begin = ptr;
702 ext->p = WEP_GETPOS(ptr->pos);
703 }
704 break;
705 }
706 ptr--;
707 }
708
709 if (ext->p <= ext->q)
710 {
711 /*
712 * set position for next try to next lexeme after beginning of found
713 * cover
714 */
715 ext->pos = (ptr - doc) + 1;
716 return true;
717 }
718
719 ext->pos++;
720 return Cover(doc, len, qr, ext);
721 }
722
723 static DocRepresentation *
get_docrep(TSVector txt,QueryRepresentation * qr,int * doclen)724 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
725 {
726 QueryItem *item = GETQUERY(qr->query);
727 WordEntry *entry,
728 *firstentry;
729 WordEntryPos *post;
730 int32 dimt, /* number of 'post' items */
731 j,
732 i,
733 nitem;
734 int len = qr->query->size * 4,
735 cur = 0;
736 DocRepresentation *doc;
737
738 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
739
740 /*
741 * Iterate through query to make DocRepresentaion for words and it's
742 * entries satisfied by query
743 */
744 for (i = 0; i < qr->query->size; i++)
745 {
746 QueryOperand *curoperand;
747
748 if (item[i].type != QI_VAL)
749 continue;
750
751 curoperand = &item[i].qoperand;
752
753 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
754 if (!entry)
755 continue;
756
757 /* iterations over entries in tsvector */
758 while (entry - firstentry < nitem)
759 {
760 if (entry->haspos)
761 {
762 dimt = POSDATALEN(txt, entry);
763 post = POSDATAPTR(txt, entry);
764 }
765 else
766 {
767 /* ignore words without positions */
768 entry++;
769 continue;
770 }
771
772 while (cur + dimt >= len)
773 {
774 len *= 2;
775 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
776 }
777
778 /* iterations over entry's positions */
779 for (j = 0; j < dimt; j++)
780 {
781 if (curoperand->weight == 0 ||
782 curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
783 {
784 doc[cur].pos = post[j];
785 doc[cur].data.map.entry = entry;
786 doc[cur].data.map.item = (QueryItem *) curoperand;
787 cur++;
788 }
789 }
790
791 entry++;
792 }
793 }
794
795 if (cur > 0)
796 {
797 DocRepresentation *rptr = doc + 1,
798 *wptr = doc,
799 storage;
800
801 /*
802 * Sort representation in ascending order by pos and entry
803 */
804 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
805
806 /*
807 * Join QueryItem per WordEntry and it's position
808 */
809 storage.pos = doc->pos;
810 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
811 storage.data.query.items[0] = doc->data.map.item;
812 storage.data.query.nitem = 1;
813
814 while (rptr - doc < cur)
815 {
816 if (rptr->pos == (rptr - 1)->pos &&
817 rptr->data.map.entry == (rptr - 1)->data.map.entry)
818 {
819 storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
820 storage.data.query.nitem++;
821 }
822 else
823 {
824 *wptr = storage;
825 wptr++;
826 storage.pos = rptr->pos;
827 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
828 storage.data.query.items[0] = rptr->data.map.item;
829 storage.data.query.nitem = 1;
830 }
831
832 rptr++;
833 }
834
835 *wptr = storage;
836 wptr++;
837
838 *doclen = wptr - doc;
839 return doc;
840 }
841
842 pfree(doc);
843 return NULL;
844 }
845
846 static float4
calc_rank_cd(const float4 * arrdata,TSVector txt,TSQuery query,int method)847 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
848 {
849 DocRepresentation *doc;
850 int len,
851 i,
852 doclen = 0;
853 CoverExt ext;
854 double Wdoc = 0.0;
855 double invws[lengthof(weights)];
856 double SumDist = 0.0,
857 PrevExtPos = 0.0,
858 CurExtPos = 0.0;
859 int NExtent = 0;
860 QueryRepresentation qr;
861
862
863 for (i = 0; i < lengthof(weights); i++)
864 {
865 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
866 if (invws[i] > 1.0)
867 ereport(ERROR,
868 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
869 errmsg("weight out of range")));
870 invws[i] = 1.0 / invws[i];
871 }
872
873 qr.query = query;
874 qr.operandData = (QueryRepresentationOperand *)
875 palloc0(sizeof(QueryRepresentationOperand) * query->size);
876
877 doc = get_docrep(txt, &qr, &doclen);
878 if (!doc)
879 {
880 pfree(qr.operandData);
881 return 0.0;
882 }
883
884 MemSet(&ext, 0, sizeof(CoverExt));
885 while (Cover(doc, doclen, &qr, &ext))
886 {
887 double Cpos = 0.0;
888 double InvSum = 0.0;
889 int nNoise;
890 DocRepresentation *ptr = ext.begin;
891
892 while (ptr <= ext.end)
893 {
894 InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
895 ptr++;
896 }
897
898 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
899
900 /*
901 * if doc are big enough then ext.q may be equal to ext.p due to limit
902 * of positional information. In this case we approximate number of
903 * noise word as half cover's length
904 */
905 nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
906 if (nNoise < 0)
907 nNoise = (ext.end - ext.begin) / 2;
908 Wdoc += Cpos / ((double) (1 + nNoise));
909
910 CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
911 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by
912 * zero in a case of
913 * multiple lexize */ )
914 SumDist += 1.0 / (CurExtPos - PrevExtPos);
915
916 PrevExtPos = CurExtPos;
917 NExtent++;
918 }
919
920 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
921 Wdoc /= log((double) (cnt_length(txt) + 1));
922
923 if (method & RANK_NORM_LENGTH)
924 {
925 len = cnt_length(txt);
926 if (len > 0)
927 Wdoc /= (double) len;
928 }
929
930 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
931 Wdoc /= ((double) NExtent) / SumDist;
932
933 if ((method & RANK_NORM_UNIQ) && txt->size > 0)
934 Wdoc /= (double) (txt->size);
935
936 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
937 Wdoc /= log((double) (txt->size + 1)) / log(2.0);
938
939 if (method & RANK_NORM_RDIVRPLUS1)
940 Wdoc /= (Wdoc + 1);
941
942 pfree(doc);
943
944 pfree(qr.operandData);
945
946 return (float4) Wdoc;
947 }
948
949 Datum
ts_rankcd_wttf(PG_FUNCTION_ARGS)950 ts_rankcd_wttf(PG_FUNCTION_ARGS)
951 {
952 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
953 TSVector txt = PG_GETARG_TSVECTOR(1);
954 TSQuery query = PG_GETARG_TSQUERY(2);
955 int method = PG_GETARG_INT32(3);
956 float res;
957
958 res = calc_rank_cd(getWeights(win), txt, query, method);
959
960 PG_FREE_IF_COPY(win, 0);
961 PG_FREE_IF_COPY(txt, 1);
962 PG_FREE_IF_COPY(query, 2);
963 PG_RETURN_FLOAT4(res);
964 }
965
966 Datum
ts_rankcd_wtt(PG_FUNCTION_ARGS)967 ts_rankcd_wtt(PG_FUNCTION_ARGS)
968 {
969 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
970 TSVector txt = PG_GETARG_TSVECTOR(1);
971 TSQuery query = PG_GETARG_TSQUERY(2);
972 float res;
973
974 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
975
976 PG_FREE_IF_COPY(win, 0);
977 PG_FREE_IF_COPY(txt, 1);
978 PG_FREE_IF_COPY(query, 2);
979 PG_RETURN_FLOAT4(res);
980 }
981
982 Datum
ts_rankcd_ttf(PG_FUNCTION_ARGS)983 ts_rankcd_ttf(PG_FUNCTION_ARGS)
984 {
985 TSVector txt = PG_GETARG_TSVECTOR(0);
986 TSQuery query = PG_GETARG_TSQUERY(1);
987 int method = PG_GETARG_INT32(2);
988 float res;
989
990 res = calc_rank_cd(getWeights(NULL), txt, query, method);
991
992 PG_FREE_IF_COPY(txt, 0);
993 PG_FREE_IF_COPY(query, 1);
994 PG_RETURN_FLOAT4(res);
995 }
996
997 Datum
ts_rankcd_tt(PG_FUNCTION_ARGS)998 ts_rankcd_tt(PG_FUNCTION_ARGS)
999 {
1000 TSVector txt = PG_GETARG_TSVECTOR(0);
1001 TSQuery query = PG_GETARG_TSQUERY(1);
1002 float res;
1003
1004 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1005
1006 PG_FREE_IF_COPY(txt, 0);
1007 PG_FREE_IF_COPY(query, 1);
1008 PG_RETURN_FLOAT4(res);
1009 }
1010