1 /*-------------------------------------------------------------------------
2  *
3  * tsrank.c
4  *		rank tsvector by tsquery
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsrank.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include <limits.h>
17 #include <math.h>
18 
19 #include "miscadmin.h"
20 #include "tsearch/ts_utils.h"
21 #include "utils/array.h"
22 #include "utils/builtins.h"
23 
24 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
25 
26 #define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )
27 
28 #define RANK_NO_NORM			0x00
29 #define RANK_NORM_LOGLENGTH		0x01
30 #define RANK_NORM_LENGTH		0x02
31 #define RANK_NORM_EXTDIST		0x04
32 #define RANK_NORM_UNIQ			0x08
33 #define RANK_NORM_LOGUNIQ		0x10
34 #define RANK_NORM_RDIVRPLUS1	0x20
35 #define DEF_NORM_METHOD			RANK_NO_NORM
36 
37 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
38 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
39 
40 /*
41  * Returns a weight of a word collocation
42  */
43 static float4
word_distance(int32 w)44 word_distance(int32 w)
45 {
46 	if (w > 100)
47 		return 1e-30f;
48 
49 	return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
50 }
51 
52 static int
cnt_length(TSVector t)53 cnt_length(TSVector t)
54 {
55 	WordEntry  *ptr = ARRPTR(t),
56 			   *end = (WordEntry *) STRPTR(t);
57 	int			len = 0;
58 
59 	while (ptr < end)
60 	{
61 		int			clen = POSDATALEN(t, ptr);
62 
63 		if (clen == 0)
64 			len += 1;
65 		else
66 			len += clen;
67 
68 		ptr++;
69 	}
70 
71 	return len;
72 }
73 
74 
75 #define WordECompareQueryItem(e,q,p,i,m) \
76 	tsCompareString((q) + (i)->distance, (i)->length,	\
77 					(e) + (p)->pos, (p)->len, (m))
78 
79 
80 /*
81  * Returns a pointer to a WordEntry's array corresponding to 'item' from
82  * tsvector 't'. 'q' is the TSQuery containing 'item'.
83  * Returns NULL if not found.
84  */
85 static WordEntry *
find_wordentry(TSVector t,TSQuery q,QueryOperand * item,int32 * nitem)86 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
87 {
88 	WordEntry  *StopLow = ARRPTR(t);
89 	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
90 	WordEntry  *StopMiddle = StopHigh;
91 	int			difference;
92 
93 	*nitem = 0;
94 
95 	/* Loop invariant: StopLow <= item < StopHigh */
96 	while (StopLow < StopHigh)
97 	{
98 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
99 		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
100 		if (difference == 0)
101 		{
102 			StopHigh = StopMiddle;
103 			*nitem = 1;
104 			break;
105 		}
106 		else if (difference > 0)
107 			StopLow = StopMiddle + 1;
108 		else
109 			StopHigh = StopMiddle;
110 	}
111 
112 	if (item->prefix)
113 	{
114 		if (StopLow >= StopHigh)
115 			StopMiddle = StopHigh;
116 
117 		*nitem = 0;
118 
119 		while (StopMiddle < (WordEntry *) STRPTR(t) &&
120 			   WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
121 		{
122 			(*nitem)++;
123 			StopMiddle++;
124 		}
125 	}
126 
127 	return (*nitem > 0) ? StopHigh : NULL;
128 }
129 
130 
131 /*
132  * sort QueryOperands by (length, word)
133  */
134 static int
compareQueryOperand(const void * a,const void * b,void * arg)135 compareQueryOperand(const void *a, const void *b, void *arg)
136 {
137 	char	   *operand = (char *) arg;
138 	QueryOperand *qa = (*(QueryOperand *const *) a);
139 	QueryOperand *qb = (*(QueryOperand *const *) b);
140 
141 	return tsCompareString(operand + qa->distance, qa->length,
142 						   operand + qb->distance, qb->length,
143 						   false);
144 }
145 
146 /*
147  * Returns a sorted, de-duplicated array of QueryOperands in a query.
148  * The returned QueryOperands are pointers to the original QueryOperands
149  * in the query.
150  *
151  * Length of the returned array is stored in *size
152  */
153 static QueryOperand **
SortAndUniqItems(TSQuery q,int * size)154 SortAndUniqItems(TSQuery q, int *size)
155 {
156 	char	   *operand = GETOPERAND(q);
157 	QueryItem  *item = GETQUERY(q);
158 	QueryOperand **res,
159 			  **ptr,
160 			  **prevptr;
161 
162 	ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
163 
164 	/* Collect all operands from the tree to res */
165 	while ((*size)--)
166 	{
167 		if (item->type == QI_VAL)
168 		{
169 			*ptr = (QueryOperand *) item;
170 			ptr++;
171 		}
172 		item++;
173 	}
174 
175 	*size = ptr - res;
176 	if (*size < 2)
177 		return res;
178 
179 	qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
180 
181 	ptr = res + 1;
182 	prevptr = res;
183 
184 	/* remove duplicates */
185 	while (ptr - res < *size)
186 	{
187 		if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
188 		{
189 			prevptr++;
190 			*prevptr = *ptr;
191 		}
192 		ptr++;
193 	}
194 
195 	*size = prevptr + 1 - res;
196 	return res;
197 }
198 
199 static float
calc_rank_and(const float * w,TSVector t,TSQuery q)200 calc_rank_and(const float *w, TSVector t, TSQuery q)
201 {
202 	WordEntryPosVector **pos;
203 	WordEntryPosVector1 posnull;
204 	WordEntryPosVector *POSNULL;
205 	int			i,
206 				k,
207 				l,
208 				p;
209 	WordEntry  *entry,
210 			   *firstentry;
211 	WordEntryPos *post,
212 			   *ct;
213 	int32		dimt,
214 				lenct,
215 				dist,
216 				nitem;
217 	float		res = -1.0;
218 	QueryOperand **item;
219 	int			size = q->size;
220 
221 	item = SortAndUniqItems(q, &size);
222 	if (size < 2)
223 	{
224 		pfree(item);
225 		return calc_rank_or(w, t, q);
226 	}
227 	pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
228 
229 	/* A dummy WordEntryPos array to use when haspos is false */
230 	posnull.npos = 1;
231 	posnull.pos[0] = 0;
232 	WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
233 	POSNULL = (WordEntryPosVector *) &posnull;
234 
235 	for (i = 0; i < size; i++)
236 	{
237 		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
238 		if (!entry)
239 			continue;
240 
241 		while (entry - firstentry < nitem)
242 		{
243 			if (entry->haspos)
244 				pos[i] = _POSVECPTR(t, entry);
245 			else
246 				pos[i] = POSNULL;
247 
248 			dimt = pos[i]->npos;
249 			post = pos[i]->pos;
250 			for (k = 0; k < i; k++)
251 			{
252 				if (!pos[k])
253 					continue;
254 				lenct = pos[k]->npos;
255 				ct = pos[k]->pos;
256 				for (l = 0; l < dimt; l++)
257 				{
258 					for (p = 0; p < lenct; p++)
259 					{
260 						dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
261 						if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
262 						{
263 							float		curw;
264 
265 							if (!dist)
266 								dist = MAXENTRYPOS;
267 							curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
268 							res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
269 						}
270 					}
271 				}
272 			}
273 
274 			entry++;
275 		}
276 	}
277 	pfree(pos);
278 	pfree(item);
279 	return res;
280 }
281 
282 static float
calc_rank_or(const float * w,TSVector t,TSQuery q)283 calc_rank_or(const float *w, TSVector t, TSQuery q)
284 {
285 	WordEntry  *entry,
286 			   *firstentry;
287 	WordEntryPosVector1 posnull;
288 	WordEntryPos *post;
289 	int32		dimt,
290 				j,
291 				i,
292 				nitem;
293 	float		res = 0.0;
294 	QueryOperand **item;
295 	int			size = q->size;
296 
297 	/* A dummy WordEntryPos array to use when haspos is false */
298 	posnull.npos = 1;
299 	posnull.pos[0] = 0;
300 
301 	item = SortAndUniqItems(q, &size);
302 
303 	for (i = 0; i < size; i++)
304 	{
305 		float		resj,
306 					wjm;
307 		int32		jm;
308 
309 		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
310 		if (!entry)
311 			continue;
312 
313 		while (entry - firstentry < nitem)
314 		{
315 			if (entry->haspos)
316 			{
317 				dimt = POSDATALEN(t, entry);
318 				post = POSDATAPTR(t, entry);
319 			}
320 			else
321 			{
322 				dimt = posnull.npos;
323 				post = posnull.pos;
324 			}
325 
326 			resj = 0.0;
327 			wjm = -1.0;
328 			jm = 0;
329 			for (j = 0; j < dimt; j++)
330 			{
331 				resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
332 				if (wpos(post[j]) > wjm)
333 				{
334 					wjm = wpos(post[j]);
335 					jm = j;
336 				}
337 			}
338 /*
339 			limit (sum(1/i^2),i=1,inf) = pi^2/6
340 			resj = sum(wi/i^2),i=1,noccurence,
341 			wi - should be sorted desc,
342 			don't sort for now, just choose maximum weight. This should be corrected
343 			Oleg Bartunov
344 */
345 			res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
346 
347 			entry++;
348 		}
349 	}
350 	if (size > 0)
351 		res = res / size;
352 	pfree(item);
353 	return res;
354 }
355 
356 static float
calc_rank(const float * w,TSVector t,TSQuery q,int32 method)357 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
358 {
359 	QueryItem  *item = GETQUERY(q);
360 	float		res = 0.0;
361 	int			len;
362 
363 	if (!t->size || !q->size)
364 		return 0.0;
365 
366 	/* XXX: What about NOT? */
367 	res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
368 									item->qoperator.oper == OP_PHRASE)) ?
369 		calc_rank_and(w, t, q) :
370 		calc_rank_or(w, t, q);
371 
372 	if (res < 0)
373 		res = 1e-20f;
374 
375 	if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
376 		res /= log((double) (cnt_length(t) + 1)) / log(2.0);
377 
378 	if (method & RANK_NORM_LENGTH)
379 	{
380 		len = cnt_length(t);
381 		if (len > 0)
382 			res /= (float) len;
383 	}
384 
385 	/* RANK_NORM_EXTDIST not applicable */
386 
387 	if ((method & RANK_NORM_UNIQ) && t->size > 0)
388 		res /= (float) (t->size);
389 
390 	if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
391 		res /= log((double) (t->size + 1)) / log(2.0);
392 
393 	if (method & RANK_NORM_RDIVRPLUS1)
394 		res /= (res + 1);
395 
396 	return res;
397 }
398 
399 static const float *
getWeights(ArrayType * win)400 getWeights(ArrayType *win)
401 {
402 	static float ws[lengthof(weights)];
403 	int			i;
404 	float4	   *arrdata;
405 
406 	if (win == NULL)
407 		return weights;
408 
409 	if (ARR_NDIM(win) != 1)
410 		ereport(ERROR,
411 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
412 				 errmsg("array of weight must be one-dimensional")));
413 
414 	if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
415 		ereport(ERROR,
416 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
417 				 errmsg("array of weight is too short")));
418 
419 	if (array_contains_nulls(win))
420 		ereport(ERROR,
421 				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
422 				 errmsg("array of weight must not contain nulls")));
423 
424 	arrdata = (float4 *) ARR_DATA_PTR(win);
425 	for (i = 0; i < lengthof(weights); i++)
426 	{
427 		ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
428 		if (ws[i] > 1.0)
429 			ereport(ERROR,
430 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
431 					 errmsg("weight out of range")));
432 	}
433 
434 	return ws;
435 }
436 
437 Datum
ts_rank_wttf(PG_FUNCTION_ARGS)438 ts_rank_wttf(PG_FUNCTION_ARGS)
439 {
440 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
441 	TSVector	txt = PG_GETARG_TSVECTOR(1);
442 	TSQuery		query = PG_GETARG_TSQUERY(2);
443 	int			method = PG_GETARG_INT32(3);
444 	float		res;
445 
446 	res = calc_rank(getWeights(win), txt, query, method);
447 
448 	PG_FREE_IF_COPY(win, 0);
449 	PG_FREE_IF_COPY(txt, 1);
450 	PG_FREE_IF_COPY(query, 2);
451 	PG_RETURN_FLOAT4(res);
452 }
453 
454 Datum
ts_rank_wtt(PG_FUNCTION_ARGS)455 ts_rank_wtt(PG_FUNCTION_ARGS)
456 {
457 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
458 	TSVector	txt = PG_GETARG_TSVECTOR(1);
459 	TSQuery		query = PG_GETARG_TSQUERY(2);
460 	float		res;
461 
462 	res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
463 
464 	PG_FREE_IF_COPY(win, 0);
465 	PG_FREE_IF_COPY(txt, 1);
466 	PG_FREE_IF_COPY(query, 2);
467 	PG_RETURN_FLOAT4(res);
468 }
469 
470 Datum
ts_rank_ttf(PG_FUNCTION_ARGS)471 ts_rank_ttf(PG_FUNCTION_ARGS)
472 {
473 	TSVector	txt = PG_GETARG_TSVECTOR(0);
474 	TSQuery		query = PG_GETARG_TSQUERY(1);
475 	int			method = PG_GETARG_INT32(2);
476 	float		res;
477 
478 	res = calc_rank(getWeights(NULL), txt, query, method);
479 
480 	PG_FREE_IF_COPY(txt, 0);
481 	PG_FREE_IF_COPY(query, 1);
482 	PG_RETURN_FLOAT4(res);
483 }
484 
485 Datum
ts_rank_tt(PG_FUNCTION_ARGS)486 ts_rank_tt(PG_FUNCTION_ARGS)
487 {
488 	TSVector	txt = PG_GETARG_TSVECTOR(0);
489 	TSQuery		query = PG_GETARG_TSQUERY(1);
490 	float		res;
491 
492 	res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
493 
494 	PG_FREE_IF_COPY(txt, 0);
495 	PG_FREE_IF_COPY(query, 1);
496 	PG_RETURN_FLOAT4(res);
497 }
498 
499 typedef struct
500 {
501 	union
502 	{
503 		struct
504 		{						/* compiled doc representation */
505 			QueryItem **items;
506 			int16		nitem;
507 		}			query;
508 		struct
509 		{						/* struct is used for preparing doc
510 								 * representation */
511 			QueryItem  *item;
512 			WordEntry  *entry;
513 		}			map;
514 	}			data;
515 	WordEntryPos pos;
516 } DocRepresentation;
517 
518 static int
compareDocR(const void * va,const void * vb)519 compareDocR(const void *va, const void *vb)
520 {
521 	const DocRepresentation *a = (const DocRepresentation *) va;
522 	const DocRepresentation *b = (const DocRepresentation *) vb;
523 
524 	if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
525 	{
526 		if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
527 		{
528 			if (a->data.map.entry == b->data.map.entry)
529 				return 0;
530 
531 			return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
532 		}
533 
534 		return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
535 	}
536 
537 	return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
538 }
539 
540 #define MAXQROPOS	MAXENTRYPOS
541 typedef struct
542 {
543 	bool		operandexists;
544 	bool		reverseinsert;	/* indicates insert order, true means
545 								 * descending order */
546 	uint32		npos;
547 	WordEntryPos pos[MAXQROPOS];
548 } QueryRepresentationOperand;
549 
550 typedef struct
551 {
552 	TSQuery		query;
553 	QueryRepresentationOperand *operandData;
554 } QueryRepresentation;
555 
556 #define QR_GET_OPERAND_DATA(q, v) \
557 	( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
558 
559 /*
560  * TS_execute callback for matching a tsquery operand to QueryRepresentation
561  */
562 static TSTernaryValue
checkcondition_QueryOperand(void * checkval,QueryOperand * val,ExecPhraseData * data)563 checkcondition_QueryOperand(void *checkval, QueryOperand *val,
564 							ExecPhraseData *data)
565 {
566 	QueryRepresentation *qr = (QueryRepresentation *) checkval;
567 	QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
568 
569 	if (!opData->operandexists)
570 		return TS_NO;
571 
572 	if (data)
573 	{
574 		data->npos = opData->npos;
575 		data->pos = opData->pos;
576 		if (opData->reverseinsert)
577 			data->pos += MAXQROPOS - opData->npos;
578 	}
579 
580 	return TS_YES;
581 }
582 
583 typedef struct
584 {
585 	int			pos;
586 	int			p;
587 	int			q;
588 	DocRepresentation *begin;
589 	DocRepresentation *end;
590 } CoverExt;
591 
592 static void
resetQueryRepresentation(QueryRepresentation * qr,bool reverseinsert)593 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
594 {
595 	int			i;
596 
597 	for (i = 0; i < qr->query->size; i++)
598 	{
599 		qr->operandData[i].operandexists = false;
600 		qr->operandData[i].reverseinsert = reverseinsert;
601 		qr->operandData[i].npos = 0;
602 	}
603 }
604 
605 static void
fillQueryRepresentationData(QueryRepresentation * qr,DocRepresentation * entry)606 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
607 {
608 	int			i;
609 	int			lastPos;
610 	QueryRepresentationOperand *opData;
611 
612 	for (i = 0; i < entry->data.query.nitem; i++)
613 	{
614 		if (entry->data.query.items[i]->type != QI_VAL)
615 			continue;
616 
617 		opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
618 
619 		opData->operandexists = true;
620 
621 		if (opData->npos == 0)
622 		{
623 			lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
624 			opData->pos[lastPos] = entry->pos;
625 			opData->npos++;
626 			continue;
627 		}
628 
629 		lastPos = opData->reverseinsert ?
630 			(MAXQROPOS - opData->npos) :
631 			(opData->npos - 1);
632 
633 		if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
634 		{
635 			lastPos = opData->reverseinsert ?
636 				(MAXQROPOS - 1 - opData->npos) :
637 				(opData->npos);
638 
639 			opData->pos[lastPos] = entry->pos;
640 			opData->npos++;
641 		}
642 	}
643 }
644 
645 static bool
Cover(DocRepresentation * doc,int len,QueryRepresentation * qr,CoverExt * ext)646 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
647 {
648 	DocRepresentation *ptr;
649 	int			lastpos = ext->pos;
650 	bool		found = false;
651 
652 	/*
653 	 * since this function recurses, it could be driven to stack overflow.
654 	 * (though any decent compiler will optimize away the tail-recursion.
655 	 */
656 	check_stack_depth();
657 
658 	resetQueryRepresentation(qr, false);
659 
660 	ext->p = INT_MAX;
661 	ext->q = 0;
662 	ptr = doc + ext->pos;
663 
664 	/* find upper bound of cover from current position, move up */
665 	while (ptr - doc < len)
666 	{
667 		fillQueryRepresentationData(qr, ptr);
668 
669 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
670 					   TS_EXEC_EMPTY, checkcondition_QueryOperand))
671 		{
672 			if (WEP_GETPOS(ptr->pos) > ext->q)
673 			{
674 				ext->q = WEP_GETPOS(ptr->pos);
675 				ext->end = ptr;
676 				lastpos = ptr - doc;
677 				found = true;
678 			}
679 			break;
680 		}
681 		ptr++;
682 	}
683 
684 	if (!found)
685 		return false;
686 
687 	resetQueryRepresentation(qr, true);
688 
689 	ptr = doc + lastpos;
690 
691 	/* find lower bound of cover from found upper bound, move down */
692 	while (ptr >= doc + ext->pos)
693 	{
694 		/*
695 		 * we scan doc from right to left, so pos info in reverse order!
696 		 */
697 		fillQueryRepresentationData(qr, ptr);
698 
699 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
700 					   TS_EXEC_EMPTY, checkcondition_QueryOperand))
701 		{
702 			if (WEP_GETPOS(ptr->pos) < ext->p)
703 			{
704 				ext->begin = ptr;
705 				ext->p = WEP_GETPOS(ptr->pos);
706 			}
707 			break;
708 		}
709 		ptr--;
710 	}
711 
712 	if (ext->p <= ext->q)
713 	{
714 		/*
715 		 * set position for next try to next lexeme after beginning of found
716 		 * cover
717 		 */
718 		ext->pos = (ptr - doc) + 1;
719 		return true;
720 	}
721 
722 	ext->pos++;
723 	return Cover(doc, len, qr, ext);
724 }
725 
726 static DocRepresentation *
get_docrep(TSVector txt,QueryRepresentation * qr,int * doclen)727 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
728 {
729 	QueryItem  *item = GETQUERY(qr->query);
730 	WordEntry  *entry,
731 			   *firstentry;
732 	WordEntryPos *post;
733 	int32		dimt,			/* number of 'post' items */
734 				j,
735 				i,
736 				nitem;
737 	int			len = qr->query->size * 4,
738 				cur = 0;
739 	DocRepresentation *doc;
740 
741 	doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
742 
743 	/*
744 	 * Iterate through query to make DocRepresentation for words and it's
745 	 * entries satisfied by query
746 	 */
747 	for (i = 0; i < qr->query->size; i++)
748 	{
749 		QueryOperand *curoperand;
750 
751 		if (item[i].type != QI_VAL)
752 			continue;
753 
754 		curoperand = &item[i].qoperand;
755 
756 		firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
757 		if (!entry)
758 			continue;
759 
760 		/* iterations over entries in tsvector */
761 		while (entry - firstentry < nitem)
762 		{
763 			if (entry->haspos)
764 			{
765 				dimt = POSDATALEN(txt, entry);
766 				post = POSDATAPTR(txt, entry);
767 			}
768 			else
769 			{
770 				/* ignore words without positions */
771 				entry++;
772 				continue;
773 			}
774 
775 			while (cur + dimt >= len)
776 			{
777 				len *= 2;
778 				doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
779 			}
780 
781 			/* iterations over entry's positions */
782 			for (j = 0; j < dimt; j++)
783 			{
784 				if (curoperand->weight == 0 ||
785 					curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
786 				{
787 					doc[cur].pos = post[j];
788 					doc[cur].data.map.entry = entry;
789 					doc[cur].data.map.item = (QueryItem *) curoperand;
790 					cur++;
791 				}
792 			}
793 
794 			entry++;
795 		}
796 	}
797 
798 	if (cur > 0)
799 	{
800 		DocRepresentation *rptr = doc + 1,
801 				   *wptr = doc,
802 					storage;
803 
804 		/*
805 		 * Sort representation in ascending order by pos and entry
806 		 */
807 		qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
808 
809 		/*
810 		 * Join QueryItem per WordEntry and it's position
811 		 */
812 		storage.pos = doc->pos;
813 		storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
814 		storage.data.query.items[0] = doc->data.map.item;
815 		storage.data.query.nitem = 1;
816 
817 		while (rptr - doc < cur)
818 		{
819 			if (rptr->pos == (rptr - 1)->pos &&
820 				rptr->data.map.entry == (rptr - 1)->data.map.entry)
821 			{
822 				storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
823 				storage.data.query.nitem++;
824 			}
825 			else
826 			{
827 				*wptr = storage;
828 				wptr++;
829 				storage.pos = rptr->pos;
830 				storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
831 				storage.data.query.items[0] = rptr->data.map.item;
832 				storage.data.query.nitem = 1;
833 			}
834 
835 			rptr++;
836 		}
837 
838 		*wptr = storage;
839 		wptr++;
840 
841 		*doclen = wptr - doc;
842 		return doc;
843 	}
844 
845 	pfree(doc);
846 	return NULL;
847 }
848 
849 static float4
calc_rank_cd(const float4 * arrdata,TSVector txt,TSQuery query,int method)850 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
851 {
852 	DocRepresentation *doc;
853 	int			len,
854 				i,
855 				doclen = 0;
856 	CoverExt	ext;
857 	double		Wdoc = 0.0;
858 	double		invws[lengthof(weights)];
859 	double		SumDist = 0.0,
860 				PrevExtPos = 0.0;
861 	int			NExtent = 0;
862 	QueryRepresentation qr;
863 
864 
865 	for (i = 0; i < lengthof(weights); i++)
866 	{
867 		invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
868 		if (invws[i] > 1.0)
869 			ereport(ERROR,
870 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
871 					 errmsg("weight out of range")));
872 		invws[i] = 1.0 / invws[i];
873 	}
874 
875 	qr.query = query;
876 	qr.operandData = (QueryRepresentationOperand *)
877 		palloc0(sizeof(QueryRepresentationOperand) * query->size);
878 
879 	doc = get_docrep(txt, &qr, &doclen);
880 	if (!doc)
881 	{
882 		pfree(qr.operandData);
883 		return 0.0;
884 	}
885 
886 	MemSet(&ext, 0, sizeof(CoverExt));
887 	while (Cover(doc, doclen, &qr, &ext))
888 	{
889 		double		Cpos = 0.0;
890 		double		InvSum = 0.0;
891 		double		CurExtPos;
892 		int			nNoise;
893 		DocRepresentation *ptr = ext.begin;
894 
895 		while (ptr <= ext.end)
896 		{
897 			InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
898 			ptr++;
899 		}
900 
901 		Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
902 
903 		/*
904 		 * if doc are big enough then ext.q may be equal to ext.p due to limit
905 		 * of positional information. In this case we approximate number of
906 		 * noise word as half cover's length
907 		 */
908 		nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
909 		if (nNoise < 0)
910 			nNoise = (ext.end - ext.begin) / 2;
911 		Wdoc += Cpos / ((double) (1 + nNoise));
912 
913 		CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
914 		if (NExtent > 0 && CurExtPos > PrevExtPos	/* prevent division by
915 													 * zero in a case of
916 			  * multiple lexize */ )
917 			SumDist += 1.0 / (CurExtPos - PrevExtPos);
918 
919 		PrevExtPos = CurExtPos;
920 		NExtent++;
921 	}
922 
923 	if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
924 		Wdoc /= log((double) (cnt_length(txt) + 1));
925 
926 	if (method & RANK_NORM_LENGTH)
927 	{
928 		len = cnt_length(txt);
929 		if (len > 0)
930 			Wdoc /= (double) len;
931 	}
932 
933 	if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
934 		Wdoc /= ((double) NExtent) / SumDist;
935 
936 	if ((method & RANK_NORM_UNIQ) && txt->size > 0)
937 		Wdoc /= (double) (txt->size);
938 
939 	if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
940 		Wdoc /= log((double) (txt->size + 1)) / log(2.0);
941 
942 	if (method & RANK_NORM_RDIVRPLUS1)
943 		Wdoc /= (Wdoc + 1);
944 
945 	pfree(doc);
946 
947 	pfree(qr.operandData);
948 
949 	return (float4) Wdoc;
950 }
951 
952 Datum
ts_rankcd_wttf(PG_FUNCTION_ARGS)953 ts_rankcd_wttf(PG_FUNCTION_ARGS)
954 {
955 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
956 	TSVector	txt = PG_GETARG_TSVECTOR(1);
957 	TSQuery		query = PG_GETARG_TSQUERY(2);
958 	int			method = PG_GETARG_INT32(3);
959 	float		res;
960 
961 	res = calc_rank_cd(getWeights(win), txt, query, method);
962 
963 	PG_FREE_IF_COPY(win, 0);
964 	PG_FREE_IF_COPY(txt, 1);
965 	PG_FREE_IF_COPY(query, 2);
966 	PG_RETURN_FLOAT4(res);
967 }
968 
969 Datum
ts_rankcd_wtt(PG_FUNCTION_ARGS)970 ts_rankcd_wtt(PG_FUNCTION_ARGS)
971 {
972 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
973 	TSVector	txt = PG_GETARG_TSVECTOR(1);
974 	TSQuery		query = PG_GETARG_TSQUERY(2);
975 	float		res;
976 
977 	res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
978 
979 	PG_FREE_IF_COPY(win, 0);
980 	PG_FREE_IF_COPY(txt, 1);
981 	PG_FREE_IF_COPY(query, 2);
982 	PG_RETURN_FLOAT4(res);
983 }
984 
985 Datum
ts_rankcd_ttf(PG_FUNCTION_ARGS)986 ts_rankcd_ttf(PG_FUNCTION_ARGS)
987 {
988 	TSVector	txt = PG_GETARG_TSVECTOR(0);
989 	TSQuery		query = PG_GETARG_TSQUERY(1);
990 	int			method = PG_GETARG_INT32(2);
991 	float		res;
992 
993 	res = calc_rank_cd(getWeights(NULL), txt, query, method);
994 
995 	PG_FREE_IF_COPY(txt, 0);
996 	PG_FREE_IF_COPY(query, 1);
997 	PG_RETURN_FLOAT4(res);
998 }
999 
1000 Datum
ts_rankcd_tt(PG_FUNCTION_ARGS)1001 ts_rankcd_tt(PG_FUNCTION_ARGS)
1002 {
1003 	TSVector	txt = PG_GETARG_TSVECTOR(0);
1004 	TSQuery		query = PG_GETARG_TSQUERY(1);
1005 	float		res;
1006 
1007 	res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1008 
1009 	PG_FREE_IF_COPY(txt, 0);
1010 	PG_FREE_IF_COPY(query, 1);
1011 	PG_RETURN_FLOAT4(res);
1012 }
1013