1 /*-------------------------------------------------------------------------
2  *
3  * tsrank.c
4  *		rank tsvector by tsquery
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsrank.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include <limits.h>
17 #include <math.h>
18 
19 #include "tsearch/ts_utils.h"
20 #include "utils/array.h"
21 #include "miscadmin.h"
22 
23 
24 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f};
25 
26 #define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )
27 
28 #define RANK_NO_NORM			0x00
29 #define RANK_NORM_LOGLENGTH		0x01
30 #define RANK_NORM_LENGTH		0x02
31 #define RANK_NORM_EXTDIST		0x04
32 #define RANK_NORM_UNIQ			0x08
33 #define RANK_NORM_LOGUNIQ		0x10
34 #define RANK_NORM_RDIVRPLUS1	0x20
35 #define DEF_NORM_METHOD			RANK_NO_NORM
36 
37 static float calc_rank_or(const float *w, TSVector t, TSQuery q);
38 static float calc_rank_and(const float *w, TSVector t, TSQuery q);
39 
40 /*
41  * Returns a weight of a word collocation
42  */
43 static float4
word_distance(int32 w)44 word_distance(int32 w)
45 {
46 	if (w > 100)
47 		return 1e-30f;
48 
49 	return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
50 }
51 
52 static int
cnt_length(TSVector t)53 cnt_length(TSVector t)
54 {
55 	WordEntry  *ptr = ARRPTR(t),
56 			   *end = (WordEntry *) STRPTR(t);
57 	int			len = 0;
58 
59 	while (ptr < end)
60 	{
61 		int			clen = POSDATALEN(t, ptr);
62 
63 		if (clen == 0)
64 			len += 1;
65 		else
66 			len += clen;
67 
68 		ptr++;
69 	}
70 
71 	return len;
72 }
73 
74 
75 #define WordECompareQueryItem(e,q,p,i,m) \
76 	tsCompareString((q) + (i)->distance, (i)->length,	\
77 					(e) + (p)->pos, (p)->len, (m))
78 
79 
80 /*
81  * Returns a pointer to a WordEntry's array corresponding to 'item' from
82  * tsvector 't'. 'q' is the TSQuery containing 'item'.
83  * Returns NULL if not found.
84  */
85 static WordEntry *
find_wordentry(TSVector t,TSQuery q,QueryOperand * item,int32 * nitem)86 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
87 {
88 	WordEntry  *StopLow = ARRPTR(t);
89 	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
90 	WordEntry  *StopMiddle = StopHigh;
91 	int			difference;
92 
93 	*nitem = 0;
94 
95 	/* Loop invariant: StopLow <= item < StopHigh */
96 	while (StopLow < StopHigh)
97 	{
98 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
99 		difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false);
100 		if (difference == 0)
101 		{
102 			StopHigh = StopMiddle;
103 			*nitem = 1;
104 			break;
105 		}
106 		else if (difference > 0)
107 			StopLow = StopMiddle + 1;
108 		else
109 			StopHigh = StopMiddle;
110 	}
111 
112 	if (item->prefix)
113 	{
114 		if (StopLow >= StopHigh)
115 			StopMiddle = StopHigh;
116 
117 		*nitem = 0;
118 
119 		while (StopMiddle < (WordEntry *) STRPTR(t) &&
120 			   WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0)
121 		{
122 			(*nitem)++;
123 			StopMiddle++;
124 		}
125 	}
126 
127 	return (*nitem > 0) ? StopHigh : NULL;
128 }
129 
130 
131 /*
132  * sort QueryOperands by (length, word)
133  */
134 static int
compareQueryOperand(const void * a,const void * b,void * arg)135 compareQueryOperand(const void *a, const void *b, void *arg)
136 {
137 	char	   *operand = (char *) arg;
138 	QueryOperand *qa = (*(QueryOperand *const *) a);
139 	QueryOperand *qb = (*(QueryOperand *const *) b);
140 
141 	return tsCompareString(operand + qa->distance, qa->length,
142 						   operand + qb->distance, qb->length,
143 						   false);
144 }
145 
146 /*
147  * Returns a sorted, de-duplicated array of QueryOperands in a query.
148  * The returned QueryOperands are pointers to the original QueryOperands
149  * in the query.
150  *
151  * Length of the returned array is stored in *size
152  */
153 static QueryOperand **
SortAndUniqItems(TSQuery q,int * size)154 SortAndUniqItems(TSQuery q, int *size)
155 {
156 	char	   *operand = GETOPERAND(q);
157 	QueryItem  *item = GETQUERY(q);
158 	QueryOperand **res,
159 			  **ptr,
160 			  **prevptr;
161 
162 	ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size);
163 
164 	/* Collect all operands from the tree to res */
165 	while ((*size)--)
166 	{
167 		if (item->type == QI_VAL)
168 		{
169 			*ptr = (QueryOperand *) item;
170 			ptr++;
171 		}
172 		item++;
173 	}
174 
175 	*size = ptr - res;
176 	if (*size < 2)
177 		return res;
178 
179 	qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand);
180 
181 	ptr = res + 1;
182 	prevptr = res;
183 
184 	/* remove duplicates */
185 	while (ptr - res < *size)
186 	{
187 		if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0)
188 		{
189 			prevptr++;
190 			*prevptr = *ptr;
191 		}
192 		ptr++;
193 	}
194 
195 	*size = prevptr + 1 - res;
196 	return res;
197 }
198 
199 static float
calc_rank_and(const float * w,TSVector t,TSQuery q)200 calc_rank_and(const float *w, TSVector t, TSQuery q)
201 {
202 	WordEntryPosVector **pos;
203 	WordEntryPosVector1 posnull;
204 	WordEntryPosVector *POSNULL;
205 	int			i,
206 				k,
207 				l,
208 				p;
209 	WordEntry  *entry,
210 			   *firstentry;
211 	WordEntryPos *post,
212 			   *ct;
213 	int32		dimt,
214 				lenct,
215 				dist,
216 				nitem;
217 	float		res = -1.0;
218 	QueryOperand **item;
219 	int			size = q->size;
220 
221 	item = SortAndUniqItems(q, &size);
222 	if (size < 2)
223 	{
224 		pfree(item);
225 		return calc_rank_or(w, t, q);
226 	}
227 	pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size);
228 
229 	/* A dummy WordEntryPos array to use when haspos is false */
230 	posnull.npos = 1;
231 	posnull.pos[0] = 0;
232 	WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1);
233 	POSNULL = (WordEntryPosVector *) &posnull;
234 
235 	for (i = 0; i < size; i++)
236 	{
237 		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
238 		if (!entry)
239 			continue;
240 
241 		while (entry - firstentry < nitem)
242 		{
243 			if (entry->haspos)
244 				pos[i] = _POSVECPTR(t, entry);
245 			else
246 				pos[i] = POSNULL;
247 
248 			dimt = pos[i]->npos;
249 			post = pos[i]->pos;
250 			for (k = 0; k < i; k++)
251 			{
252 				if (!pos[k])
253 					continue;
254 				lenct = pos[k]->npos;
255 				ct = pos[k]->pos;
256 				for (l = 0; l < dimt; l++)
257 				{
258 					for (p = 0; p < lenct; p++)
259 					{
260 						dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
261 						if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL)))
262 						{
263 							float		curw;
264 
265 							if (!dist)
266 								dist = MAXENTRYPOS;
267 							curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
268 							res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
269 						}
270 					}
271 				}
272 			}
273 
274 			entry++;
275 		}
276 	}
277 	pfree(pos);
278 	pfree(item);
279 	return res;
280 }
281 
282 static float
calc_rank_or(const float * w,TSVector t,TSQuery q)283 calc_rank_or(const float *w, TSVector t, TSQuery q)
284 {
285 	WordEntry  *entry,
286 			   *firstentry;
287 	WordEntryPosVector1 posnull;
288 	WordEntryPos *post;
289 	int32		dimt,
290 				j,
291 				i,
292 				nitem;
293 	float		res = 0.0;
294 	QueryOperand **item;
295 	int			size = q->size;
296 
297 	/* A dummy WordEntryPos array to use when haspos is false */
298 	posnull.npos = 1;
299 	posnull.pos[0] = 0;
300 
301 	item = SortAndUniqItems(q, &size);
302 
303 	for (i = 0; i < size; i++)
304 	{
305 		float		resj,
306 					wjm;
307 		int32		jm;
308 
309 		firstentry = entry = find_wordentry(t, q, item[i], &nitem);
310 		if (!entry)
311 			continue;
312 
313 		while (entry - firstentry < nitem)
314 		{
315 			if (entry->haspos)
316 			{
317 				dimt = POSDATALEN(t, entry);
318 				post = POSDATAPTR(t, entry);
319 			}
320 			else
321 			{
322 				dimt = posnull.npos;
323 				post = posnull.pos;
324 			}
325 
326 			resj = 0.0;
327 			wjm = -1.0;
328 			jm = 0;
329 			for (j = 0; j < dimt; j++)
330 			{
331 				resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
332 				if (wpos(post[j]) > wjm)
333 				{
334 					wjm = wpos(post[j]);
335 					jm = j;
336 				}
337 			}
338 /*
339 			limit (sum(i/i^2),i->inf) = pi^2/6
340 			resj = sum(wi/i^2),i=1,noccurence,
341 			wi - should be sorted desc,
342 			don't sort for now, just choose maximum weight. This should be corrected
343 			Oleg Bartunov
344 */
345 			res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
346 
347 			entry++;
348 		}
349 	}
350 	if (size > 0)
351 		res = res / size;
352 	pfree(item);
353 	return res;
354 }
355 
356 static float
calc_rank(const float * w,TSVector t,TSQuery q,int32 method)357 calc_rank(const float *w, TSVector t, TSQuery q, int32 method)
358 {
359 	QueryItem  *item = GETQUERY(q);
360 	float		res = 0.0;
361 	int			len;
362 
363 	if (!t->size || !q->size)
364 		return 0.0;
365 
366 	/* XXX: What about NOT? */
367 	res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND ||
368 									item->qoperator.oper == OP_PHRASE)) ?
369 		calc_rank_and(w, t, q) :
370 		calc_rank_or(w, t, q);
371 
372 	if (res < 0)
373 		res = 1e-20f;
374 
375 	if ((method & RANK_NORM_LOGLENGTH) && t->size > 0)
376 		res /= log((double) (cnt_length(t) + 1)) / log(2.0);
377 
378 	if (method & RANK_NORM_LENGTH)
379 	{
380 		len = cnt_length(t);
381 		if (len > 0)
382 			res /= (float) len;
383 	}
384 
385 	/* RANK_NORM_EXTDIST not applicable */
386 
387 	if ((method & RANK_NORM_UNIQ) && t->size > 0)
388 		res /= (float) (t->size);
389 
390 	if ((method & RANK_NORM_LOGUNIQ) && t->size > 0)
391 		res /= log((double) (t->size + 1)) / log(2.0);
392 
393 	if (method & RANK_NORM_RDIVRPLUS1)
394 		res /= (res + 1);
395 
396 	return res;
397 }
398 
399 static const float *
getWeights(ArrayType * win)400 getWeights(ArrayType *win)
401 {
402 	static float ws[lengthof(weights)];
403 	int			i;
404 	float4	   *arrdata;
405 
406 	if (win == NULL)
407 		return weights;
408 
409 	if (ARR_NDIM(win) != 1)
410 		ereport(ERROR,
411 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
412 				 errmsg("array of weight must be one-dimensional")));
413 
414 	if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights))
415 		ereport(ERROR,
416 				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
417 				 errmsg("array of weight is too short")));
418 
419 	if (array_contains_nulls(win))
420 		ereport(ERROR,
421 				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
422 				 errmsg("array of weight must not contain nulls")));
423 
424 	arrdata = (float4 *) ARR_DATA_PTR(win);
425 	for (i = 0; i < lengthof(weights); i++)
426 	{
427 		ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
428 		if (ws[i] > 1.0)
429 			ereport(ERROR,
430 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
431 					 errmsg("weight out of range")));
432 	}
433 
434 	return ws;
435 }
436 
437 Datum
ts_rank_wttf(PG_FUNCTION_ARGS)438 ts_rank_wttf(PG_FUNCTION_ARGS)
439 {
440 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
441 	TSVector	txt = PG_GETARG_TSVECTOR(1);
442 	TSQuery		query = PG_GETARG_TSQUERY(2);
443 	int			method = PG_GETARG_INT32(3);
444 	float		res;
445 
446 	res = calc_rank(getWeights(win), txt, query, method);
447 
448 	PG_FREE_IF_COPY(win, 0);
449 	PG_FREE_IF_COPY(txt, 1);
450 	PG_FREE_IF_COPY(query, 2);
451 	PG_RETURN_FLOAT4(res);
452 }
453 
454 Datum
ts_rank_wtt(PG_FUNCTION_ARGS)455 ts_rank_wtt(PG_FUNCTION_ARGS)
456 {
457 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
458 	TSVector	txt = PG_GETARG_TSVECTOR(1);
459 	TSQuery		query = PG_GETARG_TSQUERY(2);
460 	float		res;
461 
462 	res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD);
463 
464 	PG_FREE_IF_COPY(win, 0);
465 	PG_FREE_IF_COPY(txt, 1);
466 	PG_FREE_IF_COPY(query, 2);
467 	PG_RETURN_FLOAT4(res);
468 }
469 
470 Datum
ts_rank_ttf(PG_FUNCTION_ARGS)471 ts_rank_ttf(PG_FUNCTION_ARGS)
472 {
473 	TSVector	txt = PG_GETARG_TSVECTOR(0);
474 	TSQuery		query = PG_GETARG_TSQUERY(1);
475 	int			method = PG_GETARG_INT32(2);
476 	float		res;
477 
478 	res = calc_rank(getWeights(NULL), txt, query, method);
479 
480 	PG_FREE_IF_COPY(txt, 0);
481 	PG_FREE_IF_COPY(query, 1);
482 	PG_RETURN_FLOAT4(res);
483 }
484 
485 Datum
ts_rank_tt(PG_FUNCTION_ARGS)486 ts_rank_tt(PG_FUNCTION_ARGS)
487 {
488 	TSVector	txt = PG_GETARG_TSVECTOR(0);
489 	TSQuery		query = PG_GETARG_TSQUERY(1);
490 	float		res;
491 
492 	res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD);
493 
494 	PG_FREE_IF_COPY(txt, 0);
495 	PG_FREE_IF_COPY(query, 1);
496 	PG_RETURN_FLOAT4(res);
497 }
498 
499 typedef struct
500 {
501 	union
502 	{
503 		struct
504 		{						/* compiled doc representation */
505 			QueryItem **items;
506 			int16		nitem;
507 		}			query;
508 		struct
509 		{						/* struct is used for preparing doc
510 								 * representation */
511 			QueryItem  *item;
512 			WordEntry  *entry;
513 		}			map;
514 	}			data;
515 	WordEntryPos pos;
516 } DocRepresentation;
517 
518 static int
compareDocR(const void * va,const void * vb)519 compareDocR(const void *va, const void *vb)
520 {
521 	const DocRepresentation *a = (const DocRepresentation *) va;
522 	const DocRepresentation *b = (const DocRepresentation *) vb;
523 
524 	if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos))
525 	{
526 		if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos))
527 		{
528 			if (a->data.map.entry == b->data.map.entry)
529 				return 0;
530 
531 			return (a->data.map.entry > b->data.map.entry) ? 1 : -1;
532 		}
533 
534 		return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1;
535 	}
536 
537 	return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1;
538 }
539 
540 #define MAXQROPOS	MAXENTRYPOS
541 typedef struct
542 {
543 	bool		operandexists;
544 	bool		reverseinsert;	/* indicates insert order, true means
545 								 * descending order */
546 	uint32		npos;
547 	WordEntryPos pos[MAXQROPOS];
548 } QueryRepresentationOperand;
549 
550 typedef struct
551 {
552 	TSQuery		query;
553 	QueryRepresentationOperand *operandData;
554 } QueryRepresentation;
555 
556 #define QR_GET_OPERAND_DATA(q, v) \
557 	( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
558 
559 static bool
checkcondition_QueryOperand(void * checkval,QueryOperand * val,ExecPhraseData * data)560 checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
561 {
562 	QueryRepresentation *qr = (QueryRepresentation *) checkval;
563 	QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
564 
565 	if (!opData->operandexists)
566 		return false;
567 
568 	if (data)
569 	{
570 		data->npos = opData->npos;
571 		data->pos = opData->pos;
572 		if (opData->reverseinsert)
573 			data->pos += MAXQROPOS - opData->npos;
574 	}
575 
576 	return true;
577 }
578 
579 typedef struct
580 {
581 	int			pos;
582 	int			p;
583 	int			q;
584 	DocRepresentation *begin;
585 	DocRepresentation *end;
586 } CoverExt;
587 
588 static void
resetQueryRepresentation(QueryRepresentation * qr,bool reverseinsert)589 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert)
590 {
591 	int			i;
592 
593 	for (i = 0; i < qr->query->size; i++)
594 	{
595 		qr->operandData[i].operandexists = false;
596 		qr->operandData[i].reverseinsert = reverseinsert;
597 		qr->operandData[i].npos = 0;
598 	}
599 }
600 
601 static void
fillQueryRepresentationData(QueryRepresentation * qr,DocRepresentation * entry)602 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry)
603 {
604 	int			i;
605 	int			lastPos;
606 	QueryRepresentationOperand *opData;
607 
608 	for (i = 0; i < entry->data.query.nitem; i++)
609 	{
610 		if (entry->data.query.items[i]->type != QI_VAL)
611 			continue;
612 
613 		opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]);
614 
615 		opData->operandexists = true;
616 
617 		if (opData->npos == 0)
618 		{
619 			lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0;
620 			opData->pos[lastPos] = entry->pos;
621 			opData->npos++;
622 			continue;
623 		}
624 
625 		lastPos = opData->reverseinsert ?
626 			(MAXQROPOS - opData->npos) :
627 			(opData->npos - 1);
628 
629 		if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos))
630 		{
631 			lastPos = opData->reverseinsert ?
632 				(MAXQROPOS - 1 - opData->npos) :
633 				(opData->npos);
634 
635 			opData->pos[lastPos] = entry->pos;
636 			opData->npos++;
637 		}
638 	}
639 }
640 
641 static bool
Cover(DocRepresentation * doc,int len,QueryRepresentation * qr,CoverExt * ext)642 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
643 {
644 	DocRepresentation *ptr;
645 	int			lastpos = ext->pos;
646 	bool		found = false;
647 
648 	/*
649 	 * since this function recurses, it could be driven to stack overflow.
650 	 * (though any decent compiler will optimize away the tail-recursion.
651 	 */
652 	check_stack_depth();
653 
654 	resetQueryRepresentation(qr, false);
655 
656 	ext->p = INT_MAX;
657 	ext->q = 0;
658 	ptr = doc + ext->pos;
659 
660 	/* find upper bound of cover from current position, move up */
661 	while (ptr - doc < len)
662 	{
663 		fillQueryRepresentationData(qr, ptr);
664 
665 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
666 					   TS_EXEC_EMPTY, checkcondition_QueryOperand))
667 		{
668 			if (WEP_GETPOS(ptr->pos) > ext->q)
669 			{
670 				ext->q = WEP_GETPOS(ptr->pos);
671 				ext->end = ptr;
672 				lastpos = ptr - doc;
673 				found = true;
674 			}
675 			break;
676 		}
677 		ptr++;
678 	}
679 
680 	if (!found)
681 		return false;
682 
683 	resetQueryRepresentation(qr, true);
684 
685 	ptr = doc + lastpos;
686 
687 	/* find lower bound of cover from found upper bound, move down */
688 	while (ptr >= doc + ext->pos)
689 	{
690 		/*
691 		 * we scan doc from right to left, so pos info in reverse order!
692 		 */
693 		fillQueryRepresentationData(qr, ptr);
694 
695 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
696 					   TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
697 		{
698 			if (WEP_GETPOS(ptr->pos) < ext->p)
699 			{
700 				ext->begin = ptr;
701 				ext->p = WEP_GETPOS(ptr->pos);
702 			}
703 			break;
704 		}
705 		ptr--;
706 	}
707 
708 	if (ext->p <= ext->q)
709 	{
710 		/*
711 		 * set position for next try to next lexeme after beginning of found
712 		 * cover
713 		 */
714 		ext->pos = (ptr - doc) + 1;
715 		return true;
716 	}
717 
718 	ext->pos++;
719 	return Cover(doc, len, qr, ext);
720 }
721 
722 static DocRepresentation *
get_docrep(TSVector txt,QueryRepresentation * qr,int * doclen)723 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
724 {
725 	QueryItem  *item = GETQUERY(qr->query);
726 	WordEntry  *entry,
727 			   *firstentry;
728 	WordEntryPos *post;
729 	int32		dimt,			/* number of 'post' items */
730 				j,
731 				i,
732 				nitem;
733 	int			len = qr->query->size * 4,
734 				cur = 0;
735 	DocRepresentation *doc;
736 
737 	doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
738 
739 	/*
740 	 * Iterate through query to make DocRepresentaion for words and it's
741 	 * entries satisfied by query
742 	 */
743 	for (i = 0; i < qr->query->size; i++)
744 	{
745 		QueryOperand *curoperand;
746 
747 		if (item[i].type != QI_VAL)
748 			continue;
749 
750 		curoperand = &item[i].qoperand;
751 
752 		firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem);
753 		if (!entry)
754 			continue;
755 
756 		/* iterations over entries in tsvector */
757 		while (entry - firstentry < nitem)
758 		{
759 			if (entry->haspos)
760 			{
761 				dimt = POSDATALEN(txt, entry);
762 				post = POSDATAPTR(txt, entry);
763 			}
764 			else
765 			{
766 				/* ignore words without positions */
767 				entry++;
768 				continue;
769 			}
770 
771 			while (cur + dimt >= len)
772 			{
773 				len *= 2;
774 				doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
775 			}
776 
777 			/* iterations over entry's positions */
778 			for (j = 0; j < dimt; j++)
779 			{
780 				if (curoperand->weight == 0 ||
781 					curoperand->weight & (1 << WEP_GETWEIGHT(post[j])))
782 				{
783 					doc[cur].pos = post[j];
784 					doc[cur].data.map.entry = entry;
785 					doc[cur].data.map.item = (QueryItem *) curoperand;
786 					cur++;
787 				}
788 			}
789 
790 			entry++;
791 		}
792 	}
793 
794 	if (cur > 0)
795 	{
796 		DocRepresentation *rptr = doc + 1,
797 				   *wptr = doc,
798 					storage;
799 
800 		/*
801 		 * Sort representation in ascending order by pos and entry
802 		 */
803 		qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
804 
805 		/*
806 		 * Join QueryItem per WordEntry and it's position
807 		 */
808 		storage.pos = doc->pos;
809 		storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
810 		storage.data.query.items[0] = doc->data.map.item;
811 		storage.data.query.nitem = 1;
812 
813 		while (rptr - doc < cur)
814 		{
815 			if (rptr->pos == (rptr - 1)->pos &&
816 				rptr->data.map.entry == (rptr - 1)->data.map.entry)
817 			{
818 				storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item;
819 				storage.data.query.nitem++;
820 			}
821 			else
822 			{
823 				*wptr = storage;
824 				wptr++;
825 				storage.pos = rptr->pos;
826 				storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size);
827 				storage.data.query.items[0] = rptr->data.map.item;
828 				storage.data.query.nitem = 1;
829 			}
830 
831 			rptr++;
832 		}
833 
834 		*wptr = storage;
835 		wptr++;
836 
837 		*doclen = wptr - doc;
838 		return doc;
839 	}
840 
841 	pfree(doc);
842 	return NULL;
843 }
844 
845 static float4
calc_rank_cd(const float4 * arrdata,TSVector txt,TSQuery query,int method)846 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method)
847 {
848 	DocRepresentation *doc;
849 	int			len,
850 				i,
851 				doclen = 0;
852 	CoverExt	ext;
853 	double		Wdoc = 0.0;
854 	double		invws[lengthof(weights)];
855 	double		SumDist = 0.0,
856 				PrevExtPos = 0.0,
857 				CurExtPos = 0.0;
858 	int			NExtent = 0;
859 	QueryRepresentation qr;
860 
861 
862 	for (i = 0; i < lengthof(weights); i++)
863 	{
864 		invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
865 		if (invws[i] > 1.0)
866 			ereport(ERROR,
867 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
868 					 errmsg("weight out of range")));
869 		invws[i] = 1.0 / invws[i];
870 	}
871 
872 	qr.query = query;
873 	qr.operandData = (QueryRepresentationOperand *)
874 		palloc0(sizeof(QueryRepresentationOperand) * query->size);
875 
876 	doc = get_docrep(txt, &qr, &doclen);
877 	if (!doc)
878 	{
879 		pfree(qr.operandData);
880 		return 0.0;
881 	}
882 
883 	MemSet(&ext, 0, sizeof(CoverExt));
884 	while (Cover(doc, doclen, &qr, &ext))
885 	{
886 		double		Cpos = 0.0;
887 		double		InvSum = 0.0;
888 		int			nNoise;
889 		DocRepresentation *ptr = ext.begin;
890 
891 		while (ptr <= ext.end)
892 		{
893 			InvSum += invws[WEP_GETWEIGHT(ptr->pos)];
894 			ptr++;
895 		}
896 
897 		Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum;
898 
899 		/*
900 		 * if doc are big enough then ext.q may be equal to ext.p due to limit
901 		 * of positional information. In this case we approximate number of
902 		 * noise word as half cover's length
903 		 */
904 		nNoise = (ext.q - ext.p) - (ext.end - ext.begin);
905 		if (nNoise < 0)
906 			nNoise = (ext.end - ext.begin) / 2;
907 		Wdoc += Cpos / ((double) (1 + nNoise));
908 
909 		CurExtPos = ((double) (ext.q + ext.p)) / 2.0;
910 		if (NExtent > 0 && CurExtPos > PrevExtPos		/* prevent division by
911 														 * zero in a case of
912 				multiple lexize */ )
913 			SumDist += 1.0 / (CurExtPos - PrevExtPos);
914 
915 		PrevExtPos = CurExtPos;
916 		NExtent++;
917 	}
918 
919 	if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0)
920 		Wdoc /= log((double) (cnt_length(txt) + 1));
921 
922 	if (method & RANK_NORM_LENGTH)
923 	{
924 		len = cnt_length(txt);
925 		if (len > 0)
926 			Wdoc /= (double) len;
927 	}
928 
929 	if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0)
930 		Wdoc /= ((double) NExtent) / SumDist;
931 
932 	if ((method & RANK_NORM_UNIQ) && txt->size > 0)
933 		Wdoc /= (double) (txt->size);
934 
935 	if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0)
936 		Wdoc /= log((double) (txt->size + 1)) / log(2.0);
937 
938 	if (method & RANK_NORM_RDIVRPLUS1)
939 		Wdoc /= (Wdoc + 1);
940 
941 	pfree(doc);
942 
943 	pfree(qr.operandData);
944 
945 	return (float4) Wdoc;
946 }
947 
948 Datum
ts_rankcd_wttf(PG_FUNCTION_ARGS)949 ts_rankcd_wttf(PG_FUNCTION_ARGS)
950 {
951 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
952 	TSVector	txt = PG_GETARG_TSVECTOR(1);
953 	TSQuery		query = PG_GETARG_TSQUERY(2);
954 	int			method = PG_GETARG_INT32(3);
955 	float		res;
956 
957 	res = calc_rank_cd(getWeights(win), txt, query, method);
958 
959 	PG_FREE_IF_COPY(win, 0);
960 	PG_FREE_IF_COPY(txt, 1);
961 	PG_FREE_IF_COPY(query, 2);
962 	PG_RETURN_FLOAT4(res);
963 }
964 
965 Datum
ts_rankcd_wtt(PG_FUNCTION_ARGS)966 ts_rankcd_wtt(PG_FUNCTION_ARGS)
967 {
968 	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
969 	TSVector	txt = PG_GETARG_TSVECTOR(1);
970 	TSQuery		query = PG_GETARG_TSQUERY(2);
971 	float		res;
972 
973 	res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD);
974 
975 	PG_FREE_IF_COPY(win, 0);
976 	PG_FREE_IF_COPY(txt, 1);
977 	PG_FREE_IF_COPY(query, 2);
978 	PG_RETURN_FLOAT4(res);
979 }
980 
981 Datum
ts_rankcd_ttf(PG_FUNCTION_ARGS)982 ts_rankcd_ttf(PG_FUNCTION_ARGS)
983 {
984 	TSVector	txt = PG_GETARG_TSVECTOR(0);
985 	TSQuery		query = PG_GETARG_TSQUERY(1);
986 	int			method = PG_GETARG_INT32(2);
987 	float		res;
988 
989 	res = calc_rank_cd(getWeights(NULL), txt, query, method);
990 
991 	PG_FREE_IF_COPY(txt, 0);
992 	PG_FREE_IF_COPY(query, 1);
993 	PG_RETURN_FLOAT4(res);
994 }
995 
996 Datum
ts_rankcd_tt(PG_FUNCTION_ARGS)997 ts_rankcd_tt(PG_FUNCTION_ARGS)
998 {
999 	TSVector	txt = PG_GETARG_TSVECTOR(0);
1000 	TSQuery		query = PG_GETARG_TSQUERY(1);
1001 	float		res;
1002 
1003 	res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD);
1004 
1005 	PG_FREE_IF_COPY(txt, 0);
1006 	PG_FREE_IF_COPY(query, 1);
1007 	PG_RETURN_FLOAT4(res);
1008 }
1009