1 /*------------------------------------------------------------------------- 2 * 3 * tsrank.c 4 * rank tsvector by tsquery 5 * 6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 7 * 8 * 9 * IDENTIFICATION 10 * src/backend/utils/adt/tsrank.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #include "postgres.h" 15 16 #include <limits.h> 17 #include <math.h> 18 19 #include "tsearch/ts_utils.h" 20 #include "utils/array.h" 21 #include "utils/builtins.h" 22 #include "miscadmin.h" 23 24 25 static const float weights[] = {0.1f, 0.2f, 0.4f, 1.0f}; 26 27 #define wpos(wep) ( w[ WEP_GETWEIGHT(wep) ] ) 28 29 #define RANK_NO_NORM 0x00 30 #define RANK_NORM_LOGLENGTH 0x01 31 #define RANK_NORM_LENGTH 0x02 32 #define RANK_NORM_EXTDIST 0x04 33 #define RANK_NORM_UNIQ 0x08 34 #define RANK_NORM_LOGUNIQ 0x10 35 #define RANK_NORM_RDIVRPLUS1 0x20 36 #define DEF_NORM_METHOD RANK_NO_NORM 37 38 static float calc_rank_or(const float *w, TSVector t, TSQuery q); 39 static float calc_rank_and(const float *w, TSVector t, TSQuery q); 40 41 /* 42 * Returns a weight of a word collocation 43 */ 44 static float4 45 word_distance(int32 w) 46 { 47 if (w > 100) 48 return 1e-30f; 49 50 return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2)); 51 } 52 53 static int 54 cnt_length(TSVector t) 55 { 56 WordEntry *ptr = ARRPTR(t), 57 *end = (WordEntry *) STRPTR(t); 58 int len = 0; 59 60 while (ptr < end) 61 { 62 int clen = POSDATALEN(t, ptr); 63 64 if (clen == 0) 65 len += 1; 66 else 67 len += clen; 68 69 ptr++; 70 } 71 72 return len; 73 } 74 75 76 #define WordECompareQueryItem(e,q,p,i,m) \ 77 tsCompareString((q) + (i)->distance, (i)->length, \ 78 (e) + (p)->pos, (p)->len, (m)) 79 80 81 /* 82 * Returns a pointer to a WordEntry's array corresponding to 'item' from 83 * tsvector 't'. 'q' is the TSQuery containing 'item'. 84 * Returns NULL if not found. 85 */ 86 static WordEntry * 87 find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem) 88 { 89 WordEntry *StopLow = ARRPTR(t); 90 WordEntry *StopHigh = (WordEntry *) STRPTR(t); 91 WordEntry *StopMiddle = StopHigh; 92 int difference; 93 94 *nitem = 0; 95 96 /* Loop invariant: StopLow <= item < StopHigh */ 97 while (StopLow < StopHigh) 98 { 99 StopMiddle = StopLow + (StopHigh - StopLow) / 2; 100 difference = WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, false); 101 if (difference == 0) 102 { 103 StopHigh = StopMiddle; 104 *nitem = 1; 105 break; 106 } 107 else if (difference > 0) 108 StopLow = StopMiddle + 1; 109 else 110 StopHigh = StopMiddle; 111 } 112 113 if (item->prefix) 114 { 115 if (StopLow >= StopHigh) 116 StopMiddle = StopHigh; 117 118 *nitem = 0; 119 120 while (StopMiddle < (WordEntry *) STRPTR(t) && 121 WordECompareQueryItem(STRPTR(t), GETOPERAND(q), StopMiddle, item, true) == 0) 122 { 123 (*nitem)++; 124 StopMiddle++; 125 } 126 } 127 128 return (*nitem > 0) ? StopHigh : NULL; 129 } 130 131 132 /* 133 * sort QueryOperands by (length, word) 134 */ 135 static int 136 compareQueryOperand(const void *a, const void *b, void *arg) 137 { 138 char *operand = (char *) arg; 139 QueryOperand *qa = (*(QueryOperand *const *) a); 140 QueryOperand *qb = (*(QueryOperand *const *) b); 141 142 return tsCompareString(operand + qa->distance, qa->length, 143 operand + qb->distance, qb->length, 144 false); 145 } 146 147 /* 148 * Returns a sorted, de-duplicated array of QueryOperands in a query. 149 * The returned QueryOperands are pointers to the original QueryOperands 150 * in the query. 151 * 152 * Length of the returned array is stored in *size 153 */ 154 static QueryOperand ** 155 SortAndUniqItems(TSQuery q, int *size) 156 { 157 char *operand = GETOPERAND(q); 158 QueryItem *item = GETQUERY(q); 159 QueryOperand **res, 160 **ptr, 161 **prevptr; 162 163 ptr = res = (QueryOperand **) palloc(sizeof(QueryOperand *) * *size); 164 165 /* Collect all operands from the tree to res */ 166 while ((*size)--) 167 { 168 if (item->type == QI_VAL) 169 { 170 *ptr = (QueryOperand *) item; 171 ptr++; 172 } 173 item++; 174 } 175 176 *size = ptr - res; 177 if (*size < 2) 178 return res; 179 180 qsort_arg(res, *size, sizeof(QueryOperand *), compareQueryOperand, (void *) operand); 181 182 ptr = res + 1; 183 prevptr = res; 184 185 /* remove duplicates */ 186 while (ptr - res < *size) 187 { 188 if (compareQueryOperand((void *) ptr, (void *) prevptr, (void *) operand) != 0) 189 { 190 prevptr++; 191 *prevptr = *ptr; 192 } 193 ptr++; 194 } 195 196 *size = prevptr + 1 - res; 197 return res; 198 } 199 200 static float 201 calc_rank_and(const float *w, TSVector t, TSQuery q) 202 { 203 WordEntryPosVector **pos; 204 WordEntryPosVector1 posnull; 205 WordEntryPosVector *POSNULL; 206 int i, 207 k, 208 l, 209 p; 210 WordEntry *entry, 211 *firstentry; 212 WordEntryPos *post, 213 *ct; 214 int32 dimt, 215 lenct, 216 dist, 217 nitem; 218 float res = -1.0; 219 QueryOperand **item; 220 int size = q->size; 221 222 item = SortAndUniqItems(q, &size); 223 if (size < 2) 224 { 225 pfree(item); 226 return calc_rank_or(w, t, q); 227 } 228 pos = (WordEntryPosVector **) palloc0(sizeof(WordEntryPosVector *) * q->size); 229 230 /* A dummy WordEntryPos array to use when haspos is false */ 231 posnull.npos = 1; 232 posnull.pos[0] = 0; 233 WEP_SETPOS(posnull.pos[0], MAXENTRYPOS - 1); 234 POSNULL = (WordEntryPosVector *) &posnull; 235 236 for (i = 0; i < size; i++) 237 { 238 firstentry = entry = find_wordentry(t, q, item[i], &nitem); 239 if (!entry) 240 continue; 241 242 while (entry - firstentry < nitem) 243 { 244 if (entry->haspos) 245 pos[i] = _POSVECPTR(t, entry); 246 else 247 pos[i] = POSNULL; 248 249 dimt = pos[i]->npos; 250 post = pos[i]->pos; 251 for (k = 0; k < i; k++) 252 { 253 if (!pos[k]) 254 continue; 255 lenct = pos[k]->npos; 256 ct = pos[k]->pos; 257 for (l = 0; l < dimt; l++) 258 { 259 for (p = 0; p < lenct; p++) 260 { 261 dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p])); 262 if (dist || (dist == 0 && (pos[i] == POSNULL || pos[k] == POSNULL))) 263 { 264 float curw; 265 266 if (!dist) 267 dist = MAXENTRYPOS; 268 curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist)); 269 res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw); 270 } 271 } 272 } 273 } 274 275 entry++; 276 } 277 } 278 pfree(pos); 279 pfree(item); 280 return res; 281 } 282 283 static float 284 calc_rank_or(const float *w, TSVector t, TSQuery q) 285 { 286 WordEntry *entry, 287 *firstentry; 288 WordEntryPosVector1 posnull; 289 WordEntryPos *post; 290 int32 dimt, 291 j, 292 i, 293 nitem; 294 float res = 0.0; 295 QueryOperand **item; 296 int size = q->size; 297 298 /* A dummy WordEntryPos array to use when haspos is false */ 299 posnull.npos = 1; 300 posnull.pos[0] = 0; 301 302 item = SortAndUniqItems(q, &size); 303 304 for (i = 0; i < size; i++) 305 { 306 float resj, 307 wjm; 308 int32 jm; 309 310 firstentry = entry = find_wordentry(t, q, item[i], &nitem); 311 if (!entry) 312 continue; 313 314 while (entry - firstentry < nitem) 315 { 316 if (entry->haspos) 317 { 318 dimt = POSDATALEN(t, entry); 319 post = POSDATAPTR(t, entry); 320 } 321 else 322 { 323 dimt = posnull.npos; 324 post = posnull.pos; 325 } 326 327 resj = 0.0; 328 wjm = -1.0; 329 jm = 0; 330 for (j = 0; j < dimt; j++) 331 { 332 resj = resj + wpos(post[j]) / ((j + 1) * (j + 1)); 333 if (wpos(post[j]) > wjm) 334 { 335 wjm = wpos(post[j]); 336 jm = j; 337 } 338 } 339 /* 340 limit (sum(i/i^2),i->inf) = pi^2/6 341 resj = sum(wi/i^2),i=1,noccurence, 342 wi - should be sorted desc, 343 don't sort for now, just choose maximum weight. This should be corrected 344 Oleg Bartunov 345 */ 346 res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685; 347 348 entry++; 349 } 350 } 351 if (size > 0) 352 res = res / size; 353 pfree(item); 354 return res; 355 } 356 357 static float 358 calc_rank(const float *w, TSVector t, TSQuery q, int32 method) 359 { 360 QueryItem *item = GETQUERY(q); 361 float res = 0.0; 362 int len; 363 364 if (!t->size || !q->size) 365 return 0.0; 366 367 /* XXX: What about NOT? */ 368 res = (item->type == QI_OPR && (item->qoperator.oper == OP_AND || 369 item->qoperator.oper == OP_PHRASE)) ? 370 calc_rank_and(w, t, q) : 371 calc_rank_or(w, t, q); 372 373 if (res < 0) 374 res = 1e-20f; 375 376 if ((method & RANK_NORM_LOGLENGTH) && t->size > 0) 377 res /= log((double) (cnt_length(t) + 1)) / log(2.0); 378 379 if (method & RANK_NORM_LENGTH) 380 { 381 len = cnt_length(t); 382 if (len > 0) 383 res /= (float) len; 384 } 385 386 /* RANK_NORM_EXTDIST not applicable */ 387 388 if ((method & RANK_NORM_UNIQ) && t->size > 0) 389 res /= (float) (t->size); 390 391 if ((method & RANK_NORM_LOGUNIQ) && t->size > 0) 392 res /= log((double) (t->size + 1)) / log(2.0); 393 394 if (method & RANK_NORM_RDIVRPLUS1) 395 res /= (res + 1); 396 397 return res; 398 } 399 400 static const float * 401 getWeights(ArrayType *win) 402 { 403 static float ws[lengthof(weights)]; 404 int i; 405 float4 *arrdata; 406 407 if (win == NULL) 408 return weights; 409 410 if (ARR_NDIM(win) != 1) 411 ereport(ERROR, 412 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), 413 errmsg("array of weight must be one-dimensional"))); 414 415 if (ArrayGetNItems(ARR_NDIM(win), ARR_DIMS(win)) < lengthof(weights)) 416 ereport(ERROR, 417 (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR), 418 errmsg("array of weight is too short"))); 419 420 if (array_contains_nulls(win)) 421 ereport(ERROR, 422 (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), 423 errmsg("array of weight must not contain nulls"))); 424 425 arrdata = (float4 *) ARR_DATA_PTR(win); 426 for (i = 0; i < lengthof(weights); i++) 427 { 428 ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i]; 429 if (ws[i] > 1.0) 430 ereport(ERROR, 431 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 432 errmsg("weight out of range"))); 433 } 434 435 return ws; 436 } 437 438 Datum 439 ts_rank_wttf(PG_FUNCTION_ARGS) 440 { 441 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); 442 TSVector txt = PG_GETARG_TSVECTOR(1); 443 TSQuery query = PG_GETARG_TSQUERY(2); 444 int method = PG_GETARG_INT32(3); 445 float res; 446 447 res = calc_rank(getWeights(win), txt, query, method); 448 449 PG_FREE_IF_COPY(win, 0); 450 PG_FREE_IF_COPY(txt, 1); 451 PG_FREE_IF_COPY(query, 2); 452 PG_RETURN_FLOAT4(res); 453 } 454 455 Datum 456 ts_rank_wtt(PG_FUNCTION_ARGS) 457 { 458 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); 459 TSVector txt = PG_GETARG_TSVECTOR(1); 460 TSQuery query = PG_GETARG_TSQUERY(2); 461 float res; 462 463 res = calc_rank(getWeights(win), txt, query, DEF_NORM_METHOD); 464 465 PG_FREE_IF_COPY(win, 0); 466 PG_FREE_IF_COPY(txt, 1); 467 PG_FREE_IF_COPY(query, 2); 468 PG_RETURN_FLOAT4(res); 469 } 470 471 Datum 472 ts_rank_ttf(PG_FUNCTION_ARGS) 473 { 474 TSVector txt = PG_GETARG_TSVECTOR(0); 475 TSQuery query = PG_GETARG_TSQUERY(1); 476 int method = PG_GETARG_INT32(2); 477 float res; 478 479 res = calc_rank(getWeights(NULL), txt, query, method); 480 481 PG_FREE_IF_COPY(txt, 0); 482 PG_FREE_IF_COPY(query, 1); 483 PG_RETURN_FLOAT4(res); 484 } 485 486 Datum 487 ts_rank_tt(PG_FUNCTION_ARGS) 488 { 489 TSVector txt = PG_GETARG_TSVECTOR(0); 490 TSQuery query = PG_GETARG_TSQUERY(1); 491 float res; 492 493 res = calc_rank(getWeights(NULL), txt, query, DEF_NORM_METHOD); 494 495 PG_FREE_IF_COPY(txt, 0); 496 PG_FREE_IF_COPY(query, 1); 497 PG_RETURN_FLOAT4(res); 498 } 499 500 typedef struct 501 { 502 union 503 { 504 struct 505 { /* compiled doc representation */ 506 QueryItem **items; 507 int16 nitem; 508 } query; 509 struct 510 { /* struct is used for preparing doc 511 * representation */ 512 QueryItem *item; 513 WordEntry *entry; 514 } map; 515 } data; 516 WordEntryPos pos; 517 } DocRepresentation; 518 519 static int 520 compareDocR(const void *va, const void *vb) 521 { 522 const DocRepresentation *a = (const DocRepresentation *) va; 523 const DocRepresentation *b = (const DocRepresentation *) vb; 524 525 if (WEP_GETPOS(a->pos) == WEP_GETPOS(b->pos)) 526 { 527 if (WEP_GETWEIGHT(a->pos) == WEP_GETWEIGHT(b->pos)) 528 { 529 if (a->data.map.entry == b->data.map.entry) 530 return 0; 531 532 return (a->data.map.entry > b->data.map.entry) ? 1 : -1; 533 } 534 535 return (WEP_GETWEIGHT(a->pos) > WEP_GETWEIGHT(b->pos)) ? 1 : -1; 536 } 537 538 return (WEP_GETPOS(a->pos) > WEP_GETPOS(b->pos)) ? 1 : -1; 539 } 540 541 #define MAXQROPOS MAXENTRYPOS 542 typedef struct 543 { 544 bool operandexists; 545 bool reverseinsert; /* indicates insert order, true means 546 * descending order */ 547 uint32 npos; 548 WordEntryPos pos[MAXQROPOS]; 549 } QueryRepresentationOperand; 550 551 typedef struct 552 { 553 TSQuery query; 554 QueryRepresentationOperand *operandData; 555 } QueryRepresentation; 556 557 #define QR_GET_OPERAND_DATA(q, v) \ 558 ( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) ) 559 560 static bool 561 checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data) 562 { 563 QueryRepresentation *qr = (QueryRepresentation *) checkval; 564 QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val); 565 566 if (!opData->operandexists) 567 return false; 568 569 if (data) 570 { 571 data->npos = opData->npos; 572 data->pos = opData->pos; 573 if (opData->reverseinsert) 574 data->pos += MAXQROPOS - opData->npos; 575 } 576 577 return true; 578 } 579 580 typedef struct 581 { 582 int pos; 583 int p; 584 int q; 585 DocRepresentation *begin; 586 DocRepresentation *end; 587 } CoverExt; 588 589 static void 590 resetQueryRepresentation(QueryRepresentation *qr, bool reverseinsert) 591 { 592 int i; 593 594 for (i = 0; i < qr->query->size; i++) 595 { 596 qr->operandData[i].operandexists = false; 597 qr->operandData[i].reverseinsert = reverseinsert; 598 qr->operandData[i].npos = 0; 599 } 600 } 601 602 static void 603 fillQueryRepresentationData(QueryRepresentation *qr, DocRepresentation *entry) 604 { 605 int i; 606 int lastPos; 607 QueryRepresentationOperand *opData; 608 609 for (i = 0; i < entry->data.query.nitem; i++) 610 { 611 if (entry->data.query.items[i]->type != QI_VAL) 612 continue; 613 614 opData = QR_GET_OPERAND_DATA(qr, entry->data.query.items[i]); 615 616 opData->operandexists = true; 617 618 if (opData->npos == 0) 619 { 620 lastPos = (opData->reverseinsert) ? (MAXQROPOS - 1) : 0; 621 opData->pos[lastPos] = entry->pos; 622 opData->npos++; 623 continue; 624 } 625 626 lastPos = opData->reverseinsert ? 627 (MAXQROPOS - opData->npos) : 628 (opData->npos - 1); 629 630 if (WEP_GETPOS(opData->pos[lastPos]) != WEP_GETPOS(entry->pos)) 631 { 632 lastPos = opData->reverseinsert ? 633 (MAXQROPOS - 1 - opData->npos) : 634 (opData->npos); 635 636 opData->pos[lastPos] = entry->pos; 637 opData->npos++; 638 } 639 } 640 } 641 642 static bool 643 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext) 644 { 645 DocRepresentation *ptr; 646 int lastpos = ext->pos; 647 bool found = false; 648 649 /* 650 * since this function recurses, it could be driven to stack overflow. 651 * (though any decent compiler will optimize away the tail-recursion. 652 */ 653 check_stack_depth(); 654 655 resetQueryRepresentation(qr, false); 656 657 ext->p = INT_MAX; 658 ext->q = 0; 659 ptr = doc + ext->pos; 660 661 /* find upper bound of cover from current position, move up */ 662 while (ptr - doc < len) 663 { 664 fillQueryRepresentationData(qr, ptr); 665 666 if (TS_execute(GETQUERY(qr->query), (void *) qr, 667 TS_EXEC_EMPTY, checkcondition_QueryOperand)) 668 { 669 if (WEP_GETPOS(ptr->pos) > ext->q) 670 { 671 ext->q = WEP_GETPOS(ptr->pos); 672 ext->end = ptr; 673 lastpos = ptr - doc; 674 found = true; 675 } 676 break; 677 } 678 ptr++; 679 } 680 681 if (!found) 682 return false; 683 684 resetQueryRepresentation(qr, true); 685 686 ptr = doc + lastpos; 687 688 /* find lower bound of cover from found upper bound, move down */ 689 while (ptr >= doc + ext->pos) 690 { 691 /* 692 * we scan doc from right to left, so pos info in reverse order! 693 */ 694 fillQueryRepresentationData(qr, ptr); 695 696 if (TS_execute(GETQUERY(qr->query), (void *) qr, 697 TS_EXEC_CALC_NOT, checkcondition_QueryOperand)) 698 { 699 if (WEP_GETPOS(ptr->pos) < ext->p) 700 { 701 ext->begin = ptr; 702 ext->p = WEP_GETPOS(ptr->pos); 703 } 704 break; 705 } 706 ptr--; 707 } 708 709 if (ext->p <= ext->q) 710 { 711 /* 712 * set position for next try to next lexeme after beginning of found 713 * cover 714 */ 715 ext->pos = (ptr - doc) + 1; 716 return true; 717 } 718 719 ext->pos++; 720 return Cover(doc, len, qr, ext); 721 } 722 723 static DocRepresentation * 724 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) 725 { 726 QueryItem *item = GETQUERY(qr->query); 727 WordEntry *entry, 728 *firstentry; 729 WordEntryPos *post; 730 int32 dimt, /* number of 'post' items */ 731 j, 732 i, 733 nitem; 734 int len = qr->query->size * 4, 735 cur = 0; 736 DocRepresentation *doc; 737 738 doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len); 739 740 /* 741 * Iterate through query to make DocRepresentaion for words and it's 742 * entries satisfied by query 743 */ 744 for (i = 0; i < qr->query->size; i++) 745 { 746 QueryOperand *curoperand; 747 748 if (item[i].type != QI_VAL) 749 continue; 750 751 curoperand = &item[i].qoperand; 752 753 firstentry = entry = find_wordentry(txt, qr->query, curoperand, &nitem); 754 if (!entry) 755 continue; 756 757 /* iterations over entries in tsvector */ 758 while (entry - firstentry < nitem) 759 { 760 if (entry->haspos) 761 { 762 dimt = POSDATALEN(txt, entry); 763 post = POSDATAPTR(txt, entry); 764 } 765 else 766 { 767 /* ignore words without positions */ 768 entry++; 769 continue; 770 } 771 772 while (cur + dimt >= len) 773 { 774 len *= 2; 775 doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len); 776 } 777 778 /* iterations over entry's positions */ 779 for (j = 0; j < dimt; j++) 780 { 781 if (curoperand->weight == 0 || 782 curoperand->weight & (1 << WEP_GETWEIGHT(post[j]))) 783 { 784 doc[cur].pos = post[j]; 785 doc[cur].data.map.entry = entry; 786 doc[cur].data.map.item = (QueryItem *) curoperand; 787 cur++; 788 } 789 } 790 791 entry++; 792 } 793 } 794 795 if (cur > 0) 796 { 797 DocRepresentation *rptr = doc + 1, 798 *wptr = doc, 799 storage; 800 801 /* 802 * Sort representation in ascending order by pos and entry 803 */ 804 qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR); 805 806 /* 807 * Join QueryItem per WordEntry and it's position 808 */ 809 storage.pos = doc->pos; 810 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); 811 storage.data.query.items[0] = doc->data.map.item; 812 storage.data.query.nitem = 1; 813 814 while (rptr - doc < cur) 815 { 816 if (rptr->pos == (rptr - 1)->pos && 817 rptr->data.map.entry == (rptr - 1)->data.map.entry) 818 { 819 storage.data.query.items[storage.data.query.nitem] = rptr->data.map.item; 820 storage.data.query.nitem++; 821 } 822 else 823 { 824 *wptr = storage; 825 wptr++; 826 storage.pos = rptr->pos; 827 storage.data.query.items = palloc(sizeof(QueryItem *) * qr->query->size); 828 storage.data.query.items[0] = rptr->data.map.item; 829 storage.data.query.nitem = 1; 830 } 831 832 rptr++; 833 } 834 835 *wptr = storage; 836 wptr++; 837 838 *doclen = wptr - doc; 839 return doc; 840 } 841 842 pfree(doc); 843 return NULL; 844 } 845 846 static float4 847 calc_rank_cd(const float4 *arrdata, TSVector txt, TSQuery query, int method) 848 { 849 DocRepresentation *doc; 850 int len, 851 i, 852 doclen = 0; 853 CoverExt ext; 854 double Wdoc = 0.0; 855 double invws[lengthof(weights)]; 856 double SumDist = 0.0, 857 PrevExtPos = 0.0, 858 CurExtPos = 0.0; 859 int NExtent = 0; 860 QueryRepresentation qr; 861 862 863 for (i = 0; i < lengthof(weights); i++) 864 { 865 invws[i] = ((double) ((arrdata[i] >= 0) ? arrdata[i] : weights[i])); 866 if (invws[i] > 1.0) 867 ereport(ERROR, 868 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), 869 errmsg("weight out of range"))); 870 invws[i] = 1.0 / invws[i]; 871 } 872 873 qr.query = query; 874 qr.operandData = (QueryRepresentationOperand *) 875 palloc0(sizeof(QueryRepresentationOperand) * query->size); 876 877 doc = get_docrep(txt, &qr, &doclen); 878 if (!doc) 879 { 880 pfree(qr.operandData); 881 return 0.0; 882 } 883 884 MemSet(&ext, 0, sizeof(CoverExt)); 885 while (Cover(doc, doclen, &qr, &ext)) 886 { 887 double Cpos = 0.0; 888 double InvSum = 0.0; 889 int nNoise; 890 DocRepresentation *ptr = ext.begin; 891 892 while (ptr <= ext.end) 893 { 894 InvSum += invws[WEP_GETWEIGHT(ptr->pos)]; 895 ptr++; 896 } 897 898 Cpos = ((double) (ext.end - ext.begin + 1)) / InvSum; 899 900 /* 901 * if doc are big enough then ext.q may be equal to ext.p due to limit 902 * of positional information. In this case we approximate number of 903 * noise word as half cover's length 904 */ 905 nNoise = (ext.q - ext.p) - (ext.end - ext.begin); 906 if (nNoise < 0) 907 nNoise = (ext.end - ext.begin) / 2; 908 Wdoc += Cpos / ((double) (1 + nNoise)); 909 910 CurExtPos = ((double) (ext.q + ext.p)) / 2.0; 911 if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent division by 912 * zero in a case of 913 * multiple lexize */ ) 914 SumDist += 1.0 / (CurExtPos - PrevExtPos); 915 916 PrevExtPos = CurExtPos; 917 NExtent++; 918 } 919 920 if ((method & RANK_NORM_LOGLENGTH) && txt->size > 0) 921 Wdoc /= log((double) (cnt_length(txt) + 1)); 922 923 if (method & RANK_NORM_LENGTH) 924 { 925 len = cnt_length(txt); 926 if (len > 0) 927 Wdoc /= (double) len; 928 } 929 930 if ((method & RANK_NORM_EXTDIST) && NExtent > 0 && SumDist > 0) 931 Wdoc /= ((double) NExtent) / SumDist; 932 933 if ((method & RANK_NORM_UNIQ) && txt->size > 0) 934 Wdoc /= (double) (txt->size); 935 936 if ((method & RANK_NORM_LOGUNIQ) && txt->size > 0) 937 Wdoc /= log((double) (txt->size + 1)) / log(2.0); 938 939 if (method & RANK_NORM_RDIVRPLUS1) 940 Wdoc /= (Wdoc + 1); 941 942 pfree(doc); 943 944 pfree(qr.operandData); 945 946 return (float4) Wdoc; 947 } 948 949 Datum 950 ts_rankcd_wttf(PG_FUNCTION_ARGS) 951 { 952 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); 953 TSVector txt = PG_GETARG_TSVECTOR(1); 954 TSQuery query = PG_GETARG_TSQUERY(2); 955 int method = PG_GETARG_INT32(3); 956 float res; 957 958 res = calc_rank_cd(getWeights(win), txt, query, method); 959 960 PG_FREE_IF_COPY(win, 0); 961 PG_FREE_IF_COPY(txt, 1); 962 PG_FREE_IF_COPY(query, 2); 963 PG_RETURN_FLOAT4(res); 964 } 965 966 Datum 967 ts_rankcd_wtt(PG_FUNCTION_ARGS) 968 { 969 ArrayType *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0)); 970 TSVector txt = PG_GETARG_TSVECTOR(1); 971 TSQuery query = PG_GETARG_TSQUERY(2); 972 float res; 973 974 res = calc_rank_cd(getWeights(win), txt, query, DEF_NORM_METHOD); 975 976 PG_FREE_IF_COPY(win, 0); 977 PG_FREE_IF_COPY(txt, 1); 978 PG_FREE_IF_COPY(query, 2); 979 PG_RETURN_FLOAT4(res); 980 } 981 982 Datum 983 ts_rankcd_ttf(PG_FUNCTION_ARGS) 984 { 985 TSVector txt = PG_GETARG_TSVECTOR(0); 986 TSQuery query = PG_GETARG_TSQUERY(1); 987 int method = PG_GETARG_INT32(2); 988 float res; 989 990 res = calc_rank_cd(getWeights(NULL), txt, query, method); 991 992 PG_FREE_IF_COPY(txt, 0); 993 PG_FREE_IF_COPY(query, 1); 994 PG_RETURN_FLOAT4(res); 995 } 996 997 Datum 998 ts_rankcd_tt(PG_FUNCTION_ARGS) 999 { 1000 TSVector txt = PG_GETARG_TSVECTOR(0); 1001 TSQuery query = PG_GETARG_TSQUERY(1); 1002 float res; 1003 1004 res = calc_rank_cd(getWeights(NULL), txt, query, DEF_NORM_METHOD); 1005 1006 PG_FREE_IF_COPY(txt, 0); 1007 PG_FREE_IF_COPY(query, 1); 1008 PG_RETURN_FLOAT4(res); 1009 } 1010