1 /*------------------------------------------------------------------------- 2 * 3 * to_tsany.c 4 * to_ts* function definitions 5 * 6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group 7 * 8 * 9 * IDENTIFICATION 10 * src/backend/tsearch/to_tsany.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 #include "postgres.h" 15 16 #include "tsearch/ts_cache.h" 17 #include "tsearch/ts_utils.h" 18 #include "utils/builtins.h" 19 #include "utils/jsonapi.h" 20 21 22 typedef struct MorphOpaque 23 { 24 Oid cfg_id; 25 int qoperator; /* query operator */ 26 } MorphOpaque; 27 28 typedef struct TSVectorBuildState 29 { 30 ParsedText *prs; 31 Oid cfgId; 32 } TSVectorBuildState; 33 34 static void add_to_tsvector(void *_state, char *elem_value, int elem_len); 35 36 37 Datum 38 get_current_ts_config(PG_FUNCTION_ARGS) 39 { 40 PG_RETURN_OID(getTSCurrentConfig(true)); 41 } 42 43 /* 44 * to_tsvector 45 */ 46 static int 47 compareWORD(const void *a, const void *b) 48 { 49 int res; 50 51 res = tsCompareString( 52 ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len, 53 ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len, 54 false); 55 56 if (res == 0) 57 { 58 if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos) 59 return 0; 60 61 res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1; 62 } 63 64 return res; 65 } 66 67 static int 68 uniqueWORD(ParsedWord *a, int32 l) 69 { 70 ParsedWord *ptr, 71 *res; 72 int tmppos; 73 74 if (l == 1) 75 { 76 tmppos = LIMITPOS(a->pos.pos); 77 a->alen = 2; 78 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); 79 a->pos.apos[0] = 1; 80 a->pos.apos[1] = tmppos; 81 return l; 82 } 83 84 res = a; 85 ptr = a + 1; 86 87 /* 88 * Sort words with its positions 89 */ 90 qsort((void *) a, l, sizeof(ParsedWord), compareWORD); 91 92 /* 93 * Initialize first word and its first position 94 */ 95 tmppos = LIMITPOS(a->pos.pos); 96 a->alen = 2; 97 a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); 98 a->pos.apos[0] = 1; 99 a->pos.apos[1] = tmppos; 100 101 /* 102 * Summarize position information for each word 103 */ 104 while (ptr - a < l) 105 { 106 if (!(ptr->len == res->len && 107 strncmp(ptr->word, res->word, res->len) == 0)) 108 { 109 /* 110 * Got a new word, so put it in result 111 */ 112 res++; 113 res->len = ptr->len; 114 res->word = ptr->word; 115 tmppos = LIMITPOS(ptr->pos.pos); 116 res->alen = 2; 117 res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); 118 res->pos.apos[0] = 1; 119 res->pos.apos[1] = tmppos; 120 } 121 else 122 { 123 /* 124 * The word already exists, so adjust position information. But 125 * before we should check size of position's array, max allowed 126 * value for position and uniqueness of position 127 */ 128 pfree(ptr->word); 129 if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 && 130 res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) 131 { 132 if (res->pos.apos[0] + 1 >= res->alen) 133 { 134 res->alen *= 2; 135 res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); 136 } 137 if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) 138 { 139 res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); 140 res->pos.apos[0]++; 141 } 142 } 143 } 144 ptr++; 145 } 146 147 return res + 1 - a; 148 } 149 150 /* 151 * make value of tsvector, given parsed text 152 * 153 * Note: frees prs->words and subsidiary data. 154 */ 155 TSVector 156 make_tsvector(ParsedText *prs) 157 { 158 int i, 159 j, 160 lenstr = 0, 161 totallen; 162 TSVector in; 163 WordEntry *ptr; 164 char *str; 165 int stroff; 166 167 /* Merge duplicate words */ 168 if (prs->curwords > 0) 169 prs->curwords = uniqueWORD(prs->words, prs->curwords); 170 171 /* Determine space needed */ 172 for (i = 0; i < prs->curwords; i++) 173 { 174 lenstr += prs->words[i].len; 175 if (prs->words[i].alen) 176 { 177 lenstr = SHORTALIGN(lenstr); 178 lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos); 179 } 180 } 181 182 if (lenstr > MAXSTRPOS) 183 ereport(ERROR, 184 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 185 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); 186 187 totallen = CALCDATASIZE(prs->curwords, lenstr); 188 in = (TSVector) palloc0(totallen); 189 SET_VARSIZE(in, totallen); 190 in->size = prs->curwords; 191 192 ptr = ARRPTR(in); 193 str = STRPTR(in); 194 stroff = 0; 195 for (i = 0; i < prs->curwords; i++) 196 { 197 ptr->len = prs->words[i].len; 198 ptr->pos = stroff; 199 memcpy(str + stroff, prs->words[i].word, prs->words[i].len); 200 stroff += prs->words[i].len; 201 pfree(prs->words[i].word); 202 if (prs->words[i].alen) 203 { 204 int k = prs->words[i].pos.apos[0]; 205 WordEntryPos *wptr; 206 207 if (k > 0xFFFF) 208 elog(ERROR, "positions array too long"); 209 210 ptr->haspos = 1; 211 stroff = SHORTALIGN(stroff); 212 *(uint16 *) (str + stroff) = (uint16) k; 213 wptr = POSDATAPTR(in, ptr); 214 for (j = 0; j < k; j++) 215 { 216 WEP_SETWEIGHT(wptr[j], 0); 217 WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]); 218 } 219 stroff += sizeof(uint16) + k * sizeof(WordEntryPos); 220 pfree(prs->words[i].pos.apos); 221 } 222 else 223 ptr->haspos = 0; 224 ptr++; 225 } 226 227 if (prs->words) 228 pfree(prs->words); 229 230 return in; 231 } 232 233 Datum 234 to_tsvector_byid(PG_FUNCTION_ARGS) 235 { 236 Oid cfgId = PG_GETARG_OID(0); 237 text *in = PG_GETARG_TEXT_PP(1); 238 ParsedText prs; 239 TSVector out; 240 241 prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's 242 * number */ 243 if (prs.lenwords < 2) 244 prs.lenwords = 2; 245 prs.curwords = 0; 246 prs.pos = 0; 247 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); 248 249 parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); 250 251 PG_FREE_IF_COPY(in, 1); 252 253 out = make_tsvector(&prs); 254 255 PG_RETURN_TSVECTOR(out); 256 } 257 258 Datum 259 to_tsvector(PG_FUNCTION_ARGS) 260 { 261 text *in = PG_GETARG_TEXT_PP(0); 262 Oid cfgId; 263 264 cfgId = getTSCurrentConfig(true); 265 PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid, 266 ObjectIdGetDatum(cfgId), 267 PointerGetDatum(in))); 268 } 269 270 /* 271 * Worker function for jsonb(_string)_to_tsvector(_byid) 272 */ 273 static TSVector 274 jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags) 275 { 276 TSVectorBuildState state; 277 ParsedText prs; 278 279 prs.words = NULL; 280 prs.curwords = 0; 281 state.prs = &prs; 282 state.cfgId = cfgId; 283 284 iterate_jsonb_values(jb, flags, &state, add_to_tsvector); 285 286 return make_tsvector(&prs); 287 } 288 289 Datum 290 jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS) 291 { 292 Oid cfgId = PG_GETARG_OID(0); 293 Jsonb *jb = PG_GETARG_JSONB_P(1); 294 TSVector result; 295 296 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); 297 PG_FREE_IF_COPY(jb, 1); 298 299 PG_RETURN_TSVECTOR(result); 300 } 301 302 Datum 303 jsonb_string_to_tsvector(PG_FUNCTION_ARGS) 304 { 305 Jsonb *jb = PG_GETARG_JSONB_P(0); 306 Oid cfgId; 307 TSVector result; 308 309 cfgId = getTSCurrentConfig(true); 310 result = jsonb_to_tsvector_worker(cfgId, jb, jtiString); 311 PG_FREE_IF_COPY(jb, 0); 312 313 PG_RETURN_TSVECTOR(result); 314 } 315 316 Datum 317 jsonb_to_tsvector_byid(PG_FUNCTION_ARGS) 318 { 319 Oid cfgId = PG_GETARG_OID(0); 320 Jsonb *jb = PG_GETARG_JSONB_P(1); 321 Jsonb *jbFlags = PG_GETARG_JSONB_P(2); 322 TSVector result; 323 uint32 flags = parse_jsonb_index_flags(jbFlags); 324 325 result = jsonb_to_tsvector_worker(cfgId, jb, flags); 326 PG_FREE_IF_COPY(jb, 1); 327 PG_FREE_IF_COPY(jbFlags, 2); 328 329 PG_RETURN_TSVECTOR(result); 330 } 331 332 Datum 333 jsonb_to_tsvector(PG_FUNCTION_ARGS) 334 { 335 Jsonb *jb = PG_GETARG_JSONB_P(0); 336 Jsonb *jbFlags = PG_GETARG_JSONB_P(1); 337 Oid cfgId; 338 TSVector result; 339 uint32 flags = parse_jsonb_index_flags(jbFlags); 340 341 cfgId = getTSCurrentConfig(true); 342 result = jsonb_to_tsvector_worker(cfgId, jb, flags); 343 PG_FREE_IF_COPY(jb, 0); 344 PG_FREE_IF_COPY(jbFlags, 1); 345 346 PG_RETURN_TSVECTOR(result); 347 } 348 349 /* 350 * Worker function for json(_string)_to_tsvector(_byid) 351 */ 352 static TSVector 353 json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags) 354 { 355 TSVectorBuildState state; 356 ParsedText prs; 357 358 prs.words = NULL; 359 prs.curwords = 0; 360 state.prs = &prs; 361 state.cfgId = cfgId; 362 363 iterate_json_values(json, flags, &state, add_to_tsvector); 364 365 return make_tsvector(&prs); 366 } 367 368 Datum 369 json_string_to_tsvector_byid(PG_FUNCTION_ARGS) 370 { 371 Oid cfgId = PG_GETARG_OID(0); 372 text *json = PG_GETARG_TEXT_P(1); 373 TSVector result; 374 375 result = json_to_tsvector_worker(cfgId, json, jtiString); 376 PG_FREE_IF_COPY(json, 1); 377 378 PG_RETURN_TSVECTOR(result); 379 } 380 381 Datum 382 json_string_to_tsvector(PG_FUNCTION_ARGS) 383 { 384 text *json = PG_GETARG_TEXT_P(0); 385 Oid cfgId; 386 TSVector result; 387 388 cfgId = getTSCurrentConfig(true); 389 result = json_to_tsvector_worker(cfgId, json, jtiString); 390 PG_FREE_IF_COPY(json, 0); 391 392 PG_RETURN_TSVECTOR(result); 393 } 394 395 Datum 396 json_to_tsvector_byid(PG_FUNCTION_ARGS) 397 { 398 Oid cfgId = PG_GETARG_OID(0); 399 text *json = PG_GETARG_TEXT_P(1); 400 Jsonb *jbFlags = PG_GETARG_JSONB_P(2); 401 TSVector result; 402 uint32 flags = parse_jsonb_index_flags(jbFlags); 403 404 result = json_to_tsvector_worker(cfgId, json, flags); 405 PG_FREE_IF_COPY(json, 1); 406 PG_FREE_IF_COPY(jbFlags, 2); 407 408 PG_RETURN_TSVECTOR(result); 409 } 410 411 Datum 412 json_to_tsvector(PG_FUNCTION_ARGS) 413 { 414 text *json = PG_GETARG_TEXT_P(0); 415 Jsonb *jbFlags = PG_GETARG_JSONB_P(1); 416 Oid cfgId; 417 TSVector result; 418 uint32 flags = parse_jsonb_index_flags(jbFlags); 419 420 cfgId = getTSCurrentConfig(true); 421 result = json_to_tsvector_worker(cfgId, json, flags); 422 PG_FREE_IF_COPY(json, 0); 423 PG_FREE_IF_COPY(jbFlags, 1); 424 425 PG_RETURN_TSVECTOR(result); 426 } 427 428 /* 429 * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState. 430 */ 431 static void 432 add_to_tsvector(void *_state, char *elem_value, int elem_len) 433 { 434 TSVectorBuildState *state = (TSVectorBuildState *) _state; 435 ParsedText *prs = state->prs; 436 int32 prevwords; 437 438 if (prs->words == NULL) 439 { 440 /* 441 * First time through: initialize words array to a reasonable size. 442 * (parsetext() will realloc it bigger as needed.) 443 */ 444 prs->lenwords = 16; 445 prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); 446 prs->curwords = 0; 447 prs->pos = 0; 448 } 449 450 prevwords = prs->curwords; 451 452 parsetext(state->cfgId, prs, elem_value, elem_len); 453 454 /* 455 * If we extracted any words from this JSON element, advance pos to create 456 * an artificial break between elements. This is because we don't want 457 * phrase searches to think that the last word in this element is adjacent 458 * to the first word in the next one. 459 */ 460 if (prs->curwords > prevwords) 461 prs->pos += 1; 462 } 463 464 465 /* 466 * to_tsquery 467 */ 468 469 470 /* 471 * This function is used for morph parsing. 472 * 473 * The value is passed to parsetext which will call the right dictionary to 474 * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP 475 * to the stack. 476 * 477 * All words belonging to the same variant are pushed as an ANDed list, 478 * and different variants are ORed together. 479 */ 480 static void 481 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix) 482 { 483 int32 count = 0; 484 ParsedText prs; 485 uint32 variant, 486 pos = 0, 487 cntvar = 0, 488 cntpos = 0, 489 cnt = 0; 490 MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque); 491 492 prs.lenwords = 4; 493 prs.curwords = 0; 494 prs.pos = 0; 495 prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); 496 497 parsetext(data->cfg_id, &prs, strval, lenval); 498 499 if (prs.curwords > 0) 500 { 501 while (count < prs.curwords) 502 { 503 /* 504 * Were any stop words removed? If so, fill empty positions with 505 * placeholders linked by an appropriate operator. 506 */ 507 if (pos > 0 && pos + 1 < prs.words[count].pos.pos) 508 { 509 while (pos + 1 < prs.words[count].pos.pos) 510 { 511 /* put placeholders for each missing stop word */ 512 pushStop(state); 513 if (cntpos) 514 pushOperator(state, data->qoperator, 1); 515 cntpos++; 516 pos++; 517 } 518 } 519 520 /* save current word's position */ 521 pos = prs.words[count].pos.pos; 522 523 /* Go through all variants obtained from this token */ 524 cntvar = 0; 525 while (count < prs.curwords && pos == prs.words[count].pos.pos) 526 { 527 variant = prs.words[count].nvariant; 528 529 /* Push all words belonging to the same variant */ 530 cnt = 0; 531 while (count < prs.curwords && 532 pos == prs.words[count].pos.pos && 533 variant == prs.words[count].nvariant) 534 { 535 pushValue(state, 536 prs.words[count].word, 537 prs.words[count].len, 538 weight, 539 ((prs.words[count].flags & TSL_PREFIX) || prefix)); 540 pfree(prs.words[count].word); 541 if (cnt) 542 pushOperator(state, OP_AND, 0); 543 cnt++; 544 count++; 545 } 546 547 if (cntvar) 548 pushOperator(state, OP_OR, 0); 549 cntvar++; 550 } 551 552 if (cntpos) 553 { 554 /* distance may be useful */ 555 pushOperator(state, data->qoperator, 1); 556 } 557 558 cntpos++; 559 } 560 561 pfree(prs.words); 562 563 } 564 else 565 pushStop(state); 566 } 567 568 Datum 569 to_tsquery_byid(PG_FUNCTION_ARGS) 570 { 571 text *in = PG_GETARG_TEXT_PP(1); 572 TSQuery query; 573 MorphOpaque data; 574 575 data.cfg_id = PG_GETARG_OID(0); 576 data.qoperator = OP_AND; 577 578 query = parse_tsquery(text_to_cstring(in), 579 pushval_morph, 580 PointerGetDatum(&data), 581 0); 582 583 PG_RETURN_TSQUERY(query); 584 } 585 586 Datum 587 to_tsquery(PG_FUNCTION_ARGS) 588 { 589 text *in = PG_GETARG_TEXT_PP(0); 590 Oid cfgId; 591 592 cfgId = getTSCurrentConfig(true); 593 PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid, 594 ObjectIdGetDatum(cfgId), 595 PointerGetDatum(in))); 596 } 597 598 Datum 599 plainto_tsquery_byid(PG_FUNCTION_ARGS) 600 { 601 text *in = PG_GETARG_TEXT_PP(1); 602 TSQuery query; 603 MorphOpaque data; 604 605 data.cfg_id = PG_GETARG_OID(0); 606 data.qoperator = OP_AND; 607 608 query = parse_tsquery(text_to_cstring(in), 609 pushval_morph, 610 PointerGetDatum(&data), 611 P_TSQ_PLAIN); 612 613 PG_RETURN_POINTER(query); 614 } 615 616 Datum 617 plainto_tsquery(PG_FUNCTION_ARGS) 618 { 619 text *in = PG_GETARG_TEXT_PP(0); 620 Oid cfgId; 621 622 cfgId = getTSCurrentConfig(true); 623 PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid, 624 ObjectIdGetDatum(cfgId), 625 PointerGetDatum(in))); 626 } 627 628 629 Datum 630 phraseto_tsquery_byid(PG_FUNCTION_ARGS) 631 { 632 text *in = PG_GETARG_TEXT_PP(1); 633 TSQuery query; 634 MorphOpaque data; 635 636 data.cfg_id = PG_GETARG_OID(0); 637 data.qoperator = OP_PHRASE; 638 639 query = parse_tsquery(text_to_cstring(in), 640 pushval_morph, 641 PointerGetDatum(&data), 642 P_TSQ_PLAIN); 643 644 PG_RETURN_TSQUERY(query); 645 } 646 647 Datum 648 phraseto_tsquery(PG_FUNCTION_ARGS) 649 { 650 text *in = PG_GETARG_TEXT_PP(0); 651 Oid cfgId; 652 653 cfgId = getTSCurrentConfig(true); 654 PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid, 655 ObjectIdGetDatum(cfgId), 656 PointerGetDatum(in))); 657 } 658 659 Datum 660 websearch_to_tsquery_byid(PG_FUNCTION_ARGS) 661 { 662 text *in = PG_GETARG_TEXT_PP(1); 663 MorphOpaque data; 664 TSQuery query = NULL; 665 666 data.cfg_id = PG_GETARG_OID(0); 667 668 data.qoperator = OP_AND; 669 670 query = parse_tsquery(text_to_cstring(in), 671 pushval_morph, 672 PointerGetDatum(&data), 673 P_TSQ_WEB); 674 675 PG_RETURN_TSQUERY(query); 676 } 677 678 Datum 679 websearch_to_tsquery(PG_FUNCTION_ARGS) 680 { 681 text *in = PG_GETARG_TEXT_PP(0); 682 Oid cfgId; 683 684 cfgId = getTSCurrentConfig(true); 685 PG_RETURN_DATUM(DirectFunctionCall2(websearch_to_tsquery_byid, 686 ObjectIdGetDatum(cfgId), 687 PointerGetDatum(in))); 688 689 } 690