1 /*------------------------------------------------------------------------- 2 * 3 * tsvector.c 4 * I/O functions for tsvector 5 * 6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group 7 * 8 * 9 * IDENTIFICATION 10 * src/backend/utils/adt/tsvector.c 11 * 12 *------------------------------------------------------------------------- 13 */ 14 15 #include "postgres.h" 16 17 #include "libpq/pqformat.h" 18 #include "tsearch/ts_locale.h" 19 #include "tsearch/ts_utils.h" 20 #include "utils/builtins.h" 21 #include "utils/memutils.h" 22 23 typedef struct 24 { 25 WordEntry entry; /* must be first! */ 26 WordEntryPos *pos; 27 int poslen; /* number of elements in pos */ 28 } WordEntryIN; 29 30 31 /* Compare two WordEntryPos values for qsort */ 32 int 33 compareWordEntryPos(const void *a, const void *b) 34 { 35 int apos = WEP_GETPOS(*(const WordEntryPos *) a); 36 int bpos = WEP_GETPOS(*(const WordEntryPos *) b); 37 38 if (apos == bpos) 39 return 0; 40 return (apos > bpos) ? 1 : -1; 41 } 42 43 /* 44 * Removes duplicate pos entries. If there's two entries with same pos but 45 * different weight, the higher weight is retained, so we can't use 46 * qunique here. 47 * 48 * Returns new length. 49 */ 50 static int 51 uniquePos(WordEntryPos *a, int l) 52 { 53 WordEntryPos *ptr, 54 *res; 55 56 if (l <= 1) 57 return l; 58 59 qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos); 60 AudioMidiSyncHelper(float ** const o,uint32_t f,const MidiEvent * m,uint32_t mc)61 res = a; 62 ptr = a + 1; 63 while (ptr - a < l) 64 { 65 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) 66 { 67 res++; 68 *res = *ptr; 69 if (res - a >= MAXNUMPOS - 1 || 70 WEP_GETPOS(*res) == MAXENTRYPOS - 1) 71 break; 72 } 73 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) nextEvent()74 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); 75 ptr++; 76 } 77 78 return res + 1 - a; 79 } 80 81 /* Compare two WordEntryIN values for qsort */ 82 static int 83 compareentry(const void *va, const void *vb, void *arg) 84 { 85 const WordEntryIN *a = (const WordEntryIN *) va; 86 const WordEntryIN *b = (const WordEntryIN *) vb; 87 char *BufferStr = (char *) arg; 88 89 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len, 90 &BufferStr[b->entry.pos], b->entry.len, 91 false); 92 } 93 94 /* 95 * Sort an array of WordEntryIN, remove duplicates. 96 * *outbuflen receives the amount of space needed for strings and positions. 97 */ 98 static int 99 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) 100 { 101 int buflen; 102 WordEntryIN *ptr, 103 *res; 104 105 Assert(l >= 1); 106 107 if (l > 1) 108 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, 109 (void *) buf); 110 111 buflen = 0; 112 res = a; 113 ptr = a + 1; 114 while (ptr - a < l) 115 { 116 if (!(ptr->entry.len == res->entry.len && 117 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], 118 res->entry.len) == 0)) 119 { 120 /* done accumulating data into *res, count space needed */ 121 buflen += res->entry.len; 122 if (res->entry.haspos) 123 { 124 res->poslen = uniquePos(res->pos, res->poslen); 125 buflen = SHORTALIGN(buflen); 126 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); 127 } 128 res++; 129 if (res != ptr) 130 memcpy(res, ptr, sizeof(WordEntryIN)); 131 } 132 else if (ptr->entry.haspos) 133 { 134 if (res->entry.haspos) 135 { 136 /* append ptr's positions to res's positions */ 137 int newlen = ptr->poslen + res->poslen; 138 139 res->pos = (WordEntryPos *) 140 repalloc(res->pos, newlen * sizeof(WordEntryPos)); 141 memcpy(&res->pos[res->poslen], ptr->pos, 142 ptr->poslen * sizeof(WordEntryPos)); 143 res->poslen = newlen; 144 pfree(ptr->pos); 145 } 146 else 147 { 148 /* just give ptr's positions to pos */ 149 res->entry.haspos = 1; 150 res->pos = ptr->pos; 151 res->poslen = ptr->poslen; 152 } 153 } 154 ptr++; 155 } 156 157 /* count space needed for last item */ 158 buflen += res->entry.len; 159 if (res->entry.haspos) 160 { 161 res->poslen = uniquePos(res->pos, res->poslen); 162 buflen = SHORTALIGN(buflen); 163 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16); 164 } 165 166 *outbuflen = buflen; 167 return res + 1 - a; 168 } 169 170 static int 171 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf) 172 { 173 return compareentry(a, b, buf); 174 } 175 176 177 Datum 178 tsvectorin(PG_FUNCTION_ARGS) 179 { 180 char *buf = PG_GETARG_CSTRING(0); 181 TSVectorParseState state; 182 WordEntryIN *arr; 183 int totallen; 184 int arrlen; /* allocated size of arr */ 185 WordEntry *inarr; 186 int len = 0; 187 TSVector in; 188 int i; 189 char *token; 190 int toklen; 191 WordEntryPos *pos; 192 int poslen; 193 char *strbuf; 194 int stroff; 195 196 /* 197 * Tokens are appended to tmpbuf, cur is a pointer to the end of used 198 * space in tmpbuf. 199 */ 200 char *tmpbuf; 201 char *cur; 202 int buflen = 256; /* allocated size of tmpbuf */ 203 204 state = init_tsvector_parser(buf, 0); 205 206 arrlen = 64; 207 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); 208 cur = tmpbuf = (char *) palloc(buflen); 209 210 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) 211 { 212 if (toklen >= MAXSTRLEN) 213 ereport(ERROR, 214 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 215 errmsg("word is too long (%ld bytes, max %ld bytes)", 216 (long) toklen, 217 (long) (MAXSTRLEN - 1)))); 218 219 if (cur - tmpbuf > MAXSTRPOS) 220 ereport(ERROR, 221 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 222 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)", 223 (long) (cur - tmpbuf), (long) MAXSTRPOS))); 224 225 /* 226 * Enlarge buffers if needed 227 */ 228 if (len >= arrlen) 229 { 230 arrlen *= 2; 231 arr = (WordEntryIN *) 232 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); 233 } 234 while ((cur - tmpbuf) + toklen >= buflen) 235 { 236 int dist = cur - tmpbuf; 237 238 buflen *= 2; 239 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); 240 cur = tmpbuf + dist; 241 } 242 arr[len].entry.len = toklen; 243 arr[len].entry.pos = cur - tmpbuf; 244 memcpy((void *) cur, (void *) token, toklen); 245 cur += toklen; 246 247 if (poslen != 0) 248 { 249 arr[len].entry.haspos = 1; 250 arr[len].pos = pos; 251 arr[len].poslen = poslen; 252 } 253 else 254 { 255 arr[len].entry.haspos = 0; 256 arr[len].pos = NULL; 257 arr[len].poslen = 0; 258 } 259 len++; 260 } 261 262 close_tsvector_parser(state); 263 264 if (len > 0) 265 len = uniqueentry(arr, len, tmpbuf, &buflen); 266 else 267 buflen = 0; 268 269 if (buflen > MAXSTRPOS) 270 ereport(ERROR, 271 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), 272 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS))); 273 274 totallen = CALCDATASIZE(len, buflen); 275 in = (TSVector) palloc0(totallen); 276 SET_VARSIZE(in, totallen); 277 in->size = len; 278 inarr = ARRPTR(in); 279 strbuf = STRPTR(in); 280 stroff = 0; 281 for (i = 0; i < len; i++) 282 { 283 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len); 284 arr[i].entry.pos = stroff; 285 stroff += arr[i].entry.len; 286 if (arr[i].entry.haspos) 287 { 288 if (arr[i].poslen > 0xFFFF) 289 elog(ERROR, "positions array too long"); 290 291 /* Copy number of positions */ 292 stroff = SHORTALIGN(stroff); 293 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen; 294 stroff += sizeof(uint16); 295 296 /* Copy positions */ 297 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos)); 298 stroff += arr[i].poslen * sizeof(WordEntryPos); 299 300 pfree(arr[i].pos); 301 } 302 inarr[i] = arr[i].entry; 303 } 304 305 Assert((strbuf + stroff - (char *) in) == totallen); 306 307 PG_RETURN_TSVECTOR(in); 308 } 309 310 Datum 311 tsvectorout(PG_FUNCTION_ARGS) 312 { 313 TSVector out = PG_GETARG_TSVECTOR(0); 314 char *outbuf; 315 int32 i, 316 lenbuf = 0, 317 pp; 318 WordEntry *ptr = ARRPTR(out); 319 char *curbegin, 320 *curin, 321 *curout; 322 323 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ; 324 for (i = 0; i < out->size; i++) 325 { 326 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ; 327 if (ptr[i].haspos) 328 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i])); 329 } 330 331 curout = outbuf = (char *) palloc(lenbuf); 332 for (i = 0; i < out->size; i++) 333 { 334 curbegin = curin = STRPTR(out) + ptr->pos; 335 if (i != 0) 336 *curout++ = ' '; 337 *curout++ = '\''; 338 while (curin - curbegin < ptr->len) 339 { 340 int len = pg_mblen(curin); 341 342 if (t_iseq(curin, '\'')) 343 *curout++ = '\''; 344 else if (t_iseq(curin, '\\')) 345 *curout++ = '\\'; 346 347 while (len--) 348 *curout++ = *curin++; 349 } 350 351 *curout++ = '\''; 352 if ((pp = POSDATALEN(out, ptr)) != 0) 353 { 354 WordEntryPos *wptr; 355 356 *curout++ = ':'; 357 wptr = POSDATAPTR(out, ptr); 358 while (pp) 359 { 360 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); 361 switch (WEP_GETWEIGHT(*wptr)) 362 { 363 case 3: 364 *curout++ = 'A'; 365 break; 366 case 2: 367 *curout++ = 'B'; 368 break; 369 case 1: 370 *curout++ = 'C'; 371 break; 372 case 0: 373 default: 374 break; 375 } 376 377 if (pp > 1) 378 *curout++ = ','; 379 pp--; 380 wptr++; 381 } 382 } 383 ptr++; 384 } 385 386 *curout = '\0'; 387 PG_FREE_IF_COPY(out, 0); 388 PG_RETURN_CSTRING(outbuf); 389 } 390 391 /* 392 * Binary Input / Output functions. The binary format is as follows: 393 * 394 * uint32 number of lexemes 395 * 396 * for each lexeme: 397 * lexeme text in client encoding, null-terminated 398 * uint16 number of positions 399 * for each position: 400 * uint16 WordEntryPos 401 */ 402 403 Datum 404 tsvectorsend(PG_FUNCTION_ARGS) 405 { 406 TSVector vec = PG_GETARG_TSVECTOR(0); 407 StringInfoData buf; 408 int i, 409 j; 410 WordEntry *weptr = ARRPTR(vec); 411 412 pq_begintypsend(&buf); 413 414 pq_sendint32(&buf, vec->size); 415 for (i = 0; i < vec->size; i++) 416 { 417 uint16 npos; 418 419 /* 420 * the strings in the TSVector array are not null-terminated, so we 421 * have to send the null-terminator separately 422 */ 423 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len); 424 pq_sendbyte(&buf, '\0'); 425 426 npos = POSDATALEN(vec, weptr); 427 pq_sendint16(&buf, npos); 428 429 if (npos > 0) 430 { 431 WordEntryPos *wepptr = POSDATAPTR(vec, weptr); 432 433 for (j = 0; j < npos; j++) 434 pq_sendint16(&buf, wepptr[j]); 435 } 436 weptr++; 437 } 438 439 PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); 440 } 441 442 Datum 443 tsvectorrecv(PG_FUNCTION_ARGS) 444 { 445 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); 446 TSVector vec; 447 int i; 448 int32 nentries; 449 int datalen; /* number of bytes used in the variable size 450 * area after fixed size TSVector header and 451 * WordEntries */ 452 Size hdrlen; 453 Size len; /* allocated size of vec */ 454 bool needSort = false; 455 456 nentries = pq_getmsgint(buf, sizeof(int32)); 457 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry))) 458 elog(ERROR, "invalid size of tsvector"); 459 460 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries; 461 462 len = hdrlen * 2; /* times two to make room for lexemes */ 463 vec = (TSVector) palloc0(len); 464 vec->size = nentries; 465 466 datalen = 0; 467 for (i = 0; i < nentries; i++) 468 { 469 const char *lexeme; 470 uint16 npos; 471 size_t lex_len; 472 473 lexeme = pq_getmsgstring(buf); 474 npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); 475 476 /* sanity checks */ 477 478 lex_len = strlen(lexeme); 479 if (lex_len > MAXSTRLEN) 480 elog(ERROR, "invalid tsvector: lexeme too long"); 481 482 if (datalen > MAXSTRPOS) 483 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded"); 484 485 if (npos > MAXNUMPOS) 486 elog(ERROR, "unexpected number of tsvector positions"); 487 488 /* 489 * Looks valid. Fill the WordEntry struct, and copy lexeme. 490 * 491 * But make sure the buffer is large enough first. 492 */ 493 while (hdrlen + SHORTALIGN(datalen + lex_len) + 494 (npos + 1) * sizeof(WordEntryPos) >= len) 495 { 496 len *= 2; 497 vec = (TSVector) repalloc(vec, len); 498 } 499 500 vec->entries[i].haspos = (npos > 0) ? 1 : 0; 501 vec->entries[i].len = lex_len; 502 vec->entries[i].pos = datalen; 503 504 memcpy(STRPTR(vec) + datalen, lexeme, lex_len); 505 506 datalen += lex_len; 507 508 if (i > 0 && WordEntryCMP(&vec->entries[i], 509 &vec->entries[i - 1], 510 STRPTR(vec)) <= 0) 511 needSort = true; 512 513 /* Receive positions */ 514 if (npos > 0) 515 { 516 uint16 j; 517 WordEntryPos *wepptr; 518 519 /* 520 * Pad to 2-byte alignment if necessary. Though we used palloc0 521 * for the initial allocation, subsequent repalloc'd memory areas 522 * are not initialized to zero. 523 */ 524 if (datalen != SHORTALIGN(datalen)) 525 { 526 *(STRPTR(vec) + datalen) = '\0'; 527 datalen = SHORTALIGN(datalen); 528 } 529 530 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16)); 531 532 wepptr = POSDATAPTR(vec, &vec->entries[i]); 533 for (j = 0; j < npos; j++) 534 { 535 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); 536 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) 537 elog(ERROR, "position information is misordered"); 538 } 539 540 datalen += (npos + 1) * sizeof(WordEntry); 541 } 542 } 543 544 SET_VARSIZE(vec, hdrlen + datalen); 545 546 if (needSort) 547 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry), 548 compareentry, (void *) STRPTR(vec)); 549 550 PG_RETURN_TSVECTOR(vec); 551 } 552