1 /*-------------------------------------------------------------------------
2 *
3 * tsvector.c
4 * I/O functions for tsvector
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
21 #include "utils/memutils.h"
22
23 typedef struct
24 {
25 WordEntry entry; /* must be first! */
26 WordEntryPos *pos;
27 int poslen; /* number of elements in pos */
28 } WordEntryIN;
29
30
31 /* Compare two WordEntryPos values for qsort */
32 int
compareWordEntryPos(const void * a,const void * b)33 compareWordEntryPos(const void *a, const void *b)
34 {
35 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
36 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37
38 if (apos == bpos)
39 return 0;
40 return (apos > bpos) ? 1 : -1;
41 }
42
43 /*
44 * Removes duplicate pos entries. If there's two entries with same pos but
45 * different weight, the higher weight is retained, so we can't use
46 * qunique here.
47 *
48 * Returns new length.
49 */
50 static int
uniquePos(WordEntryPos * a,int l)51 uniquePos(WordEntryPos *a, int l)
52 {
53 WordEntryPos *ptr,
54 *res;
55
56 if (l <= 1)
57 return l;
58
59 qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
60
61 res = a;
62 ptr = a + 1;
63 while (ptr - a < l)
64 {
65 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
66 {
67 res++;
68 *res = *ptr;
69 if (res - a >= MAXNUMPOS - 1 ||
70 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
71 break;
72 }
73 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
74 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
75 ptr++;
76 }
77
78 return res + 1 - a;
79 }
80
81 /* Compare two WordEntryIN values for qsort */
82 static int
compareentry(const void * va,const void * vb,void * arg)83 compareentry(const void *va, const void *vb, void *arg)
84 {
85 const WordEntryIN *a = (const WordEntryIN *) va;
86 const WordEntryIN *b = (const WordEntryIN *) vb;
87 char *BufferStr = (char *) arg;
88
89 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
90 &BufferStr[b->entry.pos], b->entry.len,
91 false);
92 }
93
94 /*
95 * Sort an array of WordEntryIN, remove duplicates.
96 * *outbuflen receives the amount of space needed for strings and positions.
97 */
98 static int
uniqueentry(WordEntryIN * a,int l,char * buf,int * outbuflen)99 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
100 {
101 int buflen;
102 WordEntryIN *ptr,
103 *res;
104
105 Assert(l >= 1);
106
107 if (l > 1)
108 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
109 (void *) buf);
110
111 buflen = 0;
112 res = a;
113 ptr = a + 1;
114 while (ptr - a < l)
115 {
116 if (!(ptr->entry.len == res->entry.len &&
117 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
118 res->entry.len) == 0))
119 {
120 /* done accumulating data into *res, count space needed */
121 buflen += res->entry.len;
122 if (res->entry.haspos)
123 {
124 res->poslen = uniquePos(res->pos, res->poslen);
125 buflen = SHORTALIGN(buflen);
126 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
127 }
128 res++;
129 if (res != ptr)
130 memcpy(res, ptr, sizeof(WordEntryIN));
131 }
132 else if (ptr->entry.haspos)
133 {
134 if (res->entry.haspos)
135 {
136 /* append ptr's positions to res's positions */
137 int newlen = ptr->poslen + res->poslen;
138
139 res->pos = (WordEntryPos *)
140 repalloc(res->pos, newlen * sizeof(WordEntryPos));
141 memcpy(&res->pos[res->poslen], ptr->pos,
142 ptr->poslen * sizeof(WordEntryPos));
143 res->poslen = newlen;
144 pfree(ptr->pos);
145 }
146 else
147 {
148 /* just give ptr's positions to pos */
149 res->entry.haspos = 1;
150 res->pos = ptr->pos;
151 res->poslen = ptr->poslen;
152 }
153 }
154 ptr++;
155 }
156
157 /* count space needed for last item */
158 buflen += res->entry.len;
159 if (res->entry.haspos)
160 {
161 res->poslen = uniquePos(res->pos, res->poslen);
162 buflen = SHORTALIGN(buflen);
163 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
164 }
165
166 *outbuflen = buflen;
167 return res + 1 - a;
168 }
169
170 static int
WordEntryCMP(WordEntry * a,WordEntry * b,char * buf)171 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
172 {
173 return compareentry(a, b, buf);
174 }
175
176
177 Datum
tsvectorin(PG_FUNCTION_ARGS)178 tsvectorin(PG_FUNCTION_ARGS)
179 {
180 char *buf = PG_GETARG_CSTRING(0);
181 TSVectorParseState state;
182 WordEntryIN *arr;
183 int totallen;
184 int arrlen; /* allocated size of arr */
185 WordEntry *inarr;
186 int len = 0;
187 TSVector in;
188 int i;
189 char *token;
190 int toklen;
191 WordEntryPos *pos;
192 int poslen;
193 char *strbuf;
194 int stroff;
195
196 /*
197 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
198 * space in tmpbuf.
199 */
200 char *tmpbuf;
201 char *cur;
202 int buflen = 256; /* allocated size of tmpbuf */
203
204 state = init_tsvector_parser(buf, 0);
205
206 arrlen = 64;
207 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
208 cur = tmpbuf = (char *) palloc(buflen);
209
210 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
211 {
212 if (toklen >= MAXSTRLEN)
213 ereport(ERROR,
214 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
215 errmsg("word is too long (%ld bytes, max %ld bytes)",
216 (long) toklen,
217 (long) (MAXSTRLEN - 1))));
218
219 if (cur - tmpbuf > MAXSTRPOS)
220 ereport(ERROR,
221 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
222 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
223 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
224
225 /*
226 * Enlarge buffers if needed
227 */
228 if (len >= arrlen)
229 {
230 arrlen *= 2;
231 arr = (WordEntryIN *)
232 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
233 }
234 while ((cur - tmpbuf) + toklen >= buflen)
235 {
236 int dist = cur - tmpbuf;
237
238 buflen *= 2;
239 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
240 cur = tmpbuf + dist;
241 }
242 arr[len].entry.len = toklen;
243 arr[len].entry.pos = cur - tmpbuf;
244 memcpy((void *) cur, (void *) token, toklen);
245 cur += toklen;
246
247 if (poslen != 0)
248 {
249 arr[len].entry.haspos = 1;
250 arr[len].pos = pos;
251 arr[len].poslen = poslen;
252 }
253 else
254 {
255 arr[len].entry.haspos = 0;
256 arr[len].pos = NULL;
257 arr[len].poslen = 0;
258 }
259 len++;
260 }
261
262 close_tsvector_parser(state);
263
264 if (len > 0)
265 len = uniqueentry(arr, len, tmpbuf, &buflen);
266 else
267 buflen = 0;
268
269 if (buflen > MAXSTRPOS)
270 ereport(ERROR,
271 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
272 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
273
274 totallen = CALCDATASIZE(len, buflen);
275 in = (TSVector) palloc0(totallen);
276 SET_VARSIZE(in, totallen);
277 in->size = len;
278 inarr = ARRPTR(in);
279 strbuf = STRPTR(in);
280 stroff = 0;
281 for (i = 0; i < len; i++)
282 {
283 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
284 arr[i].entry.pos = stroff;
285 stroff += arr[i].entry.len;
286 if (arr[i].entry.haspos)
287 {
288 if (arr[i].poslen > 0xFFFF)
289 elog(ERROR, "positions array too long");
290
291 /* Copy number of positions */
292 stroff = SHORTALIGN(stroff);
293 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
294 stroff += sizeof(uint16);
295
296 /* Copy positions */
297 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
298 stroff += arr[i].poslen * sizeof(WordEntryPos);
299
300 pfree(arr[i].pos);
301 }
302 inarr[i] = arr[i].entry;
303 }
304
305 Assert((strbuf + stroff - (char *) in) == totallen);
306
307 PG_RETURN_TSVECTOR(in);
308 }
309
310 Datum
tsvectorout(PG_FUNCTION_ARGS)311 tsvectorout(PG_FUNCTION_ARGS)
312 {
313 TSVector out = PG_GETARG_TSVECTOR(0);
314 char *outbuf;
315 int32 i,
316 lenbuf = 0,
317 pp;
318 WordEntry *ptr = ARRPTR(out);
319 char *curbegin,
320 *curin,
321 *curout;
322
323 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
324 for (i = 0; i < out->size; i++)
325 {
326 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
327 if (ptr[i].haspos)
328 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
329 }
330
331 curout = outbuf = (char *) palloc(lenbuf);
332 for (i = 0; i < out->size; i++)
333 {
334 curbegin = curin = STRPTR(out) + ptr->pos;
335 if (i != 0)
336 *curout++ = ' ';
337 *curout++ = '\'';
338 while (curin - curbegin < ptr->len)
339 {
340 int len = pg_mblen(curin);
341
342 if (t_iseq(curin, '\''))
343 *curout++ = '\'';
344 else if (t_iseq(curin, '\\'))
345 *curout++ = '\\';
346
347 while (len--)
348 *curout++ = *curin++;
349 }
350
351 *curout++ = '\'';
352 if ((pp = POSDATALEN(out, ptr)) != 0)
353 {
354 WordEntryPos *wptr;
355
356 *curout++ = ':';
357 wptr = POSDATAPTR(out, ptr);
358 while (pp)
359 {
360 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
361 switch (WEP_GETWEIGHT(*wptr))
362 {
363 case 3:
364 *curout++ = 'A';
365 break;
366 case 2:
367 *curout++ = 'B';
368 break;
369 case 1:
370 *curout++ = 'C';
371 break;
372 case 0:
373 default:
374 break;
375 }
376
377 if (pp > 1)
378 *curout++ = ',';
379 pp--;
380 wptr++;
381 }
382 }
383 ptr++;
384 }
385
386 *curout = '\0';
387 PG_FREE_IF_COPY(out, 0);
388 PG_RETURN_CSTRING(outbuf);
389 }
390
391 /*
392 * Binary Input / Output functions. The binary format is as follows:
393 *
394 * uint32 number of lexemes
395 *
396 * for each lexeme:
397 * lexeme text in client encoding, null-terminated
398 * uint16 number of positions
399 * for each position:
400 * uint16 WordEntryPos
401 */
402
403 Datum
tsvectorsend(PG_FUNCTION_ARGS)404 tsvectorsend(PG_FUNCTION_ARGS)
405 {
406 TSVector vec = PG_GETARG_TSVECTOR(0);
407 StringInfoData buf;
408 int i,
409 j;
410 WordEntry *weptr = ARRPTR(vec);
411
412 pq_begintypsend(&buf);
413
414 pq_sendint32(&buf, vec->size);
415 for (i = 0; i < vec->size; i++)
416 {
417 uint16 npos;
418
419 /*
420 * the strings in the TSVector array are not null-terminated, so we
421 * have to send the null-terminator separately
422 */
423 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
424 pq_sendbyte(&buf, '\0');
425
426 npos = POSDATALEN(vec, weptr);
427 pq_sendint16(&buf, npos);
428
429 if (npos > 0)
430 {
431 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
432
433 for (j = 0; j < npos; j++)
434 pq_sendint16(&buf, wepptr[j]);
435 }
436 weptr++;
437 }
438
439 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
440 }
441
442 Datum
tsvectorrecv(PG_FUNCTION_ARGS)443 tsvectorrecv(PG_FUNCTION_ARGS)
444 {
445 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
446 TSVector vec;
447 int i;
448 int32 nentries;
449 int datalen; /* number of bytes used in the variable size
450 * area after fixed size TSVector header and
451 * WordEntries */
452 Size hdrlen;
453 Size len; /* allocated size of vec */
454 bool needSort = false;
455
456 nentries = pq_getmsgint(buf, sizeof(int32));
457 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
458 elog(ERROR, "invalid size of tsvector");
459
460 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
461
462 len = hdrlen * 2; /* times two to make room for lexemes */
463 vec = (TSVector) palloc0(len);
464 vec->size = nentries;
465
466 datalen = 0;
467 for (i = 0; i < nentries; i++)
468 {
469 const char *lexeme;
470 uint16 npos;
471 size_t lex_len;
472
473 lexeme = pq_getmsgstring(buf);
474 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
475
476 /* sanity checks */
477
478 lex_len = strlen(lexeme);
479 if (lex_len > MAXSTRLEN)
480 elog(ERROR, "invalid tsvector: lexeme too long");
481
482 if (datalen > MAXSTRPOS)
483 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
484
485 if (npos > MAXNUMPOS)
486 elog(ERROR, "unexpected number of tsvector positions");
487
488 /*
489 * Looks valid. Fill the WordEntry struct, and copy lexeme.
490 *
491 * But make sure the buffer is large enough first.
492 */
493 while (hdrlen + SHORTALIGN(datalen + lex_len) +
494 (npos + 1) * sizeof(WordEntryPos) >= len)
495 {
496 len *= 2;
497 vec = (TSVector) repalloc(vec, len);
498 }
499
500 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
501 vec->entries[i].len = lex_len;
502 vec->entries[i].pos = datalen;
503
504 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
505
506 datalen += lex_len;
507
508 if (i > 0 && WordEntryCMP(&vec->entries[i],
509 &vec->entries[i - 1],
510 STRPTR(vec)) <= 0)
511 needSort = true;
512
513 /* Receive positions */
514 if (npos > 0)
515 {
516 uint16 j;
517 WordEntryPos *wepptr;
518
519 /*
520 * Pad to 2-byte alignment if necessary. Though we used palloc0
521 * for the initial allocation, subsequent repalloc'd memory areas
522 * are not initialized to zero.
523 */
524 if (datalen != SHORTALIGN(datalen))
525 {
526 *(STRPTR(vec) + datalen) = '\0';
527 datalen = SHORTALIGN(datalen);
528 }
529
530 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
531
532 wepptr = POSDATAPTR(vec, &vec->entries[i]);
533 for (j = 0; j < npos; j++)
534 {
535 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
536 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
537 elog(ERROR, "position information is misordered");
538 }
539
540 datalen += (npos + 1) * sizeof(WordEntry);
541 }
542 }
543
544 SET_VARSIZE(vec, hdrlen + datalen);
545
546 if (needSort)
547 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
548 compareentry, (void *) STRPTR(vec));
549
550 PG_RETURN_TSVECTOR(vec);
551 }
552