1 /*-------------------------------------------------------------------------
2 *
3 * tsvector.c
4 * I/O functions for tsvector
5 *
6 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/utils/adt/tsvector.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/memutils.h"
21
22 typedef struct
23 {
24 WordEntry entry; /* must be first! */
25 WordEntryPos *pos;
26 int poslen; /* number of elements in pos */
27 } WordEntryIN;
28
29
30 /* Compare two WordEntryPos values for qsort */
31 int
compareWordEntryPos(const void * a,const void * b)32 compareWordEntryPos(const void *a, const void *b)
33 {
34 int apos = WEP_GETPOS(*(const WordEntryPos *) a);
35 int bpos = WEP_GETPOS(*(const WordEntryPos *) b);
36
37 if (apos == bpos)
38 return 0;
39 return (apos > bpos) ? 1 : -1;
40 }
41
42 /*
43 * Removes duplicate pos entries. If there's two entries with same pos
44 * but different weight, the higher weight is retained.
45 *
46 * Returns new length.
47 */
48 static int
uniquePos(WordEntryPos * a,int l)49 uniquePos(WordEntryPos *a, int l)
50 {
51 WordEntryPos *ptr,
52 *res;
53
54 if (l <= 1)
55 return l;
56
57 qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
58
59 res = a;
60 ptr = a + 1;
61 while (ptr - a < l)
62 {
63 if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
64 {
65 res++;
66 *res = *ptr;
67 if (res - a >= MAXNUMPOS - 1 ||
68 WEP_GETPOS(*res) == MAXENTRYPOS - 1)
69 break;
70 }
71 else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
72 WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
73 ptr++;
74 }
75
76 return res + 1 - a;
77 }
78
79 /* Compare two WordEntryIN values for qsort */
80 static int
compareentry(const void * va,const void * vb,void * arg)81 compareentry(const void *va, const void *vb, void *arg)
82 {
83 const WordEntryIN *a = (const WordEntryIN *) va;
84 const WordEntryIN *b = (const WordEntryIN *) vb;
85 char *BufferStr = (char *) arg;
86
87 return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
88 &BufferStr[b->entry.pos], b->entry.len,
89 false);
90 }
91
92 /*
93 * Sort an array of WordEntryIN, remove duplicates.
94 * *outbuflen receives the amount of space needed for strings and positions.
95 */
96 static int
uniqueentry(WordEntryIN * a,int l,char * buf,int * outbuflen)97 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
98 {
99 int buflen;
100 WordEntryIN *ptr,
101 *res;
102
103 Assert(l >= 1);
104
105 if (l > 1)
106 qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
107 (void *) buf);
108
109 buflen = 0;
110 res = a;
111 ptr = a + 1;
112 while (ptr - a < l)
113 {
114 if (!(ptr->entry.len == res->entry.len &&
115 strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
116 res->entry.len) == 0))
117 {
118 /* done accumulating data into *res, count space needed */
119 buflen += res->entry.len;
120 if (res->entry.haspos)
121 {
122 res->poslen = uniquePos(res->pos, res->poslen);
123 buflen = SHORTALIGN(buflen);
124 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
125 }
126 res++;
127 if (res != ptr)
128 memcpy(res, ptr, sizeof(WordEntryIN));
129 }
130 else if (ptr->entry.haspos)
131 {
132 if (res->entry.haspos)
133 {
134 /* append ptr's positions to res's positions */
135 int newlen = ptr->poslen + res->poslen;
136
137 res->pos = (WordEntryPos *)
138 repalloc(res->pos, newlen * sizeof(WordEntryPos));
139 memcpy(&res->pos[res->poslen], ptr->pos,
140 ptr->poslen * sizeof(WordEntryPos));
141 res->poslen = newlen;
142 pfree(ptr->pos);
143 }
144 else
145 {
146 /* just give ptr's positions to pos */
147 res->entry.haspos = 1;
148 res->pos = ptr->pos;
149 res->poslen = ptr->poslen;
150 }
151 }
152 ptr++;
153 }
154
155 /* count space needed for last item */
156 buflen += res->entry.len;
157 if (res->entry.haspos)
158 {
159 res->poslen = uniquePos(res->pos, res->poslen);
160 buflen = SHORTALIGN(buflen);
161 buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
162 }
163
164 *outbuflen = buflen;
165 return res + 1 - a;
166 }
167
168 static int
WordEntryCMP(WordEntry * a,WordEntry * b,char * buf)169 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
170 {
171 return compareentry(a, b, buf);
172 }
173
174
175 Datum
tsvectorin(PG_FUNCTION_ARGS)176 tsvectorin(PG_FUNCTION_ARGS)
177 {
178 char *buf = PG_GETARG_CSTRING(0);
179 TSVectorParseState state;
180 WordEntryIN *arr;
181 int totallen;
182 int arrlen; /* allocated size of arr */
183 WordEntry *inarr;
184 int len = 0;
185 TSVector in;
186 int i;
187 char *token;
188 int toklen;
189 WordEntryPos *pos;
190 int poslen;
191 char *strbuf;
192 int stroff;
193
194 /*
195 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
196 * space in tmpbuf.
197 */
198 char *tmpbuf;
199 char *cur;
200 int buflen = 256; /* allocated size of tmpbuf */
201
202 state = init_tsvector_parser(buf, false, false);
203
204 arrlen = 64;
205 arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
206 cur = tmpbuf = (char *) palloc(buflen);
207
208 while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
209 {
210 if (toklen >= MAXSTRLEN)
211 ereport(ERROR,
212 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
213 errmsg("word is too long (%ld bytes, max %ld bytes)",
214 (long) toklen,
215 (long) (MAXSTRLEN - 1))));
216
217 if (cur - tmpbuf > MAXSTRPOS)
218 ereport(ERROR,
219 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
220 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
221 (long) (cur - tmpbuf), (long) MAXSTRPOS)));
222
223 /*
224 * Enlarge buffers if needed
225 */
226 if (len >= arrlen)
227 {
228 arrlen *= 2;
229 arr = (WordEntryIN *)
230 repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
231 }
232 while ((cur - tmpbuf) + toklen >= buflen)
233 {
234 int dist = cur - tmpbuf;
235
236 buflen *= 2;
237 tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
238 cur = tmpbuf + dist;
239 }
240 arr[len].entry.len = toklen;
241 arr[len].entry.pos = cur - tmpbuf;
242 memcpy((void *) cur, (void *) token, toklen);
243 cur += toklen;
244
245 if (poslen != 0)
246 {
247 arr[len].entry.haspos = 1;
248 arr[len].pos = pos;
249 arr[len].poslen = poslen;
250 }
251 else
252 {
253 arr[len].entry.haspos = 0;
254 arr[len].pos = NULL;
255 arr[len].poslen = 0;
256 }
257 len++;
258 }
259
260 close_tsvector_parser(state);
261
262 if (len > 0)
263 len = uniqueentry(arr, len, tmpbuf, &buflen);
264 else
265 buflen = 0;
266
267 if (buflen > MAXSTRPOS)
268 ereport(ERROR,
269 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
270 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
271
272 totallen = CALCDATASIZE(len, buflen);
273 in = (TSVector) palloc0(totallen);
274 SET_VARSIZE(in, totallen);
275 in->size = len;
276 inarr = ARRPTR(in);
277 strbuf = STRPTR(in);
278 stroff = 0;
279 for (i = 0; i < len; i++)
280 {
281 memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
282 arr[i].entry.pos = stroff;
283 stroff += arr[i].entry.len;
284 if (arr[i].entry.haspos)
285 {
286 if (arr[i].poslen > 0xFFFF)
287 elog(ERROR, "positions array too long");
288
289 /* Copy number of positions */
290 stroff = SHORTALIGN(stroff);
291 *(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
292 stroff += sizeof(uint16);
293
294 /* Copy positions */
295 memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
296 stroff += arr[i].poslen * sizeof(WordEntryPos);
297
298 pfree(arr[i].pos);
299 }
300 inarr[i] = arr[i].entry;
301 }
302
303 Assert((strbuf + stroff - (char *) in) == totallen);
304
305 PG_RETURN_TSVECTOR(in);
306 }
307
308 Datum
tsvectorout(PG_FUNCTION_ARGS)309 tsvectorout(PG_FUNCTION_ARGS)
310 {
311 TSVector out = PG_GETARG_TSVECTOR(0);
312 char *outbuf;
313 int32 i,
314 lenbuf = 0,
315 pp;
316 WordEntry *ptr = ARRPTR(out);
317 char *curbegin,
318 *curin,
319 *curout;
320
321 lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
322 for (i = 0; i < out->size; i++)
323 {
324 lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
325 if (ptr[i].haspos)
326 lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
327 }
328
329 curout = outbuf = (char *) palloc(lenbuf);
330 for (i = 0; i < out->size; i++)
331 {
332 curbegin = curin = STRPTR(out) + ptr->pos;
333 if (i != 0)
334 *curout++ = ' ';
335 *curout++ = '\'';
336 while (curin - curbegin < ptr->len)
337 {
338 int len = pg_mblen(curin);
339
340 if (t_iseq(curin, '\''))
341 *curout++ = '\'';
342 else if (t_iseq(curin, '\\'))
343 *curout++ = '\\';
344
345 while (len--)
346 *curout++ = *curin++;
347 }
348
349 *curout++ = '\'';
350 if ((pp = POSDATALEN(out, ptr)) != 0)
351 {
352 WordEntryPos *wptr;
353
354 *curout++ = ':';
355 wptr = POSDATAPTR(out, ptr);
356 while (pp)
357 {
358 curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
359 switch (WEP_GETWEIGHT(*wptr))
360 {
361 case 3:
362 *curout++ = 'A';
363 break;
364 case 2:
365 *curout++ = 'B';
366 break;
367 case 1:
368 *curout++ = 'C';
369 break;
370 case 0:
371 default:
372 break;
373 }
374
375 if (pp > 1)
376 *curout++ = ',';
377 pp--;
378 wptr++;
379 }
380 }
381 ptr++;
382 }
383
384 *curout = '\0';
385 PG_FREE_IF_COPY(out, 0);
386 PG_RETURN_CSTRING(outbuf);
387 }
388
389 /*
390 * Binary Input / Output functions. The binary format is as follows:
391 *
392 * uint32 number of lexemes
393 *
394 * for each lexeme:
395 * lexeme text in client encoding, null-terminated
396 * uint16 number of positions
397 * for each position:
398 * uint16 WordEntryPos
399 */
400
401 Datum
tsvectorsend(PG_FUNCTION_ARGS)402 tsvectorsend(PG_FUNCTION_ARGS)
403 {
404 TSVector vec = PG_GETARG_TSVECTOR(0);
405 StringInfoData buf;
406 int i,
407 j;
408 WordEntry *weptr = ARRPTR(vec);
409
410 pq_begintypsend(&buf);
411
412 pq_sendint(&buf, vec->size, sizeof(int32));
413 for (i = 0; i < vec->size; i++)
414 {
415 uint16 npos;
416
417 /*
418 * the strings in the TSVector array are not null-terminated, so we
419 * have to send the null-terminator separately
420 */
421 pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
422 pq_sendbyte(&buf, '\0');
423
424 npos = POSDATALEN(vec, weptr);
425 pq_sendint(&buf, npos, sizeof(uint16));
426
427 if (npos > 0)
428 {
429 WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
430
431 for (j = 0; j < npos; j++)
432 pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
433 }
434 weptr++;
435 }
436
437 PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
438 }
439
440 Datum
tsvectorrecv(PG_FUNCTION_ARGS)441 tsvectorrecv(PG_FUNCTION_ARGS)
442 {
443 StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
444 TSVector vec;
445 int i;
446 int32 nentries;
447 int datalen; /* number of bytes used in the variable size
448 * area after fixed size TSVector header and
449 * WordEntries */
450 Size hdrlen;
451 Size len; /* allocated size of vec */
452 bool needSort = false;
453
454 nentries = pq_getmsgint(buf, sizeof(int32));
455 if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
456 elog(ERROR, "invalid size of tsvector");
457
458 hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
459
460 len = hdrlen * 2; /* times two to make room for lexemes */
461 vec = (TSVector) palloc0(len);
462 vec->size = nentries;
463
464 datalen = 0;
465 for (i = 0; i < nentries; i++)
466 {
467 const char *lexeme;
468 uint16 npos;
469 size_t lex_len;
470
471 lexeme = pq_getmsgstring(buf);
472 npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
473
474 /* sanity checks */
475
476 lex_len = strlen(lexeme);
477 if (lex_len > MAXSTRLEN)
478 elog(ERROR, "invalid tsvector: lexeme too long");
479
480 if (datalen > MAXSTRPOS)
481 elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
482
483 if (npos > MAXNUMPOS)
484 elog(ERROR, "unexpected number of tsvector positions");
485
486 /*
487 * Looks valid. Fill the WordEntry struct, and copy lexeme.
488 *
489 * But make sure the buffer is large enough first.
490 */
491 while (hdrlen + SHORTALIGN(datalen + lex_len) +
492 (npos + 1) * sizeof(WordEntryPos) >= len)
493 {
494 len *= 2;
495 vec = (TSVector) repalloc(vec, len);
496 }
497
498 vec->entries[i].haspos = (npos > 0) ? 1 : 0;
499 vec->entries[i].len = lex_len;
500 vec->entries[i].pos = datalen;
501
502 memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
503
504 datalen += lex_len;
505
506 if (i > 0 && WordEntryCMP(&vec->entries[i],
507 &vec->entries[i - 1],
508 STRPTR(vec)) <= 0)
509 needSort = true;
510
511 /* Receive positions */
512 if (npos > 0)
513 {
514 uint16 j;
515 WordEntryPos *wepptr;
516
517 /*
518 * Pad to 2-byte alignment if necessary. Though we used palloc0
519 * for the initial allocation, subsequent repalloc'd memory areas
520 * are not initialized to zero.
521 */
522 if (datalen != SHORTALIGN(datalen))
523 {
524 *(STRPTR(vec) + datalen) = '\0';
525 datalen = SHORTALIGN(datalen);
526 }
527
528 memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
529
530 wepptr = POSDATAPTR(vec, &vec->entries[i]);
531 for (j = 0; j < npos; j++)
532 {
533 wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
534 if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
535 elog(ERROR, "position information is misordered");
536 }
537
538 datalen += (npos + 1) * sizeof(WordEntry);
539 }
540 }
541
542 SET_VARSIZE(vec, hdrlen + datalen);
543
544 if (needSort)
545 qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
546 compareentry, (void *) STRPTR(vec));
547
548 PG_RETURN_TSVECTOR(vec);
549 }
550