1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  *	  I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
21 #include "utils/memutils.h"
22 
23 typedef struct
24 {
25 	WordEntry	entry;			/* must be first! */
26 	WordEntryPos *pos;
27 	int			poslen;			/* number of elements in pos */
28 } WordEntryIN;
29 
30 
31 /* Compare two WordEntryPos values for qsort */
32 int
compareWordEntryPos(const void * a,const void * b)33 compareWordEntryPos(const void *a, const void *b)
34 {
35 	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);
36 	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37 
38 	if (apos == bpos)
39 		return 0;
40 	return (apos > bpos) ? 1 : -1;
41 }
42 
43 /*
44  * Removes duplicate pos entries. If there's two entries with same pos but
45  * different weight, the higher weight is retained, so we can't use
46  * qunique here.
47  *
48  * Returns new length.
49  */
50 static int
uniquePos(WordEntryPos * a,int l)51 uniquePos(WordEntryPos *a, int l)
52 {
53 	WordEntryPos *ptr,
54 			   *res;
55 
56 	if (l <= 1)
57 		return l;
58 
59 	qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
60 
61 	res = a;
62 	ptr = a + 1;
63 	while (ptr - a < l)
64 	{
65 		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
66 		{
67 			res++;
68 			*res = *ptr;
69 			if (res - a >= MAXNUMPOS - 1 ||
70 				WEP_GETPOS(*res) == MAXENTRYPOS - 1)
71 				break;
72 		}
73 		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
74 			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
75 		ptr++;
76 	}
77 
78 	return res + 1 - a;
79 }
80 
81 /* Compare two WordEntryIN values for qsort */
82 static int
compareentry(const void * va,const void * vb,void * arg)83 compareentry(const void *va, const void *vb, void *arg)
84 {
85 	const WordEntryIN *a = (const WordEntryIN *) va;
86 	const WordEntryIN *b = (const WordEntryIN *) vb;
87 	char	   *BufferStr = (char *) arg;
88 
89 	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
90 						   &BufferStr[b->entry.pos], b->entry.len,
91 						   false);
92 }
93 
94 /*
95  * Sort an array of WordEntryIN, remove duplicates.
96  * *outbuflen receives the amount of space needed for strings and positions.
97  */
98 static int
uniqueentry(WordEntryIN * a,int l,char * buf,int * outbuflen)99 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
100 {
101 	int			buflen;
102 	WordEntryIN *ptr,
103 			   *res;
104 
105 	Assert(l >= 1);
106 
107 	if (l > 1)
108 		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
109 				  (void *) buf);
110 
111 	buflen = 0;
112 	res = a;
113 	ptr = a + 1;
114 	while (ptr - a < l)
115 	{
116 		if (!(ptr->entry.len == res->entry.len &&
117 			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
118 					  res->entry.len) == 0))
119 		{
120 			/* done accumulating data into *res, count space needed */
121 			buflen += res->entry.len;
122 			if (res->entry.haspos)
123 			{
124 				res->poslen = uniquePos(res->pos, res->poslen);
125 				buflen = SHORTALIGN(buflen);
126 				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
127 			}
128 			res++;
129 			if (res != ptr)
130 				memcpy(res, ptr, sizeof(WordEntryIN));
131 		}
132 		else if (ptr->entry.haspos)
133 		{
134 			if (res->entry.haspos)
135 			{
136 				/* append ptr's positions to res's positions */
137 				int			newlen = ptr->poslen + res->poslen;
138 
139 				res->pos = (WordEntryPos *)
140 					repalloc(res->pos, newlen * sizeof(WordEntryPos));
141 				memcpy(&res->pos[res->poslen], ptr->pos,
142 					   ptr->poslen * sizeof(WordEntryPos));
143 				res->poslen = newlen;
144 				pfree(ptr->pos);
145 			}
146 			else
147 			{
148 				/* just give ptr's positions to pos */
149 				res->entry.haspos = 1;
150 				res->pos = ptr->pos;
151 				res->poslen = ptr->poslen;
152 			}
153 		}
154 		ptr++;
155 	}
156 
157 	/* count space needed for last item */
158 	buflen += res->entry.len;
159 	if (res->entry.haspos)
160 	{
161 		res->poslen = uniquePos(res->pos, res->poslen);
162 		buflen = SHORTALIGN(buflen);
163 		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
164 	}
165 
166 	*outbuflen = buflen;
167 	return res + 1 - a;
168 }
169 
170 static int
WordEntryCMP(WordEntry * a,WordEntry * b,char * buf)171 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
172 {
173 	return compareentry(a, b, buf);
174 }
175 
176 
177 Datum
tsvectorin(PG_FUNCTION_ARGS)178 tsvectorin(PG_FUNCTION_ARGS)
179 {
180 	char	   *buf = PG_GETARG_CSTRING(0);
181 	TSVectorParseState state;
182 	WordEntryIN *arr;
183 	int			totallen;
184 	int			arrlen;			/* allocated size of arr */
185 	WordEntry  *inarr;
186 	int			len = 0;
187 	TSVector	in;
188 	int			i;
189 	char	   *token;
190 	int			toklen;
191 	WordEntryPos *pos;
192 	int			poslen;
193 	char	   *strbuf;
194 	int			stroff;
195 
196 	/*
197 	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
198 	 * space in tmpbuf.
199 	 */
200 	char	   *tmpbuf;
201 	char	   *cur;
202 	int			buflen = 256;	/* allocated size of tmpbuf */
203 
204 	state = init_tsvector_parser(buf, 0);
205 
206 	arrlen = 64;
207 	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
208 	cur = tmpbuf = (char *) palloc(buflen);
209 
210 	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
211 	{
212 		if (toklen >= MAXSTRLEN)
213 			ereport(ERROR,
214 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
215 					 errmsg("word is too long (%ld bytes, max %ld bytes)",
216 							(long) toklen,
217 							(long) (MAXSTRLEN - 1))));
218 
219 		if (cur - tmpbuf > MAXSTRPOS)
220 			ereport(ERROR,
221 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
222 					 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
223 							(long) (cur - tmpbuf), (long) MAXSTRPOS)));
224 
225 		/*
226 		 * Enlarge buffers if needed
227 		 */
228 		if (len >= arrlen)
229 		{
230 			arrlen *= 2;
231 			arr = (WordEntryIN *)
232 				repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
233 		}
234 		while ((cur - tmpbuf) + toklen >= buflen)
235 		{
236 			int			dist = cur - tmpbuf;
237 
238 			buflen *= 2;
239 			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
240 			cur = tmpbuf + dist;
241 		}
242 		arr[len].entry.len = toklen;
243 		arr[len].entry.pos = cur - tmpbuf;
244 		memcpy((void *) cur, (void *) token, toklen);
245 		cur += toklen;
246 
247 		if (poslen != 0)
248 		{
249 			arr[len].entry.haspos = 1;
250 			arr[len].pos = pos;
251 			arr[len].poslen = poslen;
252 		}
253 		else
254 		{
255 			arr[len].entry.haspos = 0;
256 			arr[len].pos = NULL;
257 			arr[len].poslen = 0;
258 		}
259 		len++;
260 	}
261 
262 	close_tsvector_parser(state);
263 
264 	if (len > 0)
265 		len = uniqueentry(arr, len, tmpbuf, &buflen);
266 	else
267 		buflen = 0;
268 
269 	if (buflen > MAXSTRPOS)
270 		ereport(ERROR,
271 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
272 				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
273 
274 	totallen = CALCDATASIZE(len, buflen);
275 	in = (TSVector) palloc0(totallen);
276 	SET_VARSIZE(in, totallen);
277 	in->size = len;
278 	inarr = ARRPTR(in);
279 	strbuf = STRPTR(in);
280 	stroff = 0;
281 	for (i = 0; i < len; i++)
282 	{
283 		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
284 		arr[i].entry.pos = stroff;
285 		stroff += arr[i].entry.len;
286 		if (arr[i].entry.haspos)
287 		{
288 			if (arr[i].poslen > 0xFFFF)
289 				elog(ERROR, "positions array too long");
290 
291 			/* Copy number of positions */
292 			stroff = SHORTALIGN(stroff);
293 			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
294 			stroff += sizeof(uint16);
295 
296 			/* Copy positions */
297 			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
298 			stroff += arr[i].poslen * sizeof(WordEntryPos);
299 
300 			pfree(arr[i].pos);
301 		}
302 		inarr[i] = arr[i].entry;
303 	}
304 
305 	Assert((strbuf + stroff - (char *) in) == totallen);
306 
307 	PG_RETURN_TSVECTOR(in);
308 }
309 
310 Datum
tsvectorout(PG_FUNCTION_ARGS)311 tsvectorout(PG_FUNCTION_ARGS)
312 {
313 	TSVector	out = PG_GETARG_TSVECTOR(0);
314 	char	   *outbuf;
315 	int32		i,
316 				lenbuf = 0,
317 				pp;
318 	WordEntry  *ptr = ARRPTR(out);
319 	char	   *curbegin,
320 			   *curin,
321 			   *curout;
322 
323 	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
324 	for (i = 0; i < out->size; i++)
325 	{
326 		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
327 		if (ptr[i].haspos)
328 			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
329 	}
330 
331 	curout = outbuf = (char *) palloc(lenbuf);
332 	for (i = 0; i < out->size; i++)
333 	{
334 		curbegin = curin = STRPTR(out) + ptr->pos;
335 		if (i != 0)
336 			*curout++ = ' ';
337 		*curout++ = '\'';
338 		while (curin - curbegin < ptr->len)
339 		{
340 			int			len = pg_mblen(curin);
341 
342 			if (t_iseq(curin, '\''))
343 				*curout++ = '\'';
344 			else if (t_iseq(curin, '\\'))
345 				*curout++ = '\\';
346 
347 			while (len--)
348 				*curout++ = *curin++;
349 		}
350 
351 		*curout++ = '\'';
352 		if ((pp = POSDATALEN(out, ptr)) != 0)
353 		{
354 			WordEntryPos *wptr;
355 
356 			*curout++ = ':';
357 			wptr = POSDATAPTR(out, ptr);
358 			while (pp)
359 			{
360 				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
361 				switch (WEP_GETWEIGHT(*wptr))
362 				{
363 					case 3:
364 						*curout++ = 'A';
365 						break;
366 					case 2:
367 						*curout++ = 'B';
368 						break;
369 					case 1:
370 						*curout++ = 'C';
371 						break;
372 					case 0:
373 					default:
374 						break;
375 				}
376 
377 				if (pp > 1)
378 					*curout++ = ',';
379 				pp--;
380 				wptr++;
381 			}
382 		}
383 		ptr++;
384 	}
385 
386 	*curout = '\0';
387 	PG_FREE_IF_COPY(out, 0);
388 	PG_RETURN_CSTRING(outbuf);
389 }
390 
391 /*
392  * Binary Input / Output functions. The binary format is as follows:
393  *
394  * uint32	number of lexemes
395  *
396  * for each lexeme:
397  *		lexeme text in client encoding, null-terminated
398  *		uint16	number of positions
399  *		for each position:
400  *			uint16 WordEntryPos
401  */
402 
403 Datum
tsvectorsend(PG_FUNCTION_ARGS)404 tsvectorsend(PG_FUNCTION_ARGS)
405 {
406 	TSVector	vec = PG_GETARG_TSVECTOR(0);
407 	StringInfoData buf;
408 	int			i,
409 				j;
410 	WordEntry  *weptr = ARRPTR(vec);
411 
412 	pq_begintypsend(&buf);
413 
414 	pq_sendint32(&buf, vec->size);
415 	for (i = 0; i < vec->size; i++)
416 	{
417 		uint16		npos;
418 
419 		/*
420 		 * the strings in the TSVector array are not null-terminated, so we
421 		 * have to send the null-terminator separately
422 		 */
423 		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
424 		pq_sendbyte(&buf, '\0');
425 
426 		npos = POSDATALEN(vec, weptr);
427 		pq_sendint16(&buf, npos);
428 
429 		if (npos > 0)
430 		{
431 			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
432 
433 			for (j = 0; j < npos; j++)
434 				pq_sendint16(&buf, wepptr[j]);
435 		}
436 		weptr++;
437 	}
438 
439 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
440 }
441 
442 Datum
tsvectorrecv(PG_FUNCTION_ARGS)443 tsvectorrecv(PG_FUNCTION_ARGS)
444 {
445 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
446 	TSVector	vec;
447 	int			i;
448 	int32		nentries;
449 	int			datalen;		/* number of bytes used in the variable size
450 								 * area after fixed size TSVector header and
451 								 * WordEntries */
452 	Size		hdrlen;
453 	Size		len;			/* allocated size of vec */
454 	bool		needSort = false;
455 
456 	nentries = pq_getmsgint(buf, sizeof(int32));
457 	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
458 		elog(ERROR, "invalid size of tsvector");
459 
460 	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
461 
462 	len = hdrlen * 2;			/* times two to make room for lexemes */
463 	vec = (TSVector) palloc0(len);
464 	vec->size = nentries;
465 
466 	datalen = 0;
467 	for (i = 0; i < nentries; i++)
468 	{
469 		const char *lexeme;
470 		uint16		npos;
471 		size_t		lex_len;
472 
473 		lexeme = pq_getmsgstring(buf);
474 		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
475 
476 		/* sanity checks */
477 
478 		lex_len = strlen(lexeme);
479 		if (lex_len > MAXSTRLEN)
480 			elog(ERROR, "invalid tsvector: lexeme too long");
481 
482 		if (datalen > MAXSTRPOS)
483 			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
484 
485 		if (npos > MAXNUMPOS)
486 			elog(ERROR, "unexpected number of tsvector positions");
487 
488 		/*
489 		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
490 		 *
491 		 * But make sure the buffer is large enough first.
492 		 */
493 		while (hdrlen + SHORTALIGN(datalen + lex_len) +
494 			   (npos + 1) * sizeof(WordEntryPos) >= len)
495 		{
496 			len *= 2;
497 			vec = (TSVector) repalloc(vec, len);
498 		}
499 
500 		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
501 		vec->entries[i].len = lex_len;
502 		vec->entries[i].pos = datalen;
503 
504 		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
505 
506 		datalen += lex_len;
507 
508 		if (i > 0 && WordEntryCMP(&vec->entries[i],
509 								  &vec->entries[i - 1],
510 								  STRPTR(vec)) <= 0)
511 			needSort = true;
512 
513 		/* Receive positions */
514 		if (npos > 0)
515 		{
516 			uint16		j;
517 			WordEntryPos *wepptr;
518 
519 			/*
520 			 * Pad to 2-byte alignment if necessary. Though we used palloc0
521 			 * for the initial allocation, subsequent repalloc'd memory areas
522 			 * are not initialized to zero.
523 			 */
524 			if (datalen != SHORTALIGN(datalen))
525 			{
526 				*(STRPTR(vec) + datalen) = '\0';
527 				datalen = SHORTALIGN(datalen);
528 			}
529 
530 			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
531 
532 			wepptr = POSDATAPTR(vec, &vec->entries[i]);
533 			for (j = 0; j < npos; j++)
534 			{
535 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
536 				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
537 					elog(ERROR, "position information is misordered");
538 			}
539 
540 			datalen += (npos + 1) * sizeof(WordEntry);
541 		}
542 	}
543 
544 	SET_VARSIZE(vec, hdrlen + datalen);
545 
546 	if (needSort)
547 		qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
548 				  compareentry, (void *) STRPTR(vec));
549 
550 	PG_RETURN_TSVECTOR(vec);
551 }
552