1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  *	  I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/builtins.h"
21 #include "utils/memutils.h"
22 
23 typedef struct
24 {
25 	WordEntry	entry;			/* must be first! */
26 	WordEntryPos *pos;
27 	int			poslen;			/* number of elements in pos */
28 } WordEntryIN;
29 
30 
31 /* Compare two WordEntryPos values for qsort */
32 int
compareWordEntryPos(const void * a,const void * b)33 compareWordEntryPos(const void *a, const void *b)
34 {
35 	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);
36 	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);
37 
38 	if (apos == bpos)
39 		return 0;
40 	return (apos > bpos) ? 1 : -1;
41 }
42 
43 /*
44  * Removes duplicate pos entries. If there's two entries with same pos
45  * but different weight, the higher weight is retained.
46  *
47  * Returns new length.
48  */
49 static int
uniquePos(WordEntryPos * a,int l)50 uniquePos(WordEntryPos *a, int l)
51 {
52 	WordEntryPos *ptr,
53 			   *res;
54 
55 	if (l <= 1)
56 		return l;
57 
58 	qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
59 
60 	res = a;
61 	ptr = a + 1;
62 	while (ptr - a < l)
63 	{
64 		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
65 		{
66 			res++;
67 			*res = *ptr;
68 			if (res - a >= MAXNUMPOS - 1 ||
69 				WEP_GETPOS(*res) == MAXENTRYPOS - 1)
70 				break;
71 		}
72 		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
73 			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
74 		ptr++;
75 	}
76 
77 	return res + 1 - a;
78 }
79 
80 /* Compare two WordEntryIN values for qsort */
81 static int
compareentry(const void * va,const void * vb,void * arg)82 compareentry(const void *va, const void *vb, void *arg)
83 {
84 	const WordEntryIN *a = (const WordEntryIN *) va;
85 	const WordEntryIN *b = (const WordEntryIN *) vb;
86 	char	   *BufferStr = (char *) arg;
87 
88 	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
89 						   &BufferStr[b->entry.pos], b->entry.len,
90 						   false);
91 }
92 
93 /*
94  * Sort an array of WordEntryIN, remove duplicates.
95  * *outbuflen receives the amount of space needed for strings and positions.
96  */
97 static int
uniqueentry(WordEntryIN * a,int l,char * buf,int * outbuflen)98 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
99 {
100 	int			buflen;
101 	WordEntryIN *ptr,
102 			   *res;
103 
104 	Assert(l >= 1);
105 
106 	if (l > 1)
107 		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
108 				  (void *) buf);
109 
110 	buflen = 0;
111 	res = a;
112 	ptr = a + 1;
113 	while (ptr - a < l)
114 	{
115 		if (!(ptr->entry.len == res->entry.len &&
116 			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
117 					  res->entry.len) == 0))
118 		{
119 			/* done accumulating data into *res, count space needed */
120 			buflen += res->entry.len;
121 			if (res->entry.haspos)
122 			{
123 				res->poslen = uniquePos(res->pos, res->poslen);
124 				buflen = SHORTALIGN(buflen);
125 				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
126 			}
127 			res++;
128 			if (res != ptr)
129 				memcpy(res, ptr, sizeof(WordEntryIN));
130 		}
131 		else if (ptr->entry.haspos)
132 		{
133 			if (res->entry.haspos)
134 			{
135 				/* append ptr's positions to res's positions */
136 				int			newlen = ptr->poslen + res->poslen;
137 
138 				res->pos = (WordEntryPos *)
139 					repalloc(res->pos, newlen * sizeof(WordEntryPos));
140 				memcpy(&res->pos[res->poslen], ptr->pos,
141 					   ptr->poslen * sizeof(WordEntryPos));
142 				res->poslen = newlen;
143 				pfree(ptr->pos);
144 			}
145 			else
146 			{
147 				/* just give ptr's positions to pos */
148 				res->entry.haspos = 1;
149 				res->pos = ptr->pos;
150 				res->poslen = ptr->poslen;
151 			}
152 		}
153 		ptr++;
154 	}
155 
156 	/* count space needed for last item */
157 	buflen += res->entry.len;
158 	if (res->entry.haspos)
159 	{
160 		res->poslen = uniquePos(res->pos, res->poslen);
161 		buflen = SHORTALIGN(buflen);
162 		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
163 	}
164 
165 	*outbuflen = buflen;
166 	return res + 1 - a;
167 }
168 
169 static int
WordEntryCMP(WordEntry * a,WordEntry * b,char * buf)170 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
171 {
172 	return compareentry(a, b, buf);
173 }
174 
175 
176 Datum
tsvectorin(PG_FUNCTION_ARGS)177 tsvectorin(PG_FUNCTION_ARGS)
178 {
179 	char	   *buf = PG_GETARG_CSTRING(0);
180 	TSVectorParseState state;
181 	WordEntryIN *arr;
182 	int			totallen;
183 	int			arrlen;			/* allocated size of arr */
184 	WordEntry  *inarr;
185 	int			len = 0;
186 	TSVector	in;
187 	int			i;
188 	char	   *token;
189 	int			toklen;
190 	WordEntryPos *pos;
191 	int			poslen;
192 	char	   *strbuf;
193 	int			stroff;
194 
195 	/*
196 	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
197 	 * space in tmpbuf.
198 	 */
199 	char	   *tmpbuf;
200 	char	   *cur;
201 	int			buflen = 256;	/* allocated size of tmpbuf */
202 
203 	state = init_tsvector_parser(buf, 0);
204 
205 	arrlen = 64;
206 	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
207 	cur = tmpbuf = (char *) palloc(buflen);
208 
209 	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
210 	{
211 		if (toklen >= MAXSTRLEN)
212 			ereport(ERROR,
213 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
214 					 errmsg("word is too long (%ld bytes, max %ld bytes)",
215 							(long) toklen,
216 							(long) (MAXSTRLEN - 1))));
217 
218 		if (cur - tmpbuf > MAXSTRPOS)
219 			ereport(ERROR,
220 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
221 					 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
222 							(long) (cur - tmpbuf), (long) MAXSTRPOS)));
223 
224 		/*
225 		 * Enlarge buffers if needed
226 		 */
227 		if (len >= arrlen)
228 		{
229 			arrlen *= 2;
230 			arr = (WordEntryIN *)
231 				repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
232 		}
233 		while ((cur - tmpbuf) + toklen >= buflen)
234 		{
235 			int			dist = cur - tmpbuf;
236 
237 			buflen *= 2;
238 			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
239 			cur = tmpbuf + dist;
240 		}
241 		arr[len].entry.len = toklen;
242 		arr[len].entry.pos = cur - tmpbuf;
243 		memcpy((void *) cur, (void *) token, toklen);
244 		cur += toklen;
245 
246 		if (poslen != 0)
247 		{
248 			arr[len].entry.haspos = 1;
249 			arr[len].pos = pos;
250 			arr[len].poslen = poslen;
251 		}
252 		else
253 		{
254 			arr[len].entry.haspos = 0;
255 			arr[len].pos = NULL;
256 			arr[len].poslen = 0;
257 		}
258 		len++;
259 	}
260 
261 	close_tsvector_parser(state);
262 
263 	if (len > 0)
264 		len = uniqueentry(arr, len, tmpbuf, &buflen);
265 	else
266 		buflen = 0;
267 
268 	if (buflen > MAXSTRPOS)
269 		ereport(ERROR,
270 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
271 				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
272 
273 	totallen = CALCDATASIZE(len, buflen);
274 	in = (TSVector) palloc0(totallen);
275 	SET_VARSIZE(in, totallen);
276 	in->size = len;
277 	inarr = ARRPTR(in);
278 	strbuf = STRPTR(in);
279 	stroff = 0;
280 	for (i = 0; i < len; i++)
281 	{
282 		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
283 		arr[i].entry.pos = stroff;
284 		stroff += arr[i].entry.len;
285 		if (arr[i].entry.haspos)
286 		{
287 			if (arr[i].poslen > 0xFFFF)
288 				elog(ERROR, "positions array too long");
289 
290 			/* Copy number of positions */
291 			stroff = SHORTALIGN(stroff);
292 			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
293 			stroff += sizeof(uint16);
294 
295 			/* Copy positions */
296 			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
297 			stroff += arr[i].poslen * sizeof(WordEntryPos);
298 
299 			pfree(arr[i].pos);
300 		}
301 		inarr[i] = arr[i].entry;
302 	}
303 
304 	Assert((strbuf + stroff - (char *) in) == totallen);
305 
306 	PG_RETURN_TSVECTOR(in);
307 }
308 
309 Datum
tsvectorout(PG_FUNCTION_ARGS)310 tsvectorout(PG_FUNCTION_ARGS)
311 {
312 	TSVector	out = PG_GETARG_TSVECTOR(0);
313 	char	   *outbuf;
314 	int32		i,
315 				lenbuf = 0,
316 				pp;
317 	WordEntry  *ptr = ARRPTR(out);
318 	char	   *curbegin,
319 			   *curin,
320 			   *curout;
321 
322 	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
323 	for (i = 0; i < out->size; i++)
324 	{
325 		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
326 		if (ptr[i].haspos)
327 			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
328 	}
329 
330 	curout = outbuf = (char *) palloc(lenbuf);
331 	for (i = 0; i < out->size; i++)
332 	{
333 		curbegin = curin = STRPTR(out) + ptr->pos;
334 		if (i != 0)
335 			*curout++ = ' ';
336 		*curout++ = '\'';
337 		while (curin - curbegin < ptr->len)
338 		{
339 			int			len = pg_mblen(curin);
340 
341 			if (t_iseq(curin, '\''))
342 				*curout++ = '\'';
343 			else if (t_iseq(curin, '\\'))
344 				*curout++ = '\\';
345 
346 			while (len--)
347 				*curout++ = *curin++;
348 		}
349 
350 		*curout++ = '\'';
351 		if ((pp = POSDATALEN(out, ptr)) != 0)
352 		{
353 			WordEntryPos *wptr;
354 
355 			*curout++ = ':';
356 			wptr = POSDATAPTR(out, ptr);
357 			while (pp)
358 			{
359 				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
360 				switch (WEP_GETWEIGHT(*wptr))
361 				{
362 					case 3:
363 						*curout++ = 'A';
364 						break;
365 					case 2:
366 						*curout++ = 'B';
367 						break;
368 					case 1:
369 						*curout++ = 'C';
370 						break;
371 					case 0:
372 					default:
373 						break;
374 				}
375 
376 				if (pp > 1)
377 					*curout++ = ',';
378 				pp--;
379 				wptr++;
380 			}
381 		}
382 		ptr++;
383 	}
384 
385 	*curout = '\0';
386 	PG_FREE_IF_COPY(out, 0);
387 	PG_RETURN_CSTRING(outbuf);
388 }
389 
390 /*
391  * Binary Input / Output functions. The binary format is as follows:
392  *
393  * uint32	number of lexemes
394  *
395  * for each lexeme:
396  *		lexeme text in client encoding, null-terminated
397  *		uint16	number of positions
398  *		for each position:
399  *			uint16 WordEntryPos
400  */
401 
402 Datum
tsvectorsend(PG_FUNCTION_ARGS)403 tsvectorsend(PG_FUNCTION_ARGS)
404 {
405 	TSVector	vec = PG_GETARG_TSVECTOR(0);
406 	StringInfoData buf;
407 	int			i,
408 				j;
409 	WordEntry  *weptr = ARRPTR(vec);
410 
411 	pq_begintypsend(&buf);
412 
413 	pq_sendint32(&buf, vec->size);
414 	for (i = 0; i < vec->size; i++)
415 	{
416 		uint16		npos;
417 
418 		/*
419 		 * the strings in the TSVector array are not null-terminated, so we
420 		 * have to send the null-terminator separately
421 		 */
422 		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
423 		pq_sendbyte(&buf, '\0');
424 
425 		npos = POSDATALEN(vec, weptr);
426 		pq_sendint16(&buf, npos);
427 
428 		if (npos > 0)
429 		{
430 			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
431 
432 			for (j = 0; j < npos; j++)
433 				pq_sendint16(&buf, wepptr[j]);
434 		}
435 		weptr++;
436 	}
437 
438 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
439 }
440 
441 Datum
tsvectorrecv(PG_FUNCTION_ARGS)442 tsvectorrecv(PG_FUNCTION_ARGS)
443 {
444 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
445 	TSVector	vec;
446 	int			i;
447 	int32		nentries;
448 	int			datalen;		/* number of bytes used in the variable size
449 								 * area after fixed size TSVector header and
450 								 * WordEntries */
451 	Size		hdrlen;
452 	Size		len;			/* allocated size of vec */
453 	bool		needSort = false;
454 
455 	nentries = pq_getmsgint(buf, sizeof(int32));
456 	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
457 		elog(ERROR, "invalid size of tsvector");
458 
459 	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
460 
461 	len = hdrlen * 2;			/* times two to make room for lexemes */
462 	vec = (TSVector) palloc0(len);
463 	vec->size = nentries;
464 
465 	datalen = 0;
466 	for (i = 0; i < nentries; i++)
467 	{
468 		const char *lexeme;
469 		uint16		npos;
470 		size_t		lex_len;
471 
472 		lexeme = pq_getmsgstring(buf);
473 		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
474 
475 		/* sanity checks */
476 
477 		lex_len = strlen(lexeme);
478 		if (lex_len > MAXSTRLEN)
479 			elog(ERROR, "invalid tsvector: lexeme too long");
480 
481 		if (datalen > MAXSTRPOS)
482 			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
483 
484 		if (npos > MAXNUMPOS)
485 			elog(ERROR, "unexpected number of tsvector positions");
486 
487 		/*
488 		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
489 		 *
490 		 * But make sure the buffer is large enough first.
491 		 */
492 		while (hdrlen + SHORTALIGN(datalen + lex_len) +
493 			   (npos + 1) * sizeof(WordEntryPos) >= len)
494 		{
495 			len *= 2;
496 			vec = (TSVector) repalloc(vec, len);
497 		}
498 
499 		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
500 		vec->entries[i].len = lex_len;
501 		vec->entries[i].pos = datalen;
502 
503 		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
504 
505 		datalen += lex_len;
506 
507 		if (i > 0 && WordEntryCMP(&vec->entries[i],
508 								  &vec->entries[i - 1],
509 								  STRPTR(vec)) <= 0)
510 			needSort = true;
511 
512 		/* Receive positions */
513 		if (npos > 0)
514 		{
515 			uint16		j;
516 			WordEntryPos *wepptr;
517 
518 			/*
519 			 * Pad to 2-byte alignment if necessary. Though we used palloc0
520 			 * for the initial allocation, subsequent repalloc'd memory areas
521 			 * are not initialized to zero.
522 			 */
523 			if (datalen != SHORTALIGN(datalen))
524 			{
525 				*(STRPTR(vec) + datalen) = '\0';
526 				datalen = SHORTALIGN(datalen);
527 			}
528 
529 			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
530 
531 			wepptr = POSDATAPTR(vec, &vec->entries[i]);
532 			for (j = 0; j < npos; j++)
533 			{
534 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
535 				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
536 					elog(ERROR, "position information is misordered");
537 			}
538 
539 			datalen += (npos + 1) * sizeof(WordEntry);
540 		}
541 	}
542 
543 	SET_VARSIZE(vec, hdrlen + datalen);
544 
545 	if (needSort)
546 		qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
547 				  compareentry, (void *) STRPTR(vec));
548 
549 	PG_RETURN_TSVECTOR(vec);
550 }
551