1 /*-------------------------------------------------------------------------
2  *
3  * tsvector.c
4  *	  I/O functions for tsvector
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/utils/adt/tsvector.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "libpq/pqformat.h"
18 #include "tsearch/ts_locale.h"
19 #include "tsearch/ts_utils.h"
20 #include "utils/memutils.h"
21 
22 typedef struct
23 {
24 	WordEntry	entry;			/* must be first! */
25 	WordEntryPos *pos;
26 	int			poslen;			/* number of elements in pos */
27 } WordEntryIN;
28 
29 
30 /* Compare two WordEntryPos values for qsort */
31 int
compareWordEntryPos(const void * a,const void * b)32 compareWordEntryPos(const void *a, const void *b)
33 {
34 	int			apos = WEP_GETPOS(*(const WordEntryPos *) a);
35 	int			bpos = WEP_GETPOS(*(const WordEntryPos *) b);
36 
37 	if (apos == bpos)
38 		return 0;
39 	return (apos > bpos) ? 1 : -1;
40 }
41 
42 /*
43  * Removes duplicate pos entries. If there's two entries with same pos
44  * but different weight, the higher weight is retained.
45  *
46  * Returns new length.
47  */
48 static int
uniquePos(WordEntryPos * a,int l)49 uniquePos(WordEntryPos *a, int l)
50 {
51 	WordEntryPos *ptr,
52 			   *res;
53 
54 	if (l <= 1)
55 		return l;
56 
57 	qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos);
58 
59 	res = a;
60 	ptr = a + 1;
61 	while (ptr - a < l)
62 	{
63 		if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
64 		{
65 			res++;
66 			*res = *ptr;
67 			if (res - a >= MAXNUMPOS - 1 ||
68 				WEP_GETPOS(*res) == MAXENTRYPOS - 1)
69 				break;
70 		}
71 		else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
72 			WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
73 		ptr++;
74 	}
75 
76 	return res + 1 - a;
77 }
78 
79 /* Compare two WordEntryIN values for qsort */
80 static int
compareentry(const void * va,const void * vb,void * arg)81 compareentry(const void *va, const void *vb, void *arg)
82 {
83 	const WordEntryIN *a = (const WordEntryIN *) va;
84 	const WordEntryIN *b = (const WordEntryIN *) vb;
85 	char	   *BufferStr = (char *) arg;
86 
87 	return tsCompareString(&BufferStr[a->entry.pos], a->entry.len,
88 						   &BufferStr[b->entry.pos], b->entry.len,
89 						   false);
90 }
91 
92 /*
93  * Sort an array of WordEntryIN, remove duplicates.
94  * *outbuflen receives the amount of space needed for strings and positions.
95  */
96 static int
uniqueentry(WordEntryIN * a,int l,char * buf,int * outbuflen)97 uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen)
98 {
99 	int			buflen;
100 	WordEntryIN *ptr,
101 			   *res;
102 
103 	Assert(l >= 1);
104 
105 	if (l > 1)
106 		qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry,
107 				  (void *) buf);
108 
109 	buflen = 0;
110 	res = a;
111 	ptr = a + 1;
112 	while (ptr - a < l)
113 	{
114 		if (!(ptr->entry.len == res->entry.len &&
115 			  strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos],
116 					  res->entry.len) == 0))
117 		{
118 			/* done accumulating data into *res, count space needed */
119 			buflen += res->entry.len;
120 			if (res->entry.haspos)
121 			{
122 				res->poslen = uniquePos(res->pos, res->poslen);
123 				buflen = SHORTALIGN(buflen);
124 				buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
125 			}
126 			res++;
127 			if (res != ptr)
128 				memcpy(res, ptr, sizeof(WordEntryIN));
129 		}
130 		else if (ptr->entry.haspos)
131 		{
132 			if (res->entry.haspos)
133 			{
134 				/* append ptr's positions to res's positions */
135 				int			newlen = ptr->poslen + res->poslen;
136 
137 				res->pos = (WordEntryPos *)
138 					repalloc(res->pos, newlen * sizeof(WordEntryPos));
139 				memcpy(&res->pos[res->poslen], ptr->pos,
140 					   ptr->poslen * sizeof(WordEntryPos));
141 				res->poslen = newlen;
142 				pfree(ptr->pos);
143 			}
144 			else
145 			{
146 				/* just give ptr's positions to pos */
147 				res->entry.haspos = 1;
148 				res->pos = ptr->pos;
149 				res->poslen = ptr->poslen;
150 			}
151 		}
152 		ptr++;
153 	}
154 
155 	/* count space needed for last item */
156 	buflen += res->entry.len;
157 	if (res->entry.haspos)
158 	{
159 		res->poslen = uniquePos(res->pos, res->poslen);
160 		buflen = SHORTALIGN(buflen);
161 		buflen += res->poslen * sizeof(WordEntryPos) + sizeof(uint16);
162 	}
163 
164 	*outbuflen = buflen;
165 	return res + 1 - a;
166 }
167 
168 static int
WordEntryCMP(WordEntry * a,WordEntry * b,char * buf)169 WordEntryCMP(WordEntry *a, WordEntry *b, char *buf)
170 {
171 	return compareentry(a, b, buf);
172 }
173 
174 
175 Datum
tsvectorin(PG_FUNCTION_ARGS)176 tsvectorin(PG_FUNCTION_ARGS)
177 {
178 	char	   *buf = PG_GETARG_CSTRING(0);
179 	TSVectorParseState state;
180 	WordEntryIN *arr;
181 	int			totallen;
182 	int			arrlen;			/* allocated size of arr */
183 	WordEntry  *inarr;
184 	int			len = 0;
185 	TSVector	in;
186 	int			i;
187 	char	   *token;
188 	int			toklen;
189 	WordEntryPos *pos;
190 	int			poslen;
191 	char	   *strbuf;
192 	int			stroff;
193 
194 	/*
195 	 * Tokens are appended to tmpbuf, cur is a pointer to the end of used
196 	 * space in tmpbuf.
197 	 */
198 	char	   *tmpbuf;
199 	char	   *cur;
200 	int			buflen = 256;	/* allocated size of tmpbuf */
201 
202 	state = init_tsvector_parser(buf, false, false);
203 
204 	arrlen = 64;
205 	arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen);
206 	cur = tmpbuf = (char *) palloc(buflen);
207 
208 	while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL))
209 	{
210 		if (toklen >= MAXSTRLEN)
211 			ereport(ERROR,
212 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
213 					 errmsg("word is too long (%ld bytes, max %ld bytes)",
214 							(long) toklen,
215 							(long) (MAXSTRLEN - 1))));
216 
217 		if (cur - tmpbuf > MAXSTRPOS)
218 			ereport(ERROR,
219 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
220 					 errmsg("string is too long for tsvector (%ld bytes, max %ld bytes)",
221 							(long) (cur - tmpbuf), (long) MAXSTRPOS)));
222 
223 		/*
224 		 * Enlarge buffers if needed
225 		 */
226 		if (len >= arrlen)
227 		{
228 			arrlen *= 2;
229 			arr = (WordEntryIN *)
230 				repalloc((void *) arr, sizeof(WordEntryIN) * arrlen);
231 		}
232 		while ((cur - tmpbuf) + toklen >= buflen)
233 		{
234 			int			dist = cur - tmpbuf;
235 
236 			buflen *= 2;
237 			tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
238 			cur = tmpbuf + dist;
239 		}
240 		arr[len].entry.len = toklen;
241 		arr[len].entry.pos = cur - tmpbuf;
242 		memcpy((void *) cur, (void *) token, toklen);
243 		cur += toklen;
244 
245 		if (poslen != 0)
246 		{
247 			arr[len].entry.haspos = 1;
248 			arr[len].pos = pos;
249 			arr[len].poslen = poslen;
250 		}
251 		else
252 		{
253 			arr[len].entry.haspos = 0;
254 			arr[len].pos = NULL;
255 			arr[len].poslen = 0;
256 		}
257 		len++;
258 	}
259 
260 	close_tsvector_parser(state);
261 
262 	if (len > 0)
263 		len = uniqueentry(arr, len, tmpbuf, &buflen);
264 	else
265 		buflen = 0;
266 
267 	if (buflen > MAXSTRPOS)
268 		ereport(ERROR,
269 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
270 				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", buflen, MAXSTRPOS)));
271 
272 	totallen = CALCDATASIZE(len, buflen);
273 	in = (TSVector) palloc0(totallen);
274 	SET_VARSIZE(in, totallen);
275 	in->size = len;
276 	inarr = ARRPTR(in);
277 	strbuf = STRPTR(in);
278 	stroff = 0;
279 	for (i = 0; i < len; i++)
280 	{
281 		memcpy(strbuf + stroff, &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
282 		arr[i].entry.pos = stroff;
283 		stroff += arr[i].entry.len;
284 		if (arr[i].entry.haspos)
285 		{
286 			if (arr[i].poslen > 0xFFFF)
287 				elog(ERROR, "positions array too long");
288 
289 			/* Copy number of positions */
290 			stroff = SHORTALIGN(stroff);
291 			*(uint16 *) (strbuf + stroff) = (uint16) arr[i].poslen;
292 			stroff += sizeof(uint16);
293 
294 			/* Copy positions */
295 			memcpy(strbuf + stroff, arr[i].pos, arr[i].poslen * sizeof(WordEntryPos));
296 			stroff += arr[i].poslen * sizeof(WordEntryPos);
297 
298 			pfree(arr[i].pos);
299 		}
300 		inarr[i] = arr[i].entry;
301 	}
302 
303 	Assert((strbuf + stroff - (char *) in) == totallen);
304 
305 	PG_RETURN_TSVECTOR(in);
306 }
307 
308 Datum
tsvectorout(PG_FUNCTION_ARGS)309 tsvectorout(PG_FUNCTION_ARGS)
310 {
311 	TSVector	out = PG_GETARG_TSVECTOR(0);
312 	char	   *outbuf;
313 	int32		i,
314 				lenbuf = 0,
315 				pp;
316 	WordEntry  *ptr = ARRPTR(out);
317 	char	   *curbegin,
318 			   *curin,
319 			   *curout;
320 
321 	lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
322 	for (i = 0; i < out->size; i++)
323 	{
324 		lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
325 		if (ptr[i].haspos)
326 			lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * POSDATALEN(out, &(ptr[i]));
327 	}
328 
329 	curout = outbuf = (char *) palloc(lenbuf);
330 	for (i = 0; i < out->size; i++)
331 	{
332 		curbegin = curin = STRPTR(out) + ptr->pos;
333 		if (i != 0)
334 			*curout++ = ' ';
335 		*curout++ = '\'';
336 		while (curin - curbegin < ptr->len)
337 		{
338 			int			len = pg_mblen(curin);
339 
340 			if (t_iseq(curin, '\''))
341 				*curout++ = '\'';
342 			else if (t_iseq(curin, '\\'))
343 				*curout++ = '\\';
344 
345 			while (len--)
346 				*curout++ = *curin++;
347 		}
348 
349 		*curout++ = '\'';
350 		if ((pp = POSDATALEN(out, ptr)) != 0)
351 		{
352 			WordEntryPos *wptr;
353 
354 			*curout++ = ':';
355 			wptr = POSDATAPTR(out, ptr);
356 			while (pp)
357 			{
358 				curout += sprintf(curout, "%d", WEP_GETPOS(*wptr));
359 				switch (WEP_GETWEIGHT(*wptr))
360 				{
361 					case 3:
362 						*curout++ = 'A';
363 						break;
364 					case 2:
365 						*curout++ = 'B';
366 						break;
367 					case 1:
368 						*curout++ = 'C';
369 						break;
370 					case 0:
371 					default:
372 						break;
373 				}
374 
375 				if (pp > 1)
376 					*curout++ = ',';
377 				pp--;
378 				wptr++;
379 			}
380 		}
381 		ptr++;
382 	}
383 
384 	*curout = '\0';
385 	PG_FREE_IF_COPY(out, 0);
386 	PG_RETURN_CSTRING(outbuf);
387 }
388 
389 /*
390  * Binary Input / Output functions. The binary format is as follows:
391  *
392  * uint32	number of lexemes
393  *
394  * for each lexeme:
395  *		lexeme text in client encoding, null-terminated
396  *		uint16	number of positions
397  *		for each position:
398  *			uint16 WordEntryPos
399  */
400 
401 Datum
tsvectorsend(PG_FUNCTION_ARGS)402 tsvectorsend(PG_FUNCTION_ARGS)
403 {
404 	TSVector	vec = PG_GETARG_TSVECTOR(0);
405 	StringInfoData buf;
406 	int			i,
407 				j;
408 	WordEntry  *weptr = ARRPTR(vec);
409 
410 	pq_begintypsend(&buf);
411 
412 	pq_sendint(&buf, vec->size, sizeof(int32));
413 	for (i = 0; i < vec->size; i++)
414 	{
415 		uint16		npos;
416 
417 		/*
418 		 * the strings in the TSVector array are not null-terminated, so we
419 		 * have to send the null-terminator separately
420 		 */
421 		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
422 		pq_sendbyte(&buf, '\0');
423 
424 		npos = POSDATALEN(vec, weptr);
425 		pq_sendint(&buf, npos, sizeof(uint16));
426 
427 		if (npos > 0)
428 		{
429 			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);
430 
431 			for (j = 0; j < npos; j++)
432 				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
433 		}
434 		weptr++;
435 	}
436 
437 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
438 }
439 
440 Datum
tsvectorrecv(PG_FUNCTION_ARGS)441 tsvectorrecv(PG_FUNCTION_ARGS)
442 {
443 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
444 	TSVector	vec;
445 	int			i;
446 	int32		nentries;
447 	int			datalen;		/* number of bytes used in the variable size
448 								 * area after fixed size TSVector header and
449 								 * WordEntries */
450 	Size		hdrlen;
451 	Size		len;			/* allocated size of vec */
452 	bool		needSort = false;
453 
454 	nentries = pq_getmsgint(buf, sizeof(int32));
455 	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
456 		elog(ERROR, "invalid size of tsvector");
457 
458 	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;
459 
460 	len = hdrlen * 2;			/* times two to make room for lexemes */
461 	vec = (TSVector) palloc0(len);
462 	vec->size = nentries;
463 
464 	datalen = 0;
465 	for (i = 0; i < nentries; i++)
466 	{
467 		const char *lexeme;
468 		uint16		npos;
469 		size_t		lex_len;
470 
471 		lexeme = pq_getmsgstring(buf);
472 		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
473 
474 		/* sanity checks */
475 
476 		lex_len = strlen(lexeme);
477 		if (lex_len > MAXSTRLEN)
478 			elog(ERROR, "invalid tsvector: lexeme too long");
479 
480 		if (datalen > MAXSTRPOS)
481 			elog(ERROR, "invalid tsvector: maximum total lexeme length exceeded");
482 
483 		if (npos > MAXNUMPOS)
484 			elog(ERROR, "unexpected number of tsvector positions");
485 
486 		/*
487 		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
488 		 *
489 		 * But make sure the buffer is large enough first.
490 		 */
491 		while (hdrlen + SHORTALIGN(datalen + lex_len) +
492 			   (npos + 1) * sizeof(WordEntryPos) >= len)
493 		{
494 			len *= 2;
495 			vec = (TSVector) repalloc(vec, len);
496 		}
497 
498 		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
499 		vec->entries[i].len = lex_len;
500 		vec->entries[i].pos = datalen;
501 
502 		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
503 
504 		datalen += lex_len;
505 
506 		if (i > 0 && WordEntryCMP(&vec->entries[i],
507 								  &vec->entries[i - 1],
508 								  STRPTR(vec)) <= 0)
509 			needSort = true;
510 
511 		/* Receive positions */
512 		if (npos > 0)
513 		{
514 			uint16		j;
515 			WordEntryPos *wepptr;
516 
517 			/*
518 			 * Pad to 2-byte alignment if necessary. Though we used palloc0
519 			 * for the initial allocation, subsequent repalloc'd memory areas
520 			 * are not initialized to zero.
521 			 */
522 			if (datalen != SHORTALIGN(datalen))
523 			{
524 				*(STRPTR(vec) + datalen) = '\0';
525 				datalen = SHORTALIGN(datalen);
526 			}
527 
528 			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
529 
530 			wepptr = POSDATAPTR(vec, &vec->entries[i]);
531 			for (j = 0; j < npos; j++)
532 			{
533 				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
534 				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
535 					elog(ERROR, "position information is misordered");
536 			}
537 
538 			datalen += (npos + 1) * sizeof(WordEntry);
539 		}
540 	}
541 
542 	SET_VARSIZE(vec, hdrlen + datalen);
543 
544 	if (needSort)
545 		qsort_arg((void *) ARRPTR(vec), vec->size, sizeof(WordEntry),
546 				  compareentry, (void *) STRPTR(vec));
547 
548 	PG_RETURN_TSVECTOR(vec);
549 }
550