1 /*-------------------------------------------------------------------------
2  *
3  * to_tsany.c
4  *		to_ts* function definitions
5  *
6  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/to_tsany.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 #include "postgres.h"
15 
16 #include "tsearch/ts_cache.h"
17 #include "tsearch/ts_utils.h"
18 #include "utils/builtins.h"
19 
20 
21 typedef struct MorphOpaque
22 {
23 	Oid			cfg_id;
24 	int			qoperator;		/* query operator */
25 } MorphOpaque;
26 
27 
28 Datum
get_current_ts_config(PG_FUNCTION_ARGS)29 get_current_ts_config(PG_FUNCTION_ARGS)
30 {
31 	PG_RETURN_OID(getTSCurrentConfig(true));
32 }
33 
34 /*
35  * to_tsvector
36  */
37 static int
compareWORD(const void * a,const void * b)38 compareWORD(const void *a, const void *b)
39 {
40 	int			res;
41 
42 	res = tsCompareString(
43 			   ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len,
44 			   ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len,
45 						  false);
46 
47 	if (res == 0)
48 	{
49 		if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos)
50 			return 0;
51 
52 		res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1;
53 	}
54 
55 	return res;
56 }
57 
58 static int
uniqueWORD(ParsedWord * a,int32 l)59 uniqueWORD(ParsedWord *a, int32 l)
60 {
61 	ParsedWord *ptr,
62 			   *res;
63 	int			tmppos;
64 
65 	if (l == 1)
66 	{
67 		tmppos = LIMITPOS(a->pos.pos);
68 		a->alen = 2;
69 		a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
70 		a->pos.apos[0] = 1;
71 		a->pos.apos[1] = tmppos;
72 		return l;
73 	}
74 
75 	res = a;
76 	ptr = a + 1;
77 
78 	/*
79 	 * Sort words with its positions
80 	 */
81 	qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
82 
83 	/*
84 	 * Initialize first word and its first position
85 	 */
86 	tmppos = LIMITPOS(a->pos.pos);
87 	a->alen = 2;
88 	a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
89 	a->pos.apos[0] = 1;
90 	a->pos.apos[1] = tmppos;
91 
92 	/*
93 	 * Summarize position information for each word
94 	 */
95 	while (ptr - a < l)
96 	{
97 		if (!(ptr->len == res->len &&
98 			  strncmp(ptr->word, res->word, res->len) == 0))
99 		{
100 			/*
101 			 * Got a new word, so put it in result
102 			 */
103 			res++;
104 			res->len = ptr->len;
105 			res->word = ptr->word;
106 			tmppos = LIMITPOS(ptr->pos.pos);
107 			res->alen = 2;
108 			res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
109 			res->pos.apos[0] = 1;
110 			res->pos.apos[1] = tmppos;
111 		}
112 		else
113 		{
114 			/*
115 			 * The word already exists, so adjust position information. But
116 			 * before we should check size of position's array, max allowed
117 			 * value for position and uniqueness of position
118 			 */
119 			pfree(ptr->word);
120 			if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 &&
121 				res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
122 			{
123 				if (res->pos.apos[0] + 1 >= res->alen)
124 				{
125 					res->alen *= 2;
126 					res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
127 				}
128 				if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
129 				{
130 					res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
131 					res->pos.apos[0]++;
132 				}
133 			}
134 		}
135 		ptr++;
136 	}
137 
138 	return res + 1 - a;
139 }
140 
141 /*
142  * make value of tsvector, given parsed text
143  */
144 TSVector
make_tsvector(ParsedText * prs)145 make_tsvector(ParsedText *prs)
146 {
147 	int			i,
148 				j,
149 				lenstr = 0,
150 				totallen;
151 	TSVector	in;
152 	WordEntry  *ptr;
153 	char	   *str;
154 	int			stroff;
155 
156 	prs->curwords = uniqueWORD(prs->words, prs->curwords);
157 	for (i = 0; i < prs->curwords; i++)
158 	{
159 		lenstr += prs->words[i].len;
160 		if (prs->words[i].alen)
161 		{
162 			lenstr = SHORTALIGN(lenstr);
163 			lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
164 		}
165 	}
166 
167 	if (lenstr > MAXSTRPOS)
168 		ereport(ERROR,
169 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
170 				 errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS)));
171 
172 	totallen = CALCDATASIZE(prs->curwords, lenstr);
173 	in = (TSVector) palloc0(totallen);
174 	SET_VARSIZE(in, totallen);
175 	in->size = prs->curwords;
176 
177 	ptr = ARRPTR(in);
178 	str = STRPTR(in);
179 	stroff = 0;
180 	for (i = 0; i < prs->curwords; i++)
181 	{
182 		ptr->len = prs->words[i].len;
183 		ptr->pos = stroff;
184 		memcpy(str + stroff, prs->words[i].word, prs->words[i].len);
185 		stroff += prs->words[i].len;
186 		pfree(prs->words[i].word);
187 		if (prs->words[i].alen)
188 		{
189 			int			k = prs->words[i].pos.apos[0];
190 			WordEntryPos *wptr;
191 
192 			if (k > 0xFFFF)
193 				elog(ERROR, "positions array too long");
194 
195 			ptr->haspos = 1;
196 			stroff = SHORTALIGN(stroff);
197 			*(uint16 *) (str + stroff) = (uint16) k;
198 			wptr = POSDATAPTR(in, ptr);
199 			for (j = 0; j < k; j++)
200 			{
201 				WEP_SETWEIGHT(wptr[j], 0);
202 				WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
203 			}
204 			stroff += sizeof(uint16) + k * sizeof(WordEntryPos);
205 			pfree(prs->words[i].pos.apos);
206 		}
207 		else
208 			ptr->haspos = 0;
209 		ptr++;
210 	}
211 	pfree(prs->words);
212 	return in;
213 }
214 
215 Datum
to_tsvector_byid(PG_FUNCTION_ARGS)216 to_tsvector_byid(PG_FUNCTION_ARGS)
217 {
218 	Oid			cfgId = PG_GETARG_OID(0);
219 	text	   *in = PG_GETARG_TEXT_P(1);
220 	ParsedText	prs;
221 	TSVector	out;
222 
223 	prs.lenwords = (VARSIZE(in) - VARHDRSZ) / 6;		/* just estimation of
224 														 * word's number */
225 	if (prs.lenwords == 0)
226 		prs.lenwords = 2;
227 	prs.curwords = 0;
228 	prs.pos = 0;
229 	prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
230 
231 	parsetext(cfgId, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
232 	PG_FREE_IF_COPY(in, 1);
233 
234 	if (prs.curwords)
235 		out = make_tsvector(&prs);
236 	else
237 	{
238 		pfree(prs.words);
239 		out = palloc(CALCDATASIZE(0, 0));
240 		SET_VARSIZE(out, CALCDATASIZE(0, 0));
241 		out->size = 0;
242 	}
243 
244 	PG_RETURN_POINTER(out);
245 }
246 
247 Datum
to_tsvector(PG_FUNCTION_ARGS)248 to_tsvector(PG_FUNCTION_ARGS)
249 {
250 	text	   *in = PG_GETARG_TEXT_P(0);
251 	Oid			cfgId;
252 
253 	cfgId = getTSCurrentConfig(true);
254 	PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
255 										ObjectIdGetDatum(cfgId),
256 										PointerGetDatum(in)));
257 }
258 
259 /*
260  * to_tsquery
261  */
262 
263 
264 /*
265  * This function is used for morph parsing.
266  *
267  * The value is passed to parsetext which will call the right dictionary to
268  * lexize the word. If it turns out to be a stopword, we push a QI_VALSTOP
269  * to the stack.
270  *
271  * All words belonging to the same variant are pushed as an ANDed list,
272  * and different variants are ORed together.
273  */
274 static void
pushval_morph(Datum opaque,TSQueryParserState state,char * strval,int lenval,int16 weight,bool prefix)275 pushval_morph(Datum opaque, TSQueryParserState state, char *strval, int lenval, int16 weight, bool prefix)
276 {
277 	int32		count = 0;
278 	ParsedText	prs;
279 	uint32		variant,
280 				pos = 0,
281 				cntvar = 0,
282 				cntpos = 0,
283 				cnt = 0;
284 	MorphOpaque *data = (MorphOpaque *) DatumGetPointer(opaque);
285 
286 	prs.lenwords = 4;
287 	prs.curwords = 0;
288 	prs.pos = 0;
289 	prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
290 
291 	parsetext(data->cfg_id, &prs, strval, lenval);
292 
293 	if (prs.curwords > 0)
294 	{
295 		while (count < prs.curwords)
296 		{
297 			/*
298 			 * Were any stop words removed? If so, fill empty positions with
299 			 * placeholders linked by an appropriate operator.
300 			 */
301 			if (pos > 0 && pos + 1 < prs.words[count].pos.pos)
302 			{
303 				while (pos + 1 < prs.words[count].pos.pos)
304 				{
305 					/* put placeholders for each missing stop word */
306 					pushStop(state);
307 					if (cntpos)
308 						pushOperator(state, data->qoperator, 1);
309 					cntpos++;
310 					pos++;
311 				}
312 			}
313 
314 			/* save current word's position */
315 			pos = prs.words[count].pos.pos;
316 
317 			/* Go through all variants obtained from this token */
318 			cntvar = 0;
319 			while (count < prs.curwords && pos == prs.words[count].pos.pos)
320 			{
321 				variant = prs.words[count].nvariant;
322 
323 				/* Push all words belonging to the same variant */
324 				cnt = 0;
325 				while (count < prs.curwords &&
326 					   pos == prs.words[count].pos.pos &&
327 					   variant == prs.words[count].nvariant)
328 				{
329 					pushValue(state,
330 							  prs.words[count].word,
331 							  prs.words[count].len,
332 							  weight,
333 						  ((prs.words[count].flags & TSL_PREFIX) || prefix));
334 					pfree(prs.words[count].word);
335 					if (cnt)
336 						pushOperator(state, OP_AND, 0);
337 					cnt++;
338 					count++;
339 				}
340 
341 				if (cntvar)
342 					pushOperator(state, OP_OR, 0);
343 				cntvar++;
344 			}
345 
346 			if (cntpos)
347 			{
348 				/* distance may be useful */
349 				pushOperator(state, data->qoperator, 1);
350 			}
351 
352 			cntpos++;
353 		}
354 
355 		pfree(prs.words);
356 
357 	}
358 	else
359 		pushStop(state);
360 }
361 
362 Datum
to_tsquery_byid(PG_FUNCTION_ARGS)363 to_tsquery_byid(PG_FUNCTION_ARGS)
364 {
365 	text	   *in = PG_GETARG_TEXT_P(1);
366 	TSQuery		query;
367 	MorphOpaque data;
368 
369 	data.cfg_id = PG_GETARG_OID(0);
370 	data.qoperator = OP_AND;
371 
372 	query = parse_tsquery(text_to_cstring(in),
373 						  pushval_morph,
374 						  PointerGetDatum(&data),
375 						  false);
376 
377 	PG_RETURN_TSQUERY(query);
378 }
379 
380 Datum
to_tsquery(PG_FUNCTION_ARGS)381 to_tsquery(PG_FUNCTION_ARGS)
382 {
383 	text	   *in = PG_GETARG_TEXT_P(0);
384 	Oid			cfgId;
385 
386 	cfgId = getTSCurrentConfig(true);
387 	PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
388 										ObjectIdGetDatum(cfgId),
389 										PointerGetDatum(in)));
390 }
391 
392 Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)393 plainto_tsquery_byid(PG_FUNCTION_ARGS)
394 {
395 	text	   *in = PG_GETARG_TEXT_P(1);
396 	TSQuery		query;
397 	MorphOpaque data;
398 
399 	data.cfg_id = PG_GETARG_OID(0);
400 	data.qoperator = OP_AND;
401 
402 	query = parse_tsquery(text_to_cstring(in),
403 						  pushval_morph,
404 						  PointerGetDatum(&data),
405 						  true);
406 
407 	PG_RETURN_POINTER(query);
408 }
409 
410 Datum
plainto_tsquery(PG_FUNCTION_ARGS)411 plainto_tsquery(PG_FUNCTION_ARGS)
412 {
413 	text	   *in = PG_GETARG_TEXT_P(0);
414 	Oid			cfgId;
415 
416 	cfgId = getTSCurrentConfig(true);
417 	PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
418 										ObjectIdGetDatum(cfgId),
419 										PointerGetDatum(in)));
420 }
421 
422 
423 Datum
phraseto_tsquery_byid(PG_FUNCTION_ARGS)424 phraseto_tsquery_byid(PG_FUNCTION_ARGS)
425 {
426 	text	   *in = PG_GETARG_TEXT_P(1);
427 	TSQuery		query;
428 	MorphOpaque data;
429 
430 	data.cfg_id = PG_GETARG_OID(0);
431 	data.qoperator = OP_PHRASE;
432 
433 	query = parse_tsquery(text_to_cstring(in),
434 						  pushval_morph,
435 						  PointerGetDatum(&data),
436 						  true);
437 
438 	PG_RETURN_TSQUERY(query);
439 }
440 
441 Datum
phraseto_tsquery(PG_FUNCTION_ARGS)442 phraseto_tsquery(PG_FUNCTION_ARGS)
443 {
444 	text	   *in = PG_GETARG_TEXT_P(0);
445 	Oid			cfgId;
446 
447 	cfgId = getTSCurrentConfig(true);
448 	PG_RETURN_DATUM(DirectFunctionCall2(phraseto_tsquery_byid,
449 										ObjectIdGetDatum(cfgId),
450 										PointerGetDatum(in)));
451 }
452