1 /*-------------------------------------------------------------------------
2  *
3  * ts_parse.c
4  *		main parse functions for tsearch
5  *
6  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_parse.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_utils.h"
19 
20 #define IGNORE_LONGLEXEME	1
21 
22 /*
23  * Lexize subsystem
24  */
25 
26 typedef struct ParsedLex
27 {
28 	int			type;
29 	char	   *lemm;
30 	int			lenlemm;
31 	struct ParsedLex *next;
32 } ParsedLex;
33 
34 typedef struct ListParsedLex
35 {
36 	ParsedLex  *head;
37 	ParsedLex  *tail;
38 } ListParsedLex;
39 
40 typedef struct
41 {
42 	TSConfigCacheEntry *cfg;
43 	Oid			curDictId;
44 	int			posDict;
45 	DictSubState dictState;
46 	ParsedLex  *curSub;
47 	ListParsedLex towork;		/* current list to work */
48 	ListParsedLex waste;		/* list of lexemes that already lexized */
49 
50 	/*
51 	 * fields to store last variant to lexize (basically, thesaurus or similar
52 	 * to, which wants	several lexemes
53 	 */
54 
55 	ParsedLex  *lastRes;
56 	TSLexeme   *tmpRes;
57 } LexizeData;
58 
59 static void
LexizeInit(LexizeData * ld,TSConfigCacheEntry * cfg)60 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
61 {
62 	ld->cfg = cfg;
63 	ld->curDictId = InvalidOid;
64 	ld->posDict = 0;
65 	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
66 	ld->waste.head = ld->waste.tail = NULL;
67 	ld->lastRes = NULL;
68 	ld->tmpRes = NULL;
69 }
70 
71 static void
LPLAddTail(ListParsedLex * list,ParsedLex * newpl)72 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
73 {
74 	if (list->tail)
75 	{
76 		list->tail->next = newpl;
77 		list->tail = newpl;
78 	}
79 	else
80 		list->head = list->tail = newpl;
81 	newpl->next = NULL;
82 }
83 
84 static ParsedLex *
LPLRemoveHead(ListParsedLex * list)85 LPLRemoveHead(ListParsedLex *list)
86 {
87 	ParsedLex  *res = list->head;
88 
89 	if (list->head)
90 		list->head = list->head->next;
91 
92 	if (list->head == NULL)
93 		list->tail = NULL;
94 
95 	return res;
96 }
97 
98 static void
LexizeAddLemm(LexizeData * ld,int type,char * lemm,int lenlemm)99 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
100 {
101 	ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
102 
103 	newpl->type = type;
104 	newpl->lemm = lemm;
105 	newpl->lenlemm = lenlemm;
106 	LPLAddTail(&ld->towork, newpl);
107 	ld->curSub = ld->towork.tail;
108 }
109 
110 static void
RemoveHead(LexizeData * ld)111 RemoveHead(LexizeData *ld)
112 {
113 	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
114 
115 	ld->posDict = 0;
116 }
117 
118 static void
setCorrLex(LexizeData * ld,ParsedLex ** correspondLexem)119 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
120 {
121 	if (correspondLexem)
122 	{
123 		*correspondLexem = ld->waste.head;
124 	}
125 	else
126 	{
127 		ParsedLex  *tmp,
128 				   *ptr = ld->waste.head;
129 
130 		while (ptr)
131 		{
132 			tmp = ptr->next;
133 			pfree(ptr);
134 			ptr = tmp;
135 		}
136 	}
137 	ld->waste.head = ld->waste.tail = NULL;
138 }
139 
140 static void
moveToWaste(LexizeData * ld,ParsedLex * stop)141 moveToWaste(LexizeData *ld, ParsedLex *stop)
142 {
143 	bool		go = true;
144 
145 	while (ld->towork.head && go)
146 	{
147 		if (ld->towork.head == stop)
148 		{
149 			ld->curSub = stop->next;
150 			go = false;
151 		}
152 		RemoveHead(ld);
153 	}
154 }
155 
156 static void
setNewTmpRes(LexizeData * ld,ParsedLex * lex,TSLexeme * res)157 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
158 {
159 	if (ld->tmpRes)
160 	{
161 		TSLexeme   *ptr;
162 
163 		for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
164 			pfree(ptr->lexeme);
165 		pfree(ld->tmpRes);
166 	}
167 	ld->tmpRes = res;
168 	ld->lastRes = lex;
169 }
170 
171 static TSLexeme *
LexizeExec(LexizeData * ld,ParsedLex ** correspondLexem)172 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
173 {
174 	int			i;
175 	ListDictionary *map;
176 	TSDictionaryCacheEntry *dict;
177 	TSLexeme   *res;
178 
179 	if (ld->curDictId == InvalidOid)
180 	{
181 		/*
182 		 * usual mode: dictionary wants only one word, but we should keep in
183 		 * mind that we should go through all stack
184 		 */
185 
186 		while (ld->towork.head)
187 		{
188 			ParsedLex  *curVal = ld->towork.head;
189 			char	   *curValLemm = curVal->lemm;
190 			int			curValLenLemm = curVal->lenlemm;
191 
192 			map = ld->cfg->map + curVal->type;
193 
194 			if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
195 			{
196 				/* skip this type of lexeme */
197 				RemoveHead(ld);
198 				continue;
199 			}
200 
201 			for (i = ld->posDict; i < map->len; i++)
202 			{
203 				dict = lookup_ts_dictionary_cache(map->dictIds[i]);
204 
205 				ld->dictState.isend = ld->dictState.getnext = false;
206 				ld->dictState.private_state = NULL;
207 				res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
208 																 PointerGetDatum(dict->dictData),
209 																 PointerGetDatum(curValLemm),
210 																 Int32GetDatum(curValLenLemm),
211 																 PointerGetDatum(&ld->dictState)));
212 
213 				if (ld->dictState.getnext)
214 				{
215 					/*
216 					 * dictionary wants next word, so setup and store current
217 					 * position and go to multiword mode
218 					 */
219 
220 					ld->curDictId = DatumGetObjectId(map->dictIds[i]);
221 					ld->posDict = i + 1;
222 					ld->curSub = curVal->next;
223 					if (res)
224 						setNewTmpRes(ld, curVal, res);
225 					return LexizeExec(ld, correspondLexem);
226 				}
227 
228 				if (!res)		/* dictionary doesn't know this lexeme */
229 					continue;
230 
231 				if (res->flags & TSL_FILTER)
232 				{
233 					curValLemm = res->lexeme;
234 					curValLenLemm = strlen(res->lexeme);
235 					continue;
236 				}
237 
238 				RemoveHead(ld);
239 				setCorrLex(ld, correspondLexem);
240 				return res;
241 			}
242 
243 			RemoveHead(ld);
244 		}
245 	}
246 	else
247 	{							/* curDictId is valid */
248 		dict = lookup_ts_dictionary_cache(ld->curDictId);
249 
250 		/*
251 		 * Dictionary ld->curDictId asks  us about following words
252 		 */
253 
254 		while (ld->curSub)
255 		{
256 			ParsedLex  *curVal = ld->curSub;
257 
258 			map = ld->cfg->map + curVal->type;
259 
260 			if (curVal->type != 0)
261 			{
262 				bool		dictExists = false;
263 
264 				if (curVal->type >= ld->cfg->lenmap || map->len == 0)
265 				{
266 					/* skip this type of lexeme */
267 					ld->curSub = curVal->next;
268 					continue;
269 				}
270 
271 				/*
272 				 * We should be sure that current type of lexeme is recognized
273 				 * by our dictionary: we just check is it exist in list of
274 				 * dictionaries ?
275 				 */
276 				for (i = 0; i < map->len && !dictExists; i++)
277 					if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
278 						dictExists = true;
279 
280 				if (!dictExists)
281 				{
282 					/*
283 					 * Dictionary can't work with current type of lexeme,
284 					 * return to basic mode and redo all stored lexemes
285 					 */
286 					ld->curDictId = InvalidOid;
287 					return LexizeExec(ld, correspondLexem);
288 				}
289 			}
290 
291 			ld->dictState.isend = (curVal->type == 0) ? true : false;
292 			ld->dictState.getnext = false;
293 
294 			res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
295 															 PointerGetDatum(dict->dictData),
296 															 PointerGetDatum(curVal->lemm),
297 															 Int32GetDatum(curVal->lenlemm),
298 															 PointerGetDatum(&ld->dictState)));
299 
300 			if (ld->dictState.getnext)
301 			{
302 				/* Dictionary wants one more */
303 				ld->curSub = curVal->next;
304 				if (res)
305 					setNewTmpRes(ld, curVal, res);
306 				continue;
307 			}
308 
309 			if (res || ld->tmpRes)
310 			{
311 				/*
312 				 * Dictionary normalizes lexemes, so we remove from stack all
313 				 * used lexemes, return to basic mode and redo end of stack
314 				 * (if it exists)
315 				 */
316 				if (res)
317 				{
318 					moveToWaste(ld, ld->curSub);
319 				}
320 				else
321 				{
322 					res = ld->tmpRes;
323 					moveToWaste(ld, ld->lastRes);
324 				}
325 
326 				/* reset to initial state */
327 				ld->curDictId = InvalidOid;
328 				ld->posDict = 0;
329 				ld->lastRes = NULL;
330 				ld->tmpRes = NULL;
331 				setCorrLex(ld, correspondLexem);
332 				return res;
333 			}
334 
335 			/*
336 			 * Dict don't want next lexem and didn't recognize anything, redo
337 			 * from ld->towork.head
338 			 */
339 			ld->curDictId = InvalidOid;
340 			return LexizeExec(ld, correspondLexem);
341 		}
342 	}
343 
344 	setCorrLex(ld, correspondLexem);
345 	return NULL;
346 }
347 
348 /*
349  * Parse string and lexize words.
350  *
351  * prs will be filled in.
352  */
353 void
parsetext(Oid cfgId,ParsedText * prs,char * buf,int buflen)354 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
355 {
356 	int			type,
357 				lenlemm;
358 	char	   *lemm = NULL;
359 	LexizeData	ldata;
360 	TSLexeme   *norms;
361 	TSConfigCacheEntry *cfg;
362 	TSParserCacheEntry *prsobj;
363 	void	   *prsdata;
364 
365 	cfg = lookup_ts_config_cache(cfgId);
366 	prsobj = lookup_ts_parser_cache(cfg->prsId);
367 
368 	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
369 													 PointerGetDatum(buf),
370 													 Int32GetDatum(buflen)));
371 
372 	LexizeInit(&ldata, cfg);
373 
374 	do
375 	{
376 		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
377 										   PointerGetDatum(prsdata),
378 										   PointerGetDatum(&lemm),
379 										   PointerGetDatum(&lenlemm)));
380 
381 		if (type > 0 && lenlemm >= MAXSTRLEN)
382 		{
383 #ifdef IGNORE_LONGLEXEME
384 			ereport(NOTICE,
385 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
386 					 errmsg("word is too long to be indexed"),
387 					 errdetail("Words longer than %d characters are ignored.",
388 							   MAXSTRLEN)));
389 			continue;
390 #else
391 			ereport(ERROR,
392 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
393 					 errmsg("word is too long to be indexed"),
394 					 errdetail("Words longer than %d characters are ignored.",
395 							   MAXSTRLEN)));
396 #endif
397 		}
398 
399 		LexizeAddLemm(&ldata, type, lemm, lenlemm);
400 
401 		while ((norms = LexizeExec(&ldata, NULL)) != NULL)
402 		{
403 			TSLexeme   *ptr = norms;
404 
405 			prs->pos++;			/* set pos */
406 
407 			while (ptr->lexeme)
408 			{
409 				if (prs->curwords == prs->lenwords)
410 				{
411 					prs->lenwords *= 2;
412 					prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
413 				}
414 
415 				if (ptr->flags & TSL_ADDPOS)
416 					prs->pos++;
417 				prs->words[prs->curwords].len = strlen(ptr->lexeme);
418 				prs->words[prs->curwords].word = ptr->lexeme;
419 				prs->words[prs->curwords].nvariant = ptr->nvariant;
420 				prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
421 				prs->words[prs->curwords].alen = 0;
422 				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
423 				ptr++;
424 				prs->curwords++;
425 			}
426 			pfree(norms);
427 		}
428 	} while (type > 0);
429 
430 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
431 }
432 
433 /*
434  * Headline framework
435  */
436 static void
hladdword(HeadlineParsedText * prs,char * buf,int buflen,int type)437 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
438 {
439 	while (prs->curwords >= prs->lenwords)
440 	{
441 		prs->lenwords *= 2;
442 		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
443 	}
444 	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
445 	prs->words[prs->curwords].type = (uint8) type;
446 	prs->words[prs->curwords].len = buflen;
447 	prs->words[prs->curwords].word = palloc(buflen);
448 	memcpy(prs->words[prs->curwords].word, buf, buflen);
449 	prs->curwords++;
450 }
451 
452 static void
hlfinditem(HeadlineParsedText * prs,TSQuery query,int32 pos,char * buf,int buflen)453 hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
454 {
455 	int			i;
456 	QueryItem  *item = GETQUERY(query);
457 	HeadlineWordEntry *word;
458 
459 	while (prs->curwords + query->size >= prs->lenwords)
460 	{
461 		prs->lenwords *= 2;
462 		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
463 	}
464 
465 	word = &(prs->words[prs->curwords - 1]);
466 	word->pos = LIMITPOS(pos);
467 	for (i = 0; i < query->size; i++)
468 	{
469 		if (item->type == QI_VAL &&
470 			tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
471 							buf, buflen, item->qoperand.prefix) == 0)
472 		{
473 			if (word->item)
474 			{
475 				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
476 				prs->words[prs->curwords].item = &item->qoperand;
477 				prs->words[prs->curwords].repeated = 1;
478 				prs->curwords++;
479 			}
480 			else
481 				word->item = &item->qoperand;
482 		}
483 		item++;
484 	}
485 }
486 
487 static void
addHLParsedLex(HeadlineParsedText * prs,TSQuery query,ParsedLex * lexs,TSLexeme * norms)488 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
489 {
490 	ParsedLex  *tmplexs;
491 	TSLexeme   *ptr;
492 	int32		savedpos;
493 
494 	while (lexs)
495 	{
496 		if (lexs->type > 0)
497 			hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
498 
499 		ptr = norms;
500 		savedpos = prs->vectorpos;
501 		while (ptr && ptr->lexeme)
502 		{
503 			if (ptr->flags & TSL_ADDPOS)
504 				savedpos++;
505 			hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
506 			ptr++;
507 		}
508 
509 		tmplexs = lexs->next;
510 		pfree(lexs);
511 		lexs = tmplexs;
512 	}
513 
514 	if (norms)
515 	{
516 		ptr = norms;
517 		while (ptr->lexeme)
518 		{
519 			if (ptr->flags & TSL_ADDPOS)
520 				prs->vectorpos++;
521 			pfree(ptr->lexeme);
522 			ptr++;
523 		}
524 		pfree(norms);
525 	}
526 }
527 
528 void
hlparsetext(Oid cfgId,HeadlineParsedText * prs,TSQuery query,char * buf,int buflen)529 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
530 {
531 	int			type,
532 				lenlemm;
533 	char	   *lemm = NULL;
534 	LexizeData	ldata;
535 	TSLexeme   *norms;
536 	ParsedLex  *lexs;
537 	TSConfigCacheEntry *cfg;
538 	TSParserCacheEntry *prsobj;
539 	void	   *prsdata;
540 
541 	cfg = lookup_ts_config_cache(cfgId);
542 	prsobj = lookup_ts_parser_cache(cfg->prsId);
543 
544 	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
545 													 PointerGetDatum(buf),
546 													 Int32GetDatum(buflen)));
547 
548 	LexizeInit(&ldata, cfg);
549 
550 	do
551 	{
552 		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
553 										   PointerGetDatum(prsdata),
554 										   PointerGetDatum(&lemm),
555 										   PointerGetDatum(&lenlemm)));
556 
557 		if (type > 0 && lenlemm >= MAXSTRLEN)
558 		{
559 #ifdef IGNORE_LONGLEXEME
560 			ereport(NOTICE,
561 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
562 					 errmsg("word is too long to be indexed"),
563 					 errdetail("Words longer than %d characters are ignored.",
564 							   MAXSTRLEN)));
565 			continue;
566 #else
567 			ereport(ERROR,
568 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
569 					 errmsg("word is too long to be indexed"),
570 					 errdetail("Words longer than %d characters are ignored.",
571 							   MAXSTRLEN)));
572 #endif
573 		}
574 
575 		LexizeAddLemm(&ldata, type, lemm, lenlemm);
576 
577 		do
578 		{
579 			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
580 			{
581 				prs->vectorpos++;
582 				addHLParsedLex(prs, query, lexs, norms);
583 			}
584 			else
585 				addHLParsedLex(prs, query, lexs, NULL);
586 		} while (norms);
587 
588 	} while (type > 0);
589 
590 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
591 }
592 
593 text *
generateHeadline(HeadlineParsedText * prs)594 generateHeadline(HeadlineParsedText *prs)
595 {
596 	text	   *out;
597 	char	   *ptr;
598 	int			len = 128;
599 	int			numfragments = 0;
600 	int16		infrag = 0;
601 
602 	HeadlineWordEntry *wrd = prs->words;
603 
604 	out = (text *) palloc(len);
605 	ptr = ((char *) out) + VARHDRSZ;
606 
607 	while (wrd - prs->words < prs->curwords)
608 	{
609 		while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
610 		{
611 			int			dist = ptr - ((char *) out);
612 
613 			len *= 2;
614 			out = (text *) repalloc(out, len);
615 			ptr = ((char *) out) + dist;
616 		}
617 
618 		if (wrd->in && !wrd->repeated)
619 		{
620 			if (!infrag)
621 			{
622 
623 				/* start of a new fragment */
624 				infrag = 1;
625 				numfragments++;
626 				/* add a fragment delimiter if this is after the first one */
627 				if (numfragments > 1)
628 				{
629 					memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
630 					ptr += prs->fragdelimlen;
631 				}
632 
633 			}
634 			if (wrd->replace)
635 			{
636 				*ptr = ' ';
637 				ptr++;
638 			}
639 			else if (!wrd->skip)
640 			{
641 				if (wrd->selected)
642 				{
643 					memcpy(ptr, prs->startsel, prs->startsellen);
644 					ptr += prs->startsellen;
645 				}
646 				memcpy(ptr, wrd->word, wrd->len);
647 				ptr += wrd->len;
648 				if (wrd->selected)
649 				{
650 					memcpy(ptr, prs->stopsel, prs->stopsellen);
651 					ptr += prs->stopsellen;
652 				}
653 			}
654 		}
655 		else if (!wrd->repeated)
656 		{
657 			if (infrag)
658 				infrag = 0;
659 			pfree(wrd->word);
660 		}
661 
662 		wrd++;
663 	}
664 
665 	SET_VARSIZE(out, ptr - ((char *) out));
666 	return out;
667 }
668