1 /*-------------------------------------------------------------------------
2  *
3  * ts_parse.c
4  *		main parse functions for tsearch
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/ts_parse.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_utils.h"
19 
20 #define IGNORE_LONGLEXEME	1
21 
22 /*
23  * Lexize subsystem
24  */
25 
26 typedef struct ParsedLex
27 {
28 	int			type;
29 	char	   *lemm;
30 	int			lenlemm;
31 	struct ParsedLex *next;
32 } ParsedLex;
33 
34 typedef struct ListParsedLex
35 {
36 	ParsedLex  *head;
37 	ParsedLex  *tail;
38 } ListParsedLex;
39 
40 typedef struct
41 {
42 	TSConfigCacheEntry *cfg;
43 	Oid			curDictId;
44 	int			posDict;
45 	DictSubState dictState;
46 	ParsedLex  *curSub;
47 	ListParsedLex towork;		/* current list to work */
48 	ListParsedLex waste;		/* list of lexemes that already lexized */
49 
50 	/*
51 	 * fields to store last variant to lexize (basically, thesaurus or similar
52 	 * to, which wants	several lexemes
53 	 */
54 
55 	ParsedLex  *lastRes;
56 	TSLexeme   *tmpRes;
57 } LexizeData;
58 
59 static void
LexizeInit(LexizeData * ld,TSConfigCacheEntry * cfg)60 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
61 {
62 	ld->cfg = cfg;
63 	ld->curDictId = InvalidOid;
64 	ld->posDict = 0;
65 	ld->towork.head = ld->towork.tail = ld->curSub = NULL;
66 	ld->waste.head = ld->waste.tail = NULL;
67 	ld->lastRes = NULL;
68 	ld->tmpRes = NULL;
69 }
70 
71 static void
LPLAddTail(ListParsedLex * list,ParsedLex * newpl)72 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
73 {
74 	if (list->tail)
75 	{
76 		list->tail->next = newpl;
77 		list->tail = newpl;
78 	}
79 	else
80 		list->head = list->tail = newpl;
81 	newpl->next = NULL;
82 }
83 
84 static ParsedLex *
LPLRemoveHead(ListParsedLex * list)85 LPLRemoveHead(ListParsedLex *list)
86 {
87 	ParsedLex  *res = list->head;
88 
89 	if (list->head)
90 		list->head = list->head->next;
91 
92 	if (list->head == NULL)
93 		list->tail = NULL;
94 
95 	return res;
96 }
97 
98 static void
LexizeAddLemm(LexizeData * ld,int type,char * lemm,int lenlemm)99 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
100 {
101 	ParsedLex  *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
102 
103 	newpl->type = type;
104 	newpl->lemm = lemm;
105 	newpl->lenlemm = lenlemm;
106 	LPLAddTail(&ld->towork, newpl);
107 	ld->curSub = ld->towork.tail;
108 }
109 
110 static void
RemoveHead(LexizeData * ld)111 RemoveHead(LexizeData *ld)
112 {
113 	LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
114 
115 	ld->posDict = 0;
116 }
117 
118 static void
setCorrLex(LexizeData * ld,ParsedLex ** correspondLexem)119 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
120 {
121 	if (correspondLexem)
122 	{
123 		*correspondLexem = ld->waste.head;
124 	}
125 	else
126 	{
127 		ParsedLex  *tmp,
128 				   *ptr = ld->waste.head;
129 
130 		while (ptr)
131 		{
132 			tmp = ptr->next;
133 			pfree(ptr);
134 			ptr = tmp;
135 		}
136 	}
137 	ld->waste.head = ld->waste.tail = NULL;
138 }
139 
140 static void
moveToWaste(LexizeData * ld,ParsedLex * stop)141 moveToWaste(LexizeData *ld, ParsedLex *stop)
142 {
143 	bool		go = true;
144 
145 	while (ld->towork.head && go)
146 	{
147 		if (ld->towork.head == stop)
148 		{
149 			ld->curSub = stop->next;
150 			go = false;
151 		}
152 		RemoveHead(ld);
153 	}
154 }
155 
156 static void
setNewTmpRes(LexizeData * ld,ParsedLex * lex,TSLexeme * res)157 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
158 {
159 	if (ld->tmpRes)
160 	{
161 		TSLexeme   *ptr;
162 
163 		for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
164 			pfree(ptr->lexeme);
165 		pfree(ld->tmpRes);
166 	}
167 	ld->tmpRes = res;
168 	ld->lastRes = lex;
169 }
170 
171 static TSLexeme *
LexizeExec(LexizeData * ld,ParsedLex ** correspondLexem)172 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
173 {
174 	int			i;
175 	ListDictionary *map;
176 	TSDictionaryCacheEntry *dict;
177 	TSLexeme   *res;
178 
179 	if (ld->curDictId == InvalidOid)
180 	{
181 		/*
182 		 * usual mode: dictionary wants only one word, but we should keep in
183 		 * mind that we should go through all stack
184 		 */
185 
186 		while (ld->towork.head)
187 		{
188 			ParsedLex  *curVal = ld->towork.head;
189 			char	   *curValLemm = curVal->lemm;
190 			int			curValLenLemm = curVal->lenlemm;
191 
192 			map = ld->cfg->map + curVal->type;
193 
194 			if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
195 			{
196 				/* skip this type of lexeme */
197 				RemoveHead(ld);
198 				continue;
199 			}
200 
201 			for (i = ld->posDict; i < map->len; i++)
202 			{
203 				dict = lookup_ts_dictionary_cache(map->dictIds[i]);
204 
205 				ld->dictState.isend = ld->dictState.getnext = false;
206 				ld->dictState.private_state = NULL;
207 				res = (TSLexeme *) DatumGetPointer(FunctionCall4(
208 																 &(dict->lexize),
209 																 PointerGetDatum(dict->dictData),
210 																 PointerGetDatum(curValLemm),
211 																 Int32GetDatum(curValLenLemm),
212 																 PointerGetDatum(&ld->dictState)
213 																 ));
214 
215 				if (ld->dictState.getnext)
216 				{
217 					/*
218 					 * dictionary wants next word, so setup and store current
219 					 * position and go to multiword mode
220 					 */
221 
222 					ld->curDictId = DatumGetObjectId(map->dictIds[i]);
223 					ld->posDict = i + 1;
224 					ld->curSub = curVal->next;
225 					if (res)
226 						setNewTmpRes(ld, curVal, res);
227 					return LexizeExec(ld, correspondLexem);
228 				}
229 
230 				if (!res)		/* dictionary doesn't know this lexeme */
231 					continue;
232 
233 				if (res->flags & TSL_FILTER)
234 				{
235 					curValLemm = res->lexeme;
236 					curValLenLemm = strlen(res->lexeme);
237 					continue;
238 				}
239 
240 				RemoveHead(ld);
241 				setCorrLex(ld, correspondLexem);
242 				return res;
243 			}
244 
245 			RemoveHead(ld);
246 		}
247 	}
248 	else
249 	{							/* curDictId is valid */
250 		dict = lookup_ts_dictionary_cache(ld->curDictId);
251 
252 		/*
253 		 * Dictionary ld->curDictId asks  us about following words
254 		 */
255 
256 		while (ld->curSub)
257 		{
258 			ParsedLex  *curVal = ld->curSub;
259 
260 			map = ld->cfg->map + curVal->type;
261 
262 			if (curVal->type != 0)
263 			{
264 				bool		dictExists = false;
265 
266 				if (curVal->type >= ld->cfg->lenmap || map->len == 0)
267 				{
268 					/* skip this type of lexeme */
269 					ld->curSub = curVal->next;
270 					continue;
271 				}
272 
273 				/*
274 				 * We should be sure that current type of lexeme is recognized
275 				 * by our dictionary: we just check is it exist in list of
276 				 * dictionaries ?
277 				 */
278 				for (i = 0; i < map->len && !dictExists; i++)
279 					if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
280 						dictExists = true;
281 
282 				if (!dictExists)
283 				{
284 					/*
285 					 * Dictionary can't work with current tpe of lexeme,
286 					 * return to basic mode and redo all stored lexemes
287 					 */
288 					ld->curDictId = InvalidOid;
289 					return LexizeExec(ld, correspondLexem);
290 				}
291 			}
292 
293 			ld->dictState.isend = (curVal->type == 0) ? true : false;
294 			ld->dictState.getnext = false;
295 
296 			res = (TSLexeme *) DatumGetPointer(FunctionCall4(
297 															 &(dict->lexize),
298 															 PointerGetDatum(dict->dictData),
299 															 PointerGetDatum(curVal->lemm),
300 															 Int32GetDatum(curVal->lenlemm),
301 															 PointerGetDatum(&ld->dictState)
302 															 ));
303 
304 			if (ld->dictState.getnext)
305 			{
306 				/* Dictionary wants one more */
307 				ld->curSub = curVal->next;
308 				if (res)
309 					setNewTmpRes(ld, curVal, res);
310 				continue;
311 			}
312 
313 			if (res || ld->tmpRes)
314 			{
315 				/*
316 				 * Dictionary normalizes lexemes, so we remove from stack all
317 				 * used lexemes, return to basic mode and redo end of stack
318 				 * (if it exists)
319 				 */
320 				if (res)
321 				{
322 					moveToWaste(ld, ld->curSub);
323 				}
324 				else
325 				{
326 					res = ld->tmpRes;
327 					moveToWaste(ld, ld->lastRes);
328 				}
329 
330 				/* reset to initial state */
331 				ld->curDictId = InvalidOid;
332 				ld->posDict = 0;
333 				ld->lastRes = NULL;
334 				ld->tmpRes = NULL;
335 				setCorrLex(ld, correspondLexem);
336 				return res;
337 			}
338 
339 			/*
340 			 * Dict don't want next lexem and didn't recognize anything, redo
341 			 * from ld->towork.head
342 			 */
343 			ld->curDictId = InvalidOid;
344 			return LexizeExec(ld, correspondLexem);
345 		}
346 	}
347 
348 	setCorrLex(ld, correspondLexem);
349 	return NULL;
350 }
351 
352 /*
353  * Parse string and lexize words.
354  *
355  * prs will be filled in.
356  */
357 void
parsetext(Oid cfgId,ParsedText * prs,char * buf,int buflen)358 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
359 {
360 	int			type,
361 				lenlemm;
362 	char	   *lemm = NULL;
363 	LexizeData	ldata;
364 	TSLexeme   *norms;
365 	TSConfigCacheEntry *cfg;
366 	TSParserCacheEntry *prsobj;
367 	void	   *prsdata;
368 
369 	cfg = lookup_ts_config_cache(cfgId);
370 	prsobj = lookup_ts_parser_cache(cfg->prsId);
371 
372 	prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
373 													 PointerGetDatum(buf),
374 													 Int32GetDatum(buflen)));
375 
376 	LexizeInit(&ldata, cfg);
377 
378 	do
379 	{
380 		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
381 										   PointerGetDatum(prsdata),
382 										   PointerGetDatum(&lemm),
383 										   PointerGetDatum(&lenlemm)));
384 
385 		if (type > 0 && lenlemm >= MAXSTRLEN)
386 		{
387 #ifdef IGNORE_LONGLEXEME
388 			ereport(NOTICE,
389 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
390 					 errmsg("word is too long to be indexed"),
391 					 errdetail("Words longer than %d characters are ignored.",
392 							   MAXSTRLEN)));
393 			continue;
394 #else
395 			ereport(ERROR,
396 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
397 					 errmsg("word is too long to be indexed"),
398 					 errdetail("Words longer than %d characters are ignored.",
399 							   MAXSTRLEN)));
400 #endif
401 		}
402 
403 		LexizeAddLemm(&ldata, type, lemm, lenlemm);
404 
405 		while ((norms = LexizeExec(&ldata, NULL)) != NULL)
406 		{
407 			TSLexeme   *ptr = norms;
408 
409 			prs->pos++;			/* set pos */
410 
411 			while (ptr->lexeme)
412 			{
413 				if (prs->curwords == prs->lenwords)
414 				{
415 					prs->lenwords *= 2;
416 					prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
417 				}
418 
419 				if (ptr->flags & TSL_ADDPOS)
420 					prs->pos++;
421 				prs->words[prs->curwords].len = strlen(ptr->lexeme);
422 				prs->words[prs->curwords].word = ptr->lexeme;
423 				prs->words[prs->curwords].nvariant = ptr->nvariant;
424 				prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
425 				prs->words[prs->curwords].alen = 0;
426 				prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
427 				ptr++;
428 				prs->curwords++;
429 			}
430 			pfree(norms);
431 		}
432 	} while (type > 0);
433 
434 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
435 }
436 
437 /*
438  * Headline framework
439  */
440 static void
hladdword(HeadlineParsedText * prs,char * buf,int buflen,int type)441 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
442 {
443 	while (prs->curwords >= prs->lenwords)
444 	{
445 		prs->lenwords *= 2;
446 		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
447 	}
448 	memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
449 	prs->words[prs->curwords].type = (uint8) type;
450 	prs->words[prs->curwords].len = buflen;
451 	prs->words[prs->curwords].word = palloc(buflen);
452 	memcpy(prs->words[prs->curwords].word, buf, buflen);
453 	prs->curwords++;
454 }
455 
456 static void
hlfinditem(HeadlineParsedText * prs,TSQuery query,int32 pos,char * buf,int buflen)457 hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
458 {
459 	int			i;
460 	QueryItem  *item = GETQUERY(query);
461 	HeadlineWordEntry *word;
462 
463 	while (prs->curwords + query->size >= prs->lenwords)
464 	{
465 		prs->lenwords *= 2;
466 		prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
467 	}
468 
469 	word = &(prs->words[prs->curwords - 1]);
470 	word->pos = LIMITPOS(pos);
471 	for (i = 0; i < query->size; i++)
472 	{
473 		if (item->type == QI_VAL &&
474 			tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
475 							buf, buflen, item->qoperand.prefix) == 0)
476 		{
477 			if (word->item)
478 			{
479 				memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
480 				prs->words[prs->curwords].item = &item->qoperand;
481 				prs->words[prs->curwords].repeated = 1;
482 				prs->curwords++;
483 			}
484 			else
485 				word->item = &item->qoperand;
486 		}
487 		item++;
488 	}
489 }
490 
491 static void
addHLParsedLex(HeadlineParsedText * prs,TSQuery query,ParsedLex * lexs,TSLexeme * norms)492 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
493 {
494 	ParsedLex  *tmplexs;
495 	TSLexeme   *ptr;
496 	int32		savedpos;
497 
498 	while (lexs)
499 	{
500 		if (lexs->type > 0)
501 			hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
502 
503 		ptr = norms;
504 		savedpos = prs->vectorpos;
505 		while (ptr && ptr->lexeme)
506 		{
507 			if (ptr->flags & TSL_ADDPOS)
508 				savedpos++;
509 			hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
510 			ptr++;
511 		}
512 
513 		tmplexs = lexs->next;
514 		pfree(lexs);
515 		lexs = tmplexs;
516 	}
517 
518 	if (norms)
519 	{
520 		ptr = norms;
521 		while (ptr->lexeme)
522 		{
523 			if (ptr->flags & TSL_ADDPOS)
524 				prs->vectorpos++;
525 			pfree(ptr->lexeme);
526 			ptr++;
527 		}
528 		pfree(norms);
529 	}
530 }
531 
532 void
hlparsetext(Oid cfgId,HeadlineParsedText * prs,TSQuery query,char * buf,int buflen)533 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
534 {
535 	int			type,
536 				lenlemm;
537 	char	   *lemm = NULL;
538 	LexizeData	ldata;
539 	TSLexeme   *norms;
540 	ParsedLex  *lexs;
541 	TSConfigCacheEntry *cfg;
542 	TSParserCacheEntry *prsobj;
543 	void	   *prsdata;
544 
545 	cfg = lookup_ts_config_cache(cfgId);
546 	prsobj = lookup_ts_parser_cache(cfg->prsId);
547 
548 	prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
549 													 PointerGetDatum(buf),
550 													 Int32GetDatum(buflen)));
551 
552 	LexizeInit(&ldata, cfg);
553 
554 	do
555 	{
556 		type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
557 										   PointerGetDatum(prsdata),
558 										   PointerGetDatum(&lemm),
559 										   PointerGetDatum(&lenlemm)));
560 
561 		if (type > 0 && lenlemm >= MAXSTRLEN)
562 		{
563 #ifdef IGNORE_LONGLEXEME
564 			ereport(NOTICE,
565 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
566 					 errmsg("word is too long to be indexed"),
567 					 errdetail("Words longer than %d characters are ignored.",
568 							   MAXSTRLEN)));
569 			continue;
570 #else
571 			ereport(ERROR,
572 					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
573 					 errmsg("word is too long to be indexed"),
574 					 errdetail("Words longer than %d characters are ignored.",
575 							   MAXSTRLEN)));
576 #endif
577 		}
578 
579 		LexizeAddLemm(&ldata, type, lemm, lenlemm);
580 
581 		do
582 		{
583 			if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
584 			{
585 				prs->vectorpos++;
586 				addHLParsedLex(prs, query, lexs, norms);
587 			}
588 			else
589 				addHLParsedLex(prs, query, lexs, NULL);
590 		} while (norms);
591 
592 	} while (type > 0);
593 
594 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
595 }
596 
597 text *
generateHeadline(HeadlineParsedText * prs)598 generateHeadline(HeadlineParsedText *prs)
599 {
600 	text	   *out;
601 	char	   *ptr;
602 	int			len = 128;
603 	int			numfragments = 0;
604 	int16		infrag = 0;
605 
606 	HeadlineWordEntry *wrd = prs->words;
607 
608 	out = (text *) palloc(len);
609 	ptr = ((char *) out) + VARHDRSZ;
610 
611 	while (wrd - prs->words < prs->curwords)
612 	{
613 		while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
614 		{
615 			int			dist = ptr - ((char *) out);
616 
617 			len *= 2;
618 			out = (text *) repalloc(out, len);
619 			ptr = ((char *) out) + dist;
620 		}
621 
622 		if (wrd->in && !wrd->repeated)
623 		{
624 			if (!infrag)
625 			{
626 
627 				/* start of a new fragment */
628 				infrag = 1;
629 				numfragments++;
630 				/* add a fragment delimiter if this is after the first one */
631 				if (numfragments > 1)
632 				{
633 					memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
634 					ptr += prs->fragdelimlen;
635 				}
636 
637 			}
638 			if (wrd->replace)
639 			{
640 				*ptr = ' ';
641 				ptr++;
642 			}
643 			else if (!wrd->skip)
644 			{
645 				if (wrd->selected)
646 				{
647 					memcpy(ptr, prs->startsel, prs->startsellen);
648 					ptr += prs->startsellen;
649 				}
650 				memcpy(ptr, wrd->word, wrd->len);
651 				ptr += wrd->len;
652 				if (wrd->selected)
653 				{
654 					memcpy(ptr, prs->stopsel, prs->stopsellen);
655 					ptr += prs->stopsellen;
656 				}
657 			}
658 		}
659 		else if (!wrd->repeated)
660 		{
661 			if (infrag)
662 				infrag = 0;
663 			pfree(wrd->word);
664 		}
665 
666 		wrd++;
667 	}
668 
669 	SET_VARSIZE(out, ptr - ((char *) out));
670 	return out;
671 }
672