1 /*-------------------------------------------------------------------------
2 *
3 * ts_parse.c
4 * main parse functions for tsearch
5 *
6 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/ts_parse.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include "tsearch/ts_cache.h"
18 #include "tsearch/ts_utils.h"
19
20 #define IGNORE_LONGLEXEME 1
21
22 /*
23 * Lexize subsystem
24 */
25
26 typedef struct ParsedLex
27 {
28 int type;
29 char *lemm;
30 int lenlemm;
31 struct ParsedLex *next;
32 } ParsedLex;
33
34 typedef struct ListParsedLex
35 {
36 ParsedLex *head;
37 ParsedLex *tail;
38 } ListParsedLex;
39
40 typedef struct
41 {
42 TSConfigCacheEntry *cfg;
43 Oid curDictId;
44 int posDict;
45 DictSubState dictState;
46 ParsedLex *curSub;
47 ListParsedLex towork; /* current list to work */
48 ListParsedLex waste; /* list of lexemes that already lexized */
49
50 /*
51 * fields to store last variant to lexize (basically, thesaurus or similar
52 * to, which wants several lexemes
53 */
54
55 ParsedLex *lastRes;
56 TSLexeme *tmpRes;
57 } LexizeData;
58
59 static void
LexizeInit(LexizeData * ld,TSConfigCacheEntry * cfg)60 LexizeInit(LexizeData *ld, TSConfigCacheEntry *cfg)
61 {
62 ld->cfg = cfg;
63 ld->curDictId = InvalidOid;
64 ld->posDict = 0;
65 ld->towork.head = ld->towork.tail = ld->curSub = NULL;
66 ld->waste.head = ld->waste.tail = NULL;
67 ld->lastRes = NULL;
68 ld->tmpRes = NULL;
69 }
70
71 static void
LPLAddTail(ListParsedLex * list,ParsedLex * newpl)72 LPLAddTail(ListParsedLex *list, ParsedLex *newpl)
73 {
74 if (list->tail)
75 {
76 list->tail->next = newpl;
77 list->tail = newpl;
78 }
79 else
80 list->head = list->tail = newpl;
81 newpl->next = NULL;
82 }
83
84 static ParsedLex *
LPLRemoveHead(ListParsedLex * list)85 LPLRemoveHead(ListParsedLex *list)
86 {
87 ParsedLex *res = list->head;
88
89 if (list->head)
90 list->head = list->head->next;
91
92 if (list->head == NULL)
93 list->tail = NULL;
94
95 return res;
96 }
97
98 static void
LexizeAddLemm(LexizeData * ld,int type,char * lemm,int lenlemm)99 LexizeAddLemm(LexizeData *ld, int type, char *lemm, int lenlemm)
100 {
101 ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
102
103 newpl->type = type;
104 newpl->lemm = lemm;
105 newpl->lenlemm = lenlemm;
106 LPLAddTail(&ld->towork, newpl);
107 ld->curSub = ld->towork.tail;
108 }
109
110 static void
RemoveHead(LexizeData * ld)111 RemoveHead(LexizeData *ld)
112 {
113 LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
114
115 ld->posDict = 0;
116 }
117
118 static void
setCorrLex(LexizeData * ld,ParsedLex ** correspondLexem)119 setCorrLex(LexizeData *ld, ParsedLex **correspondLexem)
120 {
121 if (correspondLexem)
122 {
123 *correspondLexem = ld->waste.head;
124 }
125 else
126 {
127 ParsedLex *tmp,
128 *ptr = ld->waste.head;
129
130 while (ptr)
131 {
132 tmp = ptr->next;
133 pfree(ptr);
134 ptr = tmp;
135 }
136 }
137 ld->waste.head = ld->waste.tail = NULL;
138 }
139
140 static void
moveToWaste(LexizeData * ld,ParsedLex * stop)141 moveToWaste(LexizeData *ld, ParsedLex *stop)
142 {
143 bool go = true;
144
145 while (ld->towork.head && go)
146 {
147 if (ld->towork.head == stop)
148 {
149 ld->curSub = stop->next;
150 go = false;
151 }
152 RemoveHead(ld);
153 }
154 }
155
156 static void
setNewTmpRes(LexizeData * ld,ParsedLex * lex,TSLexeme * res)157 setNewTmpRes(LexizeData *ld, ParsedLex *lex, TSLexeme *res)
158 {
159 if (ld->tmpRes)
160 {
161 TSLexeme *ptr;
162
163 for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
164 pfree(ptr->lexeme);
165 pfree(ld->tmpRes);
166 }
167 ld->tmpRes = res;
168 ld->lastRes = lex;
169 }
170
171 static TSLexeme *
LexizeExec(LexizeData * ld,ParsedLex ** correspondLexem)172 LexizeExec(LexizeData *ld, ParsedLex **correspondLexem)
173 {
174 int i;
175 ListDictionary *map;
176 TSDictionaryCacheEntry *dict;
177 TSLexeme *res;
178
179 if (ld->curDictId == InvalidOid)
180 {
181 /*
182 * usual mode: dictionary wants only one word, but we should keep in
183 * mind that we should go through all stack
184 */
185
186 while (ld->towork.head)
187 {
188 ParsedLex *curVal = ld->towork.head;
189 char *curValLemm = curVal->lemm;
190 int curValLenLemm = curVal->lenlemm;
191
192 map = ld->cfg->map + curVal->type;
193
194 if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
195 {
196 /* skip this type of lexeme */
197 RemoveHead(ld);
198 continue;
199 }
200
201 for (i = ld->posDict; i < map->len; i++)
202 {
203 dict = lookup_ts_dictionary_cache(map->dictIds[i]);
204
205 ld->dictState.isend = ld->dictState.getnext = false;
206 ld->dictState.private_state = NULL;
207 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
208 PointerGetDatum(dict->dictData),
209 PointerGetDatum(curValLemm),
210 Int32GetDatum(curValLenLemm),
211 PointerGetDatum(&ld->dictState)));
212
213 if (ld->dictState.getnext)
214 {
215 /*
216 * dictionary wants next word, so setup and store current
217 * position and go to multiword mode
218 */
219
220 ld->curDictId = DatumGetObjectId(map->dictIds[i]);
221 ld->posDict = i + 1;
222 ld->curSub = curVal->next;
223 if (res)
224 setNewTmpRes(ld, curVal, res);
225 return LexizeExec(ld, correspondLexem);
226 }
227
228 if (!res) /* dictionary doesn't know this lexeme */
229 continue;
230
231 if (res->flags & TSL_FILTER)
232 {
233 curValLemm = res->lexeme;
234 curValLenLemm = strlen(res->lexeme);
235 continue;
236 }
237
238 RemoveHead(ld);
239 setCorrLex(ld, correspondLexem);
240 return res;
241 }
242
243 RemoveHead(ld);
244 }
245 }
246 else
247 { /* curDictId is valid */
248 dict = lookup_ts_dictionary_cache(ld->curDictId);
249
250 /*
251 * Dictionary ld->curDictId asks us about following words
252 */
253
254 while (ld->curSub)
255 {
256 ParsedLex *curVal = ld->curSub;
257
258 map = ld->cfg->map + curVal->type;
259
260 if (curVal->type != 0)
261 {
262 bool dictExists = false;
263
264 if (curVal->type >= ld->cfg->lenmap || map->len == 0)
265 {
266 /* skip this type of lexeme */
267 ld->curSub = curVal->next;
268 continue;
269 }
270
271 /*
272 * We should be sure that current type of lexeme is recognized
273 * by our dictionary: we just check is it exist in list of
274 * dictionaries ?
275 */
276 for (i = 0; i < map->len && !dictExists; i++)
277 if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
278 dictExists = true;
279
280 if (!dictExists)
281 {
282 /*
283 * Dictionary can't work with current type of lexeme,
284 * return to basic mode and redo all stored lexemes
285 */
286 ld->curDictId = InvalidOid;
287 return LexizeExec(ld, correspondLexem);
288 }
289 }
290
291 ld->dictState.isend = (curVal->type == 0) ? true : false;
292 ld->dictState.getnext = false;
293
294 res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
295 PointerGetDatum(dict->dictData),
296 PointerGetDatum(curVal->lemm),
297 Int32GetDatum(curVal->lenlemm),
298 PointerGetDatum(&ld->dictState)));
299
300 if (ld->dictState.getnext)
301 {
302 /* Dictionary wants one more */
303 ld->curSub = curVal->next;
304 if (res)
305 setNewTmpRes(ld, curVal, res);
306 continue;
307 }
308
309 if (res || ld->tmpRes)
310 {
311 /*
312 * Dictionary normalizes lexemes, so we remove from stack all
313 * used lexemes, return to basic mode and redo end of stack
314 * (if it exists)
315 */
316 if (res)
317 {
318 moveToWaste(ld, ld->curSub);
319 }
320 else
321 {
322 res = ld->tmpRes;
323 moveToWaste(ld, ld->lastRes);
324 }
325
326 /* reset to initial state */
327 ld->curDictId = InvalidOid;
328 ld->posDict = 0;
329 ld->lastRes = NULL;
330 ld->tmpRes = NULL;
331 setCorrLex(ld, correspondLexem);
332 return res;
333 }
334
335 /*
336 * Dict don't want next lexem and didn't recognize anything, redo
337 * from ld->towork.head
338 */
339 ld->curDictId = InvalidOid;
340 return LexizeExec(ld, correspondLexem);
341 }
342 }
343
344 setCorrLex(ld, correspondLexem);
345 return NULL;
346 }
347
348 /*
349 * Parse string and lexize words.
350 *
351 * prs will be filled in.
352 */
353 void
parsetext(Oid cfgId,ParsedText * prs,char * buf,int buflen)354 parsetext(Oid cfgId, ParsedText *prs, char *buf, int buflen)
355 {
356 int type,
357 lenlemm;
358 char *lemm = NULL;
359 LexizeData ldata;
360 TSLexeme *norms;
361 TSConfigCacheEntry *cfg;
362 TSParserCacheEntry *prsobj;
363 void *prsdata;
364
365 cfg = lookup_ts_config_cache(cfgId);
366 prsobj = lookup_ts_parser_cache(cfg->prsId);
367
368 prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
369 PointerGetDatum(buf),
370 Int32GetDatum(buflen)));
371
372 LexizeInit(&ldata, cfg);
373
374 do
375 {
376 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
377 PointerGetDatum(prsdata),
378 PointerGetDatum(&lemm),
379 PointerGetDatum(&lenlemm)));
380
381 if (type > 0 && lenlemm >= MAXSTRLEN)
382 {
383 #ifdef IGNORE_LONGLEXEME
384 ereport(NOTICE,
385 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
386 errmsg("word is too long to be indexed"),
387 errdetail("Words longer than %d characters are ignored.",
388 MAXSTRLEN)));
389 continue;
390 #else
391 ereport(ERROR,
392 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
393 errmsg("word is too long to be indexed"),
394 errdetail("Words longer than %d characters are ignored.",
395 MAXSTRLEN)));
396 #endif
397 }
398
399 LexizeAddLemm(&ldata, type, lemm, lenlemm);
400
401 while ((norms = LexizeExec(&ldata, NULL)) != NULL)
402 {
403 TSLexeme *ptr = norms;
404
405 prs->pos++; /* set pos */
406
407 while (ptr->lexeme)
408 {
409 if (prs->curwords == prs->lenwords)
410 {
411 prs->lenwords *= 2;
412 prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
413 }
414
415 if (ptr->flags & TSL_ADDPOS)
416 prs->pos++;
417 prs->words[prs->curwords].len = strlen(ptr->lexeme);
418 prs->words[prs->curwords].word = ptr->lexeme;
419 prs->words[prs->curwords].nvariant = ptr->nvariant;
420 prs->words[prs->curwords].flags = ptr->flags & TSL_PREFIX;
421 prs->words[prs->curwords].alen = 0;
422 prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
423 ptr++;
424 prs->curwords++;
425 }
426 pfree(norms);
427 }
428 } while (type > 0);
429
430 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
431 }
432
433 /*
434 * Headline framework
435 */
436 static void
hladdword(HeadlineParsedText * prs,char * buf,int buflen,int type)437 hladdword(HeadlineParsedText *prs, char *buf, int buflen, int type)
438 {
439 while (prs->curwords >= prs->lenwords)
440 {
441 prs->lenwords *= 2;
442 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
443 }
444 memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWordEntry));
445 prs->words[prs->curwords].type = (uint8) type;
446 prs->words[prs->curwords].len = buflen;
447 prs->words[prs->curwords].word = palloc(buflen);
448 memcpy(prs->words[prs->curwords].word, buf, buflen);
449 prs->curwords++;
450 }
451
452 static void
hlfinditem(HeadlineParsedText * prs,TSQuery query,int32 pos,char * buf,int buflen)453 hlfinditem(HeadlineParsedText *prs, TSQuery query, int32 pos, char *buf, int buflen)
454 {
455 int i;
456 QueryItem *item = GETQUERY(query);
457 HeadlineWordEntry *word;
458
459 while (prs->curwords + query->size >= prs->lenwords)
460 {
461 prs->lenwords *= 2;
462 prs->words = (HeadlineWordEntry *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWordEntry));
463 }
464
465 word = &(prs->words[prs->curwords - 1]);
466 word->pos = LIMITPOS(pos);
467 for (i = 0; i < query->size; i++)
468 {
469 if (item->type == QI_VAL &&
470 tsCompareString(GETOPERAND(query) + item->qoperand.distance, item->qoperand.length,
471 buf, buflen, item->qoperand.prefix) == 0)
472 {
473 if (word->item)
474 {
475 memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWordEntry));
476 prs->words[prs->curwords].item = &item->qoperand;
477 prs->words[prs->curwords].repeated = 1;
478 prs->curwords++;
479 }
480 else
481 word->item = &item->qoperand;
482 }
483 item++;
484 }
485 }
486
487 static void
addHLParsedLex(HeadlineParsedText * prs,TSQuery query,ParsedLex * lexs,TSLexeme * norms)488 addHLParsedLex(HeadlineParsedText *prs, TSQuery query, ParsedLex *lexs, TSLexeme *norms)
489 {
490 ParsedLex *tmplexs;
491 TSLexeme *ptr;
492 int32 savedpos;
493
494 while (lexs)
495 {
496 if (lexs->type > 0)
497 hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
498
499 ptr = norms;
500 savedpos = prs->vectorpos;
501 while (ptr && ptr->lexeme)
502 {
503 if (ptr->flags & TSL_ADDPOS)
504 savedpos++;
505 hlfinditem(prs, query, savedpos, ptr->lexeme, strlen(ptr->lexeme));
506 ptr++;
507 }
508
509 tmplexs = lexs->next;
510 pfree(lexs);
511 lexs = tmplexs;
512 }
513
514 if (norms)
515 {
516 ptr = norms;
517 while (ptr->lexeme)
518 {
519 if (ptr->flags & TSL_ADDPOS)
520 prs->vectorpos++;
521 pfree(ptr->lexeme);
522 ptr++;
523 }
524 pfree(norms);
525 }
526 }
527
528 void
hlparsetext(Oid cfgId,HeadlineParsedText * prs,TSQuery query,char * buf,int buflen)529 hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int buflen)
530 {
531 int type,
532 lenlemm;
533 char *lemm = NULL;
534 LexizeData ldata;
535 TSLexeme *norms;
536 ParsedLex *lexs;
537 TSConfigCacheEntry *cfg;
538 TSParserCacheEntry *prsobj;
539 void *prsdata;
540
541 cfg = lookup_ts_config_cache(cfgId);
542 prsobj = lookup_ts_parser_cache(cfg->prsId);
543
544 prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
545 PointerGetDatum(buf),
546 Int32GetDatum(buflen)));
547
548 LexizeInit(&ldata, cfg);
549
550 do
551 {
552 type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
553 PointerGetDatum(prsdata),
554 PointerGetDatum(&lemm),
555 PointerGetDatum(&lenlemm)));
556
557 if (type > 0 && lenlemm >= MAXSTRLEN)
558 {
559 #ifdef IGNORE_LONGLEXEME
560 ereport(NOTICE,
561 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
562 errmsg("word is too long to be indexed"),
563 errdetail("Words longer than %d characters are ignored.",
564 MAXSTRLEN)));
565 continue;
566 #else
567 ereport(ERROR,
568 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
569 errmsg("word is too long to be indexed"),
570 errdetail("Words longer than %d characters are ignored.",
571 MAXSTRLEN)));
572 #endif
573 }
574
575 LexizeAddLemm(&ldata, type, lemm, lenlemm);
576
577 do
578 {
579 if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
580 {
581 prs->vectorpos++;
582 addHLParsedLex(prs, query, lexs, norms);
583 }
584 else
585 addHLParsedLex(prs, query, lexs, NULL);
586 } while (norms);
587
588 } while (type > 0);
589
590 FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
591 }
592
593 text *
generateHeadline(HeadlineParsedText * prs)594 generateHeadline(HeadlineParsedText *prs)
595 {
596 text *out;
597 char *ptr;
598 int len = 128;
599 int numfragments = 0;
600 int16 infrag = 0;
601
602 HeadlineWordEntry *wrd = prs->words;
603
604 out = (text *) palloc(len);
605 ptr = ((char *) out) + VARHDRSZ;
606
607 while (wrd - prs->words < prs->curwords)
608 {
609 while (wrd->len + prs->stopsellen + prs->startsellen + prs->fragdelimlen + (ptr - ((char *) out)) >= len)
610 {
611 int dist = ptr - ((char *) out);
612
613 len *= 2;
614 out = (text *) repalloc(out, len);
615 ptr = ((char *) out) + dist;
616 }
617
618 if (wrd->in && !wrd->repeated)
619 {
620 if (!infrag)
621 {
622
623 /* start of a new fragment */
624 infrag = 1;
625 numfragments++;
626 /* add a fragment delimiter if this is after the first one */
627 if (numfragments > 1)
628 {
629 memcpy(ptr, prs->fragdelim, prs->fragdelimlen);
630 ptr += prs->fragdelimlen;
631 }
632
633 }
634 if (wrd->replace)
635 {
636 *ptr = ' ';
637 ptr++;
638 }
639 else if (!wrd->skip)
640 {
641 if (wrd->selected)
642 {
643 memcpy(ptr, prs->startsel, prs->startsellen);
644 ptr += prs->startsellen;
645 }
646 memcpy(ptr, wrd->word, wrd->len);
647 ptr += wrd->len;
648 if (wrd->selected)
649 {
650 memcpy(ptr, prs->stopsel, prs->stopsellen);
651 ptr += prs->stopsellen;
652 }
653 }
654 }
655 else if (!wrd->repeated)
656 {
657 if (infrag)
658 infrag = 0;
659 pfree(wrd->word);
660 }
661
662 wrd++;
663 }
664
665 SET_VARSIZE(out, ptr - ((char *) out));
666 return out;
667 }
668