1 /*-------------------------------------------------------------------------
2 *
3 * wparser_def.c
4 * Default text search parser
5 *
6 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/wparser_def.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <limits.h>
18
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26
27
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30
31
32 /* Output token categories */
33
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57
58 #define LASTNUM 23
59
60 static const char *const tok_alias[] = {
61 "",
62 "asciiword",
63 "word",
64 "numword",
65 "email",
66 "url",
67 "host",
68 "sfloat",
69 "version",
70 "hword_numpart",
71 "hword_part",
72 "hword_asciipart",
73 "blank",
74 "tag",
75 "protocol",
76 "numhword",
77 "asciihword",
78 "hword",
79 "url_path",
80 "file",
81 "float",
82 "int",
83 "uint",
84 "entity"
85 };
86
87 static const char *const lex_descr[] = {
88 "",
89 "Word, all ASCII",
90 "Word, all letters",
91 "Word, letters and digits",
92 "Email address",
93 "URL",
94 "Host",
95 "Scientific notation",
96 "Version number",
97 "Hyphenated word part, letters and digits",
98 "Hyphenated word part, all letters",
99 "Hyphenated word part, all ASCII",
100 "Space symbols",
101 "XML tag",
102 "Protocol head",
103 "Hyphenated word, letters and digits",
104 "Hyphenated word, all ASCII",
105 "Hyphenated word, all letters",
106 "URL path",
107 "File or path name",
108 "Decimal notation",
109 "Signed integer",
110 "Unsigned integer",
111 "XML entity"
112 };
113
114
115 /* Parser states */
116
117 typedef enum
118 {
119 TPS_Base = 0,
120 TPS_InNumWord,
121 TPS_InAsciiWord,
122 TPS_InWord,
123 TPS_InUnsignedInt,
124 TPS_InSignedIntFirst,
125 TPS_InSignedInt,
126 TPS_InSpace,
127 TPS_InUDecimalFirst,
128 TPS_InUDecimal,
129 TPS_InDecimalFirst,
130 TPS_InDecimal,
131 TPS_InVerVersion,
132 TPS_InSVerVersion,
133 TPS_InVersionFirst,
134 TPS_InVersion,
135 TPS_InMantissaFirst,
136 TPS_InMantissaSign,
137 TPS_InMantissa,
138 TPS_InXMLEntityFirst,
139 TPS_InXMLEntity,
140 TPS_InXMLEntityNumFirst,
141 TPS_InXMLEntityNum,
142 TPS_InXMLEntityHexNumFirst,
143 TPS_InXMLEntityHexNum,
144 TPS_InXMLEntityEnd,
145 TPS_InTagFirst,
146 TPS_InXMLBegin,
147 TPS_InTagCloseFirst,
148 TPS_InTagName,
149 TPS_InTagBeginEnd,
150 TPS_InTag,
151 TPS_InTagEscapeK,
152 TPS_InTagEscapeKK,
153 TPS_InTagBackSleshed,
154 TPS_InTagEnd,
155 TPS_InCommentFirst,
156 TPS_InCommentLast,
157 TPS_InComment,
158 TPS_InCloseCommentFirst,
159 TPS_InCloseCommentLast,
160 TPS_InCommentEnd,
161 TPS_InHostFirstDomain,
162 TPS_InHostDomainSecond,
163 TPS_InHostDomain,
164 TPS_InPortFirst,
165 TPS_InPort,
166 TPS_InHostFirstAN,
167 TPS_InHost,
168 TPS_InEmail,
169 TPS_InFileFirst,
170 TPS_InFileTwiddle,
171 TPS_InPathFirst,
172 TPS_InPathFirstFirst,
173 TPS_InPathSecond,
174 TPS_InFile,
175 TPS_InFileNext,
176 TPS_InURLPathFirst,
177 TPS_InURLPathStart,
178 TPS_InURLPath,
179 TPS_InFURL,
180 TPS_InProtocolFirst,
181 TPS_InProtocolSecond,
182 TPS_InProtocolEnd,
183 TPS_InHyphenAsciiWordFirst,
184 TPS_InHyphenAsciiWord,
185 TPS_InHyphenWordFirst,
186 TPS_InHyphenWord,
187 TPS_InHyphenNumWordFirst,
188 TPS_InHyphenNumWord,
189 TPS_InHyphenDigitLookahead,
190 TPS_InParseHyphen,
191 TPS_InParseHyphenHyphen,
192 TPS_InHyphenWordPart,
193 TPS_InHyphenAsciiWordPart,
194 TPS_InHyphenNumWordPart,
195 TPS_InHyphenUnsignedInt,
196 TPS_Null /* last state (fake value) */
197 } TParserState;
198
199 /* forward declaration */
200 struct TParser;
201
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203 * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205 * special cases... */
206
207 typedef struct
208 {
209 TParserCharTest isclass;
210 char c;
211 uint16 flags;
212 TParserState tostate;
213 int type;
214 TParserSpecial special;
215 } TParserStateActionItem;
216
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226
227 typedef struct TParserPosition
228 {
229 int posbyte; /* position of parser in bytes */
230 int poschar; /* position of parser in characters */
231 int charlen; /* length of current char */
232 int lenbytetoken; /* length of token-so-far in bytes */
233 int lenchartoken; /* and in chars */
234 TParserState state;
235 struct TParserPosition *prev;
236 const TParserStateActionItem *pushedAtAction;
237 } TParserPosition;
238
239 typedef struct TParser
240 {
241 /* string and position information */
242 char *str; /* multibyte string */
243 int lenstr; /* length of mbstring */
244 wchar_t *wstr; /* wide character string */
245 pg_wchar *pgwstr; /* wide character string for C-locale */
246 bool usewide;
247
248 /* State of parse */
249 int charmaxlen;
250 TParserPosition *state;
251 bool ignore;
252 bool wanthost;
253
254 /* silly char */
255 char c;
256
257 /* out */
258 char *token;
259 int lenbytetoken;
260 int lenchartoken;
261 int type;
262 } TParser;
263
264
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267
268
269 static TParserPosition *
newTParserPosition(TParserPosition * prev)270 newTParserPosition(TParserPosition *prev)
271 {
272 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273
274 if (prev)
275 memcpy(res, prev, sizeof(TParserPosition));
276 else
277 memset(res, 0, sizeof(TParserPosition));
278
279 res->prev = prev;
280
281 res->pushedAtAction = NULL;
282
283 return res;
284 }
285
286 static TParser *
TParserInit(char * str,int len)287 TParserInit(char *str, int len)
288 {
289 TParser *prs = (TParser *) palloc0(sizeof(TParser));
290
291 prs->charmaxlen = pg_database_encoding_max_length();
292 prs->str = str;
293 prs->lenstr = len;
294
295 /*
296 * Use wide char code only when max encoding length > 1.
297 */
298 if (prs->charmaxlen > 1)
299 {
300 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
301 pg_locale_t mylocale = 0; /* TODO */
302
303 prs->usewide = true;
304 if (lc_ctype_is_c(collation))
305 {
306 /*
307 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308 * be different from sizeof(wchar_t)
309 */
310 prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312 }
313 else
314 {
315 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316 char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317 mylocale);
318 }
319 }
320 else
321 prs->usewide = false;
322
323 prs->state = newTParserPosition(NULL);
324 prs->state->state = TPS_Base;
325
326 #ifdef WPARSER_TRACE
327
328 /*
329 * Use of %.*s here is a bit risky since it can misbehave if the data is
330 * not in what libc thinks is the prevailing encoding. However, since
331 * this is just a debugging aid, we choose to live with that.
332 */
333 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
334 #endif
335
336 return prs;
337 }
338
339 /*
340 * As an alternative to a full TParserInit one can create a
341 * TParserCopy which basically is a regular TParser without a private
342 * copy of the string - instead it uses the one from another TParser.
343 * This is useful because at some places TParsers are created
344 * recursively and the repeated copying around of the strings can
345 * cause major inefficiency if the source string is long.
346 * The new parser starts parsing at the original's current position.
347 *
348 * Obviously one must not close the original TParser before the copy.
349 */
350 static TParser *
TParserCopyInit(const TParser * orig)351 TParserCopyInit(const TParser *orig)
352 {
353 TParser *prs = (TParser *) palloc0(sizeof(TParser));
354
355 prs->charmaxlen = orig->charmaxlen;
356 prs->str = orig->str + orig->state->posbyte;
357 prs->lenstr = orig->lenstr - orig->state->posbyte;
358 prs->usewide = orig->usewide;
359
360 if (orig->pgwstr)
361 prs->pgwstr = orig->pgwstr + orig->state->poschar;
362 if (orig->wstr)
363 prs->wstr = orig->wstr + orig->state->poschar;
364
365 prs->state = newTParserPosition(NULL);
366 prs->state->state = TPS_Base;
367
368 #ifdef WPARSER_TRACE
369 /* See note above about %.*s */
370 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
371 #endif
372
373 return prs;
374 }
375
376
377 static void
TParserClose(TParser * prs)378 TParserClose(TParser *prs)
379 {
380 while (prs->state)
381 {
382 TParserPosition *ptr = prs->state->prev;
383
384 pfree(prs->state);
385 prs->state = ptr;
386 }
387
388 if (prs->wstr)
389 pfree(prs->wstr);
390 if (prs->pgwstr)
391 pfree(prs->pgwstr);
392
393 #ifdef WPARSER_TRACE
394 fprintf(stderr, "closing parser\n");
395 #endif
396 pfree(prs);
397 }
398
399 /*
400 * Close a parser created with TParserCopyInit
401 */
402 static void
TParserCopyClose(TParser * prs)403 TParserCopyClose(TParser *prs)
404 {
405 while (prs->state)
406 {
407 TParserPosition *ptr = prs->state->prev;
408
409 pfree(prs->state);
410 prs->state = ptr;
411 }
412
413 #ifdef WPARSER_TRACE
414 fprintf(stderr, "closing parser copy\n");
415 #endif
416 pfree(prs);
417 }
418
419
420 /*
421 * Character-type support functions, equivalent to is* macros, but
422 * working with any possible encodings and locales. Notes:
423 * - with multibyte encoding and C-locale isw* function may fail
424 * or give wrong result.
425 * - multibyte encoding and C-locale often are used for
426 * Asian languages.
427 * - if locale is C then we use pgwstr instead of wstr.
428 */
429
430 #define p_iswhat(type, nonascii) \
431 \
432 static int \
433 p_is##type(TParser *prs) \
434 { \
435 Assert(prs->state); \
436 if (prs->usewide) \
437 { \
438 if (prs->pgwstr) \
439 { \
440 unsigned int c = *(prs->pgwstr + prs->state->poschar); \
441 if (c > 0x7f) \
442 return nonascii; \
443 return is##type(c); \
444 } \
445 return isw##type(*(prs->wstr + prs->state->poschar)); \
446 } \
447 return is##type(*(unsigned char *) (prs->str + prs->state->posbyte)); \
448 } \
449 \
450 static int \
451 p_isnot##type(TParser *prs) \
452 { \
453 return !p_is##type(prs); \
454 }
455
456 /*
457 * In C locale with a multibyte encoding, any non-ASCII symbol is considered
458 * an alpha character, but not a member of other char classes.
459 */
460 p_iswhat(alnum, 1)
461 p_iswhat(alpha, 1)
462 p_iswhat(digit, 0)
463 p_iswhat(lower, 0)
464 p_iswhat(print, 0)
465 p_iswhat(punct, 0)
466 p_iswhat(space, 0)
467 p_iswhat(upper, 0)
468 p_iswhat(xdigit, 0)
469
470 /* p_iseq should be used only for ascii symbols */
471
472 static int
p_iseq(TParser * prs,char c)473 p_iseq(TParser *prs, char c)
474 {
475 Assert(prs->state);
476 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
477 }
478
479 static int
p_isEOF(TParser * prs)480 p_isEOF(TParser *prs)
481 {
482 Assert(prs->state);
483 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 }
485
486 static int
p_iseqC(TParser * prs)487 p_iseqC(TParser *prs)
488 {
489 return p_iseq(prs, prs->c);
490 }
491
492 static int
p_isneC(TParser * prs)493 p_isneC(TParser *prs)
494 {
495 return !p_iseq(prs, prs->c);
496 }
497
498 static int
p_isascii(TParser * prs)499 p_isascii(TParser *prs)
500 {
501 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 }
503
504 static int
p_isasclet(TParser * prs)505 p_isasclet(TParser *prs)
506 {
507 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 }
509
510 static int
p_isurlchar(TParser * prs)511 p_isurlchar(TParser *prs)
512 {
513 char ch;
514
515 /* no non-ASCII need apply */
516 if (prs->state->charlen != 1)
517 return 0;
518 ch = *(prs->str + prs->state->posbyte);
519 /* no spaces or control characters */
520 if (ch <= 0x20 || ch >= 0x7F)
521 return 0;
522 /* reject characters disallowed by RFC 3986 */
523 switch (ch)
524 {
525 case '"':
526 case '<':
527 case '>':
528 case '\\':
529 case '^':
530 case '`':
531 case '{':
532 case '|':
533 case '}':
534 return 0;
535 }
536 return 1;
537 }
538
539
540 /* deliberately suppress unused-function complaints for the above */
541 void _make_compiler_happy(void);
542 void
_make_compiler_happy(void)543 _make_compiler_happy(void)
544 {
545 p_isalnum(NULL);
546 p_isnotalnum(NULL);
547 p_isalpha(NULL);
548 p_isnotalpha(NULL);
549 p_isdigit(NULL);
550 p_isnotdigit(NULL);
551 p_islower(NULL);
552 p_isnotlower(NULL);
553 p_isprint(NULL);
554 p_isnotprint(NULL);
555 p_ispunct(NULL);
556 p_isnotpunct(NULL);
557 p_isspace(NULL);
558 p_isnotspace(NULL);
559 p_isupper(NULL);
560 p_isnotupper(NULL);
561 p_isxdigit(NULL);
562 p_isnotxdigit(NULL);
563 p_isEOF(NULL);
564 p_iseqC(NULL);
565 p_isneC(NULL);
566 }
567
568
569 static void
SpecialTags(TParser * prs)570 SpecialTags(TParser *prs)
571 {
572 switch (prs->state->lenchartoken)
573 {
574 case 8: /* </script */
575 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
576 prs->ignore = false;
577 break;
578 case 7: /* <script || </style */
579 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
580 prs->ignore = false;
581 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
582 prs->ignore = true;
583 break;
584 case 6: /* <style */
585 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
586 prs->ignore = true;
587 break;
588 default:
589 break;
590 }
591 }
592
593 static void
SpecialFURL(TParser * prs)594 SpecialFURL(TParser *prs)
595 {
596 prs->wanthost = true;
597 prs->state->posbyte -= prs->state->lenbytetoken;
598 prs->state->poschar -= prs->state->lenchartoken;
599 }
600
601 static void
SpecialHyphen(TParser * prs)602 SpecialHyphen(TParser *prs)
603 {
604 prs->state->posbyte -= prs->state->lenbytetoken;
605 prs->state->poschar -= prs->state->lenchartoken;
606 }
607
608 static void
SpecialVerVersion(TParser * prs)609 SpecialVerVersion(TParser *prs)
610 {
611 prs->state->posbyte -= prs->state->lenbytetoken;
612 prs->state->poschar -= prs->state->lenchartoken;
613 prs->state->lenbytetoken = 0;
614 prs->state->lenchartoken = 0;
615 }
616
617 static int
p_isstophost(TParser * prs)618 p_isstophost(TParser *prs)
619 {
620 if (prs->wanthost)
621 {
622 prs->wanthost = false;
623 return 1;
624 }
625 return 0;
626 }
627
628 static int
p_isignore(TParser * prs)629 p_isignore(TParser *prs)
630 {
631 return (prs->ignore) ? 1 : 0;
632 }
633
634 static int
p_ishost(TParser * prs)635 p_ishost(TParser *prs)
636 {
637 TParser *tmpprs = TParserCopyInit(prs);
638 int res = 0;
639
640 tmpprs->wanthost = true;
641
642 if (TParserGet(tmpprs) && tmpprs->type == HOST)
643 {
644 prs->state->posbyte += tmpprs->lenbytetoken;
645 prs->state->poschar += tmpprs->lenchartoken;
646 prs->state->lenbytetoken += tmpprs->lenbytetoken;
647 prs->state->lenchartoken += tmpprs->lenchartoken;
648 prs->state->charlen = tmpprs->state->charlen;
649 res = 1;
650 }
651 TParserCopyClose(tmpprs);
652
653 return res;
654 }
655
656 static int
p_isURLPath(TParser * prs)657 p_isURLPath(TParser *prs)
658 {
659 TParser *tmpprs = TParserCopyInit(prs);
660 int res = 0;
661
662 tmpprs->state = newTParserPosition(tmpprs->state);
663 tmpprs->state->state = TPS_InURLPathFirst;
664
665 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
666 {
667 prs->state->posbyte += tmpprs->lenbytetoken;
668 prs->state->poschar += tmpprs->lenchartoken;
669 prs->state->lenbytetoken += tmpprs->lenbytetoken;
670 prs->state->lenchartoken += tmpprs->lenchartoken;
671 prs->state->charlen = tmpprs->state->charlen;
672 res = 1;
673 }
674 TParserCopyClose(tmpprs);
675
676 return res;
677 }
678
679 /*
680 * returns true if current character has zero display length or
681 * it's a special sign in several languages. Such characters
682 * aren't a word-breaker although they aren't an isalpha.
683 * In beginning of word they aren't a part of it.
684 */
685 static int
p_isspecial(TParser * prs)686 p_isspecial(TParser *prs)
687 {
688 /*
689 * pg_dsplen could return -1 which means error or control character
690 */
691 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
692 return 1;
693
694 /*
695 * Unicode Characters in the 'Mark, Spacing Combining' Category That
696 * characters are not alpha although they are not breakers of word too.
697 * Check that only in utf encoding, because other encodings aren't
698 * supported by postgres or even exists.
699 */
700 if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
701 {
702 static const pg_wchar strange_letter[] = {
703 /*
704 * use binary search, so elements should be ordered
705 */
706 0x0903, /* DEVANAGARI SIGN VISARGA */
707 0x093E, /* DEVANAGARI VOWEL SIGN AA */
708 0x093F, /* DEVANAGARI VOWEL SIGN I */
709 0x0940, /* DEVANAGARI VOWEL SIGN II */
710 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
711 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
712 0x094B, /* DEVANAGARI VOWEL SIGN O */
713 0x094C, /* DEVANAGARI VOWEL SIGN AU */
714 0x0982, /* BENGALI SIGN ANUSVARA */
715 0x0983, /* BENGALI SIGN VISARGA */
716 0x09BE, /* BENGALI VOWEL SIGN AA */
717 0x09BF, /* BENGALI VOWEL SIGN I */
718 0x09C0, /* BENGALI VOWEL SIGN II */
719 0x09C7, /* BENGALI VOWEL SIGN E */
720 0x09C8, /* BENGALI VOWEL SIGN AI */
721 0x09CB, /* BENGALI VOWEL SIGN O */
722 0x09CC, /* BENGALI VOWEL SIGN AU */
723 0x09D7, /* BENGALI AU LENGTH MARK */
724 0x0A03, /* GURMUKHI SIGN VISARGA */
725 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
726 0x0A3F, /* GURMUKHI VOWEL SIGN I */
727 0x0A40, /* GURMUKHI VOWEL SIGN II */
728 0x0A83, /* GUJARATI SIGN VISARGA */
729 0x0ABE, /* GUJARATI VOWEL SIGN AA */
730 0x0ABF, /* GUJARATI VOWEL SIGN I */
731 0x0AC0, /* GUJARATI VOWEL SIGN II */
732 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
733 0x0ACB, /* GUJARATI VOWEL SIGN O */
734 0x0ACC, /* GUJARATI VOWEL SIGN AU */
735 0x0B02, /* ORIYA SIGN ANUSVARA */
736 0x0B03, /* ORIYA SIGN VISARGA */
737 0x0B3E, /* ORIYA VOWEL SIGN AA */
738 0x0B40, /* ORIYA VOWEL SIGN II */
739 0x0B47, /* ORIYA VOWEL SIGN E */
740 0x0B48, /* ORIYA VOWEL SIGN AI */
741 0x0B4B, /* ORIYA VOWEL SIGN O */
742 0x0B4C, /* ORIYA VOWEL SIGN AU */
743 0x0B57, /* ORIYA AU LENGTH MARK */
744 0x0BBE, /* TAMIL VOWEL SIGN AA */
745 0x0BBF, /* TAMIL VOWEL SIGN I */
746 0x0BC1, /* TAMIL VOWEL SIGN U */
747 0x0BC2, /* TAMIL VOWEL SIGN UU */
748 0x0BC6, /* TAMIL VOWEL SIGN E */
749 0x0BC7, /* TAMIL VOWEL SIGN EE */
750 0x0BC8, /* TAMIL VOWEL SIGN AI */
751 0x0BCA, /* TAMIL VOWEL SIGN O */
752 0x0BCB, /* TAMIL VOWEL SIGN OO */
753 0x0BCC, /* TAMIL VOWEL SIGN AU */
754 0x0BD7, /* TAMIL AU LENGTH MARK */
755 0x0C01, /* TELUGU SIGN CANDRABINDU */
756 0x0C02, /* TELUGU SIGN ANUSVARA */
757 0x0C03, /* TELUGU SIGN VISARGA */
758 0x0C41, /* TELUGU VOWEL SIGN U */
759 0x0C42, /* TELUGU VOWEL SIGN UU */
760 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
761 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
762 0x0C82, /* KANNADA SIGN ANUSVARA */
763 0x0C83, /* KANNADA SIGN VISARGA */
764 0x0CBE, /* KANNADA VOWEL SIGN AA */
765 0x0CC0, /* KANNADA VOWEL SIGN II */
766 0x0CC1, /* KANNADA VOWEL SIGN U */
767 0x0CC2, /* KANNADA VOWEL SIGN UU */
768 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
769 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
770 0x0CC7, /* KANNADA VOWEL SIGN EE */
771 0x0CC8, /* KANNADA VOWEL SIGN AI */
772 0x0CCA, /* KANNADA VOWEL SIGN O */
773 0x0CCB, /* KANNADA VOWEL SIGN OO */
774 0x0CD5, /* KANNADA LENGTH MARK */
775 0x0CD6, /* KANNADA AI LENGTH MARK */
776 0x0D02, /* MALAYALAM SIGN ANUSVARA */
777 0x0D03, /* MALAYALAM SIGN VISARGA */
778 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
779 0x0D3F, /* MALAYALAM VOWEL SIGN I */
780 0x0D40, /* MALAYALAM VOWEL SIGN II */
781 0x0D46, /* MALAYALAM VOWEL SIGN E */
782 0x0D47, /* MALAYALAM VOWEL SIGN EE */
783 0x0D48, /* MALAYALAM VOWEL SIGN AI */
784 0x0D4A, /* MALAYALAM VOWEL SIGN O */
785 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
786 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
787 0x0D57, /* MALAYALAM AU LENGTH MARK */
788 0x0D82, /* SINHALA SIGN ANUSVARAYA */
789 0x0D83, /* SINHALA SIGN VISARGAYA */
790 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
791 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
792 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
793 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
794 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
795 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
796 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
797 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
798 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
799 * AELA-PILLA */
800 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
801 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
802 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
803 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
804 0x0F3E, /* TIBETAN SIGN YAR TSHES */
805 0x0F3F, /* TIBETAN SIGN MAR TSHES */
806 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
807 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
808 0x102C, /* MYANMAR VOWEL SIGN AA */
809 0x1031, /* MYANMAR VOWEL SIGN E */
810 0x1038, /* MYANMAR SIGN VISARGA */
811 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
812 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
813 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
814 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
815 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
816 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
817 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
818 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
819 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
820 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
821 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
822 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
823 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
824 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
825 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
826 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
827 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
828 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
829 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
830 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
831 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
832 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
833 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
834 0x17B6, /* KHMER VOWEL SIGN AA */
835 0x17BE, /* KHMER VOWEL SIGN OE */
836 0x17BF, /* KHMER VOWEL SIGN YA */
837 0x17C0, /* KHMER VOWEL SIGN IE */
838 0x17C1, /* KHMER VOWEL SIGN E */
839 0x17C2, /* KHMER VOWEL SIGN AE */
840 0x17C3, /* KHMER VOWEL SIGN AI */
841 0x17C4, /* KHMER VOWEL SIGN OO */
842 0x17C5, /* KHMER VOWEL SIGN AU */
843 0x17C7, /* KHMER SIGN REAHMUK */
844 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
845 0x1923, /* LIMBU VOWEL SIGN EE */
846 0x1924, /* LIMBU VOWEL SIGN AI */
847 0x1925, /* LIMBU VOWEL SIGN OO */
848 0x1926, /* LIMBU VOWEL SIGN AU */
849 0x1929, /* LIMBU SUBJOINED LETTER YA */
850 0x192A, /* LIMBU SUBJOINED LETTER RA */
851 0x192B, /* LIMBU SUBJOINED LETTER WA */
852 0x1930, /* LIMBU SMALL LETTER KA */
853 0x1931, /* LIMBU SMALL LETTER NGA */
854 0x1933, /* LIMBU SMALL LETTER TA */
855 0x1934, /* LIMBU SMALL LETTER NA */
856 0x1935, /* LIMBU SMALL LETTER PA */
857 0x1936, /* LIMBU SMALL LETTER MA */
858 0x1937, /* LIMBU SMALL LETTER RA */
859 0x1938, /* LIMBU SMALL LETTER LA */
860 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
861 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
862 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
863 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
864 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
865 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
866 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
867 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
868 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
869 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
870 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
871 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
872 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
873 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
874 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
875 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
876 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
877 0x19C8, /* NEW TAI LUE TONE MARK-1 */
878 0x19C9, /* NEW TAI LUE TONE MARK-2 */
879 0x1A19, /* BUGINESE VOWEL SIGN E */
880 0x1A1A, /* BUGINESE VOWEL SIGN O */
881 0x1A1B, /* BUGINESE VOWEL SIGN AE */
882 0x1B04, /* BALINESE SIGN BISAH */
883 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
884 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
885 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
886 0x1B3E, /* BALINESE VOWEL SIGN TALING */
887 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
888 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
889 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
890 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
891 0x1B44, /* BALINESE ADEG ADEG */
892 0x1B82, /* SUNDANESE SIGN PANGWISAD */
893 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
894 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
895 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
896 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
897 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
898 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
899 0x1C26, /* LEPCHA VOWEL SIGN AA */
900 0x1C27, /* LEPCHA VOWEL SIGN I */
901 0x1C28, /* LEPCHA VOWEL SIGN O */
902 0x1C29, /* LEPCHA VOWEL SIGN OO */
903 0x1C2A, /* LEPCHA VOWEL SIGN U */
904 0x1C2B, /* LEPCHA VOWEL SIGN UU */
905 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
906 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
907 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
908 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
909 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
910 0xA880, /* SAURASHTRA SIGN ANUSVARA */
911 0xA881, /* SAURASHTRA SIGN VISARGA */
912 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
913 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
914 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
915 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
916 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
917 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
918 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
919 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
920 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
921 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
922 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
923 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
924 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
925 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
926 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
927 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
928 0xA952, /* REJANG CONSONANT SIGN H */
929 0xA953, /* REJANG VIRAMA */
930 0xAA2F, /* CHAM VOWEL SIGN O */
931 0xAA30, /* CHAM VOWEL SIGN AI */
932 0xAA33, /* CHAM CONSONANT SIGN YA */
933 0xAA34, /* CHAM CONSONANT SIGN RA */
934 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
935 };
936 const pg_wchar *StopLow = strange_letter,
937 *StopHigh = strange_letter + lengthof(strange_letter),
938 *StopMiddle;
939 pg_wchar c;
940
941 if (prs->pgwstr)
942 c = *(prs->pgwstr + prs->state->poschar);
943 else
944 c = (pg_wchar) *(prs->wstr + prs->state->poschar);
945
946 while (StopLow < StopHigh)
947 {
948 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
949 if (*StopMiddle == c)
950 return 1;
951 else if (*StopMiddle < c)
952 StopLow = StopMiddle + 1;
953 else
954 StopHigh = StopMiddle;
955 }
956 }
957
958 return 0;
959 }
960
961 /*
962 * Table of state/action of parser
963 */
964
965 static const TParserStateActionItem actionTPS_Base[] = {
966 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
967 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
968 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
969 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
970 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
971 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
972 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
973 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
974 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
975 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
976 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
977 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
978 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
979 };
980
981
982 static const TParserStateActionItem actionTPS_InNumWord[] = {
983 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
984 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
985 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
986 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
987 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
988 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
989 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
990 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
991 };
992
993 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
994 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
995 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
996 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
997 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
998 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
999 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1000 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1002 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1003 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1004 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1006 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1007 {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1008 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1009 };
1010
1011 static const TParserStateActionItem actionTPS_InWord[] = {
1012 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1013 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1014 {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1015 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1016 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1017 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1018 };
1019
1020 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1021 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1022 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1023 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1024 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1025 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1026 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1027 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1028 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1029 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1030 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1031 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1032 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1033 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1034 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1035 };
1036
1037 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1038 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1039 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1040 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1041 };
1042
1043 static const TParserStateActionItem actionTPS_InSignedInt[] = {
1044 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1045 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1046 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1047 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1048 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1049 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1050 };
1051
1052 static const TParserStateActionItem actionTPS_InSpace[] = {
1053 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1054 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1055 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1056 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1057 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1058 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1059 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1060 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1061 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1062 };
1063
1064 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1065 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1066 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1067 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1068 };
1069
1070 static const TParserStateActionItem actionTPS_InUDecimal[] = {
1071 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1072 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1073 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1074 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1075 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1076 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1077 };
1078
1079 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1080 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1081 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1082 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1083 };
1084
1085 static const TParserStateActionItem actionTPS_InDecimal[] = {
1086 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1087 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1088 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1089 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1090 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1091 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1092 };
1093
1094 static const TParserStateActionItem actionTPS_InVerVersion[] = {
1095 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1096 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1097 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1098 };
1099
1100 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1101 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1103 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1104 };
1105
1106
1107 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1108 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1109 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1110 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1111 };
1112
1113 static const TParserStateActionItem actionTPS_InVersion[] = {
1114 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1115 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1116 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1117 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1118 };
1119
1120 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1121 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1124 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1125 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1126 };
1127
1128 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1129 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1130 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1131 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1132 };
1133
1134 static const TParserStateActionItem actionTPS_InMantissa[] = {
1135 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1136 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1137 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1138 };
1139
1140 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1141 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1142 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1143 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 };
1148
1149 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1150 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1151 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1152 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1153 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1154 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1155 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1156 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1157 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 };
1159
1160 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1161 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1163 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1164 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1165 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1166 };
1167
1168 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1169 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1171 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173
1174 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1175 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1177 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1179 };
1180
1181 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1182 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1183 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1184 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1185 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1186 };
1187
1188 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1189 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1190 };
1191
1192 static const TParserStateActionItem actionTPS_InTagFirst[] = {
1193 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1195 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1196 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1197 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1198 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1199 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1200 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 };
1202
1203 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1204 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205 /* <?xml ... */
1206 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1207 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1208 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1209 };
1210
1211 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1212 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1213 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1214 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1215 };
1216
1217 static const TParserStateActionItem actionTPS_InTagName[] = {
1218 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 /* <br/> case */
1220 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1221 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1222 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1223 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1224 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1225 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1226 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1227 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1228 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230
1231 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1232 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1234 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236
1237 static const TParserStateActionItem actionTPS_InTag[] = {
1238 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1240 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1241 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1242 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1243 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1244 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1245 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1246 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1247 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1248 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1249 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1250 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1251 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1252 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1253 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1254 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1255 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1256 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1257 };
1258
1259 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1260 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1263 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1264 };
1265
1266 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1267 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1269 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1270 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1271 };
1272
1273 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1274 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1275 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1276 };
1277
1278 static const TParserStateActionItem actionTPS_InTagEnd[] = {
1279 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1280 };
1281
1282 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1283 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1284 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1285 /* <!DOCTYPE ...> */
1286 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1287 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1288 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1289 };
1290
1291 static const TParserStateActionItem actionTPS_InCommentLast[] = {
1292 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1293 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1294 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1295 };
1296
1297 static const TParserStateActionItem actionTPS_InComment[] = {
1298 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1300 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1301 };
1302
1303 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1304 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1306 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307 };
1308
1309 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1310 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1312 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1313 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1314 };
1315
1316 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1317 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1318 };
1319
1320 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1321 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1323 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1324 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1325 };
1326
1327 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1328 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1330 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1331 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1332 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1333 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1334 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1335 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1336 };
1337
1338 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1339 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1340 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1341 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1342 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1343 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1344 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1345 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1346 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1347 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1348 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1349 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1350 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1351 };
1352
1353 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1354 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1355 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1357 };
1358
1359 static const TParserStateActionItem actionTPS_InPort[] = {
1360 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1361 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1362 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1363 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1364 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1365 };
1366
1367 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1368 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1372 };
1373
1374 static const TParserStateActionItem actionTPS_InHost[] = {
1375 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1376 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1377 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1378 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1379 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1380 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1381 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1382 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1383 };
1384
1385 static const TParserStateActionItem actionTPS_InEmail[] = {
1386 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1387 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1388 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1389 };
1390
1391 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1392 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1393 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1394 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1395 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1396 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1397 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1398 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1399 };
1400
1401 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1402 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1404 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1405 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1406 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1407 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1408 };
1409
1410 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1411 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1412 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1413 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1414 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1415 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1418 };
1419
1420 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1421 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1422 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1423 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1424 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 };
1426
1427 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1428 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1429 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1430 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1431 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1432 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434
1435 static const TParserStateActionItem actionTPS_InFile[] = {
1436 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1437 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1438 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1439 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1440 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1441 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1442 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1443 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1444 };
1445
1446 static const TParserStateActionItem actionTPS_InFileNext[] = {
1447 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1449 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1450 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1451 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1452 };
1453
1454 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1455 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1456 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1457 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1458 };
1459
1460 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1461 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1462 };
1463
1464 static const TParserStateActionItem actionTPS_InURLPath[] = {
1465 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1466 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1467 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1468 };
1469
1470 static const TParserStateActionItem actionTPS_InFURL[] = {
1471 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1472 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1473 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1474 };
1475
1476 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1477 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1479 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481
1482 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1483 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1485 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487
1488 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1489 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1490 };
1491
1492 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1493 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1494 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1497 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1498 };
1499
1500 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1501 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1502 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1503 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1504 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1507 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1508 };
1509
1510 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1511 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1512 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1514 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516
1517 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1518 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1519 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1520 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1521 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1523 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1524 };
1525
1526 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1527 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1528 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1530 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1531 };
1532
1533 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1534 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1535 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1536 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1538 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1539 };
1540
1541 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1542 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1543 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1544 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1545 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1546 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1547 };
1548
1549 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1550 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1551 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1552 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1553 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1554 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1555 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1556 };
1557
1558 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1559 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1560 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1561 {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1562 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1563 };
1564
1565 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1566 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1567 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1568 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1570 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1571 };
1572
1573 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1574 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1575 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1576 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1577 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1578 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1580 };
1581
1582 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1583 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1584 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1586 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1587 };
1588
1589 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1590 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1592 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1593 {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1594 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596
1597
1598 /*
1599 * main table of per-state parser actions
1600 */
1601 typedef struct
1602 {
1603 const TParserStateActionItem *action; /* the actual state info */
1604 TParserState state; /* only for Assert crosscheck */
1605 #ifdef WPARSER_TRACE
1606 const char *state_name; /* only for debug printout */
1607 #endif
1608 } TParserStateAction;
1609
1610 #ifdef WPARSER_TRACE
1611 #define TPARSERSTATEACTION(state) \
1612 { CppConcat(action,state), state, CppAsString(state) }
1613 #else
1614 #define TPARSERSTATEACTION(state) \
1615 { CppConcat(action,state), state }
1616 #endif
1617
1618 /*
1619 * order must be the same as in typedef enum {} TParserState!!
1620 */
1621
1622 static const TParserStateAction Actions[] = {
1623 TPARSERSTATEACTION(TPS_Base),
1624 TPARSERSTATEACTION(TPS_InNumWord),
1625 TPARSERSTATEACTION(TPS_InAsciiWord),
1626 TPARSERSTATEACTION(TPS_InWord),
1627 TPARSERSTATEACTION(TPS_InUnsignedInt),
1628 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1629 TPARSERSTATEACTION(TPS_InSignedInt),
1630 TPARSERSTATEACTION(TPS_InSpace),
1631 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1632 TPARSERSTATEACTION(TPS_InUDecimal),
1633 TPARSERSTATEACTION(TPS_InDecimalFirst),
1634 TPARSERSTATEACTION(TPS_InDecimal),
1635 TPARSERSTATEACTION(TPS_InVerVersion),
1636 TPARSERSTATEACTION(TPS_InSVerVersion),
1637 TPARSERSTATEACTION(TPS_InVersionFirst),
1638 TPARSERSTATEACTION(TPS_InVersion),
1639 TPARSERSTATEACTION(TPS_InMantissaFirst),
1640 TPARSERSTATEACTION(TPS_InMantissaSign),
1641 TPARSERSTATEACTION(TPS_InMantissa),
1642 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1643 TPARSERSTATEACTION(TPS_InXMLEntity),
1644 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1645 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1646 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1647 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1648 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1649 TPARSERSTATEACTION(TPS_InTagFirst),
1650 TPARSERSTATEACTION(TPS_InXMLBegin),
1651 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1652 TPARSERSTATEACTION(TPS_InTagName),
1653 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1654 TPARSERSTATEACTION(TPS_InTag),
1655 TPARSERSTATEACTION(TPS_InTagEscapeK),
1656 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1657 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1658 TPARSERSTATEACTION(TPS_InTagEnd),
1659 TPARSERSTATEACTION(TPS_InCommentFirst),
1660 TPARSERSTATEACTION(TPS_InCommentLast),
1661 TPARSERSTATEACTION(TPS_InComment),
1662 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1663 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1664 TPARSERSTATEACTION(TPS_InCommentEnd),
1665 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1666 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1667 TPARSERSTATEACTION(TPS_InHostDomain),
1668 TPARSERSTATEACTION(TPS_InPortFirst),
1669 TPARSERSTATEACTION(TPS_InPort),
1670 TPARSERSTATEACTION(TPS_InHostFirstAN),
1671 TPARSERSTATEACTION(TPS_InHost),
1672 TPARSERSTATEACTION(TPS_InEmail),
1673 TPARSERSTATEACTION(TPS_InFileFirst),
1674 TPARSERSTATEACTION(TPS_InFileTwiddle),
1675 TPARSERSTATEACTION(TPS_InPathFirst),
1676 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1677 TPARSERSTATEACTION(TPS_InPathSecond),
1678 TPARSERSTATEACTION(TPS_InFile),
1679 TPARSERSTATEACTION(TPS_InFileNext),
1680 TPARSERSTATEACTION(TPS_InURLPathFirst),
1681 TPARSERSTATEACTION(TPS_InURLPathStart),
1682 TPARSERSTATEACTION(TPS_InURLPath),
1683 TPARSERSTATEACTION(TPS_InFURL),
1684 TPARSERSTATEACTION(TPS_InProtocolFirst),
1685 TPARSERSTATEACTION(TPS_InProtocolSecond),
1686 TPARSERSTATEACTION(TPS_InProtocolEnd),
1687 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1688 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1689 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1690 TPARSERSTATEACTION(TPS_InHyphenWord),
1691 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1692 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1693 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1694 TPARSERSTATEACTION(TPS_InParseHyphen),
1695 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1696 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1697 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1698 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1699 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1700 };
1701
1702
1703 static bool
TParserGet(TParser * prs)1704 TParserGet(TParser *prs)
1705 {
1706 const TParserStateActionItem *item = NULL;
1707
1708 Assert(prs->state);
1709
1710 if (prs->state->posbyte >= prs->lenstr)
1711 return false;
1712
1713 prs->token = prs->str + prs->state->posbyte;
1714 prs->state->pushedAtAction = NULL;
1715
1716 /* look at string */
1717 while (prs->state->posbyte <= prs->lenstr)
1718 {
1719 if (prs->state->posbyte == prs->lenstr)
1720 prs->state->charlen = 0;
1721 else
1722 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1723 pg_mblen(prs->str + prs->state->posbyte);
1724
1725 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1726 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1727 Assert(Actions[prs->state->state].state == prs->state->state);
1728
1729 if (prs->state->pushedAtAction)
1730 {
1731 /* After a POP, pick up at the next test */
1732 item = prs->state->pushedAtAction + 1;
1733 prs->state->pushedAtAction = NULL;
1734 }
1735 else
1736 {
1737 item = Actions[prs->state->state].action;
1738 Assert(item != NULL);
1739 }
1740
1741 /* find action by character class */
1742 while (item->isclass)
1743 {
1744 prs->c = item->c;
1745 if (item->isclass(prs) != 0)
1746 break;
1747 item++;
1748 }
1749
1750 #ifdef WPARSER_TRACE
1751 {
1752 TParserPosition *ptr;
1753
1754 fprintf(stderr, "state ");
1755 /* indent according to stack depth */
1756 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1757 fprintf(stderr, " ");
1758 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1759 if (prs->state->posbyte < prs->lenstr)
1760 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1761 else
1762 fprintf(stderr, "at EOF");
1763 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1764 (int) (item - Actions[prs->state->state].action),
1765 (item->flags & A_BINGO) ? " BINGO" : "",
1766 (item->flags & A_POP) ? " POP" : "",
1767 (item->flags & A_PUSH) ? " PUSH" : "",
1768 (item->flags & A_RERUN) ? " RERUN" : "",
1769 (item->flags & A_CLEAR) ? " CLEAR" : "",
1770 (item->flags & A_MERGE) ? " MERGE" : "",
1771 (item->flags & A_CLRALL) ? " CLRALL" : "",
1772 (item->tostate != TPS_Null) ? " tostate " : "",
1773 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1774 (item->type > 0) ? " type " : "",
1775 tok_alias[item->type]);
1776 }
1777 #endif
1778
1779 /* call special handler if exists */
1780 if (item->special)
1781 item->special(prs);
1782
1783 /* BINGO, token is found */
1784 if (item->flags & A_BINGO)
1785 {
1786 Assert(item->type > 0);
1787 prs->lenbytetoken = prs->state->lenbytetoken;
1788 prs->lenchartoken = prs->state->lenchartoken;
1789 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1790 prs->type = item->type;
1791 }
1792
1793 /* do various actions by flags */
1794 if (item->flags & A_POP)
1795 { /* pop stored state in stack */
1796 TParserPosition *ptr = prs->state->prev;
1797
1798 pfree(prs->state);
1799 prs->state = ptr;
1800 Assert(prs->state);
1801 }
1802 else if (item->flags & A_PUSH)
1803 { /* push (store) state in stack */
1804 prs->state->pushedAtAction = item; /* remember where we push */
1805 prs->state = newTParserPosition(prs->state);
1806 }
1807 else if (item->flags & A_CLEAR)
1808 { /* clear previous pushed state */
1809 TParserPosition *ptr;
1810
1811 Assert(prs->state->prev);
1812 ptr = prs->state->prev->prev;
1813 pfree(prs->state->prev);
1814 prs->state->prev = ptr;
1815 }
1816 else if (item->flags & A_CLRALL)
1817 { /* clear all previous pushed state */
1818 TParserPosition *ptr;
1819
1820 while (prs->state->prev)
1821 {
1822 ptr = prs->state->prev->prev;
1823 pfree(prs->state->prev);
1824 prs->state->prev = ptr;
1825 }
1826 }
1827 else if (item->flags & A_MERGE)
1828 { /* merge posinfo with current and pushed state */
1829 TParserPosition *ptr = prs->state;
1830
1831 Assert(prs->state->prev);
1832 prs->state = prs->state->prev;
1833
1834 prs->state->posbyte = ptr->posbyte;
1835 prs->state->poschar = ptr->poschar;
1836 prs->state->charlen = ptr->charlen;
1837 prs->state->lenbytetoken = ptr->lenbytetoken;
1838 prs->state->lenchartoken = ptr->lenchartoken;
1839 pfree(ptr);
1840 }
1841
1842 /* set new state if pointed */
1843 if (item->tostate != TPS_Null)
1844 prs->state->state = item->tostate;
1845
1846 /* check for go away */
1847 if ((item->flags & A_BINGO) ||
1848 (prs->state->posbyte >= prs->lenstr &&
1849 (item->flags & A_RERUN) == 0))
1850 break;
1851
1852 /* go to beginning of loop if we should rerun or we just restore state */
1853 if (item->flags & (A_RERUN | A_POP))
1854 continue;
1855
1856 /* move forward */
1857 if (prs->state->charlen)
1858 {
1859 prs->state->posbyte += prs->state->charlen;
1860 prs->state->lenbytetoken += prs->state->charlen;
1861 prs->state->poschar++;
1862 prs->state->lenchartoken++;
1863 }
1864 }
1865
1866 return (item && (item->flags & A_BINGO)) ? true : false;
1867 }
1868
1869 Datum
prsd_lextype(PG_FUNCTION_ARGS)1870 prsd_lextype(PG_FUNCTION_ARGS)
1871 {
1872 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1873 int i;
1874
1875 for (i = 1; i <= LASTNUM; i++)
1876 {
1877 descr[i - 1].lexid = i;
1878 descr[i - 1].alias = pstrdup(tok_alias[i]);
1879 descr[i - 1].descr = pstrdup(lex_descr[i]);
1880 }
1881
1882 descr[LASTNUM].lexid = 0;
1883
1884 PG_RETURN_POINTER(descr);
1885 }
1886
1887 Datum
prsd_start(PG_FUNCTION_ARGS)1888 prsd_start(PG_FUNCTION_ARGS)
1889 {
1890 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1891 }
1892
1893 Datum
prsd_nexttoken(PG_FUNCTION_ARGS)1894 prsd_nexttoken(PG_FUNCTION_ARGS)
1895 {
1896 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1897 char **t = (char **) PG_GETARG_POINTER(1);
1898 int *tlen = (int *) PG_GETARG_POINTER(2);
1899
1900 if (!TParserGet(p))
1901 PG_RETURN_INT32(0);
1902
1903 *t = p->token;
1904 *tlen = p->lenbytetoken;
1905
1906 PG_RETURN_INT32(p->type);
1907 }
1908
1909 Datum
prsd_end(PG_FUNCTION_ARGS)1910 prsd_end(PG_FUNCTION_ARGS)
1911 {
1912 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1913
1914 TParserClose(p);
1915 PG_RETURN_VOID();
1916 }
1917
1918
1919 /*
1920 * ts_headline support begins here
1921 */
1922
1923 /* token type classification macros */
1924 #define LEAVETOKEN(x) ( (x)==SPACE )
1925 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1926 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1927
1928 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1929 #define HLIDREPLACE(x) ( (x)==TAG_T )
1930 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1931 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1932 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1933 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1934
1935 /*
1936 * Macros useful in headline selection. These rely on availability of
1937 * "HeadlineParsedText *prs" describing some text, and "int shortword"
1938 * describing the "short word" length parameter.
1939 */
1940
1941 /* Interesting words are non-repeated search terms */
1942 #define INTERESTINGWORD(j) \
1943 (prs->words[j].item && !prs->words[j].repeated)
1944
1945 /* Don't want to end at a non-word or a short word, unless interesting */
1946 #define BADENDPOINT(j) \
1947 ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1948 !INTERESTINGWORD(j))
1949
1950 typedef struct
1951 {
1952 /* one cover (well, really one fragment) for mark_hl_fragments */
1953 int32 startpos; /* fragment's starting word index */
1954 int32 endpos; /* ending word index (inclusive) */
1955 int32 poslen; /* number of interesting words */
1956 int32 curlen; /* total number of words */
1957 bool chosen; /* chosen? */
1958 bool excluded; /* excluded? */
1959 } CoverPos;
1960
1961 typedef struct
1962 {
1963 /* callback data for checkcondition_HL */
1964 HeadlineWordEntry *words;
1965 int len;
1966 } hlCheck;
1967
1968
1969 /*
1970 * TS_execute callback for matching a tsquery operand to headline words
1971 */
1972 static bool
checkcondition_HL(void * opaque,QueryOperand * val,ExecPhraseData * data)1973 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1974 {
1975 hlCheck *checkval = (hlCheck *) opaque;
1976 int i;
1977
1978 /* scan words array for marching items */
1979 for (i = 0; i < checkval->len; i++)
1980 {
1981 if (checkval->words[i].item == val)
1982 {
1983 /* if data == NULL, don't need to report positions */
1984 if (!data)
1985 return true;
1986
1987 if (!data->pos)
1988 {
1989 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1990 data->allocated = true;
1991 data->npos = 1;
1992 data->pos[0] = checkval->words[i].pos;
1993 }
1994 else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1995 {
1996 data->pos[data->npos++] = checkval->words[i].pos;
1997 }
1998 }
1999 }
2000
2001 if (data && data->npos > 0)
2002 return true;
2003
2004 return false;
2005 }
2006
2007 /*
2008 * hlFirstIndex: find first index >= pos containing any word used in query
2009 *
2010 * Returns -1 if no such index
2011 */
2012 static int
hlFirstIndex(HeadlineParsedText * prs,int pos)2013 hlFirstIndex(HeadlineParsedText *prs, int pos)
2014 {
2015 int i;
2016
2017 for (i = pos; i < prs->curwords; i++)
2018 {
2019 if (prs->words[i].item != NULL)
2020 return i;
2021 }
2022 return -1;
2023 }
2024
2025 /*
2026 * hlCover: try to find a substring of prs' word list that satisfies query
2027 *
2028 * At entry, *p must be the first word index to consider (initialize this
2029 * to zero, or to the next index after a previous successful search).
2030 * We will consider all substrings starting at or after that word, and
2031 * containing no more than max_cover words. (We need a length limit to
2032 * keep this from taking O(N^2) time for a long document with many query
2033 * words but few complete matches. Actually, since checkcondition_HL is
2034 * roughly O(N) in the length of the substring being checked, it's even
2035 * worse than that.)
2036 *
2037 * On success, sets *p to first word index and *q to last word index of the
2038 * cover substring, and returns true.
2039 *
2040 * The result is a minimal cover, in the sense that both *p and *q will be
2041 * words used in the query.
2042 */
2043 static bool
hlCover(HeadlineParsedText * prs,TSQuery query,int max_cover,int * p,int * q)2044 hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
2045 int *p, int *q)
2046 {
2047 int pmin,
2048 pmax,
2049 nextpmin,
2050 nextpmax;
2051 hlCheck ch;
2052
2053 /*
2054 * We look for the earliest, shortest substring of prs->words that
2055 * satisfies the query. Both the pmin and pmax indices must be words
2056 * appearing in the query; there's no point in trying endpoints in between
2057 * such points.
2058 */
2059 pmin = hlFirstIndex(prs, *p);
2060 while (pmin >= 0)
2061 {
2062 /* This useless assignment just keeps stupider compilers quiet */
2063 nextpmin = -1;
2064 /* Consider substrings starting at pmin */
2065 ch.words = &(prs->words[pmin]);
2066 /* Consider the length-one substring first, then longer substrings */
2067 pmax = pmin;
2068 do
2069 {
2070 /* Try to match query against pmin .. pmax substring */
2071 ch.len = pmax - pmin + 1;
2072 if (TS_execute(GETQUERY(query), &ch,
2073 TS_EXEC_EMPTY, checkcondition_HL))
2074 {
2075 *p = pmin;
2076 *q = pmax;
2077 return true;
2078 }
2079 /* Nope, so advance pmax to next feasible endpoint */
2080 nextpmax = hlFirstIndex(prs, pmax + 1);
2081
2082 /*
2083 * If this is our first advance past pmin, then the result is also
2084 * the next feasible value of pmin; remember it to save a
2085 * redundant search.
2086 */
2087 if (pmax == pmin)
2088 nextpmin = nextpmax;
2089 pmax = nextpmax;
2090 }
2091 while (pmax >= 0 && pmax - pmin < max_cover);
2092 /* No luck here, so try next feasible startpoint */
2093 pmin = nextpmin;
2094 }
2095 return false;
2096 }
2097
2098 /*
2099 * Apply suitable highlight marking to words selected by headline selector
2100 *
2101 * The words from startpos to endpos inclusive are marked per highlightall
2102 */
2103 static void
mark_fragment(HeadlineParsedText * prs,bool highlightall,int startpos,int endpos)2104 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2105 int startpos, int endpos)
2106 {
2107 int i;
2108
2109 for (i = startpos; i <= endpos; i++)
2110 {
2111 if (prs->words[i].item)
2112 prs->words[i].selected = 1;
2113 if (!highlightall)
2114 {
2115 if (HLIDREPLACE(prs->words[i].type))
2116 prs->words[i].replace = 1;
2117 else if (HLIDSKIP(prs->words[i].type))
2118 prs->words[i].skip = 1;
2119 }
2120 else
2121 {
2122 if (XMLHLIDSKIP(prs->words[i].type))
2123 prs->words[i].skip = 1;
2124 }
2125
2126 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2127 }
2128 }
2129
2130 /*
2131 * split a cover substring into fragments not longer than max_words
2132 *
2133 * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2134 * substring. They are updated to hold the bounds of the next fragment.
2135 *
2136 * *curlen and *poslen are set to the fragment's length, in words and
2137 * interesting words respectively.
2138 */
2139 static void
get_next_fragment(HeadlineParsedText * prs,int * startpos,int * endpos,int * curlen,int * poslen,int max_words)2140 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2141 int *curlen, int *poslen, int max_words)
2142 {
2143 int i;
2144
2145 /*
2146 * Objective: select a fragment of words between startpos and endpos such
2147 * that it has at most max_words and both ends have query words. If the
2148 * startpos and endpos are the endpoints of the cover and the cover has
2149 * fewer words than max_words, then this function should just return the
2150 * cover
2151 */
2152 /* first move startpos to an item */
2153 for (i = *startpos; i <= *endpos; i++)
2154 {
2155 *startpos = i;
2156 if (INTERESTINGWORD(i))
2157 break;
2158 }
2159 /* cut endpos to have only max_words */
2160 *curlen = 0;
2161 *poslen = 0;
2162 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2163 {
2164 if (!NONWORDTOKEN(prs->words[i].type))
2165 *curlen += 1;
2166 if (INTERESTINGWORD(i))
2167 *poslen += 1;
2168 }
2169 /* if the cover was cut then move back endpos to a query item */
2170 if (*endpos > i)
2171 {
2172 *endpos = i;
2173 for (i = *endpos; i >= *startpos; i--)
2174 {
2175 *endpos = i;
2176 if (INTERESTINGWORD(i))
2177 break;
2178 if (!NONWORDTOKEN(prs->words[i].type))
2179 *curlen -= 1;
2180 }
2181 }
2182 }
2183
2184 /*
2185 * Headline selector used when MaxFragments > 0
2186 *
2187 * Note: in this mode, highlightall is disregarded for phrase selection;
2188 * it only controls presentation details.
2189 */
2190 static void
mark_hl_fragments(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_fragments,int max_cover)2191 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2192 int shortword, int min_words,
2193 int max_words, int max_fragments, int max_cover)
2194 {
2195 int32 poslen,
2196 curlen,
2197 i,
2198 f,
2199 num_f = 0;
2200 int32 stretch,
2201 maxstretch,
2202 posmarker;
2203
2204 int32 startpos = 0,
2205 endpos = 0,
2206 p = 0,
2207 q = 0;
2208
2209 int32 numcovers = 0,
2210 maxcovers = 32;
2211
2212 int32 minI,
2213 minwords,
2214 maxitems;
2215 CoverPos *covers;
2216
2217 covers = palloc(maxcovers * sizeof(CoverPos));
2218
2219 /* get all covers */
2220 while (hlCover(prs, query, max_cover, &p, &q))
2221 {
2222 startpos = p;
2223 endpos = q;
2224
2225 /*
2226 * Break the cover into smaller fragments such that each fragment has
2227 * at most max_words. Also ensure that each end of each fragment is a
2228 * query word. This will allow us to stretch the fragment in either
2229 * direction
2230 */
2231
2232 while (startpos <= endpos)
2233 {
2234 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2235 if (numcovers >= maxcovers)
2236 {
2237 maxcovers *= 2;
2238 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2239 }
2240 covers[numcovers].startpos = startpos;
2241 covers[numcovers].endpos = endpos;
2242 covers[numcovers].curlen = curlen;
2243 covers[numcovers].poslen = poslen;
2244 covers[numcovers].chosen = false;
2245 covers[numcovers].excluded = false;
2246 numcovers++;
2247 startpos = endpos + 1;
2248 endpos = q;
2249 }
2250
2251 /* move p to generate the next cover */
2252 p++;
2253 }
2254
2255 /* choose best covers */
2256 for (f = 0; f < max_fragments; f++)
2257 {
2258 maxitems = 0;
2259 minwords = PG_INT32_MAX;
2260 minI = -1;
2261
2262 /*
2263 * Choose the cover that contains max items. In case of tie choose the
2264 * one with smaller number of words.
2265 */
2266 for (i = 0; i < numcovers; i++)
2267 {
2268 if (!covers[i].chosen && !covers[i].excluded &&
2269 (maxitems < covers[i].poslen ||
2270 (maxitems == covers[i].poslen &&
2271 minwords > covers[i].curlen)))
2272 {
2273 maxitems = covers[i].poslen;
2274 minwords = covers[i].curlen;
2275 minI = i;
2276 }
2277 }
2278 /* if a cover was found mark it */
2279 if (minI >= 0)
2280 {
2281 covers[minI].chosen = true;
2282 /* adjust the size of cover */
2283 startpos = covers[minI].startpos;
2284 endpos = covers[minI].endpos;
2285 curlen = covers[minI].curlen;
2286 /* stretch the cover if cover size is lower than max_words */
2287 if (curlen < max_words)
2288 {
2289 /* divide the stretch on both sides of cover */
2290 maxstretch = (max_words - curlen) / 2;
2291
2292 /*
2293 * first stretch the startpos stop stretching if 1. we hit the
2294 * beginning of document 2. exceed maxstretch 3. we hit an
2295 * already marked fragment
2296 */
2297 stretch = 0;
2298 posmarker = startpos;
2299 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2300 {
2301 if (!NONWORDTOKEN(prs->words[i].type))
2302 {
2303 curlen++;
2304 stretch++;
2305 }
2306 posmarker = i;
2307 }
2308 /* cut back startpos till we find a good endpoint */
2309 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2310 {
2311 if (!NONWORDTOKEN(prs->words[i].type))
2312 curlen--;
2313 }
2314 startpos = i;
2315 /* now stretch the endpos as much as possible */
2316 posmarker = endpos;
2317 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2318 {
2319 if (!NONWORDTOKEN(prs->words[i].type))
2320 curlen++;
2321 posmarker = i;
2322 }
2323 /* cut back endpos till we find a good endpoint */
2324 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2325 {
2326 if (!NONWORDTOKEN(prs->words[i].type))
2327 curlen--;
2328 }
2329 endpos = i;
2330 }
2331 covers[minI].startpos = startpos;
2332 covers[minI].endpos = endpos;
2333 covers[minI].curlen = curlen;
2334 /* Mark the chosen fragments (covers) */
2335 mark_fragment(prs, highlightall, startpos, endpos);
2336 num_f++;
2337 /* Exclude covers overlapping this one from future consideration */
2338 for (i = 0; i < numcovers; i++)
2339 {
2340 if (i != minI &&
2341 ((covers[i].startpos >= startpos &&
2342 covers[i].startpos <= endpos) ||
2343 (covers[i].endpos >= startpos &&
2344 covers[i].endpos <= endpos) ||
2345 (covers[i].startpos < startpos &&
2346 covers[i].endpos > endpos)))
2347 covers[i].excluded = true;
2348 }
2349 }
2350 else
2351 break; /* no selectable covers remain */
2352 }
2353
2354 /* show the first min_words words if we have not marked anything */
2355 if (num_f <= 0)
2356 {
2357 startpos = endpos = curlen = 0;
2358 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2359 {
2360 if (!NONWORDTOKEN(prs->words[i].type))
2361 curlen++;
2362 endpos = i;
2363 }
2364 mark_fragment(prs, highlightall, startpos, endpos);
2365 }
2366
2367 pfree(covers);
2368 }
2369
2370 /*
2371 * Headline selector used when MaxFragments == 0
2372 */
2373 static void
mark_hl_words(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_cover)2374 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2375 int shortword, int min_words, int max_words, int max_cover)
2376 {
2377 int p = 0,
2378 q = 0;
2379 int bestb = -1,
2380 beste = -1;
2381 int bestlen = -1;
2382 bool bestcover = false;
2383 int pose,
2384 posb,
2385 poslen,
2386 curlen;
2387 bool poscover;
2388 int i;
2389
2390 if (!highlightall)
2391 {
2392 /* examine all covers, select a headline using the best one */
2393 while (hlCover(prs, query, max_cover, &p, &q))
2394 {
2395 /*
2396 * Count words (curlen) and interesting words (poslen) within
2397 * cover, but stop once we reach max_words. This step doesn't
2398 * consider whether that's a good stopping point. posb and pose
2399 * are set to the start and end indexes of the possible headline.
2400 */
2401 curlen = 0;
2402 poslen = 0;
2403 posb = pose = p;
2404 for (i = p; i <= q && curlen < max_words; i++)
2405 {
2406 if (!NONWORDTOKEN(prs->words[i].type))
2407 curlen++;
2408 if (INTERESTINGWORD(i))
2409 poslen++;
2410 pose = i;
2411 }
2412
2413 if (curlen < max_words)
2414 {
2415 /*
2416 * We have room to lengthen the headline, so search forward
2417 * until it's full or we find a good stopping point. We'll
2418 * reconsider the word at "q", then move forward.
2419 */
2420 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2421 {
2422 if (i > q)
2423 {
2424 if (!NONWORDTOKEN(prs->words[i].type))
2425 curlen++;
2426 if (INTERESTINGWORD(i))
2427 poslen++;
2428 }
2429 pose = i;
2430 if (BADENDPOINT(i))
2431 continue;
2432 if (curlen >= min_words)
2433 break;
2434 }
2435 if (curlen < min_words)
2436 {
2437 /*
2438 * Reached end of text and our headline is still shorter
2439 * than min_words, so try to extend it to the left.
2440 */
2441 for (i = p - 1; i >= 0; i--)
2442 {
2443 if (!NONWORDTOKEN(prs->words[i].type))
2444 curlen++;
2445 if (INTERESTINGWORD(i))
2446 poslen++;
2447 if (curlen >= max_words)
2448 break;
2449 if (BADENDPOINT(i))
2450 continue;
2451 if (curlen >= min_words)
2452 break;
2453 }
2454 posb = (i >= 0) ? i : 0;
2455 }
2456 }
2457 else
2458 {
2459 /*
2460 * Can't make headline longer, so consider making it shorter
2461 * if needed to avoid a bad endpoint.
2462 */
2463 if (i > q)
2464 i = q;
2465 for (; curlen > min_words; i--)
2466 {
2467 if (!BADENDPOINT(i))
2468 break;
2469 if (!NONWORDTOKEN(prs->words[i].type))
2470 curlen--;
2471 if (INTERESTINGWORD(i))
2472 poslen--;
2473 pose = i - 1;
2474 }
2475 }
2476
2477 /*
2478 * Check whether the proposed headline includes the original
2479 * cover; it might not if we trimmed it due to max_words.
2480 */
2481 poscover = (posb <= p && pose >= q);
2482
2483 /*
2484 * Adopt this headline if it's better than the last one, giving
2485 * highest priority to headlines including the cover, then to
2486 * headlines with more interesting words, then to headlines with
2487 * good stopping points. (Since bestlen is initially -1, we will
2488 * certainly adopt the first headline.)
2489 */
2490 if (poscover > bestcover ||
2491 (poscover == bestcover && poslen > bestlen) ||
2492 (poscover == bestcover && poslen == bestlen &&
2493 !BADENDPOINT(pose) && BADENDPOINT(beste)))
2494 {
2495 bestb = posb;
2496 beste = pose;
2497 bestlen = poslen;
2498 bestcover = poscover;
2499 }
2500
2501 /* move p to generate the next cover */
2502 p++;
2503 }
2504
2505 /*
2506 * If we found nothing acceptable, select min_words words starting at
2507 * the beginning.
2508 */
2509 if (bestlen < 0)
2510 {
2511 curlen = 0;
2512 pose = 0;
2513 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2514 {
2515 if (!NONWORDTOKEN(prs->words[i].type))
2516 curlen++;
2517 pose = i;
2518 }
2519 bestb = 0;
2520 beste = pose;
2521 }
2522 }
2523 else
2524 {
2525 /* highlightall mode: headline is whole document */
2526 bestb = 0;
2527 beste = prs->curwords - 1;
2528 }
2529
2530 mark_fragment(prs, highlightall, bestb, beste);
2531 }
2532
2533 /*
2534 * Default parser's prsheadline function
2535 */
2536 Datum
prsd_headline(PG_FUNCTION_ARGS)2537 prsd_headline(PG_FUNCTION_ARGS)
2538 {
2539 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2540 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2541 TSQuery query = PG_GETARG_TSQUERY(2);
2542
2543 /* default option values: */
2544 int min_words = 15;
2545 int max_words = 35;
2546 int shortword = 3;
2547 int max_fragments = 0;
2548 bool highlightall = false;
2549 int max_cover;
2550 ListCell *l;
2551
2552 /* Extract configuration option values */
2553 prs->startsel = NULL;
2554 prs->stopsel = NULL;
2555 prs->fragdelim = NULL;
2556 foreach(l, prsoptions)
2557 {
2558 DefElem *defel = (DefElem *) lfirst(l);
2559 char *val = defGetString(defel);
2560
2561 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2562 max_words = pg_strtoint32(val);
2563 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2564 min_words = pg_strtoint32(val);
2565 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2566 shortword = pg_strtoint32(val);
2567 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2568 max_fragments = pg_strtoint32(val);
2569 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2570 prs->startsel = pstrdup(val);
2571 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2572 prs->stopsel = pstrdup(val);
2573 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2574 prs->fragdelim = pstrdup(val);
2575 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2576 highlightall = (pg_strcasecmp(val, "1") == 0 ||
2577 pg_strcasecmp(val, "on") == 0 ||
2578 pg_strcasecmp(val, "true") == 0 ||
2579 pg_strcasecmp(val, "t") == 0 ||
2580 pg_strcasecmp(val, "y") == 0 ||
2581 pg_strcasecmp(val, "yes") == 0);
2582 else
2583 ereport(ERROR,
2584 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2585 errmsg("unrecognized headline parameter: \"%s\"",
2586 defel->defname)));
2587 }
2588
2589 /*
2590 * We might eventually make max_cover a user-settable parameter, but for
2591 * now, just compute a reasonable value based on max_words and
2592 * max_fragments.
2593 */
2594 max_cover = Max(max_words * 10, 100);
2595 if (max_fragments > 0)
2596 max_cover *= max_fragments;
2597
2598 /* in HighlightAll mode these parameters are ignored */
2599 if (!highlightall)
2600 {
2601 if (min_words >= max_words)
2602 ereport(ERROR,
2603 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2604 errmsg("MinWords should be less than MaxWords")));
2605 if (min_words <= 0)
2606 ereport(ERROR,
2607 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2608 errmsg("MinWords should be positive")));
2609 if (shortword < 0)
2610 ereport(ERROR,
2611 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2612 errmsg("ShortWord should be >= 0")));
2613 if (max_fragments < 0)
2614 ereport(ERROR,
2615 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2616 errmsg("MaxFragments should be >= 0")));
2617 }
2618
2619 /* Apply appropriate headline selector */
2620 if (max_fragments == 0)
2621 mark_hl_words(prs, query, highlightall, shortword,
2622 min_words, max_words, max_cover);
2623 else
2624 mark_hl_fragments(prs, query, highlightall, shortword,
2625 min_words, max_words, max_fragments, max_cover);
2626
2627 /* Fill in default values for string options */
2628 if (!prs->startsel)
2629 prs->startsel = pstrdup("<b>");
2630 if (!prs->stopsel)
2631 prs->stopsel = pstrdup("</b>");
2632 if (!prs->fragdelim)
2633 prs->fragdelim = pstrdup(" ... ");
2634
2635 /* Caller will need these lengths, too */
2636 prs->startsellen = strlen(prs->startsel);
2637 prs->stopsellen = strlen(prs->stopsel);
2638 prs->fragdelimlen = strlen(prs->fragdelim);
2639
2640 PG_RETURN_POINTER(prs);
2641 }
2642