1 /*-------------------------------------------------------------------------
2 *
3 * wparser_def.c
4 * Default text search parser
5 *
6 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 *
8 *
9 * IDENTIFICATION
10 * src/backend/tsearch/wparser_def.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <limits.h>
18
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26
27
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30
31
32 /* Output token categories */
33
34 #define ASCIIWORD 1
35 #define WORD_T 2
36 #define NUMWORD 3
37 #define EMAIL 4
38 #define URL_T 5
39 #define HOST 6
40 #define SCIENTIFIC 7
41 #define VERSIONNUMBER 8
42 #define NUMPARTHWORD 9
43 #define PARTHWORD 10
44 #define ASCIIPARTHWORD 11
45 #define SPACE 12
46 #define TAG_T 13
47 #define PROTOCOL 14
48 #define NUMHWORD 15
49 #define ASCIIHWORD 16
50 #define HWORD 17
51 #define URLPATH 18
52 #define FILEPATH 19
53 #define DECIMAL_T 20
54 #define SIGNEDINT 21
55 #define UNSIGNEDINT 22
56 #define XMLENTITY 23
57
58 #define LASTNUM 23
59
60 static const char *const tok_alias[] = {
61 "",
62 "asciiword",
63 "word",
64 "numword",
65 "email",
66 "url",
67 "host",
68 "sfloat",
69 "version",
70 "hword_numpart",
71 "hword_part",
72 "hword_asciipart",
73 "blank",
74 "tag",
75 "protocol",
76 "numhword",
77 "asciihword",
78 "hword",
79 "url_path",
80 "file",
81 "float",
82 "int",
83 "uint",
84 "entity"
85 };
86
87 static const char *const lex_descr[] = {
88 "",
89 "Word, all ASCII",
90 "Word, all letters",
91 "Word, letters and digits",
92 "Email address",
93 "URL",
94 "Host",
95 "Scientific notation",
96 "Version number",
97 "Hyphenated word part, letters and digits",
98 "Hyphenated word part, all letters",
99 "Hyphenated word part, all ASCII",
100 "Space symbols",
101 "XML tag",
102 "Protocol head",
103 "Hyphenated word, letters and digits",
104 "Hyphenated word, all ASCII",
105 "Hyphenated word, all letters",
106 "URL path",
107 "File or path name",
108 "Decimal notation",
109 "Signed integer",
110 "Unsigned integer",
111 "XML entity"
112 };
113
114
115 /* Parser states */
116
117 typedef enum
118 {
119 TPS_Base = 0,
120 TPS_InNumWord,
121 TPS_InAsciiWord,
122 TPS_InWord,
123 TPS_InUnsignedInt,
124 TPS_InSignedIntFirst,
125 TPS_InSignedInt,
126 TPS_InSpace,
127 TPS_InUDecimalFirst,
128 TPS_InUDecimal,
129 TPS_InDecimalFirst,
130 TPS_InDecimal,
131 TPS_InVerVersion,
132 TPS_InSVerVersion,
133 TPS_InVersionFirst,
134 TPS_InVersion,
135 TPS_InMantissaFirst,
136 TPS_InMantissaSign,
137 TPS_InMantissa,
138 TPS_InXMLEntityFirst,
139 TPS_InXMLEntity,
140 TPS_InXMLEntityNumFirst,
141 TPS_InXMLEntityNum,
142 TPS_InXMLEntityHexNumFirst,
143 TPS_InXMLEntityHexNum,
144 TPS_InXMLEntityEnd,
145 TPS_InTagFirst,
146 TPS_InXMLBegin,
147 TPS_InTagCloseFirst,
148 TPS_InTagName,
149 TPS_InTagBeginEnd,
150 TPS_InTag,
151 TPS_InTagEscapeK,
152 TPS_InTagEscapeKK,
153 TPS_InTagBackSleshed,
154 TPS_InTagEnd,
155 TPS_InCommentFirst,
156 TPS_InCommentLast,
157 TPS_InComment,
158 TPS_InCloseCommentFirst,
159 TPS_InCloseCommentLast,
160 TPS_InCommentEnd,
161 TPS_InHostFirstDomain,
162 TPS_InHostDomainSecond,
163 TPS_InHostDomain,
164 TPS_InPortFirst,
165 TPS_InPort,
166 TPS_InHostFirstAN,
167 TPS_InHost,
168 TPS_InEmail,
169 TPS_InFileFirst,
170 TPS_InFileTwiddle,
171 TPS_InPathFirst,
172 TPS_InPathFirstFirst,
173 TPS_InPathSecond,
174 TPS_InFile,
175 TPS_InFileNext,
176 TPS_InURLPathFirst,
177 TPS_InURLPathStart,
178 TPS_InURLPath,
179 TPS_InFURL,
180 TPS_InProtocolFirst,
181 TPS_InProtocolSecond,
182 TPS_InProtocolEnd,
183 TPS_InHyphenAsciiWordFirst,
184 TPS_InHyphenAsciiWord,
185 TPS_InHyphenWordFirst,
186 TPS_InHyphenWord,
187 TPS_InHyphenNumWordFirst,
188 TPS_InHyphenNumWord,
189 TPS_InHyphenDigitLookahead,
190 TPS_InParseHyphen,
191 TPS_InParseHyphenHyphen,
192 TPS_InHyphenWordPart,
193 TPS_InHyphenAsciiWordPart,
194 TPS_InHyphenNumWordPart,
195 TPS_InHyphenUnsignedInt,
196 TPS_Null /* last state (fake value) */
197 } TParserState;
198
199 /* forward declaration */
200 struct TParser;
201
202 typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
203 * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *); /* special handler for
205 * special cases... */
206
207 typedef struct
208 {
209 TParserCharTest isclass;
210 char c;
211 uint16 flags;
212 TParserState tostate;
213 int type;
214 TParserSpecial special;
215 } TParserStateActionItem;
216
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT 0x0000
219 #define A_BINGO 0x0001
220 #define A_POP 0x0002
221 #define A_PUSH 0x0004
222 #define A_RERUN 0x0008
223 #define A_CLEAR 0x0010
224 #define A_MERGE 0x0020
225 #define A_CLRALL 0x0040
226
227 typedef struct TParserPosition
228 {
229 int posbyte; /* position of parser in bytes */
230 int poschar; /* position of parser in characters */
231 int charlen; /* length of current char */
232 int lenbytetoken; /* length of token-so-far in bytes */
233 int lenchartoken; /* and in chars */
234 TParserState state;
235 struct TParserPosition *prev;
236 const TParserStateActionItem *pushedAtAction;
237 } TParserPosition;
238
239 typedef struct TParser
240 {
241 /* string and position information */
242 char *str; /* multibyte string */
243 int lenstr; /* length of mbstring */
244 #ifdef USE_WIDE_UPPER_LOWER
245 wchar_t *wstr; /* wide character string */
246 pg_wchar *pgwstr; /* wide character string for C-locale */
247 bool usewide;
248 #endif
249
250 /* State of parse */
251 int charmaxlen;
252 TParserPosition *state;
253 bool ignore;
254 bool wanthost;
255
256 /* silly char */
257 char c;
258
259 /* out */
260 char *token;
261 int lenbytetoken;
262 int lenchartoken;
263 int type;
264 } TParser;
265
266
267 /* forward decls here */
268 static bool TParserGet(TParser *prs);
269
270
271 static TParserPosition *
newTParserPosition(TParserPosition * prev)272 newTParserPosition(TParserPosition *prev)
273 {
274 TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
275
276 if (prev)
277 memcpy(res, prev, sizeof(TParserPosition));
278 else
279 memset(res, 0, sizeof(TParserPosition));
280
281 res->prev = prev;
282
283 res->pushedAtAction = NULL;
284
285 return res;
286 }
287
288 static TParser *
TParserInit(char * str,int len)289 TParserInit(char *str, int len)
290 {
291 TParser *prs = (TParser *) palloc0(sizeof(TParser));
292
293 prs->charmaxlen = pg_database_encoding_max_length();
294 prs->str = str;
295 prs->lenstr = len;
296
297 #ifdef USE_WIDE_UPPER_LOWER
298
299 /*
300 * Use wide char code only when max encoding length > 1.
301 */
302 if (prs->charmaxlen > 1)
303 {
304 Oid collation = DEFAULT_COLLATION_OID; /* TODO */
305 pg_locale_t mylocale = 0; /* TODO */
306
307 prs->usewide = true;
308 if (lc_ctype_is_c(collation))
309 {
310 /*
311 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
312 * be different from sizeof(wchar_t)
313 */
314 prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
315 pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
316 }
317 else
318 {
319 prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
320 char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
321 mylocale);
322 }
323 }
324 else
325 prs->usewide = false;
326 #endif
327
328 prs->state = newTParserPosition(NULL);
329 prs->state->state = TPS_Base;
330
331 #ifdef WPARSER_TRACE
332
333 /*
334 * Use of %.*s here is a bit risky since it can misbehave if the data is
335 * not in what libc thinks is the prevailing encoding. However, since
336 * this is just a debugging aid, we choose to live with that.
337 */
338 fprintf(stderr, "parsing \"%.*s\"\n", len, str);
339 #endif
340
341 return prs;
342 }
343
344 /*
345 * As an alternative to a full TParserInit one can create a
346 * TParserCopy which basically is a regular TParser without a private
347 * copy of the string - instead it uses the one from another TParser.
348 * This is useful because at some places TParsers are created
349 * recursively and the repeated copying around of the strings can
350 * cause major inefficiency if the source string is long.
351 * The new parser starts parsing at the original's current position.
352 *
353 * Obviously one must not close the original TParser before the copy.
354 */
355 static TParser *
TParserCopyInit(const TParser * orig)356 TParserCopyInit(const TParser *orig)
357 {
358 TParser *prs = (TParser *) palloc0(sizeof(TParser));
359
360 prs->charmaxlen = orig->charmaxlen;
361 prs->str = orig->str + orig->state->posbyte;
362 prs->lenstr = orig->lenstr - orig->state->posbyte;
363
364 #ifdef USE_WIDE_UPPER_LOWER
365 prs->usewide = orig->usewide;
366
367 if (orig->pgwstr)
368 prs->pgwstr = orig->pgwstr + orig->state->poschar;
369 if (orig->wstr)
370 prs->wstr = orig->wstr + orig->state->poschar;
371 #endif
372
373 prs->state = newTParserPosition(NULL);
374 prs->state->state = TPS_Base;
375
376 #ifdef WPARSER_TRACE
377 /* See note above about %.*s */
378 fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
379 #endif
380
381 return prs;
382 }
383
384
385 static void
TParserClose(TParser * prs)386 TParserClose(TParser *prs)
387 {
388 while (prs->state)
389 {
390 TParserPosition *ptr = prs->state->prev;
391
392 pfree(prs->state);
393 prs->state = ptr;
394 }
395
396 #ifdef USE_WIDE_UPPER_LOWER
397 if (prs->wstr)
398 pfree(prs->wstr);
399 if (prs->pgwstr)
400 pfree(prs->pgwstr);
401 #endif
402
403 #ifdef WPARSER_TRACE
404 fprintf(stderr, "closing parser\n");
405 #endif
406 pfree(prs);
407 }
408
409 /*
410 * Close a parser created with TParserCopyInit
411 */
412 static void
TParserCopyClose(TParser * prs)413 TParserCopyClose(TParser *prs)
414 {
415 while (prs->state)
416 {
417 TParserPosition *ptr = prs->state->prev;
418
419 pfree(prs->state);
420 prs->state = ptr;
421 }
422
423 #ifdef WPARSER_TRACE
424 fprintf(stderr, "closing parser copy\n");
425 #endif
426 pfree(prs);
427 }
428
429
430 /*
431 * Character-type support functions, equivalent to is* macros, but
432 * working with any possible encodings and locales. Notes:
433 * - with multibyte encoding and C-locale isw* function may fail
434 * or give wrong result.
435 * - multibyte encoding and C-locale often are used for
436 * Asian languages.
437 * - if locale is C then we use pgwstr instead of wstr.
438 */
439
440 #ifdef USE_WIDE_UPPER_LOWER
441
442 #define p_iswhat(type) \
443 static int \
444 p_is##type(TParser *prs) { \
445 Assert( prs->state ); \
446 if ( prs->usewide ) \
447 { \
448 if ( prs->pgwstr ) \
449 { \
450 unsigned int c = *(prs->pgwstr + prs->state->poschar); \
451 if ( c > 0x7f ) \
452 return 0; \
453 return is##type( c ); \
454 } \
455 return isw##type( *( prs->wstr + prs->state->poschar ) ); \
456 } \
457 \
458 return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
459 } \
460 \
461 static int \
462 p_isnot##type(TParser *prs) { \
463 return !p_is##type(prs); \
464 }
465
466 static int
p_isalnum(TParser * prs)467 p_isalnum(TParser *prs)
468 {
469 Assert(prs->state);
470
471 if (prs->usewide)
472 {
473 if (prs->pgwstr)
474 {
475 unsigned int c = *(prs->pgwstr + prs->state->poschar);
476
477 /*
478 * any non-ascii symbol with multibyte encoding with C-locale is
479 * an alpha character
480 */
481 if (c > 0x7f)
482 return 1;
483
484 return isalnum(c);
485 }
486
487 return iswalnum(*(prs->wstr + prs->state->poschar));
488 }
489
490 return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
491 }
492 static int
p_isnotalnum(TParser * prs)493 p_isnotalnum(TParser *prs)
494 {
495 return !p_isalnum(prs);
496 }
497
498 static int
p_isalpha(TParser * prs)499 p_isalpha(TParser *prs)
500 {
501 Assert(prs->state);
502
503 if (prs->usewide)
504 {
505 if (prs->pgwstr)
506 {
507 unsigned int c = *(prs->pgwstr + prs->state->poschar);
508
509 /*
510 * any non-ascii symbol with multibyte encoding with C-locale is
511 * an alpha character
512 */
513 if (c > 0x7f)
514 return 1;
515
516 return isalpha(c);
517 }
518
519 return iswalpha(*(prs->wstr + prs->state->poschar));
520 }
521
522 return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
523 }
524
525 static int
p_isnotalpha(TParser * prs)526 p_isnotalpha(TParser *prs)
527 {
528 return !p_isalpha(prs);
529 }
530
531 /* p_iseq should be used only for ascii symbols */
532
533 static int
p_iseq(TParser * prs,char c)534 p_iseq(TParser *prs, char c)
535 {
536 Assert(prs->state);
537 return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
538 }
539 #else /* USE_WIDE_UPPER_LOWER */
540
541 #define p_iswhat(type) \
542 static int \
543 p_is##type(TParser *prs) { \
544 Assert( prs->state ); \
545 return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) ); \
546 } \
547 \
548 static int \
549 p_isnot##type(TParser *prs) { \
550 return !p_is##type(prs); \
551 }
552
553
554 static int
p_iseq(TParser * prs,char c)555 p_iseq(TParser *prs, char c)
556 {
557 Assert(prs->state);
558 return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
559 }
560
561 p_iswhat(alnum)
p_iswhat(alpha)562 p_iswhat(alpha)
563 #endif /* USE_WIDE_UPPER_LOWER */
564
565 p_iswhat(digit)
566 p_iswhat(lower)
567 p_iswhat(print)
568 p_iswhat(punct)
569 p_iswhat(space)
570 p_iswhat(upper)
571 p_iswhat(xdigit)
572
573 static int
574 p_isEOF(TParser *prs)
575 {
576 Assert(prs->state);
577 return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
578 }
579
580 static int
p_iseqC(TParser * prs)581 p_iseqC(TParser *prs)
582 {
583 return p_iseq(prs, prs->c);
584 }
585
586 static int
p_isneC(TParser * prs)587 p_isneC(TParser *prs)
588 {
589 return !p_iseq(prs, prs->c);
590 }
591
592 static int
p_isascii(TParser * prs)593 p_isascii(TParser *prs)
594 {
595 return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
596 }
597
598 static int
p_isasclet(TParser * prs)599 p_isasclet(TParser *prs)
600 {
601 return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
602 }
603
604 static int
p_isurlchar(TParser * prs)605 p_isurlchar(TParser *prs)
606 {
607 char ch;
608
609 /* no non-ASCII need apply */
610 if (prs->state->charlen != 1)
611 return 0;
612 ch = *(prs->str + prs->state->posbyte);
613 /* no spaces or control characters */
614 if (ch <= 0x20 || ch >= 0x7F)
615 return 0;
616 /* reject characters disallowed by RFC 3986 */
617 switch (ch)
618 {
619 case '"':
620 case '<':
621 case '>':
622 case '\\':
623 case '^':
624 case '`':
625 case '{':
626 case '|':
627 case '}':
628 return 0;
629 }
630 return 1;
631 }
632
633
634 /* deliberately suppress unused-function complaints for the above */
635 void _make_compiler_happy(void);
636 void
_make_compiler_happy(void)637 _make_compiler_happy(void)
638 {
639 p_isalnum(NULL);
640 p_isnotalnum(NULL);
641 p_isalpha(NULL);
642 p_isnotalpha(NULL);
643 p_isdigit(NULL);
644 p_isnotdigit(NULL);
645 p_islower(NULL);
646 p_isnotlower(NULL);
647 p_isprint(NULL);
648 p_isnotprint(NULL);
649 p_ispunct(NULL);
650 p_isnotpunct(NULL);
651 p_isspace(NULL);
652 p_isnotspace(NULL);
653 p_isupper(NULL);
654 p_isnotupper(NULL);
655 p_isxdigit(NULL);
656 p_isnotxdigit(NULL);
657 p_isEOF(NULL);
658 p_iseqC(NULL);
659 p_isneC(NULL);
660 }
661
662
663 static void
SpecialTags(TParser * prs)664 SpecialTags(TParser *prs)
665 {
666 switch (prs->state->lenchartoken)
667 {
668 case 8: /* </script */
669 if (pg_strncasecmp(prs->token, "</script", 8) == 0)
670 prs->ignore = false;
671 break;
672 case 7: /* <script || </style */
673 if (pg_strncasecmp(prs->token, "</style", 7) == 0)
674 prs->ignore = false;
675 else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
676 prs->ignore = true;
677 break;
678 case 6: /* <style */
679 if (pg_strncasecmp(prs->token, "<style", 6) == 0)
680 prs->ignore = true;
681 break;
682 default:
683 break;
684 }
685 }
686
687 static void
SpecialFURL(TParser * prs)688 SpecialFURL(TParser *prs)
689 {
690 prs->wanthost = true;
691 prs->state->posbyte -= prs->state->lenbytetoken;
692 prs->state->poschar -= prs->state->lenchartoken;
693 }
694
695 static void
SpecialHyphen(TParser * prs)696 SpecialHyphen(TParser *prs)
697 {
698 prs->state->posbyte -= prs->state->lenbytetoken;
699 prs->state->poschar -= prs->state->lenchartoken;
700 }
701
702 static void
SpecialVerVersion(TParser * prs)703 SpecialVerVersion(TParser *prs)
704 {
705 prs->state->posbyte -= prs->state->lenbytetoken;
706 prs->state->poschar -= prs->state->lenchartoken;
707 prs->state->lenbytetoken = 0;
708 prs->state->lenchartoken = 0;
709 }
710
711 static int
p_isstophost(TParser * prs)712 p_isstophost(TParser *prs)
713 {
714 if (prs->wanthost)
715 {
716 prs->wanthost = false;
717 return 1;
718 }
719 return 0;
720 }
721
722 static int
p_isignore(TParser * prs)723 p_isignore(TParser *prs)
724 {
725 return (prs->ignore) ? 1 : 0;
726 }
727
728 static int
p_ishost(TParser * prs)729 p_ishost(TParser *prs)
730 {
731 TParser *tmpprs = TParserCopyInit(prs);
732 int res = 0;
733
734 tmpprs->wanthost = true;
735
736 if (TParserGet(tmpprs) && tmpprs->type == HOST)
737 {
738 prs->state->posbyte += tmpprs->lenbytetoken;
739 prs->state->poschar += tmpprs->lenchartoken;
740 prs->state->lenbytetoken += tmpprs->lenbytetoken;
741 prs->state->lenchartoken += tmpprs->lenchartoken;
742 prs->state->charlen = tmpprs->state->charlen;
743 res = 1;
744 }
745 TParserCopyClose(tmpprs);
746
747 return res;
748 }
749
750 static int
p_isURLPath(TParser * prs)751 p_isURLPath(TParser *prs)
752 {
753 TParser *tmpprs = TParserCopyInit(prs);
754 int res = 0;
755
756 tmpprs->state = newTParserPosition(tmpprs->state);
757 tmpprs->state->state = TPS_InURLPathFirst;
758
759 if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
760 {
761 prs->state->posbyte += tmpprs->lenbytetoken;
762 prs->state->poschar += tmpprs->lenchartoken;
763 prs->state->lenbytetoken += tmpprs->lenbytetoken;
764 prs->state->lenchartoken += tmpprs->lenchartoken;
765 prs->state->charlen = tmpprs->state->charlen;
766 res = 1;
767 }
768 TParserCopyClose(tmpprs);
769
770 return res;
771 }
772
773 /*
774 * returns true if current character has zero display length or
775 * it's a special sign in several languages. Such characters
776 * aren't a word-breaker although they aren't an isalpha.
777 * In beginning of word they aren't a part of it.
778 */
779 static int
p_isspecial(TParser * prs)780 p_isspecial(TParser *prs)
781 {
782 /*
783 * pg_dsplen could return -1 which means error or control character
784 */
785 if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
786 return 1;
787
788 #ifdef USE_WIDE_UPPER_LOWER
789
790 /*
791 * Unicode Characters in the 'Mark, Spacing Combining' Category That
792 * characters are not alpha although they are not breakers of word too.
793 * Check that only in utf encoding, because other encodings aren't
794 * supported by postgres or even exists.
795 */
796 if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
797 {
798 static const pg_wchar strange_letter[] = {
799 /*
800 * use binary search, so elements should be ordered
801 */
802 0x0903, /* DEVANAGARI SIGN VISARGA */
803 0x093E, /* DEVANAGARI VOWEL SIGN AA */
804 0x093F, /* DEVANAGARI VOWEL SIGN I */
805 0x0940, /* DEVANAGARI VOWEL SIGN II */
806 0x0949, /* DEVANAGARI VOWEL SIGN CANDRA O */
807 0x094A, /* DEVANAGARI VOWEL SIGN SHORT O */
808 0x094B, /* DEVANAGARI VOWEL SIGN O */
809 0x094C, /* DEVANAGARI VOWEL SIGN AU */
810 0x0982, /* BENGALI SIGN ANUSVARA */
811 0x0983, /* BENGALI SIGN VISARGA */
812 0x09BE, /* BENGALI VOWEL SIGN AA */
813 0x09BF, /* BENGALI VOWEL SIGN I */
814 0x09C0, /* BENGALI VOWEL SIGN II */
815 0x09C7, /* BENGALI VOWEL SIGN E */
816 0x09C8, /* BENGALI VOWEL SIGN AI */
817 0x09CB, /* BENGALI VOWEL SIGN O */
818 0x09CC, /* BENGALI VOWEL SIGN AU */
819 0x09D7, /* BENGALI AU LENGTH MARK */
820 0x0A03, /* GURMUKHI SIGN VISARGA */
821 0x0A3E, /* GURMUKHI VOWEL SIGN AA */
822 0x0A3F, /* GURMUKHI VOWEL SIGN I */
823 0x0A40, /* GURMUKHI VOWEL SIGN II */
824 0x0A83, /* GUJARATI SIGN VISARGA */
825 0x0ABE, /* GUJARATI VOWEL SIGN AA */
826 0x0ABF, /* GUJARATI VOWEL SIGN I */
827 0x0AC0, /* GUJARATI VOWEL SIGN II */
828 0x0AC9, /* GUJARATI VOWEL SIGN CANDRA O */
829 0x0ACB, /* GUJARATI VOWEL SIGN O */
830 0x0ACC, /* GUJARATI VOWEL SIGN AU */
831 0x0B02, /* ORIYA SIGN ANUSVARA */
832 0x0B03, /* ORIYA SIGN VISARGA */
833 0x0B3E, /* ORIYA VOWEL SIGN AA */
834 0x0B40, /* ORIYA VOWEL SIGN II */
835 0x0B47, /* ORIYA VOWEL SIGN E */
836 0x0B48, /* ORIYA VOWEL SIGN AI */
837 0x0B4B, /* ORIYA VOWEL SIGN O */
838 0x0B4C, /* ORIYA VOWEL SIGN AU */
839 0x0B57, /* ORIYA AU LENGTH MARK */
840 0x0BBE, /* TAMIL VOWEL SIGN AA */
841 0x0BBF, /* TAMIL VOWEL SIGN I */
842 0x0BC1, /* TAMIL VOWEL SIGN U */
843 0x0BC2, /* TAMIL VOWEL SIGN UU */
844 0x0BC6, /* TAMIL VOWEL SIGN E */
845 0x0BC7, /* TAMIL VOWEL SIGN EE */
846 0x0BC8, /* TAMIL VOWEL SIGN AI */
847 0x0BCA, /* TAMIL VOWEL SIGN O */
848 0x0BCB, /* TAMIL VOWEL SIGN OO */
849 0x0BCC, /* TAMIL VOWEL SIGN AU */
850 0x0BD7, /* TAMIL AU LENGTH MARK */
851 0x0C01, /* TELUGU SIGN CANDRABINDU */
852 0x0C02, /* TELUGU SIGN ANUSVARA */
853 0x0C03, /* TELUGU SIGN VISARGA */
854 0x0C41, /* TELUGU VOWEL SIGN U */
855 0x0C42, /* TELUGU VOWEL SIGN UU */
856 0x0C43, /* TELUGU VOWEL SIGN VOCALIC R */
857 0x0C44, /* TELUGU VOWEL SIGN VOCALIC RR */
858 0x0C82, /* KANNADA SIGN ANUSVARA */
859 0x0C83, /* KANNADA SIGN VISARGA */
860 0x0CBE, /* KANNADA VOWEL SIGN AA */
861 0x0CC0, /* KANNADA VOWEL SIGN II */
862 0x0CC1, /* KANNADA VOWEL SIGN U */
863 0x0CC2, /* KANNADA VOWEL SIGN UU */
864 0x0CC3, /* KANNADA VOWEL SIGN VOCALIC R */
865 0x0CC4, /* KANNADA VOWEL SIGN VOCALIC RR */
866 0x0CC7, /* KANNADA VOWEL SIGN EE */
867 0x0CC8, /* KANNADA VOWEL SIGN AI */
868 0x0CCA, /* KANNADA VOWEL SIGN O */
869 0x0CCB, /* KANNADA VOWEL SIGN OO */
870 0x0CD5, /* KANNADA LENGTH MARK */
871 0x0CD6, /* KANNADA AI LENGTH MARK */
872 0x0D02, /* MALAYALAM SIGN ANUSVARA */
873 0x0D03, /* MALAYALAM SIGN VISARGA */
874 0x0D3E, /* MALAYALAM VOWEL SIGN AA */
875 0x0D3F, /* MALAYALAM VOWEL SIGN I */
876 0x0D40, /* MALAYALAM VOWEL SIGN II */
877 0x0D46, /* MALAYALAM VOWEL SIGN E */
878 0x0D47, /* MALAYALAM VOWEL SIGN EE */
879 0x0D48, /* MALAYALAM VOWEL SIGN AI */
880 0x0D4A, /* MALAYALAM VOWEL SIGN O */
881 0x0D4B, /* MALAYALAM VOWEL SIGN OO */
882 0x0D4C, /* MALAYALAM VOWEL SIGN AU */
883 0x0D57, /* MALAYALAM AU LENGTH MARK */
884 0x0D82, /* SINHALA SIGN ANUSVARAYA */
885 0x0D83, /* SINHALA SIGN VISARGAYA */
886 0x0DCF, /* SINHALA VOWEL SIGN AELA-PILLA */
887 0x0DD0, /* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
888 0x0DD1, /* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
889 0x0DD8, /* SINHALA VOWEL SIGN GAETTA-PILLA */
890 0x0DD9, /* SINHALA VOWEL SIGN KOMBUVA */
891 0x0DDA, /* SINHALA VOWEL SIGN DIGA KOMBUVA */
892 0x0DDB, /* SINHALA VOWEL SIGN KOMBU DEKA */
893 0x0DDC, /* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
894 0x0DDD, /* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
895 * AELA-PILLA */
896 0x0DDE, /* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
897 0x0DDF, /* SINHALA VOWEL SIGN GAYANUKITTA */
898 0x0DF2, /* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
899 0x0DF3, /* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
900 0x0F3E, /* TIBETAN SIGN YAR TSHES */
901 0x0F3F, /* TIBETAN SIGN MAR TSHES */
902 0x0F7F, /* TIBETAN SIGN RNAM BCAD */
903 0x102B, /* MYANMAR VOWEL SIGN TALL AA */
904 0x102C, /* MYANMAR VOWEL SIGN AA */
905 0x1031, /* MYANMAR VOWEL SIGN E */
906 0x1038, /* MYANMAR SIGN VISARGA */
907 0x103B, /* MYANMAR CONSONANT SIGN MEDIAL YA */
908 0x103C, /* MYANMAR CONSONANT SIGN MEDIAL RA */
909 0x1056, /* MYANMAR VOWEL SIGN VOCALIC R */
910 0x1057, /* MYANMAR VOWEL SIGN VOCALIC RR */
911 0x1062, /* MYANMAR VOWEL SIGN SGAW KAREN EU */
912 0x1063, /* MYANMAR TONE MARK SGAW KAREN HATHI */
913 0x1064, /* MYANMAR TONE MARK SGAW KAREN KE PHO */
914 0x1067, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
915 0x1068, /* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
916 0x1069, /* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
917 0x106A, /* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
918 0x106B, /* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
919 0x106C, /* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
920 0x106D, /* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
921 0x1083, /* MYANMAR VOWEL SIGN SHAN AA */
922 0x1084, /* MYANMAR VOWEL SIGN SHAN E */
923 0x1087, /* MYANMAR SIGN SHAN TONE-2 */
924 0x1088, /* MYANMAR SIGN SHAN TONE-3 */
925 0x1089, /* MYANMAR SIGN SHAN TONE-5 */
926 0x108A, /* MYANMAR SIGN SHAN TONE-6 */
927 0x108B, /* MYANMAR SIGN SHAN COUNCIL TONE-2 */
928 0x108C, /* MYANMAR SIGN SHAN COUNCIL TONE-3 */
929 0x108F, /* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
930 0x17B6, /* KHMER VOWEL SIGN AA */
931 0x17BE, /* KHMER VOWEL SIGN OE */
932 0x17BF, /* KHMER VOWEL SIGN YA */
933 0x17C0, /* KHMER VOWEL SIGN IE */
934 0x17C1, /* KHMER VOWEL SIGN E */
935 0x17C2, /* KHMER VOWEL SIGN AE */
936 0x17C3, /* KHMER VOWEL SIGN AI */
937 0x17C4, /* KHMER VOWEL SIGN OO */
938 0x17C5, /* KHMER VOWEL SIGN AU */
939 0x17C7, /* KHMER SIGN REAHMUK */
940 0x17C8, /* KHMER SIGN YUUKALEAPINTU */
941 0x1923, /* LIMBU VOWEL SIGN EE */
942 0x1924, /* LIMBU VOWEL SIGN AI */
943 0x1925, /* LIMBU VOWEL SIGN OO */
944 0x1926, /* LIMBU VOWEL SIGN AU */
945 0x1929, /* LIMBU SUBJOINED LETTER YA */
946 0x192A, /* LIMBU SUBJOINED LETTER RA */
947 0x192B, /* LIMBU SUBJOINED LETTER WA */
948 0x1930, /* LIMBU SMALL LETTER KA */
949 0x1931, /* LIMBU SMALL LETTER NGA */
950 0x1933, /* LIMBU SMALL LETTER TA */
951 0x1934, /* LIMBU SMALL LETTER NA */
952 0x1935, /* LIMBU SMALL LETTER PA */
953 0x1936, /* LIMBU SMALL LETTER MA */
954 0x1937, /* LIMBU SMALL LETTER RA */
955 0x1938, /* LIMBU SMALL LETTER LA */
956 0x19B0, /* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
957 0x19B1, /* NEW TAI LUE VOWEL SIGN AA */
958 0x19B2, /* NEW TAI LUE VOWEL SIGN II */
959 0x19B3, /* NEW TAI LUE VOWEL SIGN U */
960 0x19B4, /* NEW TAI LUE VOWEL SIGN UU */
961 0x19B5, /* NEW TAI LUE VOWEL SIGN E */
962 0x19B6, /* NEW TAI LUE VOWEL SIGN AE */
963 0x19B7, /* NEW TAI LUE VOWEL SIGN O */
964 0x19B8, /* NEW TAI LUE VOWEL SIGN OA */
965 0x19B9, /* NEW TAI LUE VOWEL SIGN UE */
966 0x19BA, /* NEW TAI LUE VOWEL SIGN AY */
967 0x19BB, /* NEW TAI LUE VOWEL SIGN AAY */
968 0x19BC, /* NEW TAI LUE VOWEL SIGN UY */
969 0x19BD, /* NEW TAI LUE VOWEL SIGN OY */
970 0x19BE, /* NEW TAI LUE VOWEL SIGN OAY */
971 0x19BF, /* NEW TAI LUE VOWEL SIGN UEY */
972 0x19C0, /* NEW TAI LUE VOWEL SIGN IY */
973 0x19C8, /* NEW TAI LUE TONE MARK-1 */
974 0x19C9, /* NEW TAI LUE TONE MARK-2 */
975 0x1A19, /* BUGINESE VOWEL SIGN E */
976 0x1A1A, /* BUGINESE VOWEL SIGN O */
977 0x1A1B, /* BUGINESE VOWEL SIGN AE */
978 0x1B04, /* BALINESE SIGN BISAH */
979 0x1B35, /* BALINESE VOWEL SIGN TEDUNG */
980 0x1B3B, /* BALINESE VOWEL SIGN RA REPA TEDUNG */
981 0x1B3D, /* BALINESE VOWEL SIGN LA LENGA TEDUNG */
982 0x1B3E, /* BALINESE VOWEL SIGN TALING */
983 0x1B3F, /* BALINESE VOWEL SIGN TALING REPA */
984 0x1B40, /* BALINESE VOWEL SIGN TALING TEDUNG */
985 0x1B41, /* BALINESE VOWEL SIGN TALING REPA TEDUNG */
986 0x1B43, /* BALINESE VOWEL SIGN PEPET TEDUNG */
987 0x1B44, /* BALINESE ADEG ADEG */
988 0x1B82, /* SUNDANESE SIGN PANGWISAD */
989 0x1BA1, /* SUNDANESE CONSONANT SIGN PAMINGKAL */
990 0x1BA6, /* SUNDANESE VOWEL SIGN PANAELAENG */
991 0x1BA7, /* SUNDANESE VOWEL SIGN PANOLONG */
992 0x1BAA, /* SUNDANESE SIGN PAMAAEH */
993 0x1C24, /* LEPCHA SUBJOINED LETTER YA */
994 0x1C25, /* LEPCHA SUBJOINED LETTER RA */
995 0x1C26, /* LEPCHA VOWEL SIGN AA */
996 0x1C27, /* LEPCHA VOWEL SIGN I */
997 0x1C28, /* LEPCHA VOWEL SIGN O */
998 0x1C29, /* LEPCHA VOWEL SIGN OO */
999 0x1C2A, /* LEPCHA VOWEL SIGN U */
1000 0x1C2B, /* LEPCHA VOWEL SIGN UU */
1001 0x1C34, /* LEPCHA CONSONANT SIGN NYIN-DO */
1002 0x1C35, /* LEPCHA CONSONANT SIGN KANG */
1003 0xA823, /* SYLOTI NAGRI VOWEL SIGN A */
1004 0xA824, /* SYLOTI NAGRI VOWEL SIGN I */
1005 0xA827, /* SYLOTI NAGRI VOWEL SIGN OO */
1006 0xA880, /* SAURASHTRA SIGN ANUSVARA */
1007 0xA881, /* SAURASHTRA SIGN VISARGA */
1008 0xA8B4, /* SAURASHTRA CONSONANT SIGN HAARU */
1009 0xA8B5, /* SAURASHTRA VOWEL SIGN AA */
1010 0xA8B6, /* SAURASHTRA VOWEL SIGN I */
1011 0xA8B7, /* SAURASHTRA VOWEL SIGN II */
1012 0xA8B8, /* SAURASHTRA VOWEL SIGN U */
1013 0xA8B9, /* SAURASHTRA VOWEL SIGN UU */
1014 0xA8BA, /* SAURASHTRA VOWEL SIGN VOCALIC R */
1015 0xA8BB, /* SAURASHTRA VOWEL SIGN VOCALIC RR */
1016 0xA8BC, /* SAURASHTRA VOWEL SIGN VOCALIC L */
1017 0xA8BD, /* SAURASHTRA VOWEL SIGN VOCALIC LL */
1018 0xA8BE, /* SAURASHTRA VOWEL SIGN E */
1019 0xA8BF, /* SAURASHTRA VOWEL SIGN EE */
1020 0xA8C0, /* SAURASHTRA VOWEL SIGN AI */
1021 0xA8C1, /* SAURASHTRA VOWEL SIGN O */
1022 0xA8C2, /* SAURASHTRA VOWEL SIGN OO */
1023 0xA8C3, /* SAURASHTRA VOWEL SIGN AU */
1024 0xA952, /* REJANG CONSONANT SIGN H */
1025 0xA953, /* REJANG VIRAMA */
1026 0xAA2F, /* CHAM VOWEL SIGN O */
1027 0xAA30, /* CHAM VOWEL SIGN AI */
1028 0xAA33, /* CHAM CONSONANT SIGN YA */
1029 0xAA34, /* CHAM CONSONANT SIGN RA */
1030 0xAA4D /* CHAM CONSONANT SIGN FINAL H */
1031 };
1032 const pg_wchar *StopLow = strange_letter,
1033 *StopHigh = strange_letter + lengthof(strange_letter),
1034 *StopMiddle;
1035 pg_wchar c;
1036
1037 if (prs->pgwstr)
1038 c = *(prs->pgwstr + prs->state->poschar);
1039 else
1040 c = (pg_wchar) *(prs->wstr + prs->state->poschar);
1041
1042 while (StopLow < StopHigh)
1043 {
1044 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1045 if (*StopMiddle == c)
1046 return 1;
1047 else if (*StopMiddle < c)
1048 StopLow = StopMiddle + 1;
1049 else
1050 StopHigh = StopMiddle;
1051 }
1052 }
1053 #endif
1054
1055 return 0;
1056 }
1057
1058 /*
1059 * Table of state/action of parser
1060 */
1061
1062 static const TParserStateActionItem actionTPS_Base[] = {
1063 {p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
1064 {p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
1065 {p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
1066 {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
1067 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1068 {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
1069 {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1070 {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1071 {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
1072 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1073 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1074 {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
1075 {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
1076 };
1077
1078
1079 static const TParserStateActionItem actionTPS_InNumWord[] = {
1080 {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
1081 {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1082 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1083 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1084 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1085 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1086 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1087 {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
1088 };
1089
1090 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
1091 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1092 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1093 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1094 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1095 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1096 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1097 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1098 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1099 {p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1100 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1101 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1102 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1103 {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1104 {p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1105 {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1106 };
1107
1108 static const TParserStateActionItem actionTPS_InWord[] = {
1109 {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1110 {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1111 {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1112 {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1113 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1114 {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1115 };
1116
1117 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1118 {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1119 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1120 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1121 {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1122 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1123 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1124 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1125 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1126 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1127 {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1128 {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1129 {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1130 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1131 {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1132 };
1133
1134 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1135 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136 {p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1137 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1138 };
1139
1140 static const TParserStateActionItem actionTPS_InSignedInt[] = {
1141 {p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1142 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1143 {p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1144 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1145 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1146 {NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1147 };
1148
1149 static const TParserStateActionItem actionTPS_InSpace[] = {
1150 {p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1151 {p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1152 {p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1153 {p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1154 {p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1155 {p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1156 {p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1157 {p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1158 {NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1159 };
1160
1161 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1162 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1163 {p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1164 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 };
1166
1167 static const TParserStateActionItem actionTPS_InUDecimal[] = {
1168 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1169 {p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1170 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1171 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1172 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1173 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1174 };
1175
1176 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1177 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1178 {p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1179 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1180 };
1181
1182 static const TParserStateActionItem actionTPS_InDecimal[] = {
1183 {p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1184 {p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1185 {p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1186 {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1187 {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1188 {NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1189 };
1190
1191 static const TParserStateActionItem actionTPS_InVerVersion[] = {
1192 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1193 {p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1194 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1195 };
1196
1197 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1198 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199 {p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1200 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1201 };
1202
1203
1204 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1205 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1206 {p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1207 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 };
1209
1210 static const TParserStateActionItem actionTPS_InVersion[] = {
1211 {p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1212 {p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1213 {p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1214 {NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1215 };
1216
1217 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1218 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1220 {p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1221 {p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1222 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1223 };
1224
1225 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1226 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227 {p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1228 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230
1231 static const TParserStateActionItem actionTPS_InMantissa[] = {
1232 {p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1233 {p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1234 {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1235 };
1236
1237 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1238 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1240 {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1241 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1242 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1243 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1244 };
1245
1246 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1247 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1248 {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1249 {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1250 {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1251 {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1252 {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1253 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1254 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1255 };
1256
1257 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1258 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1259 {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1260 {p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1261 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1262 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 };
1264
1265 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1266 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1268 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1269 };
1270
1271 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1272 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273 {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1274 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1275 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1276 };
1277
1278 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1279 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280 {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1281 {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1282 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1283 };
1284
1285 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1286 {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1287 };
1288
1289 static const TParserStateActionItem actionTPS_InTagFirst[] = {
1290 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291 {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1292 {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1293 {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1294 {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1295 {p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1296 {p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1297 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1298 };
1299
1300 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1301 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1302 /* <?xml ... */
1303 /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */
1304 {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1305 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1306 };
1307
1308 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1309 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310 {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1311 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1312 };
1313
1314 static const TParserStateActionItem actionTPS_InTagName[] = {
1315 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 /* <br/> case */
1317 {p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1318 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1319 {p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1320 {p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1321 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1322 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1323 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1324 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1325 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1326 };
1327
1328 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1329 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1331 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1332 };
1333
1334 static const TParserStateActionItem actionTPS_InTag[] = {
1335 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1336 {p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1337 {p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1338 {p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1339 {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1340 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1341 {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1342 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1343 {p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1344 {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1345 {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1346 {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1347 {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1348 {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1349 {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1350 {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1351 {p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1352 {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1353 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1354 };
1355
1356 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1357 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1358 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1359 {p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1360 {NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1361 };
1362
1363 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1364 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1365 {p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1366 {p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1367 {NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1368 };
1369
1370 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1371 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1372 {NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1373 };
1374
1375 static const TParserStateActionItem actionTPS_InTagEnd[] = {
1376 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1377 };
1378
1379 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1380 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381 {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1382 /* <!DOCTYPE ...> */
1383 {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1384 {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1385 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1386 };
1387
1388 static const TParserStateActionItem actionTPS_InCommentLast[] = {
1389 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1390 {p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1391 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1392 };
1393
1394 static const TParserStateActionItem actionTPS_InComment[] = {
1395 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1396 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1397 {NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1398 };
1399
1400 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1401 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1402 {p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1403 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1404 };
1405
1406 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1407 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408 {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1409 {p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1410 {NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1411 };
1412
1413 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1414 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1415 };
1416
1417 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1418 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1419 {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1420 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1421 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1422 };
1423
1424 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1425 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1426 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1427 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1428 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1429 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1430 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1431 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1432 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434
1435 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1436 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1437 {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1438 {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1439 {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1440 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1441 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1442 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1443 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1444 {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1445 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1446 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1447 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1448 };
1449
1450 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1451 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1452 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1453 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1454 };
1455
1456 static const TParserStateActionItem actionTPS_InPort[] = {
1457 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1458 {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1459 {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1460 {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1461 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1462 };
1463
1464 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1465 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1466 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1467 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1468 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1469 };
1470
1471 static const TParserStateActionItem actionTPS_InHost[] = {
1472 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473 {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1474 {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1475 {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1476 {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1477 {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1478 {p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1479 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481
1482 static const TParserStateActionItem actionTPS_InEmail[] = {
1483 {p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1484 {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1485 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487
1488 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1489 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1491 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1492 {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1493 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1494 {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1495 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1496 };
1497
1498 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1499 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1501 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1502 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1503 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1504 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1505 };
1506
1507 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1508 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1509 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1510 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1511 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1512 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1513 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1514 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516
1517 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1518 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1519 {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1520 {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1521 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1522 };
1523
1524 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1525 {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1526 {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1527 {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1528 {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1529 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1530 };
1531
1532 static const TParserStateActionItem actionTPS_InFile[] = {
1533 {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1534 {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1535 {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1536 {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1537 {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1538 {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1539 {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1540 {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1541 };
1542
1543 static const TParserStateActionItem actionTPS_InFileNext[] = {
1544 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1545 {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1546 {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1547 {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1548 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1549 };
1550
1551 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1552 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1553 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1554 {NULL, 0, A_POP, TPS_Null, 0, NULL},
1555 };
1556
1557 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1558 {NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1559 };
1560
1561 static const TParserStateActionItem actionTPS_InURLPath[] = {
1562 {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1563 {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1564 {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1565 };
1566
1567 static const TParserStateActionItem actionTPS_InFURL[] = {
1568 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1569 {p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1570 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1571 };
1572
1573 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1574 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1575 {p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1576 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1577 };
1578
1579 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1580 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1581 {p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1582 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1583 };
1584
1585 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1586 {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1587 };
1588
1589 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1590 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1592 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1593 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1594 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596
1597 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1598 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1599 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1600 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1601 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1602 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1603 {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1604 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1605 };
1606
1607 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1608 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1609 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1610 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1611 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1612 };
1613
1614 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1615 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1616 {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1617 {p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1618 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1619 {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1620 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1621 };
1622
1623 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1624 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1625 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1626 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1627 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1628 };
1629
1630 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1631 {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1632 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1633 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1634 {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1635 {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1636 };
1637
1638 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1639 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1640 {p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1641 {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1642 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1643 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1644 };
1645
1646 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1647 {p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1648 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1649 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1650 {p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1651 {p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1652 {NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1653 };
1654
1655 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1656 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1657 {p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1658 {p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1659 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1660 };
1661
1662 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1663 {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1664 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1665 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1666 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1667 {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1668 };
1669
1670 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1671 {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1672 {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1673 {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1674 {p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1675 {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1676 {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1677 };
1678
1679 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1680 {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1681 {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1682 {p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1683 {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1684 };
1685
1686 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1687 {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1688 {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1689 {p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1690 {p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1691 {NULL, 0, A_POP, TPS_Null, 0, NULL}
1692 };
1693
1694
1695 /*
1696 * main table of per-state parser actions
1697 */
1698 typedef struct
1699 {
1700 const TParserStateActionItem *action; /* the actual state info */
1701 TParserState state; /* only for Assert crosscheck */
1702 #ifdef WPARSER_TRACE
1703 const char *state_name; /* only for debug printout */
1704 #endif
1705 } TParserStateAction;
1706
1707 #ifdef WPARSER_TRACE
1708 #define TPARSERSTATEACTION(state) \
1709 { CppConcat(action,state), state, CppAsString(state) }
1710 #else
1711 #define TPARSERSTATEACTION(state) \
1712 { CppConcat(action,state), state }
1713 #endif
1714
1715 /*
1716 * order must be the same as in typedef enum {} TParserState!!
1717 */
1718
1719 static const TParserStateAction Actions[] = {
1720 TPARSERSTATEACTION(TPS_Base),
1721 TPARSERSTATEACTION(TPS_InNumWord),
1722 TPARSERSTATEACTION(TPS_InAsciiWord),
1723 TPARSERSTATEACTION(TPS_InWord),
1724 TPARSERSTATEACTION(TPS_InUnsignedInt),
1725 TPARSERSTATEACTION(TPS_InSignedIntFirst),
1726 TPARSERSTATEACTION(TPS_InSignedInt),
1727 TPARSERSTATEACTION(TPS_InSpace),
1728 TPARSERSTATEACTION(TPS_InUDecimalFirst),
1729 TPARSERSTATEACTION(TPS_InUDecimal),
1730 TPARSERSTATEACTION(TPS_InDecimalFirst),
1731 TPARSERSTATEACTION(TPS_InDecimal),
1732 TPARSERSTATEACTION(TPS_InVerVersion),
1733 TPARSERSTATEACTION(TPS_InSVerVersion),
1734 TPARSERSTATEACTION(TPS_InVersionFirst),
1735 TPARSERSTATEACTION(TPS_InVersion),
1736 TPARSERSTATEACTION(TPS_InMantissaFirst),
1737 TPARSERSTATEACTION(TPS_InMantissaSign),
1738 TPARSERSTATEACTION(TPS_InMantissa),
1739 TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1740 TPARSERSTATEACTION(TPS_InXMLEntity),
1741 TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1742 TPARSERSTATEACTION(TPS_InXMLEntityNum),
1743 TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1744 TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1745 TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1746 TPARSERSTATEACTION(TPS_InTagFirst),
1747 TPARSERSTATEACTION(TPS_InXMLBegin),
1748 TPARSERSTATEACTION(TPS_InTagCloseFirst),
1749 TPARSERSTATEACTION(TPS_InTagName),
1750 TPARSERSTATEACTION(TPS_InTagBeginEnd),
1751 TPARSERSTATEACTION(TPS_InTag),
1752 TPARSERSTATEACTION(TPS_InTagEscapeK),
1753 TPARSERSTATEACTION(TPS_InTagEscapeKK),
1754 TPARSERSTATEACTION(TPS_InTagBackSleshed),
1755 TPARSERSTATEACTION(TPS_InTagEnd),
1756 TPARSERSTATEACTION(TPS_InCommentFirst),
1757 TPARSERSTATEACTION(TPS_InCommentLast),
1758 TPARSERSTATEACTION(TPS_InComment),
1759 TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1760 TPARSERSTATEACTION(TPS_InCloseCommentLast),
1761 TPARSERSTATEACTION(TPS_InCommentEnd),
1762 TPARSERSTATEACTION(TPS_InHostFirstDomain),
1763 TPARSERSTATEACTION(TPS_InHostDomainSecond),
1764 TPARSERSTATEACTION(TPS_InHostDomain),
1765 TPARSERSTATEACTION(TPS_InPortFirst),
1766 TPARSERSTATEACTION(TPS_InPort),
1767 TPARSERSTATEACTION(TPS_InHostFirstAN),
1768 TPARSERSTATEACTION(TPS_InHost),
1769 TPARSERSTATEACTION(TPS_InEmail),
1770 TPARSERSTATEACTION(TPS_InFileFirst),
1771 TPARSERSTATEACTION(TPS_InFileTwiddle),
1772 TPARSERSTATEACTION(TPS_InPathFirst),
1773 TPARSERSTATEACTION(TPS_InPathFirstFirst),
1774 TPARSERSTATEACTION(TPS_InPathSecond),
1775 TPARSERSTATEACTION(TPS_InFile),
1776 TPARSERSTATEACTION(TPS_InFileNext),
1777 TPARSERSTATEACTION(TPS_InURLPathFirst),
1778 TPARSERSTATEACTION(TPS_InURLPathStart),
1779 TPARSERSTATEACTION(TPS_InURLPath),
1780 TPARSERSTATEACTION(TPS_InFURL),
1781 TPARSERSTATEACTION(TPS_InProtocolFirst),
1782 TPARSERSTATEACTION(TPS_InProtocolSecond),
1783 TPARSERSTATEACTION(TPS_InProtocolEnd),
1784 TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1785 TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1786 TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1787 TPARSERSTATEACTION(TPS_InHyphenWord),
1788 TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1789 TPARSERSTATEACTION(TPS_InHyphenNumWord),
1790 TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1791 TPARSERSTATEACTION(TPS_InParseHyphen),
1792 TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1793 TPARSERSTATEACTION(TPS_InHyphenWordPart),
1794 TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1795 TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1796 TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1797 };
1798
1799
1800 static bool
TParserGet(TParser * prs)1801 TParserGet(TParser *prs)
1802 {
1803 const TParserStateActionItem *item = NULL;
1804
1805 Assert(prs->state);
1806
1807 if (prs->state->posbyte >= prs->lenstr)
1808 return false;
1809
1810 prs->token = prs->str + prs->state->posbyte;
1811 prs->state->pushedAtAction = NULL;
1812
1813 /* look at string */
1814 while (prs->state->posbyte <= prs->lenstr)
1815 {
1816 if (prs->state->posbyte == prs->lenstr)
1817 prs->state->charlen = 0;
1818 else
1819 prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1820 pg_mblen(prs->str + prs->state->posbyte);
1821
1822 Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1823 Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1824 Assert(Actions[prs->state->state].state == prs->state->state);
1825
1826 if (prs->state->pushedAtAction)
1827 {
1828 /* After a POP, pick up at the next test */
1829 item = prs->state->pushedAtAction + 1;
1830 prs->state->pushedAtAction = NULL;
1831 }
1832 else
1833 {
1834 item = Actions[prs->state->state].action;
1835 Assert(item != NULL);
1836 }
1837
1838 /* find action by character class */
1839 while (item->isclass)
1840 {
1841 prs->c = item->c;
1842 if (item->isclass(prs) != 0)
1843 break;
1844 item++;
1845 }
1846
1847 #ifdef WPARSER_TRACE
1848 {
1849 TParserPosition *ptr;
1850
1851 fprintf(stderr, "state ");
1852 /* indent according to stack depth */
1853 for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1854 fprintf(stderr, " ");
1855 fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1856 if (prs->state->posbyte < prs->lenstr)
1857 fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1858 else
1859 fprintf(stderr, "at EOF");
1860 fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1861 (int) (item - Actions[prs->state->state].action),
1862 (item->flags & A_BINGO) ? " BINGO" : "",
1863 (item->flags & A_POP) ? " POP" : "",
1864 (item->flags & A_PUSH) ? " PUSH" : "",
1865 (item->flags & A_RERUN) ? " RERUN" : "",
1866 (item->flags & A_CLEAR) ? " CLEAR" : "",
1867 (item->flags & A_MERGE) ? " MERGE" : "",
1868 (item->flags & A_CLRALL) ? " CLRALL" : "",
1869 (item->tostate != TPS_Null) ? " tostate " : "",
1870 (item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1871 (item->type > 0) ? " type " : "",
1872 tok_alias[item->type]);
1873 }
1874 #endif
1875
1876 /* call special handler if exists */
1877 if (item->special)
1878 item->special(prs);
1879
1880 /* BINGO, token is found */
1881 if (item->flags & A_BINGO)
1882 {
1883 Assert(item->type > 0);
1884 prs->lenbytetoken = prs->state->lenbytetoken;
1885 prs->lenchartoken = prs->state->lenchartoken;
1886 prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1887 prs->type = item->type;
1888 }
1889
1890 /* do various actions by flags */
1891 if (item->flags & A_POP)
1892 { /* pop stored state in stack */
1893 TParserPosition *ptr = prs->state->prev;
1894
1895 pfree(prs->state);
1896 prs->state = ptr;
1897 Assert(prs->state);
1898 }
1899 else if (item->flags & A_PUSH)
1900 { /* push (store) state in stack */
1901 prs->state->pushedAtAction = item; /* remember where we push */
1902 prs->state = newTParserPosition(prs->state);
1903 }
1904 else if (item->flags & A_CLEAR)
1905 { /* clear previous pushed state */
1906 TParserPosition *ptr;
1907
1908 Assert(prs->state->prev);
1909 ptr = prs->state->prev->prev;
1910 pfree(prs->state->prev);
1911 prs->state->prev = ptr;
1912 }
1913 else if (item->flags & A_CLRALL)
1914 { /* clear all previous pushed state */
1915 TParserPosition *ptr;
1916
1917 while (prs->state->prev)
1918 {
1919 ptr = prs->state->prev->prev;
1920 pfree(prs->state->prev);
1921 prs->state->prev = ptr;
1922 }
1923 }
1924 else if (item->flags & A_MERGE)
1925 { /* merge posinfo with current and pushed state */
1926 TParserPosition *ptr = prs->state;
1927
1928 Assert(prs->state->prev);
1929 prs->state = prs->state->prev;
1930
1931 prs->state->posbyte = ptr->posbyte;
1932 prs->state->poschar = ptr->poschar;
1933 prs->state->charlen = ptr->charlen;
1934 prs->state->lenbytetoken = ptr->lenbytetoken;
1935 prs->state->lenchartoken = ptr->lenchartoken;
1936 pfree(ptr);
1937 }
1938
1939 /* set new state if pointed */
1940 if (item->tostate != TPS_Null)
1941 prs->state->state = item->tostate;
1942
1943 /* check for go away */
1944 if ((item->flags & A_BINGO) ||
1945 (prs->state->posbyte >= prs->lenstr &&
1946 (item->flags & A_RERUN) == 0))
1947 break;
1948
1949 /* go to beginning of loop if we should rerun or we just restore state */
1950 if (item->flags & (A_RERUN | A_POP))
1951 continue;
1952
1953 /* move forward */
1954 if (prs->state->charlen)
1955 {
1956 prs->state->posbyte += prs->state->charlen;
1957 prs->state->lenbytetoken += prs->state->charlen;
1958 prs->state->poschar++;
1959 prs->state->lenchartoken++;
1960 }
1961 }
1962
1963 return (item && (item->flags & A_BINGO)) ? true : false;
1964 }
1965
1966 Datum
prsd_lextype(PG_FUNCTION_ARGS)1967 prsd_lextype(PG_FUNCTION_ARGS)
1968 {
1969 LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1970 int i;
1971
1972 for (i = 1; i <= LASTNUM; i++)
1973 {
1974 descr[i - 1].lexid = i;
1975 descr[i - 1].alias = pstrdup(tok_alias[i]);
1976 descr[i - 1].descr = pstrdup(lex_descr[i]);
1977 }
1978
1979 descr[LASTNUM].lexid = 0;
1980
1981 PG_RETURN_POINTER(descr);
1982 }
1983
1984 Datum
prsd_start(PG_FUNCTION_ARGS)1985 prsd_start(PG_FUNCTION_ARGS)
1986 {
1987 PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1988 }
1989
1990 Datum
prsd_nexttoken(PG_FUNCTION_ARGS)1991 prsd_nexttoken(PG_FUNCTION_ARGS)
1992 {
1993 TParser *p = (TParser *) PG_GETARG_POINTER(0);
1994 char **t = (char **) PG_GETARG_POINTER(1);
1995 int *tlen = (int *) PG_GETARG_POINTER(2);
1996
1997 if (!TParserGet(p))
1998 PG_RETURN_INT32(0);
1999
2000 *t = p->token;
2001 *tlen = p->lenbytetoken;
2002
2003 PG_RETURN_INT32(p->type);
2004 }
2005
2006 Datum
prsd_end(PG_FUNCTION_ARGS)2007 prsd_end(PG_FUNCTION_ARGS)
2008 {
2009 TParser *p = (TParser *) PG_GETARG_POINTER(0);
2010
2011 TParserClose(p);
2012 PG_RETURN_VOID();
2013 }
2014
2015
2016 /*
2017 * ts_headline support begins here
2018 */
2019
2020 /* token type classification macros */
2021 #define LEAVETOKEN(x) ( (x)==SPACE )
2022 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2023 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
2024
2025 #define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
2026 #define HLIDREPLACE(x) ( (x)==TAG_T )
2027 #define HLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2028 #define XMLHLIDSKIP(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2029 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
2030 #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
2031
2032 /*
2033 * Macros useful in headline selection. These rely on availability of
2034 * "HeadlineParsedText *prs" describing some text, and "int shortword"
2035 * describing the "short word" length parameter.
2036 */
2037
2038 /* Interesting words are non-repeated search terms */
2039 #define INTERESTINGWORD(j) \
2040 (prs->words[j].item && !prs->words[j].repeated)
2041
2042 /* Don't want to end at a non-word or a short word, unless interesting */
2043 #define BADENDPOINT(j) \
2044 ((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
2045 !INTERESTINGWORD(j))
2046
2047 typedef struct
2048 {
2049 /* one cover (well, really one fragment) for mark_hl_fragments */
2050 int32 startpos; /* fragment's starting word index */
2051 int32 endpos; /* ending word index (inclusive) */
2052 int32 poslen; /* number of interesting words */
2053 int32 curlen; /* total number of words */
2054 bool chosen; /* chosen? */
2055 bool excluded; /* excluded? */
2056 } CoverPos;
2057
2058 typedef struct
2059 {
2060 /* callback data for checkcondition_HL */
2061 HeadlineWordEntry *words;
2062 int len;
2063 } hlCheck;
2064
2065
2066 /*
2067 * TS_execute callback for matching a tsquery operand to headline words
2068 */
2069 static bool
checkcondition_HL(void * opaque,QueryOperand * val,ExecPhraseData * data)2070 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
2071 {
2072 hlCheck *checkval = (hlCheck *) opaque;
2073 int i;
2074
2075 /* scan words array for marching items */
2076 for (i = 0; i < checkval->len; i++)
2077 {
2078 if (checkval->words[i].item == val)
2079 {
2080 /* if data == NULL, don't need to report positions */
2081 if (!data)
2082 return true;
2083
2084 if (!data->pos)
2085 {
2086 data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
2087 data->allocated = true;
2088 data->npos = 1;
2089 data->pos[0] = checkval->words[i].pos;
2090 }
2091 else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2092 {
2093 data->pos[data->npos++] = checkval->words[i].pos;
2094 }
2095 }
2096 }
2097
2098 if (data && data->npos > 0)
2099 return true;
2100
2101 return false;
2102 }
2103
2104 /*
2105 * hlFirstIndex: find first index >= pos containing any word used in query
2106 *
2107 * Returns -1 if no such index
2108 */
2109 static int
hlFirstIndex(HeadlineParsedText * prs,int pos)2110 hlFirstIndex(HeadlineParsedText *prs, int pos)
2111 {
2112 int i;
2113
2114 for (i = pos; i < prs->curwords; i++)
2115 {
2116 if (prs->words[i].item != NULL)
2117 return i;
2118 }
2119 return -1;
2120 }
2121
2122 /*
2123 * hlCover: try to find a substring of prs' word list that satisfies query
2124 *
2125 * At entry, *p must be the first word index to consider (initialize this
2126 * to zero, or to the next index after a previous successful search).
2127 * We will consider all substrings starting at or after that word, and
2128 * containing no more than max_cover words. (We need a length limit to
2129 * keep this from taking O(N^2) time for a long document with many query
2130 * words but few complete matches. Actually, since checkcondition_HL is
2131 * roughly O(N) in the length of the substring being checked, it's even
2132 * worse than that.)
2133 *
2134 * On success, sets *p to first word index and *q to last word index of the
2135 * cover substring, and returns true.
2136 *
2137 * The result is a minimal cover, in the sense that both *p and *q will be
2138 * words used in the query.
2139 */
2140 static bool
hlCover(HeadlineParsedText * prs,TSQuery query,int max_cover,int * p,int * q)2141 hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
2142 int *p, int *q)
2143 {
2144 int pmin,
2145 pmax,
2146 nextpmin,
2147 nextpmax;
2148 hlCheck ch;
2149
2150 /*
2151 * We look for the earliest, shortest substring of prs->words that
2152 * satisfies the query. Both the pmin and pmax indices must be words
2153 * appearing in the query; there's no point in trying endpoints in between
2154 * such points.
2155 */
2156 pmin = hlFirstIndex(prs, *p);
2157 while (pmin >= 0)
2158 {
2159 /* This useless assignment just keeps stupider compilers quiet */
2160 nextpmin = -1;
2161 /* Consider substrings starting at pmin */
2162 ch.words = &(prs->words[pmin]);
2163 /* Consider the length-one substring first, then longer substrings */
2164 pmax = pmin;
2165 do
2166 {
2167 /* Try to match query against pmin .. pmax substring */
2168 ch.len = pmax - pmin + 1;
2169 if (TS_execute(GETQUERY(query), &ch,
2170 TS_EXEC_EMPTY, checkcondition_HL))
2171 {
2172 *p = pmin;
2173 *q = pmax;
2174 return true;
2175 }
2176 /* Nope, so advance pmax to next feasible endpoint */
2177 nextpmax = hlFirstIndex(prs, pmax + 1);
2178
2179 /*
2180 * If this is our first advance past pmin, then the result is also
2181 * the next feasible value of pmin; remember it to save a
2182 * redundant search.
2183 */
2184 if (pmax == pmin)
2185 nextpmin = nextpmax;
2186 pmax = nextpmax;
2187 }
2188 while (pmax >= 0 && pmax - pmin < max_cover);
2189 /* No luck here, so try next feasible startpoint */
2190 pmin = nextpmin;
2191 }
2192 return false;
2193 }
2194
2195 /*
2196 * Apply suitable highlight marking to words selected by headline selector
2197 *
2198 * The words from startpos to endpos inclusive are marked per highlightall
2199 */
2200 static void
mark_fragment(HeadlineParsedText * prs,bool highlightall,int startpos,int endpos)2201 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2202 int startpos, int endpos)
2203 {
2204 int i;
2205
2206 for (i = startpos; i <= endpos; i++)
2207 {
2208 if (prs->words[i].item)
2209 prs->words[i].selected = 1;
2210 if (!highlightall)
2211 {
2212 if (HLIDREPLACE(prs->words[i].type))
2213 prs->words[i].replace = 1;
2214 else if (HLIDSKIP(prs->words[i].type))
2215 prs->words[i].skip = 1;
2216 }
2217 else
2218 {
2219 if (XMLHLIDSKIP(prs->words[i].type))
2220 prs->words[i].skip = 1;
2221 }
2222
2223 prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2224 }
2225 }
2226
2227 /*
2228 * split a cover substring into fragments not longer than max_words
2229 *
2230 * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2231 * substring. They are updated to hold the bounds of the next fragment.
2232 *
2233 * *curlen and *poslen are set to the fragment's length, in words and
2234 * interesting words respectively.
2235 */
2236 static void
get_next_fragment(HeadlineParsedText * prs,int * startpos,int * endpos,int * curlen,int * poslen,int max_words)2237 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2238 int *curlen, int *poslen, int max_words)
2239 {
2240 int i;
2241
2242 /*
2243 * Objective: select a fragment of words between startpos and endpos such
2244 * that it has at most max_words and both ends have query words. If the
2245 * startpos and endpos are the endpoints of the cover and the cover has
2246 * fewer words than max_words, then this function should just return the
2247 * cover
2248 */
2249 /* first move startpos to an item */
2250 for (i = *startpos; i <= *endpos; i++)
2251 {
2252 *startpos = i;
2253 if (INTERESTINGWORD(i))
2254 break;
2255 }
2256 /* cut endpos to have only max_words */
2257 *curlen = 0;
2258 *poslen = 0;
2259 for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2260 {
2261 if (!NONWORDTOKEN(prs->words[i].type))
2262 *curlen += 1;
2263 if (INTERESTINGWORD(i))
2264 *poslen += 1;
2265 }
2266 /* if the cover was cut then move back endpos to a query item */
2267 if (*endpos > i)
2268 {
2269 *endpos = i;
2270 for (i = *endpos; i >= *startpos; i--)
2271 {
2272 *endpos = i;
2273 if (INTERESTINGWORD(i))
2274 break;
2275 if (!NONWORDTOKEN(prs->words[i].type))
2276 *curlen -= 1;
2277 }
2278 }
2279 }
2280
2281 /*
2282 * Headline selector used when MaxFragments > 0
2283 *
2284 * Note: in this mode, highlightall is disregarded for phrase selection;
2285 * it only controls presentation details.
2286 */
2287 static void
mark_hl_fragments(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_fragments,int max_cover)2288 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2289 int shortword, int min_words,
2290 int max_words, int max_fragments, int max_cover)
2291 {
2292 int32 poslen,
2293 curlen,
2294 i,
2295 f,
2296 num_f = 0;
2297 int32 stretch,
2298 maxstretch,
2299 posmarker;
2300
2301 int32 startpos = 0,
2302 endpos = 0,
2303 p = 0,
2304 q = 0;
2305
2306 int32 numcovers = 0,
2307 maxcovers = 32;
2308
2309 int32 minI,
2310 minwords,
2311 maxitems;
2312 CoverPos *covers;
2313
2314 covers = palloc(maxcovers * sizeof(CoverPos));
2315
2316 /* get all covers */
2317 while (hlCover(prs, query, max_cover, &p, &q))
2318 {
2319 startpos = p;
2320 endpos = q;
2321
2322 /*
2323 * Break the cover into smaller fragments such that each fragment has
2324 * at most max_words. Also ensure that each end of each fragment is a
2325 * query word. This will allow us to stretch the fragment in either
2326 * direction
2327 */
2328
2329 while (startpos <= endpos)
2330 {
2331 get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2332 if (numcovers >= maxcovers)
2333 {
2334 maxcovers *= 2;
2335 covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2336 }
2337 covers[numcovers].startpos = startpos;
2338 covers[numcovers].endpos = endpos;
2339 covers[numcovers].curlen = curlen;
2340 covers[numcovers].poslen = poslen;
2341 covers[numcovers].chosen = false;
2342 covers[numcovers].excluded = false;
2343 numcovers++;
2344 startpos = endpos + 1;
2345 endpos = q;
2346 }
2347
2348 /* move p to generate the next cover */
2349 p++;
2350 }
2351
2352 /* choose best covers */
2353 for (f = 0; f < max_fragments; f++)
2354 {
2355 maxitems = 0;
2356 minwords = PG_INT32_MAX;
2357 minI = -1;
2358
2359 /*
2360 * Choose the cover that contains max items. In case of tie choose the
2361 * one with smaller number of words.
2362 */
2363 for (i = 0; i < numcovers; i++)
2364 {
2365 if (!covers[i].chosen && !covers[i].excluded &&
2366 (maxitems < covers[i].poslen ||
2367 (maxitems == covers[i].poslen &&
2368 minwords > covers[i].curlen)))
2369 {
2370 maxitems = covers[i].poslen;
2371 minwords = covers[i].curlen;
2372 minI = i;
2373 }
2374 }
2375 /* if a cover was found mark it */
2376 if (minI >= 0)
2377 {
2378 covers[minI].chosen = true;
2379 /* adjust the size of cover */
2380 startpos = covers[minI].startpos;
2381 endpos = covers[minI].endpos;
2382 curlen = covers[minI].curlen;
2383 /* stretch the cover if cover size is lower than max_words */
2384 if (curlen < max_words)
2385 {
2386 /* divide the stretch on both sides of cover */
2387 maxstretch = (max_words - curlen) / 2;
2388
2389 /*
2390 * first stretch the startpos stop stretching if 1. we hit the
2391 * beginning of document 2. exceed maxstretch 3. we hit an
2392 * already marked fragment
2393 */
2394 stretch = 0;
2395 posmarker = startpos;
2396 for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2397 {
2398 if (!NONWORDTOKEN(prs->words[i].type))
2399 {
2400 curlen++;
2401 stretch++;
2402 }
2403 posmarker = i;
2404 }
2405 /* cut back startpos till we find a good endpoint */
2406 for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2407 {
2408 if (!NONWORDTOKEN(prs->words[i].type))
2409 curlen--;
2410 }
2411 startpos = i;
2412 /* now stretch the endpos as much as possible */
2413 posmarker = endpos;
2414 for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2415 {
2416 if (!NONWORDTOKEN(prs->words[i].type))
2417 curlen++;
2418 posmarker = i;
2419 }
2420 /* cut back endpos till we find a good endpoint */
2421 for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2422 {
2423 if (!NONWORDTOKEN(prs->words[i].type))
2424 curlen--;
2425 }
2426 endpos = i;
2427 }
2428 covers[minI].startpos = startpos;
2429 covers[minI].endpos = endpos;
2430 covers[minI].curlen = curlen;
2431 /* Mark the chosen fragments (covers) */
2432 mark_fragment(prs, highlightall, startpos, endpos);
2433 num_f++;
2434 /* Exclude covers overlapping this one from future consideration */
2435 for (i = 0; i < numcovers; i++)
2436 {
2437 if (i != minI &&
2438 ((covers[i].startpos >= startpos &&
2439 covers[i].startpos <= endpos) ||
2440 (covers[i].endpos >= startpos &&
2441 covers[i].endpos <= endpos) ||
2442 (covers[i].startpos < startpos &&
2443 covers[i].endpos > endpos)))
2444 covers[i].excluded = true;
2445 }
2446 }
2447 else
2448 break; /* no selectable covers remain */
2449 }
2450
2451 /* show the first min_words words if we have not marked anything */
2452 if (num_f <= 0)
2453 {
2454 startpos = endpos = curlen = 0;
2455 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2456 {
2457 if (!NONWORDTOKEN(prs->words[i].type))
2458 curlen++;
2459 endpos = i;
2460 }
2461 mark_fragment(prs, highlightall, startpos, endpos);
2462 }
2463
2464 pfree(covers);
2465 }
2466
2467 /*
2468 * Headline selector used when MaxFragments == 0
2469 */
2470 static void
mark_hl_words(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_cover)2471 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2472 int shortword, int min_words, int max_words, int max_cover)
2473 {
2474 int p = 0,
2475 q = 0;
2476 int bestb = -1,
2477 beste = -1;
2478 int bestlen = -1;
2479 bool bestcover = false;
2480 int pose,
2481 posb,
2482 poslen,
2483 curlen;
2484 bool poscover;
2485 int i;
2486
2487 if (!highlightall)
2488 {
2489 /* examine all covers, select a headline using the best one */
2490 while (hlCover(prs, query, max_cover, &p, &q))
2491 {
2492 /*
2493 * Count words (curlen) and interesting words (poslen) within
2494 * cover, but stop once we reach max_words. This step doesn't
2495 * consider whether that's a good stopping point. posb and pose
2496 * are set to the start and end indexes of the possible headline.
2497 */
2498 curlen = 0;
2499 poslen = 0;
2500 posb = pose = p;
2501 for (i = p; i <= q && curlen < max_words; i++)
2502 {
2503 if (!NONWORDTOKEN(prs->words[i].type))
2504 curlen++;
2505 if (INTERESTINGWORD(i))
2506 poslen++;
2507 pose = i;
2508 }
2509
2510 if (curlen < max_words)
2511 {
2512 /*
2513 * We have room to lengthen the headline, so search forward
2514 * until it's full or we find a good stopping point. We'll
2515 * reconsider the word at "q", then move forward.
2516 */
2517 for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2518 {
2519 if (i > q)
2520 {
2521 if (!NONWORDTOKEN(prs->words[i].type))
2522 curlen++;
2523 if (INTERESTINGWORD(i))
2524 poslen++;
2525 }
2526 pose = i;
2527 if (BADENDPOINT(i))
2528 continue;
2529 if (curlen >= min_words)
2530 break;
2531 }
2532 if (curlen < min_words)
2533 {
2534 /*
2535 * Reached end of text and our headline is still shorter
2536 * than min_words, so try to extend it to the left.
2537 */
2538 for (i = p - 1; i >= 0; i--)
2539 {
2540 if (!NONWORDTOKEN(prs->words[i].type))
2541 curlen++;
2542 if (INTERESTINGWORD(i))
2543 poslen++;
2544 if (curlen >= max_words)
2545 break;
2546 if (BADENDPOINT(i))
2547 continue;
2548 if (curlen >= min_words)
2549 break;
2550 }
2551 posb = (i >= 0) ? i : 0;
2552 }
2553 }
2554 else
2555 {
2556 /*
2557 * Can't make headline longer, so consider making it shorter
2558 * if needed to avoid a bad endpoint.
2559 */
2560 if (i > q)
2561 i = q;
2562 for (; curlen > min_words; i--)
2563 {
2564 if (!BADENDPOINT(i))
2565 break;
2566 if (!NONWORDTOKEN(prs->words[i].type))
2567 curlen--;
2568 if (INTERESTINGWORD(i))
2569 poslen--;
2570 pose = i - 1;
2571 }
2572 }
2573
2574 /*
2575 * Check whether the proposed headline includes the original
2576 * cover; it might not if we trimmed it due to max_words.
2577 */
2578 poscover = (posb <= p && pose >= q);
2579
2580 /*
2581 * Adopt this headline if it's better than the last one, giving
2582 * highest priority to headlines including the cover, then to
2583 * headlines with more interesting words, then to headlines with
2584 * good stopping points. (Since bestlen is initially -1, we will
2585 * certainly adopt the first headline.)
2586 */
2587 if (poscover > bestcover ||
2588 (poscover == bestcover && poslen > bestlen) ||
2589 (poscover == bestcover && poslen == bestlen &&
2590 !BADENDPOINT(pose) && BADENDPOINT(beste)))
2591 {
2592 bestb = posb;
2593 beste = pose;
2594 bestlen = poslen;
2595 bestcover = poscover;
2596 }
2597
2598 /* move p to generate the next cover */
2599 p++;
2600 }
2601
2602 /*
2603 * If we found nothing acceptable, select min_words words starting at
2604 * the beginning.
2605 */
2606 if (bestlen < 0)
2607 {
2608 curlen = 0;
2609 pose = 0;
2610 for (i = 0; i < prs->curwords && curlen < min_words; i++)
2611 {
2612 if (!NONWORDTOKEN(prs->words[i].type))
2613 curlen++;
2614 pose = i;
2615 }
2616 bestb = 0;
2617 beste = pose;
2618 }
2619 }
2620 else
2621 {
2622 /* highlightall mode: headline is whole document */
2623 bestb = 0;
2624 beste = prs->curwords - 1;
2625 }
2626
2627 mark_fragment(prs, highlightall, bestb, beste);
2628 }
2629
2630 /*
2631 * Default parser's prsheadline function
2632 */
2633 Datum
prsd_headline(PG_FUNCTION_ARGS)2634 prsd_headline(PG_FUNCTION_ARGS)
2635 {
2636 HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2637 List *prsoptions = (List *) PG_GETARG_POINTER(1);
2638 TSQuery query = PG_GETARG_TSQUERY(2);
2639
2640 /* default option values: */
2641 int min_words = 15;
2642 int max_words = 35;
2643 int shortword = 3;
2644 int max_fragments = 0;
2645 bool highlightall = false;
2646 int max_cover;
2647 ListCell *l;
2648
2649 /* Extract configuration option values */
2650 prs->startsel = NULL;
2651 prs->stopsel = NULL;
2652 prs->fragdelim = NULL;
2653 foreach(l, prsoptions)
2654 {
2655 DefElem *defel = (DefElem *) lfirst(l);
2656 char *val = defGetString(defel);
2657
2658 if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2659 max_words = pg_atoi(val, sizeof(int32), 0);
2660 else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2661 min_words = pg_atoi(val, sizeof(int32), 0);
2662 else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2663 shortword = pg_atoi(val, sizeof(int32), 0);
2664 else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2665 max_fragments = pg_atoi(val, sizeof(int32), 0);
2666 else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2667 prs->startsel = pstrdup(val);
2668 else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2669 prs->stopsel = pstrdup(val);
2670 else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2671 prs->fragdelim = pstrdup(val);
2672 else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2673 highlightall = (pg_strcasecmp(val, "1") == 0 ||
2674 pg_strcasecmp(val, "on") == 0 ||
2675 pg_strcasecmp(val, "true") == 0 ||
2676 pg_strcasecmp(val, "t") == 0 ||
2677 pg_strcasecmp(val, "y") == 0 ||
2678 pg_strcasecmp(val, "yes") == 0);
2679 else
2680 ereport(ERROR,
2681 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2682 errmsg("unrecognized headline parameter: \"%s\"",
2683 defel->defname)));
2684 }
2685
2686 /*
2687 * We might eventually make max_cover a user-settable parameter, but for
2688 * now, just compute a reasonable value based on max_words and
2689 * max_fragments.
2690 */
2691 max_cover = Max(max_words * 10, 100);
2692 if (max_fragments > 0)
2693 max_cover *= max_fragments;
2694
2695 /* in HighlightAll mode these parameters are ignored */
2696 if (!highlightall)
2697 {
2698 if (min_words >= max_words)
2699 ereport(ERROR,
2700 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2701 errmsg("MinWords should be less than MaxWords")));
2702 if (min_words <= 0)
2703 ereport(ERROR,
2704 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2705 errmsg("MinWords should be positive")));
2706 if (shortword < 0)
2707 ereport(ERROR,
2708 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2709 errmsg("ShortWord should be >= 0")));
2710 if (max_fragments < 0)
2711 ereport(ERROR,
2712 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2713 errmsg("MaxFragments should be >= 0")));
2714 }
2715
2716 /* Apply appropriate headline selector */
2717 if (max_fragments == 0)
2718 mark_hl_words(prs, query, highlightall, shortword,
2719 min_words, max_words, max_cover);
2720 else
2721 mark_hl_fragments(prs, query, highlightall, shortword,
2722 min_words, max_words, max_fragments, max_cover);
2723
2724 /* Fill in default values for string options */
2725 if (!prs->startsel)
2726 prs->startsel = pstrdup("<b>");
2727 if (!prs->stopsel)
2728 prs->stopsel = pstrdup("</b>");
2729 if (!prs->fragdelim)
2730 prs->fragdelim = pstrdup(" ... ");
2731
2732 /* Caller will need these lengths, too */
2733 prs->startsellen = strlen(prs->startsel);
2734 prs->stopsellen = strlen(prs->stopsel);
2735 prs->fragdelimlen = strlen(prs->fragdelim);
2736
2737 PG_RETURN_POINTER(prs);
2738 }
2739