1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  *		Default text search parser
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD		1
35 #define WORD_T			2
36 #define NUMWORD			3
37 #define EMAIL			4
38 #define URL_T			5
39 #define HOST			6
40 #define SCIENTIFIC		7
41 #define VERSIONNUMBER	8
42 #define NUMPARTHWORD	9
43 #define PARTHWORD		10
44 #define ASCIIPARTHWORD	11
45 #define SPACE			12
46 #define TAG_T			13
47 #define PROTOCOL		14
48 #define NUMHWORD		15
49 #define ASCIIHWORD		16
50 #define HWORD			17
51 #define URLPATH			18
52 #define FILEPATH		19
53 #define DECIMAL_T		20
54 #define SIGNEDINT		21
55 #define UNSIGNEDINT		22
56 #define XMLENTITY		23
57 
58 #define LASTNUM			23
59 
60 static const char *const tok_alias[] = {
61 	"",
62 	"asciiword",
63 	"word",
64 	"numword",
65 	"email",
66 	"url",
67 	"host",
68 	"sfloat",
69 	"version",
70 	"hword_numpart",
71 	"hword_part",
72 	"hword_asciipart",
73 	"blank",
74 	"tag",
75 	"protocol",
76 	"numhword",
77 	"asciihword",
78 	"hword",
79 	"url_path",
80 	"file",
81 	"float",
82 	"int",
83 	"uint",
84 	"entity"
85 };
86 
87 static const char *const lex_descr[] = {
88 	"",
89 	"Word, all ASCII",
90 	"Word, all letters",
91 	"Word, letters and digits",
92 	"Email address",
93 	"URL",
94 	"Host",
95 	"Scientific notation",
96 	"Version number",
97 	"Hyphenated word part, letters and digits",
98 	"Hyphenated word part, all letters",
99 	"Hyphenated word part, all ASCII",
100 	"Space symbols",
101 	"XML tag",
102 	"Protocol head",
103 	"Hyphenated word, letters and digits",
104 	"Hyphenated word, all ASCII",
105 	"Hyphenated word, all letters",
106 	"URL path",
107 	"File or path name",
108 	"Decimal notation",
109 	"Signed integer",
110 	"Unsigned integer",
111 	"XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119 	TPS_Base = 0,
120 	TPS_InNumWord,
121 	TPS_InAsciiWord,
122 	TPS_InWord,
123 	TPS_InUnsignedInt,
124 	TPS_InSignedIntFirst,
125 	TPS_InSignedInt,
126 	TPS_InSpace,
127 	TPS_InUDecimalFirst,
128 	TPS_InUDecimal,
129 	TPS_InDecimalFirst,
130 	TPS_InDecimal,
131 	TPS_InVerVersion,
132 	TPS_InSVerVersion,
133 	TPS_InVersionFirst,
134 	TPS_InVersion,
135 	TPS_InMantissaFirst,
136 	TPS_InMantissaSign,
137 	TPS_InMantissa,
138 	TPS_InXMLEntityFirst,
139 	TPS_InXMLEntity,
140 	TPS_InXMLEntityNumFirst,
141 	TPS_InXMLEntityNum,
142 	TPS_InXMLEntityHexNumFirst,
143 	TPS_InXMLEntityHexNum,
144 	TPS_InXMLEntityEnd,
145 	TPS_InTagFirst,
146 	TPS_InXMLBegin,
147 	TPS_InTagCloseFirst,
148 	TPS_InTagName,
149 	TPS_InTagBeginEnd,
150 	TPS_InTag,
151 	TPS_InTagEscapeK,
152 	TPS_InTagEscapeKK,
153 	TPS_InTagBackSleshed,
154 	TPS_InTagEnd,
155 	TPS_InCommentFirst,
156 	TPS_InCommentLast,
157 	TPS_InComment,
158 	TPS_InCloseCommentFirst,
159 	TPS_InCloseCommentLast,
160 	TPS_InCommentEnd,
161 	TPS_InHostFirstDomain,
162 	TPS_InHostDomainSecond,
163 	TPS_InHostDomain,
164 	TPS_InPortFirst,
165 	TPS_InPort,
166 	TPS_InHostFirstAN,
167 	TPS_InHost,
168 	TPS_InEmail,
169 	TPS_InFileFirst,
170 	TPS_InFileTwiddle,
171 	TPS_InPathFirst,
172 	TPS_InPathFirstFirst,
173 	TPS_InPathSecond,
174 	TPS_InFile,
175 	TPS_InFileNext,
176 	TPS_InURLPathFirst,
177 	TPS_InURLPathStart,
178 	TPS_InURLPath,
179 	TPS_InFURL,
180 	TPS_InProtocolFirst,
181 	TPS_InProtocolSecond,
182 	TPS_InProtocolEnd,
183 	TPS_InHyphenAsciiWordFirst,
184 	TPS_InHyphenAsciiWord,
185 	TPS_InHyphenWordFirst,
186 	TPS_InHyphenWord,
187 	TPS_InHyphenNumWordFirst,
188 	TPS_InHyphenNumWord,
189 	TPS_InHyphenDigitLookahead,
190 	TPS_InParseHyphen,
191 	TPS_InParseHyphenHyphen,
192 	TPS_InHyphenWordPart,
193 	TPS_InHyphenAsciiWordPart,
194 	TPS_InHyphenNumWordPart,
195 	TPS_InHyphenUnsignedInt,
196 	TPS_Null					/* last state (fake value) */
197 } TParserState;
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *);	/* any p_is* functions
203 													 * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *);	/* special handler for
205 													 * special cases... */
206 
207 typedef struct
208 {
209 	TParserCharTest isclass;
210 	char		c;
211 	uint16		flags;
212 	TParserState tostate;
213 	int			type;
214 	TParserSpecial special;
215 } TParserStateActionItem;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT		0x0000
219 #define A_BINGO		0x0001
220 #define A_POP		0x0002
221 #define A_PUSH		0x0004
222 #define A_RERUN		0x0008
223 #define A_CLEAR		0x0010
224 #define A_MERGE		0x0020
225 #define A_CLRALL	0x0040
226 
227 typedef struct TParserPosition
228 {
229 	int			posbyte;		/* position of parser in bytes */
230 	int			poschar;		/* position of parser in characters */
231 	int			charlen;		/* length of current char */
232 	int			lenbytetoken;	/* length of token-so-far in bytes */
233 	int			lenchartoken;	/* and in chars */
234 	TParserState state;
235 	struct TParserPosition *prev;
236 	const TParserStateActionItem *pushedAtAction;
237 } TParserPosition;
238 
239 typedef struct TParser
240 {
241 	/* string and position information */
242 	char	   *str;			/* multibyte string */
243 	int			lenstr;			/* length of mbstring */
244 	wchar_t    *wstr;			/* wide character string */
245 	pg_wchar   *pgwstr;			/* wide character string for C-locale */
246 	bool		usewide;
247 
248 	/* State of parse */
249 	int			charmaxlen;
250 	TParserPosition *state;
251 	bool		ignore;
252 	bool		wanthost;
253 
254 	/* silly char */
255 	char		c;
256 
257 	/* out */
258 	char	   *token;
259 	int			lenbytetoken;
260 	int			lenchartoken;
261 	int			type;
262 } TParser;
263 
264 
265 /* forward decls here */
266 static bool TParserGet(TParser *prs);
267 
268 
269 static TParserPosition *
newTParserPosition(TParserPosition * prev)270 newTParserPosition(TParserPosition *prev)
271 {
272 	TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
273 
274 	if (prev)
275 		memcpy(res, prev, sizeof(TParserPosition));
276 	else
277 		memset(res, 0, sizeof(TParserPosition));
278 
279 	res->prev = prev;
280 
281 	res->pushedAtAction = NULL;
282 
283 	return res;
284 }
285 
286 static TParser *
TParserInit(char * str,int len)287 TParserInit(char *str, int len)
288 {
289 	TParser    *prs = (TParser *) palloc0(sizeof(TParser));
290 
291 	prs->charmaxlen = pg_database_encoding_max_length();
292 	prs->str = str;
293 	prs->lenstr = len;
294 
295 	/*
296 	 * Use wide char code only when max encoding length > 1.
297 	 */
298 	if (prs->charmaxlen > 1)
299 	{
300 		Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
301 		pg_locale_t mylocale = 0;	/* TODO */
302 
303 		prs->usewide = true;
304 		if (lc_ctype_is_c(collation))
305 		{
306 			/*
307 			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
308 			 * be different from sizeof(wchar_t)
309 			 */
310 			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
311 			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
312 		}
313 		else
314 		{
315 			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
316 			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
317 					   mylocale);
318 		}
319 	}
320 	else
321 		prs->usewide = false;
322 
323 	prs->state = newTParserPosition(NULL);
324 	prs->state->state = TPS_Base;
325 
326 #ifdef WPARSER_TRACE
327 
328 	/*
329 	 * Use of %.*s here is a bit risky since it can misbehave if the data is
330 	 * not in what libc thinks is the prevailing encoding.  However, since
331 	 * this is just a debugging aid, we choose to live with that.
332 	 */
333 	fprintf(stderr, "parsing \"%.*s\"\n", len, str);
334 #endif
335 
336 	return prs;
337 }
338 
339 /*
340  * As an alternative to a full TParserInit one can create a
341  * TParserCopy which basically is a regular TParser without a private
342  * copy of the string - instead it uses the one from another TParser.
343  * This is useful because at some places TParsers are created
344  * recursively and the repeated copying around of the strings can
345  * cause major inefficiency if the source string is long.
346  * The new parser starts parsing at the original's current position.
347  *
348  * Obviously one must not close the original TParser before the copy.
349  */
350 static TParser *
TParserCopyInit(const TParser * orig)351 TParserCopyInit(const TParser *orig)
352 {
353 	TParser    *prs = (TParser *) palloc0(sizeof(TParser));
354 
355 	prs->charmaxlen = orig->charmaxlen;
356 	prs->str = orig->str + orig->state->posbyte;
357 	prs->lenstr = orig->lenstr - orig->state->posbyte;
358 	prs->usewide = orig->usewide;
359 
360 	if (orig->pgwstr)
361 		prs->pgwstr = orig->pgwstr + orig->state->poschar;
362 	if (orig->wstr)
363 		prs->wstr = orig->wstr + orig->state->poschar;
364 
365 	prs->state = newTParserPosition(NULL);
366 	prs->state->state = TPS_Base;
367 
368 #ifdef WPARSER_TRACE
369 	/* See note above about %.*s */
370 	fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
371 #endif
372 
373 	return prs;
374 }
375 
376 
377 static void
TParserClose(TParser * prs)378 TParserClose(TParser *prs)
379 {
380 	while (prs->state)
381 	{
382 		TParserPosition *ptr = prs->state->prev;
383 
384 		pfree(prs->state);
385 		prs->state = ptr;
386 	}
387 
388 	if (prs->wstr)
389 		pfree(prs->wstr);
390 	if (prs->pgwstr)
391 		pfree(prs->pgwstr);
392 
393 #ifdef WPARSER_TRACE
394 	fprintf(stderr, "closing parser\n");
395 #endif
396 	pfree(prs);
397 }
398 
399 /*
400  * Close a parser created with TParserCopyInit
401  */
402 static void
TParserCopyClose(TParser * prs)403 TParserCopyClose(TParser *prs)
404 {
405 	while (prs->state)
406 	{
407 		TParserPosition *ptr = prs->state->prev;
408 
409 		pfree(prs->state);
410 		prs->state = ptr;
411 	}
412 
413 #ifdef WPARSER_TRACE
414 	fprintf(stderr, "closing parser copy\n");
415 #endif
416 	pfree(prs);
417 }
418 
419 
420 /*
421  * Character-type support functions, equivalent to is* macros, but
422  * working with any possible encodings and locales. Notes:
423  *	- with multibyte encoding and C-locale isw* function may fail
424  *	  or give wrong result.
425  *	- multibyte encoding and C-locale often are used for
426  *	  Asian languages.
427  *	- if locale is C then we use pgwstr instead of wstr.
428  */
429 
430 #define p_iswhat(type, nonascii)											\
431 																			\
432 static int																	\
433 p_is##type(TParser *prs)													\
434 {																			\
435 	Assert(prs->state);														\
436 	if (prs->usewide)														\
437 	{																		\
438 		if (prs->pgwstr)													\
439 		{																	\
440 			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\
441 			if (c > 0x7f)													\
442 				return nonascii;											\
443 			return is##type(c);												\
444 		}																	\
445 		return isw##type(*(prs->wstr + prs->state->poschar));				\
446 	}																		\
447 	return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));	\
448 }																			\
449 																			\
450 static int																	\
451 p_isnot##type(TParser *prs)													\
452 {																			\
453 	return !p_is##type(prs);												\
454 }
455 
456 /*
457  * In C locale with a multibyte encoding, any non-ASCII symbol is considered
458  * an alpha character, but not a member of other char classes.
459  */
460 p_iswhat(alnum, 1)
461 p_iswhat(alpha, 1)
462 p_iswhat(digit, 0)
463 p_iswhat(lower, 0)
464 p_iswhat(print, 0)
465 p_iswhat(punct, 0)
466 p_iswhat(space, 0)
467 p_iswhat(upper, 0)
468 p_iswhat(xdigit, 0)
469 
470 /* p_iseq should be used only for ascii symbols */
471 
472 static int
p_iseq(TParser * prs,char c)473 p_iseq(TParser *prs, char c)
474 {
475 	Assert(prs->state);
476 	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
477 }
478 
479 static int
p_isEOF(TParser * prs)480 p_isEOF(TParser *prs)
481 {
482 	Assert(prs->state);
483 	return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
484 }
485 
486 static int
p_iseqC(TParser * prs)487 p_iseqC(TParser *prs)
488 {
489 	return p_iseq(prs, prs->c);
490 }
491 
492 static int
p_isneC(TParser * prs)493 p_isneC(TParser *prs)
494 {
495 	return !p_iseq(prs, prs->c);
496 }
497 
498 static int
p_isascii(TParser * prs)499 p_isascii(TParser *prs)
500 {
501 	return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
502 }
503 
504 static int
p_isasclet(TParser * prs)505 p_isasclet(TParser *prs)
506 {
507 	return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
508 }
509 
510 static int
p_isurlchar(TParser * prs)511 p_isurlchar(TParser *prs)
512 {
513 	char		ch;
514 
515 	/* no non-ASCII need apply */
516 	if (prs->state->charlen != 1)
517 		return 0;
518 	ch = *(prs->str + prs->state->posbyte);
519 	/* no spaces or control characters */
520 	if (ch <= 0x20 || ch >= 0x7F)
521 		return 0;
522 	/* reject characters disallowed by RFC 3986 */
523 	switch (ch)
524 	{
525 		case '"':
526 		case '<':
527 		case '>':
528 		case '\\':
529 		case '^':
530 		case '`':
531 		case '{':
532 		case '|':
533 		case '}':
534 			return 0;
535 	}
536 	return 1;
537 }
538 
539 
540 /* deliberately suppress unused-function complaints for the above */
541 void		_make_compiler_happy(void);
542 void
_make_compiler_happy(void)543 _make_compiler_happy(void)
544 {
545 	p_isalnum(NULL);
546 	p_isnotalnum(NULL);
547 	p_isalpha(NULL);
548 	p_isnotalpha(NULL);
549 	p_isdigit(NULL);
550 	p_isnotdigit(NULL);
551 	p_islower(NULL);
552 	p_isnotlower(NULL);
553 	p_isprint(NULL);
554 	p_isnotprint(NULL);
555 	p_ispunct(NULL);
556 	p_isnotpunct(NULL);
557 	p_isspace(NULL);
558 	p_isnotspace(NULL);
559 	p_isupper(NULL);
560 	p_isnotupper(NULL);
561 	p_isxdigit(NULL);
562 	p_isnotxdigit(NULL);
563 	p_isEOF(NULL);
564 	p_iseqC(NULL);
565 	p_isneC(NULL);
566 }
567 
568 
569 static void
SpecialTags(TParser * prs)570 SpecialTags(TParser *prs)
571 {
572 	switch (prs->state->lenchartoken)
573 	{
574 		case 8:					/* </script */
575 			if (pg_strncasecmp(prs->token, "</script", 8) == 0)
576 				prs->ignore = false;
577 			break;
578 		case 7:					/* <script || </style */
579 			if (pg_strncasecmp(prs->token, "</style", 7) == 0)
580 				prs->ignore = false;
581 			else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
582 				prs->ignore = true;
583 			break;
584 		case 6:					/* <style */
585 			if (pg_strncasecmp(prs->token, "<style", 6) == 0)
586 				prs->ignore = true;
587 			break;
588 		default:
589 			break;
590 	}
591 }
592 
593 static void
SpecialFURL(TParser * prs)594 SpecialFURL(TParser *prs)
595 {
596 	prs->wanthost = true;
597 	prs->state->posbyte -= prs->state->lenbytetoken;
598 	prs->state->poschar -= prs->state->lenchartoken;
599 }
600 
601 static void
SpecialHyphen(TParser * prs)602 SpecialHyphen(TParser *prs)
603 {
604 	prs->state->posbyte -= prs->state->lenbytetoken;
605 	prs->state->poschar -= prs->state->lenchartoken;
606 }
607 
608 static void
SpecialVerVersion(TParser * prs)609 SpecialVerVersion(TParser *prs)
610 {
611 	prs->state->posbyte -= prs->state->lenbytetoken;
612 	prs->state->poschar -= prs->state->lenchartoken;
613 	prs->state->lenbytetoken = 0;
614 	prs->state->lenchartoken = 0;
615 }
616 
617 static int
p_isstophost(TParser * prs)618 p_isstophost(TParser *prs)
619 {
620 	if (prs->wanthost)
621 	{
622 		prs->wanthost = false;
623 		return 1;
624 	}
625 	return 0;
626 }
627 
628 static int
p_isignore(TParser * prs)629 p_isignore(TParser *prs)
630 {
631 	return (prs->ignore) ? 1 : 0;
632 }
633 
634 static int
p_ishost(TParser * prs)635 p_ishost(TParser *prs)
636 {
637 	TParser    *tmpprs = TParserCopyInit(prs);
638 	int			res = 0;
639 
640 	tmpprs->wanthost = true;
641 
642 	if (TParserGet(tmpprs) && tmpprs->type == HOST)
643 	{
644 		prs->state->posbyte += tmpprs->lenbytetoken;
645 		prs->state->poschar += tmpprs->lenchartoken;
646 		prs->state->lenbytetoken += tmpprs->lenbytetoken;
647 		prs->state->lenchartoken += tmpprs->lenchartoken;
648 		prs->state->charlen = tmpprs->state->charlen;
649 		res = 1;
650 	}
651 	TParserCopyClose(tmpprs);
652 
653 	return res;
654 }
655 
656 static int
p_isURLPath(TParser * prs)657 p_isURLPath(TParser *prs)
658 {
659 	TParser    *tmpprs = TParserCopyInit(prs);
660 	int			res = 0;
661 
662 	tmpprs->state = newTParserPosition(tmpprs->state);
663 	tmpprs->state->state = TPS_InURLPathFirst;
664 
665 	if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
666 	{
667 		prs->state->posbyte += tmpprs->lenbytetoken;
668 		prs->state->poschar += tmpprs->lenchartoken;
669 		prs->state->lenbytetoken += tmpprs->lenbytetoken;
670 		prs->state->lenchartoken += tmpprs->lenchartoken;
671 		prs->state->charlen = tmpprs->state->charlen;
672 		res = 1;
673 	}
674 	TParserCopyClose(tmpprs);
675 
676 	return res;
677 }
678 
679 /*
680  * returns true if current character has zero display length or
681  * it's a special sign in several languages. Such characters
682  * aren't a word-breaker although they aren't an isalpha.
683  * In beginning of word they aren't a part of it.
684  */
685 static int
p_isspecial(TParser * prs)686 p_isspecial(TParser *prs)
687 {
688 	/*
689 	 * pg_dsplen could return -1 which means error or control character
690 	 */
691 	if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
692 		return 1;
693 
694 	/*
695 	 * Unicode Characters in the 'Mark, Spacing Combining' Category That
696 	 * characters are not alpha although they are not breakers of word too.
697 	 * Check that only in utf encoding, because other encodings aren't
698 	 * supported by postgres or even exists.
699 	 */
700 	if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
701 	{
702 		static const pg_wchar strange_letter[] = {
703 			/*
704 			 * use binary search, so elements should be ordered
705 			 */
706 			0x0903,				/* DEVANAGARI SIGN VISARGA */
707 			0x093E,				/* DEVANAGARI VOWEL SIGN AA */
708 			0x093F,				/* DEVANAGARI VOWEL SIGN I */
709 			0x0940,				/* DEVANAGARI VOWEL SIGN II */
710 			0x0949,				/* DEVANAGARI VOWEL SIGN CANDRA O */
711 			0x094A,				/* DEVANAGARI VOWEL SIGN SHORT O */
712 			0x094B,				/* DEVANAGARI VOWEL SIGN O */
713 			0x094C,				/* DEVANAGARI VOWEL SIGN AU */
714 			0x0982,				/* BENGALI SIGN ANUSVARA */
715 			0x0983,				/* BENGALI SIGN VISARGA */
716 			0x09BE,				/* BENGALI VOWEL SIGN AA */
717 			0x09BF,				/* BENGALI VOWEL SIGN I */
718 			0x09C0,				/* BENGALI VOWEL SIGN II */
719 			0x09C7,				/* BENGALI VOWEL SIGN E */
720 			0x09C8,				/* BENGALI VOWEL SIGN AI */
721 			0x09CB,				/* BENGALI VOWEL SIGN O */
722 			0x09CC,				/* BENGALI VOWEL SIGN AU */
723 			0x09D7,				/* BENGALI AU LENGTH MARK */
724 			0x0A03,				/* GURMUKHI SIGN VISARGA */
725 			0x0A3E,				/* GURMUKHI VOWEL SIGN AA */
726 			0x0A3F,				/* GURMUKHI VOWEL SIGN I */
727 			0x0A40,				/* GURMUKHI VOWEL SIGN II */
728 			0x0A83,				/* GUJARATI SIGN VISARGA */
729 			0x0ABE,				/* GUJARATI VOWEL SIGN AA */
730 			0x0ABF,				/* GUJARATI VOWEL SIGN I */
731 			0x0AC0,				/* GUJARATI VOWEL SIGN II */
732 			0x0AC9,				/* GUJARATI VOWEL SIGN CANDRA O */
733 			0x0ACB,				/* GUJARATI VOWEL SIGN O */
734 			0x0ACC,				/* GUJARATI VOWEL SIGN AU */
735 			0x0B02,				/* ORIYA SIGN ANUSVARA */
736 			0x0B03,				/* ORIYA SIGN VISARGA */
737 			0x0B3E,				/* ORIYA VOWEL SIGN AA */
738 			0x0B40,				/* ORIYA VOWEL SIGN II */
739 			0x0B47,				/* ORIYA VOWEL SIGN E */
740 			0x0B48,				/* ORIYA VOWEL SIGN AI */
741 			0x0B4B,				/* ORIYA VOWEL SIGN O */
742 			0x0B4C,				/* ORIYA VOWEL SIGN AU */
743 			0x0B57,				/* ORIYA AU LENGTH MARK */
744 			0x0BBE,				/* TAMIL VOWEL SIGN AA */
745 			0x0BBF,				/* TAMIL VOWEL SIGN I */
746 			0x0BC1,				/* TAMIL VOWEL SIGN U */
747 			0x0BC2,				/* TAMIL VOWEL SIGN UU */
748 			0x0BC6,				/* TAMIL VOWEL SIGN E */
749 			0x0BC7,				/* TAMIL VOWEL SIGN EE */
750 			0x0BC8,				/* TAMIL VOWEL SIGN AI */
751 			0x0BCA,				/* TAMIL VOWEL SIGN O */
752 			0x0BCB,				/* TAMIL VOWEL SIGN OO */
753 			0x0BCC,				/* TAMIL VOWEL SIGN AU */
754 			0x0BD7,				/* TAMIL AU LENGTH MARK */
755 			0x0C01,				/* TELUGU SIGN CANDRABINDU */
756 			0x0C02,				/* TELUGU SIGN ANUSVARA */
757 			0x0C03,				/* TELUGU SIGN VISARGA */
758 			0x0C41,				/* TELUGU VOWEL SIGN U */
759 			0x0C42,				/* TELUGU VOWEL SIGN UU */
760 			0x0C43,				/* TELUGU VOWEL SIGN VOCALIC R */
761 			0x0C44,				/* TELUGU VOWEL SIGN VOCALIC RR */
762 			0x0C82,				/* KANNADA SIGN ANUSVARA */
763 			0x0C83,				/* KANNADA SIGN VISARGA */
764 			0x0CBE,				/* KANNADA VOWEL SIGN AA */
765 			0x0CC0,				/* KANNADA VOWEL SIGN II */
766 			0x0CC1,				/* KANNADA VOWEL SIGN U */
767 			0x0CC2,				/* KANNADA VOWEL SIGN UU */
768 			0x0CC3,				/* KANNADA VOWEL SIGN VOCALIC R */
769 			0x0CC4,				/* KANNADA VOWEL SIGN VOCALIC RR */
770 			0x0CC7,				/* KANNADA VOWEL SIGN EE */
771 			0x0CC8,				/* KANNADA VOWEL SIGN AI */
772 			0x0CCA,				/* KANNADA VOWEL SIGN O */
773 			0x0CCB,				/* KANNADA VOWEL SIGN OO */
774 			0x0CD5,				/* KANNADA LENGTH MARK */
775 			0x0CD6,				/* KANNADA AI LENGTH MARK */
776 			0x0D02,				/* MALAYALAM SIGN ANUSVARA */
777 			0x0D03,				/* MALAYALAM SIGN VISARGA */
778 			0x0D3E,				/* MALAYALAM VOWEL SIGN AA */
779 			0x0D3F,				/* MALAYALAM VOWEL SIGN I */
780 			0x0D40,				/* MALAYALAM VOWEL SIGN II */
781 			0x0D46,				/* MALAYALAM VOWEL SIGN E */
782 			0x0D47,				/* MALAYALAM VOWEL SIGN EE */
783 			0x0D48,				/* MALAYALAM VOWEL SIGN AI */
784 			0x0D4A,				/* MALAYALAM VOWEL SIGN O */
785 			0x0D4B,				/* MALAYALAM VOWEL SIGN OO */
786 			0x0D4C,				/* MALAYALAM VOWEL SIGN AU */
787 			0x0D57,				/* MALAYALAM AU LENGTH MARK */
788 			0x0D82,				/* SINHALA SIGN ANUSVARAYA */
789 			0x0D83,				/* SINHALA SIGN VISARGAYA */
790 			0x0DCF,				/* SINHALA VOWEL SIGN AELA-PILLA */
791 			0x0DD0,				/* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
792 			0x0DD1,				/* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
793 			0x0DD8,				/* SINHALA VOWEL SIGN GAETTA-PILLA */
794 			0x0DD9,				/* SINHALA VOWEL SIGN KOMBUVA */
795 			0x0DDA,				/* SINHALA VOWEL SIGN DIGA KOMBUVA */
796 			0x0DDB,				/* SINHALA VOWEL SIGN KOMBU DEKA */
797 			0x0DDC,				/* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
798 			0x0DDD,				/* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
799 								 * AELA-PILLA */
800 			0x0DDE,				/* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
801 			0x0DDF,				/* SINHALA VOWEL SIGN GAYANUKITTA */
802 			0x0DF2,				/* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
803 			0x0DF3,				/* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
804 			0x0F3E,				/* TIBETAN SIGN YAR TSHES */
805 			0x0F3F,				/* TIBETAN SIGN MAR TSHES */
806 			0x0F7F,				/* TIBETAN SIGN RNAM BCAD */
807 			0x102B,				/* MYANMAR VOWEL SIGN TALL AA */
808 			0x102C,				/* MYANMAR VOWEL SIGN AA */
809 			0x1031,				/* MYANMAR VOWEL SIGN E */
810 			0x1038,				/* MYANMAR SIGN VISARGA */
811 			0x103B,				/* MYANMAR CONSONANT SIGN MEDIAL YA */
812 			0x103C,				/* MYANMAR CONSONANT SIGN MEDIAL RA */
813 			0x1056,				/* MYANMAR VOWEL SIGN VOCALIC R */
814 			0x1057,				/* MYANMAR VOWEL SIGN VOCALIC RR */
815 			0x1062,				/* MYANMAR VOWEL SIGN SGAW KAREN EU */
816 			0x1063,				/* MYANMAR TONE MARK SGAW KAREN HATHI */
817 			0x1064,				/* MYANMAR TONE MARK SGAW KAREN KE PHO */
818 			0x1067,				/* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
819 			0x1068,				/* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
820 			0x1069,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
821 			0x106A,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
822 			0x106B,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
823 			0x106C,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
824 			0x106D,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
825 			0x1083,				/* MYANMAR VOWEL SIGN SHAN AA */
826 			0x1084,				/* MYANMAR VOWEL SIGN SHAN E */
827 			0x1087,				/* MYANMAR SIGN SHAN TONE-2 */
828 			0x1088,				/* MYANMAR SIGN SHAN TONE-3 */
829 			0x1089,				/* MYANMAR SIGN SHAN TONE-5 */
830 			0x108A,				/* MYANMAR SIGN SHAN TONE-6 */
831 			0x108B,				/* MYANMAR SIGN SHAN COUNCIL TONE-2 */
832 			0x108C,				/* MYANMAR SIGN SHAN COUNCIL TONE-3 */
833 			0x108F,				/* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
834 			0x17B6,				/* KHMER VOWEL SIGN AA */
835 			0x17BE,				/* KHMER VOWEL SIGN OE */
836 			0x17BF,				/* KHMER VOWEL SIGN YA */
837 			0x17C0,				/* KHMER VOWEL SIGN IE */
838 			0x17C1,				/* KHMER VOWEL SIGN E */
839 			0x17C2,				/* KHMER VOWEL SIGN AE */
840 			0x17C3,				/* KHMER VOWEL SIGN AI */
841 			0x17C4,				/* KHMER VOWEL SIGN OO */
842 			0x17C5,				/* KHMER VOWEL SIGN AU */
843 			0x17C7,				/* KHMER SIGN REAHMUK */
844 			0x17C8,				/* KHMER SIGN YUUKALEAPINTU */
845 			0x1923,				/* LIMBU VOWEL SIGN EE */
846 			0x1924,				/* LIMBU VOWEL SIGN AI */
847 			0x1925,				/* LIMBU VOWEL SIGN OO */
848 			0x1926,				/* LIMBU VOWEL SIGN AU */
849 			0x1929,				/* LIMBU SUBJOINED LETTER YA */
850 			0x192A,				/* LIMBU SUBJOINED LETTER RA */
851 			0x192B,				/* LIMBU SUBJOINED LETTER WA */
852 			0x1930,				/* LIMBU SMALL LETTER KA */
853 			0x1931,				/* LIMBU SMALL LETTER NGA */
854 			0x1933,				/* LIMBU SMALL LETTER TA */
855 			0x1934,				/* LIMBU SMALL LETTER NA */
856 			0x1935,				/* LIMBU SMALL LETTER PA */
857 			0x1936,				/* LIMBU SMALL LETTER MA */
858 			0x1937,				/* LIMBU SMALL LETTER RA */
859 			0x1938,				/* LIMBU SMALL LETTER LA */
860 			0x19B0,				/* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
861 			0x19B1,				/* NEW TAI LUE VOWEL SIGN AA */
862 			0x19B2,				/* NEW TAI LUE VOWEL SIGN II */
863 			0x19B3,				/* NEW TAI LUE VOWEL SIGN U */
864 			0x19B4,				/* NEW TAI LUE VOWEL SIGN UU */
865 			0x19B5,				/* NEW TAI LUE VOWEL SIGN E */
866 			0x19B6,				/* NEW TAI LUE VOWEL SIGN AE */
867 			0x19B7,				/* NEW TAI LUE VOWEL SIGN O */
868 			0x19B8,				/* NEW TAI LUE VOWEL SIGN OA */
869 			0x19B9,				/* NEW TAI LUE VOWEL SIGN UE */
870 			0x19BA,				/* NEW TAI LUE VOWEL SIGN AY */
871 			0x19BB,				/* NEW TAI LUE VOWEL SIGN AAY */
872 			0x19BC,				/* NEW TAI LUE VOWEL SIGN UY */
873 			0x19BD,				/* NEW TAI LUE VOWEL SIGN OY */
874 			0x19BE,				/* NEW TAI LUE VOWEL SIGN OAY */
875 			0x19BF,				/* NEW TAI LUE VOWEL SIGN UEY */
876 			0x19C0,				/* NEW TAI LUE VOWEL SIGN IY */
877 			0x19C8,				/* NEW TAI LUE TONE MARK-1 */
878 			0x19C9,				/* NEW TAI LUE TONE MARK-2 */
879 			0x1A19,				/* BUGINESE VOWEL SIGN E */
880 			0x1A1A,				/* BUGINESE VOWEL SIGN O */
881 			0x1A1B,				/* BUGINESE VOWEL SIGN AE */
882 			0x1B04,				/* BALINESE SIGN BISAH */
883 			0x1B35,				/* BALINESE VOWEL SIGN TEDUNG */
884 			0x1B3B,				/* BALINESE VOWEL SIGN RA REPA TEDUNG */
885 			0x1B3D,				/* BALINESE VOWEL SIGN LA LENGA TEDUNG */
886 			0x1B3E,				/* BALINESE VOWEL SIGN TALING */
887 			0x1B3F,				/* BALINESE VOWEL SIGN TALING REPA */
888 			0x1B40,				/* BALINESE VOWEL SIGN TALING TEDUNG */
889 			0x1B41,				/* BALINESE VOWEL SIGN TALING REPA TEDUNG */
890 			0x1B43,				/* BALINESE VOWEL SIGN PEPET TEDUNG */
891 			0x1B44,				/* BALINESE ADEG ADEG */
892 			0x1B82,				/* SUNDANESE SIGN PANGWISAD */
893 			0x1BA1,				/* SUNDANESE CONSONANT SIGN PAMINGKAL */
894 			0x1BA6,				/* SUNDANESE VOWEL SIGN PANAELAENG */
895 			0x1BA7,				/* SUNDANESE VOWEL SIGN PANOLONG */
896 			0x1BAA,				/* SUNDANESE SIGN PAMAAEH */
897 			0x1C24,				/* LEPCHA SUBJOINED LETTER YA */
898 			0x1C25,				/* LEPCHA SUBJOINED LETTER RA */
899 			0x1C26,				/* LEPCHA VOWEL SIGN AA */
900 			0x1C27,				/* LEPCHA VOWEL SIGN I */
901 			0x1C28,				/* LEPCHA VOWEL SIGN O */
902 			0x1C29,				/* LEPCHA VOWEL SIGN OO */
903 			0x1C2A,				/* LEPCHA VOWEL SIGN U */
904 			0x1C2B,				/* LEPCHA VOWEL SIGN UU */
905 			0x1C34,				/* LEPCHA CONSONANT SIGN NYIN-DO */
906 			0x1C35,				/* LEPCHA CONSONANT SIGN KANG */
907 			0xA823,				/* SYLOTI NAGRI VOWEL SIGN A */
908 			0xA824,				/* SYLOTI NAGRI VOWEL SIGN I */
909 			0xA827,				/* SYLOTI NAGRI VOWEL SIGN OO */
910 			0xA880,				/* SAURASHTRA SIGN ANUSVARA */
911 			0xA881,				/* SAURASHTRA SIGN VISARGA */
912 			0xA8B4,				/* SAURASHTRA CONSONANT SIGN HAARU */
913 			0xA8B5,				/* SAURASHTRA VOWEL SIGN AA */
914 			0xA8B6,				/* SAURASHTRA VOWEL SIGN I */
915 			0xA8B7,				/* SAURASHTRA VOWEL SIGN II */
916 			0xA8B8,				/* SAURASHTRA VOWEL SIGN U */
917 			0xA8B9,				/* SAURASHTRA VOWEL SIGN UU */
918 			0xA8BA,				/* SAURASHTRA VOWEL SIGN VOCALIC R */
919 			0xA8BB,				/* SAURASHTRA VOWEL SIGN VOCALIC RR */
920 			0xA8BC,				/* SAURASHTRA VOWEL SIGN VOCALIC L */
921 			0xA8BD,				/* SAURASHTRA VOWEL SIGN VOCALIC LL */
922 			0xA8BE,				/* SAURASHTRA VOWEL SIGN E */
923 			0xA8BF,				/* SAURASHTRA VOWEL SIGN EE */
924 			0xA8C0,				/* SAURASHTRA VOWEL SIGN AI */
925 			0xA8C1,				/* SAURASHTRA VOWEL SIGN O */
926 			0xA8C2,				/* SAURASHTRA VOWEL SIGN OO */
927 			0xA8C3,				/* SAURASHTRA VOWEL SIGN AU */
928 			0xA952,				/* REJANG CONSONANT SIGN H */
929 			0xA953,				/* REJANG VIRAMA */
930 			0xAA2F,				/* CHAM VOWEL SIGN O */
931 			0xAA30,				/* CHAM VOWEL SIGN AI */
932 			0xAA33,				/* CHAM CONSONANT SIGN YA */
933 			0xAA34,				/* CHAM CONSONANT SIGN RA */
934 			0xAA4D				/* CHAM CONSONANT SIGN FINAL H */
935 		};
936 		const pg_wchar *StopLow = strange_letter,
937 				   *StopHigh = strange_letter + lengthof(strange_letter),
938 				   *StopMiddle;
939 		pg_wchar	c;
940 
941 		if (prs->pgwstr)
942 			c = *(prs->pgwstr + prs->state->poschar);
943 		else
944 			c = (pg_wchar) *(prs->wstr + prs->state->poschar);
945 
946 		while (StopLow < StopHigh)
947 		{
948 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
949 			if (*StopMiddle == c)
950 				return 1;
951 			else if (*StopMiddle < c)
952 				StopLow = StopMiddle + 1;
953 			else
954 				StopHigh = StopMiddle;
955 		}
956 	}
957 
958 	return 0;
959 }
960 
961 /*
962  * Table of state/action of parser
963  */
964 
965 static const TParserStateActionItem actionTPS_Base[] = {
966 	{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
967 	{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
968 	{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
969 	{p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
970 	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
971 	{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
972 	{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
973 	{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
974 	{p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
975 	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
976 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
977 	{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
978 	{NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
979 };
980 
981 
982 static const TParserStateActionItem actionTPS_InNumWord[] = {
983 	{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
984 	{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
985 	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
986 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
987 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
988 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
989 	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
990 	{NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
991 };
992 
993 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
994 	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
995 	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
996 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
997 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
998 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
999 	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1000 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1001 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1002 	{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1003 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1004 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1005 	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1006 	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1007 	{p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1008 	{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1009 };
1010 
1011 static const TParserStateActionItem actionTPS_InWord[] = {
1012 	{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1013 	{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1014 	{p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1015 	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1016 	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1017 	{NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1018 };
1019 
1020 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1021 	{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1022 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1023 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1024 	{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1025 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1026 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1027 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1028 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1029 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1030 	{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1031 	{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1032 	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1033 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1034 	{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1035 };
1036 
1037 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1038 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1039 	{p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1040 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1041 };
1042 
1043 static const TParserStateActionItem actionTPS_InSignedInt[] = {
1044 	{p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1045 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1046 	{p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1047 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1048 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1049 	{NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1050 };
1051 
1052 static const TParserStateActionItem actionTPS_InSpace[] = {
1053 	{p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1054 	{p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1055 	{p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1056 	{p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1057 	{p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1058 	{p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1059 	{p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1060 	{p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1061 	{NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1062 };
1063 
1064 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1065 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1066 	{p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1067 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1068 };
1069 
1070 static const TParserStateActionItem actionTPS_InUDecimal[] = {
1071 	{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1072 	{p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1073 	{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1074 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1075 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1076 	{NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1077 };
1078 
1079 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1080 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1081 	{p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1082 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1083 };
1084 
1085 static const TParserStateActionItem actionTPS_InDecimal[] = {
1086 	{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1087 	{p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1088 	{p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1089 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1090 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1091 	{NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1092 };
1093 
1094 static const TParserStateActionItem actionTPS_InVerVersion[] = {
1095 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1096 	{p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1097 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1098 };
1099 
1100 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1101 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1102 	{p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1103 	{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1104 };
1105 
1106 
1107 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1108 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1109 	{p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1110 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1111 };
1112 
1113 static const TParserStateActionItem actionTPS_InVersion[] = {
1114 	{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1115 	{p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1116 	{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1117 	{NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1118 };
1119 
1120 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1121 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1122 	{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1123 	{p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1124 	{p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1125 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1126 };
1127 
1128 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1129 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1130 	{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1131 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1132 };
1133 
1134 static const TParserStateActionItem actionTPS_InMantissa[] = {
1135 	{p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1136 	{p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1137 	{NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1138 };
1139 
1140 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1141 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1142 	{p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1143 	{p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1144 	{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1145 	{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1146 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1147 };
1148 
1149 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1150 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1151 	{p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1152 	{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1153 	{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1154 	{p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1155 	{p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1156 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1157 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1158 };
1159 
1160 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1161 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1162 	{p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1163 	{p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1164 	{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1165 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1166 };
1167 
1168 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1169 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1170 	{p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1171 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1172 };
1173 
1174 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1175 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1176 	{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1177 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1178 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1179 };
1180 
1181 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1182 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1183 	{p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1184 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1185 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1186 };
1187 
1188 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1189 	{NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1190 };
1191 
1192 static const TParserStateActionItem actionTPS_InTagFirst[] = {
1193 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1194 	{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1195 	{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1196 	{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1197 	{p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1198 	{p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1199 	{p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1200 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1201 };
1202 
1203 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1204 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1205 	/* <?xml ... */
1206 	/* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
1207 	{p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1208 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1209 };
1210 
1211 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1212 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1213 	{p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1214 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1215 };
1216 
1217 static const TParserStateActionItem actionTPS_InTagName[] = {
1218 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 	/* <br/> case */
1220 	{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1221 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1222 	{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1223 	{p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1224 	{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1225 	{p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1226 	{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1227 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1228 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230 
1231 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1232 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1233 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1234 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1235 };
1236 
1237 static const TParserStateActionItem actionTPS_InTag[] = {
1238 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1240 	{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1241 	{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1242 	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1243 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1244 	{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1245 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1246 	{p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1247 	{p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1248 	{p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1249 	{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1250 	{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1251 	{p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1252 	{p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1253 	{p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1254 	{p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1255 	{p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1256 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1257 };
1258 
1259 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1260 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1261 	{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1262 	{p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1263 	{NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1264 };
1265 
1266 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1267 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1268 	{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1269 	{p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1270 	{NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1271 };
1272 
1273 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1274 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1275 	{NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1276 };
1277 
1278 static const TParserStateActionItem actionTPS_InTagEnd[] = {
1279 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1280 };
1281 
1282 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1283 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1284 	{p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1285 	/* <!DOCTYPE ...> */
1286 	{p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1287 	{p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1288 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1289 };
1290 
1291 static const TParserStateActionItem actionTPS_InCommentLast[] = {
1292 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1293 	{p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1294 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1295 };
1296 
1297 static const TParserStateActionItem actionTPS_InComment[] = {
1298 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1299 	{p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1300 	{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1301 };
1302 
1303 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1304 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1305 	{p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1306 	{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1307 };
1308 
1309 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1310 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1311 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1312 	{p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1313 	{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1314 };
1315 
1316 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1317 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1318 };
1319 
1320 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1321 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1322 	{p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1323 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1324 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1325 };
1326 
1327 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1328 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1329 	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1330 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1331 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1332 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1333 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1334 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1335 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1336 };
1337 
1338 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1339 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1340 	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1341 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1342 	{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1343 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1344 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1345 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1346 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1347 	{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1348 	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1349 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1350 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1351 };
1352 
1353 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1354 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1355 	{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1356 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1357 };
1358 
1359 static const TParserStateActionItem actionTPS_InPort[] = {
1360 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1361 	{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1362 	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1363 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1364 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1365 };
1366 
1367 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1368 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1369 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1370 	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1371 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1372 };
1373 
1374 static const TParserStateActionItem actionTPS_InHost[] = {
1375 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1376 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1377 	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1378 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1379 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1380 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1381 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1382 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1383 };
1384 
1385 static const TParserStateActionItem actionTPS_InEmail[] = {
1386 	{p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1387 	{p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1388 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1389 };
1390 
1391 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1392 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1393 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1394 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1395 	{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1396 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1397 	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1398 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1399 };
1400 
1401 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1402 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1403 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1404 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1405 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1406 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1407 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1408 };
1409 
1410 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1411 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1412 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1413 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1414 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1415 	{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1416 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1417 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1418 };
1419 
1420 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1421 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1422 	{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1423 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1424 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1425 };
1426 
1427 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1428 	{p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1429 	{p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1430 	{p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1431 	{p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1432 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434 
1435 static const TParserStateActionItem actionTPS_InFile[] = {
1436 	{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1437 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1438 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1439 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1440 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1441 	{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1442 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1443 	{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1444 };
1445 
1446 static const TParserStateActionItem actionTPS_InFileNext[] = {
1447 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1448 	{p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1449 	{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1450 	{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1451 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1452 };
1453 
1454 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1455 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1456 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1457 	{NULL, 0, A_POP, TPS_Null, 0, NULL},
1458 };
1459 
1460 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1461 	{NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1462 };
1463 
1464 static const TParserStateActionItem actionTPS_InURLPath[] = {
1465 	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1466 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1467 	{NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1468 };
1469 
1470 static const TParserStateActionItem actionTPS_InFURL[] = {
1471 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1472 	{p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1473 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1474 };
1475 
1476 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1477 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1478 	{p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1479 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1482 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1483 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1484 	{p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1485 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1488 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1489 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1490 };
1491 
1492 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1493 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1494 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1495 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1496 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1497 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1498 };
1499 
1500 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1501 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1502 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1503 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1504 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1505 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1506 	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1507 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1508 };
1509 
1510 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1511 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1512 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1513 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1514 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516 
1517 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1518 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1519 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1520 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1521 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1522 	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1523 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1524 };
1525 
1526 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1527 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1528 	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1529 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1530 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1531 };
1532 
1533 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1534 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1535 	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1536 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1537 	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1538 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1539 };
1540 
1541 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1542 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1543 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1544 	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1545 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1546 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1547 };
1548 
1549 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1550 	{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1551 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1552 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1553 	{p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1554 	{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1555 	{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1556 };
1557 
1558 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1559 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1560 	{p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1561 	{p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1562 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1563 };
1564 
1565 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1566 	{p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1567 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1568 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1569 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1570 	{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1571 };
1572 
1573 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1574 	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1575 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1576 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1577 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1578 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1579 	{NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1580 };
1581 
1582 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1583 	{p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1584 	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1585 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1586 	{NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1587 };
1588 
1589 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1590 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1592 	{p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1593 	{p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1594 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596 
1597 
1598 /*
1599  * main table of per-state parser actions
1600  */
1601 typedef struct
1602 {
1603 	const TParserStateActionItem *action;	/* the actual state info */
1604 	TParserState state;			/* only for Assert crosscheck */
1605 #ifdef WPARSER_TRACE
1606 	const char *state_name;		/* only for debug printout */
1607 #endif
1608 } TParserStateAction;
1609 
1610 #ifdef WPARSER_TRACE
1611 #define TPARSERSTATEACTION(state) \
1612 	{ CppConcat(action,state), state, CppAsString(state) }
1613 #else
1614 #define TPARSERSTATEACTION(state) \
1615 	{ CppConcat(action,state), state }
1616 #endif
1617 
1618 /*
1619  * order must be the same as in typedef enum {} TParserState!!
1620  */
1621 
1622 static const TParserStateAction Actions[] = {
1623 	TPARSERSTATEACTION(TPS_Base),
1624 	TPARSERSTATEACTION(TPS_InNumWord),
1625 	TPARSERSTATEACTION(TPS_InAsciiWord),
1626 	TPARSERSTATEACTION(TPS_InWord),
1627 	TPARSERSTATEACTION(TPS_InUnsignedInt),
1628 	TPARSERSTATEACTION(TPS_InSignedIntFirst),
1629 	TPARSERSTATEACTION(TPS_InSignedInt),
1630 	TPARSERSTATEACTION(TPS_InSpace),
1631 	TPARSERSTATEACTION(TPS_InUDecimalFirst),
1632 	TPARSERSTATEACTION(TPS_InUDecimal),
1633 	TPARSERSTATEACTION(TPS_InDecimalFirst),
1634 	TPARSERSTATEACTION(TPS_InDecimal),
1635 	TPARSERSTATEACTION(TPS_InVerVersion),
1636 	TPARSERSTATEACTION(TPS_InSVerVersion),
1637 	TPARSERSTATEACTION(TPS_InVersionFirst),
1638 	TPARSERSTATEACTION(TPS_InVersion),
1639 	TPARSERSTATEACTION(TPS_InMantissaFirst),
1640 	TPARSERSTATEACTION(TPS_InMantissaSign),
1641 	TPARSERSTATEACTION(TPS_InMantissa),
1642 	TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1643 	TPARSERSTATEACTION(TPS_InXMLEntity),
1644 	TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1645 	TPARSERSTATEACTION(TPS_InXMLEntityNum),
1646 	TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1647 	TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1648 	TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1649 	TPARSERSTATEACTION(TPS_InTagFirst),
1650 	TPARSERSTATEACTION(TPS_InXMLBegin),
1651 	TPARSERSTATEACTION(TPS_InTagCloseFirst),
1652 	TPARSERSTATEACTION(TPS_InTagName),
1653 	TPARSERSTATEACTION(TPS_InTagBeginEnd),
1654 	TPARSERSTATEACTION(TPS_InTag),
1655 	TPARSERSTATEACTION(TPS_InTagEscapeK),
1656 	TPARSERSTATEACTION(TPS_InTagEscapeKK),
1657 	TPARSERSTATEACTION(TPS_InTagBackSleshed),
1658 	TPARSERSTATEACTION(TPS_InTagEnd),
1659 	TPARSERSTATEACTION(TPS_InCommentFirst),
1660 	TPARSERSTATEACTION(TPS_InCommentLast),
1661 	TPARSERSTATEACTION(TPS_InComment),
1662 	TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1663 	TPARSERSTATEACTION(TPS_InCloseCommentLast),
1664 	TPARSERSTATEACTION(TPS_InCommentEnd),
1665 	TPARSERSTATEACTION(TPS_InHostFirstDomain),
1666 	TPARSERSTATEACTION(TPS_InHostDomainSecond),
1667 	TPARSERSTATEACTION(TPS_InHostDomain),
1668 	TPARSERSTATEACTION(TPS_InPortFirst),
1669 	TPARSERSTATEACTION(TPS_InPort),
1670 	TPARSERSTATEACTION(TPS_InHostFirstAN),
1671 	TPARSERSTATEACTION(TPS_InHost),
1672 	TPARSERSTATEACTION(TPS_InEmail),
1673 	TPARSERSTATEACTION(TPS_InFileFirst),
1674 	TPARSERSTATEACTION(TPS_InFileTwiddle),
1675 	TPARSERSTATEACTION(TPS_InPathFirst),
1676 	TPARSERSTATEACTION(TPS_InPathFirstFirst),
1677 	TPARSERSTATEACTION(TPS_InPathSecond),
1678 	TPARSERSTATEACTION(TPS_InFile),
1679 	TPARSERSTATEACTION(TPS_InFileNext),
1680 	TPARSERSTATEACTION(TPS_InURLPathFirst),
1681 	TPARSERSTATEACTION(TPS_InURLPathStart),
1682 	TPARSERSTATEACTION(TPS_InURLPath),
1683 	TPARSERSTATEACTION(TPS_InFURL),
1684 	TPARSERSTATEACTION(TPS_InProtocolFirst),
1685 	TPARSERSTATEACTION(TPS_InProtocolSecond),
1686 	TPARSERSTATEACTION(TPS_InProtocolEnd),
1687 	TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1688 	TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1689 	TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1690 	TPARSERSTATEACTION(TPS_InHyphenWord),
1691 	TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1692 	TPARSERSTATEACTION(TPS_InHyphenNumWord),
1693 	TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1694 	TPARSERSTATEACTION(TPS_InParseHyphen),
1695 	TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1696 	TPARSERSTATEACTION(TPS_InHyphenWordPart),
1697 	TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1698 	TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1699 	TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1700 };
1701 
1702 
1703 static bool
TParserGet(TParser * prs)1704 TParserGet(TParser *prs)
1705 {
1706 	const TParserStateActionItem *item = NULL;
1707 
1708 	Assert(prs->state);
1709 
1710 	if (prs->state->posbyte >= prs->lenstr)
1711 		return false;
1712 
1713 	prs->token = prs->str + prs->state->posbyte;
1714 	prs->state->pushedAtAction = NULL;
1715 
1716 	/* look at string */
1717 	while (prs->state->posbyte <= prs->lenstr)
1718 	{
1719 		if (prs->state->posbyte == prs->lenstr)
1720 			prs->state->charlen = 0;
1721 		else
1722 			prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1723 				pg_mblen(prs->str + prs->state->posbyte);
1724 
1725 		Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1726 		Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1727 		Assert(Actions[prs->state->state].state == prs->state->state);
1728 
1729 		if (prs->state->pushedAtAction)
1730 		{
1731 			/* After a POP, pick up at the next test */
1732 			item = prs->state->pushedAtAction + 1;
1733 			prs->state->pushedAtAction = NULL;
1734 		}
1735 		else
1736 		{
1737 			item = Actions[prs->state->state].action;
1738 			Assert(item != NULL);
1739 		}
1740 
1741 		/* find action by character class */
1742 		while (item->isclass)
1743 		{
1744 			prs->c = item->c;
1745 			if (item->isclass(prs) != 0)
1746 				break;
1747 			item++;
1748 		}
1749 
1750 #ifdef WPARSER_TRACE
1751 		{
1752 			TParserPosition *ptr;
1753 
1754 			fprintf(stderr, "state ");
1755 			/* indent according to stack depth */
1756 			for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1757 				fprintf(stderr, "  ");
1758 			fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1759 			if (prs->state->posbyte < prs->lenstr)
1760 				fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1761 			else
1762 				fprintf(stderr, "at EOF");
1763 			fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1764 					(int) (item - Actions[prs->state->state].action),
1765 					(item->flags & A_BINGO) ? " BINGO" : "",
1766 					(item->flags & A_POP) ? " POP" : "",
1767 					(item->flags & A_PUSH) ? " PUSH" : "",
1768 					(item->flags & A_RERUN) ? " RERUN" : "",
1769 					(item->flags & A_CLEAR) ? " CLEAR" : "",
1770 					(item->flags & A_MERGE) ? " MERGE" : "",
1771 					(item->flags & A_CLRALL) ? " CLRALL" : "",
1772 					(item->tostate != TPS_Null) ? " tostate " : "",
1773 					(item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1774 					(item->type > 0) ? " type " : "",
1775 					tok_alias[item->type]);
1776 		}
1777 #endif
1778 
1779 		/* call special handler if exists */
1780 		if (item->special)
1781 			item->special(prs);
1782 
1783 		/* BINGO, token is found */
1784 		if (item->flags & A_BINGO)
1785 		{
1786 			Assert(item->type > 0);
1787 			prs->lenbytetoken = prs->state->lenbytetoken;
1788 			prs->lenchartoken = prs->state->lenchartoken;
1789 			prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1790 			prs->type = item->type;
1791 		}
1792 
1793 		/* do various actions by flags */
1794 		if (item->flags & A_POP)
1795 		{						/* pop stored state in stack */
1796 			TParserPosition *ptr = prs->state->prev;
1797 
1798 			pfree(prs->state);
1799 			prs->state = ptr;
1800 			Assert(prs->state);
1801 		}
1802 		else if (item->flags & A_PUSH)
1803 		{						/* push (store) state in stack */
1804 			prs->state->pushedAtAction = item;	/* remember where we push */
1805 			prs->state = newTParserPosition(prs->state);
1806 		}
1807 		else if (item->flags & A_CLEAR)
1808 		{						/* clear previous pushed state */
1809 			TParserPosition *ptr;
1810 
1811 			Assert(prs->state->prev);
1812 			ptr = prs->state->prev->prev;
1813 			pfree(prs->state->prev);
1814 			prs->state->prev = ptr;
1815 		}
1816 		else if (item->flags & A_CLRALL)
1817 		{						/* clear all previous pushed state */
1818 			TParserPosition *ptr;
1819 
1820 			while (prs->state->prev)
1821 			{
1822 				ptr = prs->state->prev->prev;
1823 				pfree(prs->state->prev);
1824 				prs->state->prev = ptr;
1825 			}
1826 		}
1827 		else if (item->flags & A_MERGE)
1828 		{						/* merge posinfo with current and pushed state */
1829 			TParserPosition *ptr = prs->state;
1830 
1831 			Assert(prs->state->prev);
1832 			prs->state = prs->state->prev;
1833 
1834 			prs->state->posbyte = ptr->posbyte;
1835 			prs->state->poschar = ptr->poschar;
1836 			prs->state->charlen = ptr->charlen;
1837 			prs->state->lenbytetoken = ptr->lenbytetoken;
1838 			prs->state->lenchartoken = ptr->lenchartoken;
1839 			pfree(ptr);
1840 		}
1841 
1842 		/* set new state if pointed */
1843 		if (item->tostate != TPS_Null)
1844 			prs->state->state = item->tostate;
1845 
1846 		/* check for go away */
1847 		if ((item->flags & A_BINGO) ||
1848 			(prs->state->posbyte >= prs->lenstr &&
1849 			 (item->flags & A_RERUN) == 0))
1850 			break;
1851 
1852 		/* go to beginning of loop if we should rerun or we just restore state */
1853 		if (item->flags & (A_RERUN | A_POP))
1854 			continue;
1855 
1856 		/* move forward */
1857 		if (prs->state->charlen)
1858 		{
1859 			prs->state->posbyte += prs->state->charlen;
1860 			prs->state->lenbytetoken += prs->state->charlen;
1861 			prs->state->poschar++;
1862 			prs->state->lenchartoken++;
1863 		}
1864 	}
1865 
1866 	return (item && (item->flags & A_BINGO)) ? true : false;
1867 }
1868 
1869 Datum
prsd_lextype(PG_FUNCTION_ARGS)1870 prsd_lextype(PG_FUNCTION_ARGS)
1871 {
1872 	LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1873 	int			i;
1874 
1875 	for (i = 1; i <= LASTNUM; i++)
1876 	{
1877 		descr[i - 1].lexid = i;
1878 		descr[i - 1].alias = pstrdup(tok_alias[i]);
1879 		descr[i - 1].descr = pstrdup(lex_descr[i]);
1880 	}
1881 
1882 	descr[LASTNUM].lexid = 0;
1883 
1884 	PG_RETURN_POINTER(descr);
1885 }
1886 
1887 Datum
prsd_start(PG_FUNCTION_ARGS)1888 prsd_start(PG_FUNCTION_ARGS)
1889 {
1890 	PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1891 }
1892 
1893 Datum
prsd_nexttoken(PG_FUNCTION_ARGS)1894 prsd_nexttoken(PG_FUNCTION_ARGS)
1895 {
1896 	TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1897 	char	  **t = (char **) PG_GETARG_POINTER(1);
1898 	int		   *tlen = (int *) PG_GETARG_POINTER(2);
1899 
1900 	if (!TParserGet(p))
1901 		PG_RETURN_INT32(0);
1902 
1903 	*t = p->token;
1904 	*tlen = p->lenbytetoken;
1905 
1906 	PG_RETURN_INT32(p->type);
1907 }
1908 
1909 Datum
prsd_end(PG_FUNCTION_ARGS)1910 prsd_end(PG_FUNCTION_ARGS)
1911 {
1912 	TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1913 
1914 	TParserClose(p);
1915 	PG_RETURN_VOID();
1916 }
1917 
1918 
1919 /*
1920  * ts_headline support begins here
1921  */
1922 
1923 /* token type classification macros */
1924 #define LEAVETOKEN(x)	( (x)==SPACE )
1925 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1926 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
1927 
1928 #define TS_IDIGNORE(x)	( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
1929 #define HLIDREPLACE(x)	( (x)==TAG_T )
1930 #define HLIDSKIP(x)		( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1931 #define XMLHLIDSKIP(x)	( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
1932 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
1933 #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
1934 
1935 /*
1936  * Macros useful in headline selection.  These rely on availability of
1937  * "HeadlineParsedText *prs" describing some text, and "int shortword"
1938  * describing the "short word" length parameter.
1939  */
1940 
1941 /* Interesting words are non-repeated search terms */
1942 #define INTERESTINGWORD(j) \
1943 	(prs->words[j].item && !prs->words[j].repeated)
1944 
1945 /* Don't want to end at a non-word or a short word, unless interesting */
1946 #define BADENDPOINT(j) \
1947 	((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1948 	 !INTERESTINGWORD(j))
1949 
1950 typedef struct
1951 {
1952 	/* one cover (well, really one fragment) for mark_hl_fragments */
1953 	int32		startpos;		/* fragment's starting word index */
1954 	int32		endpos;			/* ending word index (inclusive) */
1955 	int32		poslen;			/* number of interesting words */
1956 	int32		curlen;			/* total number of words */
1957 	bool		chosen;			/* chosen? */
1958 	bool		excluded;		/* excluded? */
1959 } CoverPos;
1960 
1961 typedef struct
1962 {
1963 	/* callback data for checkcondition_HL */
1964 	HeadlineWordEntry *words;
1965 	int			len;
1966 } hlCheck;
1967 
1968 
1969 /*
1970  * TS_execute callback for matching a tsquery operand to headline words
1971  */
1972 static bool
checkcondition_HL(void * opaque,QueryOperand * val,ExecPhraseData * data)1973 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
1974 {
1975 	hlCheck    *checkval = (hlCheck *) opaque;
1976 	int			i;
1977 
1978 	/* scan words array for marching items */
1979 	for (i = 0; i < checkval->len; i++)
1980 	{
1981 		if (checkval->words[i].item == val)
1982 		{
1983 			/* if data == NULL, don't need to report positions */
1984 			if (!data)
1985 				return true;
1986 
1987 			if (!data->pos)
1988 			{
1989 				data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
1990 				data->allocated = true;
1991 				data->npos = 1;
1992 				data->pos[0] = checkval->words[i].pos;
1993 			}
1994 			else if (data->pos[data->npos - 1] < checkval->words[i].pos)
1995 			{
1996 				data->pos[data->npos++] = checkval->words[i].pos;
1997 			}
1998 		}
1999 	}
2000 
2001 	if (data && data->npos > 0)
2002 		return true;
2003 
2004 	return false;
2005 }
2006 
2007 /*
2008  * hlFirstIndex: find first index >= pos containing any word used in query
2009  *
2010  * Returns -1 if no such index
2011  */
2012 static int
hlFirstIndex(HeadlineParsedText * prs,int pos)2013 hlFirstIndex(HeadlineParsedText *prs, int pos)
2014 {
2015 	int			i;
2016 
2017 	for (i = pos; i < prs->curwords; i++)
2018 	{
2019 		if (prs->words[i].item != NULL)
2020 			return i;
2021 	}
2022 	return -1;
2023 }
2024 
2025 /*
2026  * hlCover: try to find a substring of prs' word list that satisfies query
2027  *
2028  * At entry, *p must be the first word index to consider (initialize this
2029  * to zero, or to the next index after a previous successful search).
2030  * We will consider all substrings starting at or after that word, and
2031  * containing no more than max_cover words.  (We need a length limit to
2032  * keep this from taking O(N^2) time for a long document with many query
2033  * words but few complete matches.  Actually, since checkcondition_HL is
2034  * roughly O(N) in the length of the substring being checked, it's even
2035  * worse than that.)
2036  *
2037  * On success, sets *p to first word index and *q to last word index of the
2038  * cover substring, and returns true.
2039  *
2040  * The result is a minimal cover, in the sense that both *p and *q will be
2041  * words used in the query.
2042  */
2043 static bool
hlCover(HeadlineParsedText * prs,TSQuery query,int max_cover,int * p,int * q)2044 hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
2045 		int *p, int *q)
2046 {
2047 	int			pmin,
2048 				pmax,
2049 				nextpmin,
2050 				nextpmax;
2051 	hlCheck		ch;
2052 
2053 	/*
2054 	 * We look for the earliest, shortest substring of prs->words that
2055 	 * satisfies the query.  Both the pmin and pmax indices must be words
2056 	 * appearing in the query; there's no point in trying endpoints in between
2057 	 * such points.
2058 	 */
2059 	pmin = hlFirstIndex(prs, *p);
2060 	while (pmin >= 0)
2061 	{
2062 		/* This useless assignment just keeps stupider compilers quiet */
2063 		nextpmin = -1;
2064 		/* Consider substrings starting at pmin */
2065 		ch.words = &(prs->words[pmin]);
2066 		/* Consider the length-one substring first, then longer substrings */
2067 		pmax = pmin;
2068 		do
2069 		{
2070 			/* Try to match query against pmin .. pmax substring */
2071 			ch.len = pmax - pmin + 1;
2072 			if (TS_execute(GETQUERY(query), &ch,
2073 						   TS_EXEC_EMPTY, checkcondition_HL))
2074 			{
2075 				*p = pmin;
2076 				*q = pmax;
2077 				return true;
2078 			}
2079 			/* Nope, so advance pmax to next feasible endpoint */
2080 			nextpmax = hlFirstIndex(prs, pmax + 1);
2081 
2082 			/*
2083 			 * If this is our first advance past pmin, then the result is also
2084 			 * the next feasible value of pmin; remember it to save a
2085 			 * redundant search.
2086 			 */
2087 			if (pmax == pmin)
2088 				nextpmin = nextpmax;
2089 			pmax = nextpmax;
2090 		}
2091 		while (pmax >= 0 && pmax - pmin < max_cover);
2092 		/* No luck here, so try next feasible startpoint */
2093 		pmin = nextpmin;
2094 	}
2095 	return false;
2096 }
2097 
2098 /*
2099  * Apply suitable highlight marking to words selected by headline selector
2100  *
2101  * The words from startpos to endpos inclusive are marked per highlightall
2102  */
2103 static void
mark_fragment(HeadlineParsedText * prs,bool highlightall,int startpos,int endpos)2104 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2105 			  int startpos, int endpos)
2106 {
2107 	int			i;
2108 
2109 	for (i = startpos; i <= endpos; i++)
2110 	{
2111 		if (prs->words[i].item)
2112 			prs->words[i].selected = 1;
2113 		if (!highlightall)
2114 		{
2115 			if (HLIDREPLACE(prs->words[i].type))
2116 				prs->words[i].replace = 1;
2117 			else if (HLIDSKIP(prs->words[i].type))
2118 				prs->words[i].skip = 1;
2119 		}
2120 		else
2121 		{
2122 			if (XMLHLIDSKIP(prs->words[i].type))
2123 				prs->words[i].skip = 1;
2124 		}
2125 
2126 		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2127 	}
2128 }
2129 
2130 /*
2131  * split a cover substring into fragments not longer than max_words
2132  *
2133  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2134  * substring.  They are updated to hold the bounds of the next fragment.
2135  *
2136  * *curlen and *poslen are set to the fragment's length, in words and
2137  * interesting words respectively.
2138  */
2139 static void
get_next_fragment(HeadlineParsedText * prs,int * startpos,int * endpos,int * curlen,int * poslen,int max_words)2140 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2141 				  int *curlen, int *poslen, int max_words)
2142 {
2143 	int			i;
2144 
2145 	/*
2146 	 * Objective: select a fragment of words between startpos and endpos such
2147 	 * that it has at most max_words and both ends have query words. If the
2148 	 * startpos and endpos are the endpoints of the cover and the cover has
2149 	 * fewer words than max_words, then this function should just return the
2150 	 * cover
2151 	 */
2152 	/* first move startpos to an item */
2153 	for (i = *startpos; i <= *endpos; i++)
2154 	{
2155 		*startpos = i;
2156 		if (INTERESTINGWORD(i))
2157 			break;
2158 	}
2159 	/* cut endpos to have only max_words */
2160 	*curlen = 0;
2161 	*poslen = 0;
2162 	for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2163 	{
2164 		if (!NONWORDTOKEN(prs->words[i].type))
2165 			*curlen += 1;
2166 		if (INTERESTINGWORD(i))
2167 			*poslen += 1;
2168 	}
2169 	/* if the cover was cut then move back endpos to a query item */
2170 	if (*endpos > i)
2171 	{
2172 		*endpos = i;
2173 		for (i = *endpos; i >= *startpos; i--)
2174 		{
2175 			*endpos = i;
2176 			if (INTERESTINGWORD(i))
2177 				break;
2178 			if (!NONWORDTOKEN(prs->words[i].type))
2179 				*curlen -= 1;
2180 		}
2181 	}
2182 }
2183 
2184 /*
2185  * Headline selector used when MaxFragments > 0
2186  *
2187  * Note: in this mode, highlightall is disregarded for phrase selection;
2188  * it only controls presentation details.
2189  */
2190 static void
mark_hl_fragments(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_fragments,int max_cover)2191 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2192 				  int shortword, int min_words,
2193 				  int max_words, int max_fragments, int max_cover)
2194 {
2195 	int32		poslen,
2196 				curlen,
2197 				i,
2198 				f,
2199 				num_f = 0;
2200 	int32		stretch,
2201 				maxstretch,
2202 				posmarker;
2203 
2204 	int32		startpos = 0,
2205 				endpos = 0,
2206 				p = 0,
2207 				q = 0;
2208 
2209 	int32		numcovers = 0,
2210 				maxcovers = 32;
2211 
2212 	int32		minI,
2213 				minwords,
2214 				maxitems;
2215 	CoverPos   *covers;
2216 
2217 	covers = palloc(maxcovers * sizeof(CoverPos));
2218 
2219 	/* get all covers */
2220 	while (hlCover(prs, query, max_cover, &p, &q))
2221 	{
2222 		startpos = p;
2223 		endpos = q;
2224 
2225 		/*
2226 		 * Break the cover into smaller fragments such that each fragment has
2227 		 * at most max_words. Also ensure that each end of each fragment is a
2228 		 * query word. This will allow us to stretch the fragment in either
2229 		 * direction
2230 		 */
2231 
2232 		while (startpos <= endpos)
2233 		{
2234 			get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2235 			if (numcovers >= maxcovers)
2236 			{
2237 				maxcovers *= 2;
2238 				covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2239 			}
2240 			covers[numcovers].startpos = startpos;
2241 			covers[numcovers].endpos = endpos;
2242 			covers[numcovers].curlen = curlen;
2243 			covers[numcovers].poslen = poslen;
2244 			covers[numcovers].chosen = false;
2245 			covers[numcovers].excluded = false;
2246 			numcovers++;
2247 			startpos = endpos + 1;
2248 			endpos = q;
2249 		}
2250 
2251 		/* move p to generate the next cover */
2252 		p++;
2253 	}
2254 
2255 	/* choose best covers */
2256 	for (f = 0; f < max_fragments; f++)
2257 	{
2258 		maxitems = 0;
2259 		minwords = PG_INT32_MAX;
2260 		minI = -1;
2261 
2262 		/*
2263 		 * Choose the cover that contains max items. In case of tie choose the
2264 		 * one with smaller number of words.
2265 		 */
2266 		for (i = 0; i < numcovers; i++)
2267 		{
2268 			if (!covers[i].chosen && !covers[i].excluded &&
2269 				(maxitems < covers[i].poslen ||
2270 				 (maxitems == covers[i].poslen &&
2271 				  minwords > covers[i].curlen)))
2272 			{
2273 				maxitems = covers[i].poslen;
2274 				minwords = covers[i].curlen;
2275 				minI = i;
2276 			}
2277 		}
2278 		/* if a cover was found mark it */
2279 		if (minI >= 0)
2280 		{
2281 			covers[minI].chosen = true;
2282 			/* adjust the size of cover */
2283 			startpos = covers[minI].startpos;
2284 			endpos = covers[minI].endpos;
2285 			curlen = covers[minI].curlen;
2286 			/* stretch the cover if cover size is lower than max_words */
2287 			if (curlen < max_words)
2288 			{
2289 				/* divide the stretch on both sides of cover */
2290 				maxstretch = (max_words - curlen) / 2;
2291 
2292 				/*
2293 				 * first stretch the startpos stop stretching if 1. we hit the
2294 				 * beginning of document 2. exceed maxstretch 3. we hit an
2295 				 * already marked fragment
2296 				 */
2297 				stretch = 0;
2298 				posmarker = startpos;
2299 				for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2300 				{
2301 					if (!NONWORDTOKEN(prs->words[i].type))
2302 					{
2303 						curlen++;
2304 						stretch++;
2305 					}
2306 					posmarker = i;
2307 				}
2308 				/* cut back startpos till we find a good endpoint */
2309 				for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2310 				{
2311 					if (!NONWORDTOKEN(prs->words[i].type))
2312 						curlen--;
2313 				}
2314 				startpos = i;
2315 				/* now stretch the endpos as much as possible */
2316 				posmarker = endpos;
2317 				for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2318 				{
2319 					if (!NONWORDTOKEN(prs->words[i].type))
2320 						curlen++;
2321 					posmarker = i;
2322 				}
2323 				/* cut back endpos till we find a good endpoint */
2324 				for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2325 				{
2326 					if (!NONWORDTOKEN(prs->words[i].type))
2327 						curlen--;
2328 				}
2329 				endpos = i;
2330 			}
2331 			covers[minI].startpos = startpos;
2332 			covers[minI].endpos = endpos;
2333 			covers[minI].curlen = curlen;
2334 			/* Mark the chosen fragments (covers) */
2335 			mark_fragment(prs, highlightall, startpos, endpos);
2336 			num_f++;
2337 			/* Exclude covers overlapping this one from future consideration */
2338 			for (i = 0; i < numcovers; i++)
2339 			{
2340 				if (i != minI &&
2341 					((covers[i].startpos >= startpos &&
2342 					  covers[i].startpos <= endpos) ||
2343 					 (covers[i].endpos >= startpos &&
2344 					  covers[i].endpos <= endpos) ||
2345 					 (covers[i].startpos < startpos &&
2346 					  covers[i].endpos > endpos)))
2347 					covers[i].excluded = true;
2348 			}
2349 		}
2350 		else
2351 			break;				/* no selectable covers remain */
2352 	}
2353 
2354 	/* show the first min_words words if we have not marked anything */
2355 	if (num_f <= 0)
2356 	{
2357 		startpos = endpos = curlen = 0;
2358 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
2359 		{
2360 			if (!NONWORDTOKEN(prs->words[i].type))
2361 				curlen++;
2362 			endpos = i;
2363 		}
2364 		mark_fragment(prs, highlightall, startpos, endpos);
2365 	}
2366 
2367 	pfree(covers);
2368 }
2369 
2370 /*
2371  * Headline selector used when MaxFragments == 0
2372  */
2373 static void
mark_hl_words(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_cover)2374 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2375 			  int shortword, int min_words, int max_words, int max_cover)
2376 {
2377 	int			p = 0,
2378 				q = 0;
2379 	int			bestb = -1,
2380 				beste = -1;
2381 	int			bestlen = -1;
2382 	bool		bestcover = false;
2383 	int			pose,
2384 				posb,
2385 				poslen,
2386 				curlen;
2387 	bool		poscover;
2388 	int			i;
2389 
2390 	if (!highlightall)
2391 	{
2392 		/* examine all covers, select a headline using the best one */
2393 		while (hlCover(prs, query, max_cover, &p, &q))
2394 		{
2395 			/*
2396 			 * Count words (curlen) and interesting words (poslen) within
2397 			 * cover, but stop once we reach max_words.  This step doesn't
2398 			 * consider whether that's a good stopping point.  posb and pose
2399 			 * are set to the start and end indexes of the possible headline.
2400 			 */
2401 			curlen = 0;
2402 			poslen = 0;
2403 			posb = pose = p;
2404 			for (i = p; i <= q && curlen < max_words; i++)
2405 			{
2406 				if (!NONWORDTOKEN(prs->words[i].type))
2407 					curlen++;
2408 				if (INTERESTINGWORD(i))
2409 					poslen++;
2410 				pose = i;
2411 			}
2412 
2413 			if (curlen < max_words)
2414 			{
2415 				/*
2416 				 * We have room to lengthen the headline, so search forward
2417 				 * until it's full or we find a good stopping point.  We'll
2418 				 * reconsider the word at "q", then move forward.
2419 				 */
2420 				for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2421 				{
2422 					if (i > q)
2423 					{
2424 						if (!NONWORDTOKEN(prs->words[i].type))
2425 							curlen++;
2426 						if (INTERESTINGWORD(i))
2427 							poslen++;
2428 					}
2429 					pose = i;
2430 					if (BADENDPOINT(i))
2431 						continue;
2432 					if (curlen >= min_words)
2433 						break;
2434 				}
2435 				if (curlen < min_words)
2436 				{
2437 					/*
2438 					 * Reached end of text and our headline is still shorter
2439 					 * than min_words, so try to extend it to the left.
2440 					 */
2441 					for (i = p - 1; i >= 0; i--)
2442 					{
2443 						if (!NONWORDTOKEN(prs->words[i].type))
2444 							curlen++;
2445 						if (INTERESTINGWORD(i))
2446 							poslen++;
2447 						if (curlen >= max_words)
2448 							break;
2449 						if (BADENDPOINT(i))
2450 							continue;
2451 						if (curlen >= min_words)
2452 							break;
2453 					}
2454 					posb = (i >= 0) ? i : 0;
2455 				}
2456 			}
2457 			else
2458 			{
2459 				/*
2460 				 * Can't make headline longer, so consider making it shorter
2461 				 * if needed to avoid a bad endpoint.
2462 				 */
2463 				if (i > q)
2464 					i = q;
2465 				for (; curlen > min_words; i--)
2466 				{
2467 					if (!BADENDPOINT(i))
2468 						break;
2469 					if (!NONWORDTOKEN(prs->words[i].type))
2470 						curlen--;
2471 					if (INTERESTINGWORD(i))
2472 						poslen--;
2473 					pose = i - 1;
2474 				}
2475 			}
2476 
2477 			/*
2478 			 * Check whether the proposed headline includes the original
2479 			 * cover; it might not if we trimmed it due to max_words.
2480 			 */
2481 			poscover = (posb <= p && pose >= q);
2482 
2483 			/*
2484 			 * Adopt this headline if it's better than the last one, giving
2485 			 * highest priority to headlines including the cover, then to
2486 			 * headlines with more interesting words, then to headlines with
2487 			 * good stopping points.  (Since bestlen is initially -1, we will
2488 			 * certainly adopt the first headline.)
2489 			 */
2490 			if (poscover > bestcover ||
2491 				(poscover == bestcover && poslen > bestlen) ||
2492 				(poscover == bestcover && poslen == bestlen &&
2493 				 !BADENDPOINT(pose) && BADENDPOINT(beste)))
2494 			{
2495 				bestb = posb;
2496 				beste = pose;
2497 				bestlen = poslen;
2498 				bestcover = poscover;
2499 			}
2500 
2501 			/* move p to generate the next cover */
2502 			p++;
2503 		}
2504 
2505 		/*
2506 		 * If we found nothing acceptable, select min_words words starting at
2507 		 * the beginning.
2508 		 */
2509 		if (bestlen < 0)
2510 		{
2511 			curlen = 0;
2512 			pose = 0;
2513 			for (i = 0; i < prs->curwords && curlen < min_words; i++)
2514 			{
2515 				if (!NONWORDTOKEN(prs->words[i].type))
2516 					curlen++;
2517 				pose = i;
2518 			}
2519 			bestb = 0;
2520 			beste = pose;
2521 		}
2522 	}
2523 	else
2524 	{
2525 		/* highlightall mode: headline is whole document */
2526 		bestb = 0;
2527 		beste = prs->curwords - 1;
2528 	}
2529 
2530 	mark_fragment(prs, highlightall, bestb, beste);
2531 }
2532 
2533 /*
2534  * Default parser's prsheadline function
2535  */
2536 Datum
prsd_headline(PG_FUNCTION_ARGS)2537 prsd_headline(PG_FUNCTION_ARGS)
2538 {
2539 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2540 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
2541 	TSQuery		query = PG_GETARG_TSQUERY(2);
2542 
2543 	/* default option values: */
2544 	int			min_words = 15;
2545 	int			max_words = 35;
2546 	int			shortword = 3;
2547 	int			max_fragments = 0;
2548 	bool		highlightall = false;
2549 	int			max_cover;
2550 	ListCell   *l;
2551 
2552 	/* Extract configuration option values */
2553 	prs->startsel = NULL;
2554 	prs->stopsel = NULL;
2555 	prs->fragdelim = NULL;
2556 	foreach(l, prsoptions)
2557 	{
2558 		DefElem    *defel = (DefElem *) lfirst(l);
2559 		char	   *val = defGetString(defel);
2560 
2561 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2562 			max_words = pg_strtoint32(val);
2563 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2564 			min_words = pg_strtoint32(val);
2565 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2566 			shortword = pg_strtoint32(val);
2567 		else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2568 			max_fragments = pg_strtoint32(val);
2569 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2570 			prs->startsel = pstrdup(val);
2571 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2572 			prs->stopsel = pstrdup(val);
2573 		else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2574 			prs->fragdelim = pstrdup(val);
2575 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2576 			highlightall = (pg_strcasecmp(val, "1") == 0 ||
2577 							pg_strcasecmp(val, "on") == 0 ||
2578 							pg_strcasecmp(val, "true") == 0 ||
2579 							pg_strcasecmp(val, "t") == 0 ||
2580 							pg_strcasecmp(val, "y") == 0 ||
2581 							pg_strcasecmp(val, "yes") == 0);
2582 		else
2583 			ereport(ERROR,
2584 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2585 					 errmsg("unrecognized headline parameter: \"%s\"",
2586 							defel->defname)));
2587 	}
2588 
2589 	/*
2590 	 * We might eventually make max_cover a user-settable parameter, but for
2591 	 * now, just compute a reasonable value based on max_words and
2592 	 * max_fragments.
2593 	 */
2594 	max_cover = Max(max_words * 10, 100);
2595 	if (max_fragments > 0)
2596 		max_cover *= max_fragments;
2597 
2598 	/* in HighlightAll mode these parameters are ignored */
2599 	if (!highlightall)
2600 	{
2601 		if (min_words >= max_words)
2602 			ereport(ERROR,
2603 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2604 					 errmsg("MinWords should be less than MaxWords")));
2605 		if (min_words <= 0)
2606 			ereport(ERROR,
2607 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2608 					 errmsg("MinWords should be positive")));
2609 		if (shortword < 0)
2610 			ereport(ERROR,
2611 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2612 					 errmsg("ShortWord should be >= 0")));
2613 		if (max_fragments < 0)
2614 			ereport(ERROR,
2615 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2616 					 errmsg("MaxFragments should be >= 0")));
2617 	}
2618 
2619 	/* Apply appropriate headline selector */
2620 	if (max_fragments == 0)
2621 		mark_hl_words(prs, query, highlightall, shortword,
2622 					  min_words, max_words, max_cover);
2623 	else
2624 		mark_hl_fragments(prs, query, highlightall, shortword,
2625 						  min_words, max_words, max_fragments, max_cover);
2626 
2627 	/* Fill in default values for string options */
2628 	if (!prs->startsel)
2629 		prs->startsel = pstrdup("<b>");
2630 	if (!prs->stopsel)
2631 		prs->stopsel = pstrdup("</b>");
2632 	if (!prs->fragdelim)
2633 		prs->fragdelim = pstrdup(" ... ");
2634 
2635 	/* Caller will need these lengths, too */
2636 	prs->startsellen = strlen(prs->startsel);
2637 	prs->stopsellen = strlen(prs->stopsel);
2638 	prs->fragdelimlen = strlen(prs->fragdelim);
2639 
2640 	PG_RETURN_POINTER(prs);
2641 }
2642