1 /*-------------------------------------------------------------------------
2  *
3  * wparser_def.c
4  *		Default text search parser
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  *
8  *
9  * IDENTIFICATION
10  *	  src/backend/tsearch/wparser_def.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres.h"
16 
17 #include <limits.h>
18 
19 #include "catalog/pg_collation.h"
20 #include "commands/defrem.h"
21 #include "tsearch/ts_locale.h"
22 #include "tsearch/ts_public.h"
23 #include "tsearch/ts_type.h"
24 #include "tsearch/ts_utils.h"
25 #include "utils/builtins.h"
26 
27 
28 /* Define me to enable tracing of parser behavior */
29 /* #define WPARSER_TRACE */
30 
31 
32 /* Output token categories */
33 
34 #define ASCIIWORD		1
35 #define WORD_T			2
36 #define NUMWORD			3
37 #define EMAIL			4
38 #define URL_T			5
39 #define HOST			6
40 #define SCIENTIFIC		7
41 #define VERSIONNUMBER	8
42 #define NUMPARTHWORD	9
43 #define PARTHWORD		10
44 #define ASCIIPARTHWORD	11
45 #define SPACE			12
46 #define TAG_T			13
47 #define PROTOCOL		14
48 #define NUMHWORD		15
49 #define ASCIIHWORD		16
50 #define HWORD			17
51 #define URLPATH			18
52 #define FILEPATH		19
53 #define DECIMAL_T		20
54 #define SIGNEDINT		21
55 #define UNSIGNEDINT		22
56 #define XMLENTITY		23
57 
58 #define LASTNUM			23
59 
60 static const char *const tok_alias[] = {
61 	"",
62 	"asciiword",
63 	"word",
64 	"numword",
65 	"email",
66 	"url",
67 	"host",
68 	"sfloat",
69 	"version",
70 	"hword_numpart",
71 	"hword_part",
72 	"hword_asciipart",
73 	"blank",
74 	"tag",
75 	"protocol",
76 	"numhword",
77 	"asciihword",
78 	"hword",
79 	"url_path",
80 	"file",
81 	"float",
82 	"int",
83 	"uint",
84 	"entity"
85 };
86 
87 static const char *const lex_descr[] = {
88 	"",
89 	"Word, all ASCII",
90 	"Word, all letters",
91 	"Word, letters and digits",
92 	"Email address",
93 	"URL",
94 	"Host",
95 	"Scientific notation",
96 	"Version number",
97 	"Hyphenated word part, letters and digits",
98 	"Hyphenated word part, all letters",
99 	"Hyphenated word part, all ASCII",
100 	"Space symbols",
101 	"XML tag",
102 	"Protocol head",
103 	"Hyphenated word, letters and digits",
104 	"Hyphenated word, all ASCII",
105 	"Hyphenated word, all letters",
106 	"URL path",
107 	"File or path name",
108 	"Decimal notation",
109 	"Signed integer",
110 	"Unsigned integer",
111 	"XML entity"
112 };
113 
114 
115 /* Parser states */
116 
117 typedef enum
118 {
119 	TPS_Base = 0,
120 	TPS_InNumWord,
121 	TPS_InAsciiWord,
122 	TPS_InWord,
123 	TPS_InUnsignedInt,
124 	TPS_InSignedIntFirst,
125 	TPS_InSignedInt,
126 	TPS_InSpace,
127 	TPS_InUDecimalFirst,
128 	TPS_InUDecimal,
129 	TPS_InDecimalFirst,
130 	TPS_InDecimal,
131 	TPS_InVerVersion,
132 	TPS_InSVerVersion,
133 	TPS_InVersionFirst,
134 	TPS_InVersion,
135 	TPS_InMantissaFirst,
136 	TPS_InMantissaSign,
137 	TPS_InMantissa,
138 	TPS_InXMLEntityFirst,
139 	TPS_InXMLEntity,
140 	TPS_InXMLEntityNumFirst,
141 	TPS_InXMLEntityNum,
142 	TPS_InXMLEntityHexNumFirst,
143 	TPS_InXMLEntityHexNum,
144 	TPS_InXMLEntityEnd,
145 	TPS_InTagFirst,
146 	TPS_InXMLBegin,
147 	TPS_InTagCloseFirst,
148 	TPS_InTagName,
149 	TPS_InTagBeginEnd,
150 	TPS_InTag,
151 	TPS_InTagEscapeK,
152 	TPS_InTagEscapeKK,
153 	TPS_InTagBackSleshed,
154 	TPS_InTagEnd,
155 	TPS_InCommentFirst,
156 	TPS_InCommentLast,
157 	TPS_InComment,
158 	TPS_InCloseCommentFirst,
159 	TPS_InCloseCommentLast,
160 	TPS_InCommentEnd,
161 	TPS_InHostFirstDomain,
162 	TPS_InHostDomainSecond,
163 	TPS_InHostDomain,
164 	TPS_InPortFirst,
165 	TPS_InPort,
166 	TPS_InHostFirstAN,
167 	TPS_InHost,
168 	TPS_InEmail,
169 	TPS_InFileFirst,
170 	TPS_InFileTwiddle,
171 	TPS_InPathFirst,
172 	TPS_InPathFirstFirst,
173 	TPS_InPathSecond,
174 	TPS_InFile,
175 	TPS_InFileNext,
176 	TPS_InURLPathFirst,
177 	TPS_InURLPathStart,
178 	TPS_InURLPath,
179 	TPS_InFURL,
180 	TPS_InProtocolFirst,
181 	TPS_InProtocolSecond,
182 	TPS_InProtocolEnd,
183 	TPS_InHyphenAsciiWordFirst,
184 	TPS_InHyphenAsciiWord,
185 	TPS_InHyphenWordFirst,
186 	TPS_InHyphenWord,
187 	TPS_InHyphenNumWordFirst,
188 	TPS_InHyphenNumWord,
189 	TPS_InHyphenDigitLookahead,
190 	TPS_InParseHyphen,
191 	TPS_InParseHyphenHyphen,
192 	TPS_InHyphenWordPart,
193 	TPS_InHyphenAsciiWordPart,
194 	TPS_InHyphenNumWordPart,
195 	TPS_InHyphenUnsignedInt,
196 	TPS_Null					/* last state (fake value) */
197 } TParserState;
198 
199 /* forward declaration */
200 struct TParser;
201 
202 typedef int (*TParserCharTest) (struct TParser *);	/* any p_is* functions
203 													 * except p_iseq */
204 typedef void (*TParserSpecial) (struct TParser *);	/* special handler for
205 													 * special cases... */
206 
207 typedef struct
208 {
209 	TParserCharTest isclass;
210 	char		c;
211 	uint16		flags;
212 	TParserState tostate;
213 	int			type;
214 	TParserSpecial special;
215 } TParserStateActionItem;
216 
217 /* Flag bits in TParserStateActionItem.flags */
218 #define A_NEXT		0x0000
219 #define A_BINGO		0x0001
220 #define A_POP		0x0002
221 #define A_PUSH		0x0004
222 #define A_RERUN		0x0008
223 #define A_CLEAR		0x0010
224 #define A_MERGE		0x0020
225 #define A_CLRALL	0x0040
226 
227 typedef struct TParserPosition
228 {
229 	int			posbyte;		/* position of parser in bytes */
230 	int			poschar;		/* position of parser in characters */
231 	int			charlen;		/* length of current char */
232 	int			lenbytetoken;	/* length of token-so-far in bytes */
233 	int			lenchartoken;	/* and in chars */
234 	TParserState state;
235 	struct TParserPosition *prev;
236 	const TParserStateActionItem *pushedAtAction;
237 } TParserPosition;
238 
239 typedef struct TParser
240 {
241 	/* string and position information */
242 	char	   *str;			/* multibyte string */
243 	int			lenstr;			/* length of mbstring */
244 #ifdef USE_WIDE_UPPER_LOWER
245 	wchar_t    *wstr;			/* wide character string */
246 	pg_wchar   *pgwstr;			/* wide character string for C-locale */
247 	bool		usewide;
248 #endif
249 
250 	/* State of parse */
251 	int			charmaxlen;
252 	TParserPosition *state;
253 	bool		ignore;
254 	bool		wanthost;
255 
256 	/* silly char */
257 	char		c;
258 
259 	/* out */
260 	char	   *token;
261 	int			lenbytetoken;
262 	int			lenchartoken;
263 	int			type;
264 } TParser;
265 
266 
267 /* forward decls here */
268 static bool TParserGet(TParser *prs);
269 
270 
271 static TParserPosition *
newTParserPosition(TParserPosition * prev)272 newTParserPosition(TParserPosition *prev)
273 {
274 	TParserPosition *res = (TParserPosition *) palloc(sizeof(TParserPosition));
275 
276 	if (prev)
277 		memcpy(res, prev, sizeof(TParserPosition));
278 	else
279 		memset(res, 0, sizeof(TParserPosition));
280 
281 	res->prev = prev;
282 
283 	res->pushedAtAction = NULL;
284 
285 	return res;
286 }
287 
288 static TParser *
TParserInit(char * str,int len)289 TParserInit(char *str, int len)
290 {
291 	TParser    *prs = (TParser *) palloc0(sizeof(TParser));
292 
293 	prs->charmaxlen = pg_database_encoding_max_length();
294 	prs->str = str;
295 	prs->lenstr = len;
296 
297 #ifdef USE_WIDE_UPPER_LOWER
298 
299 	/*
300 	 * Use wide char code only when max encoding length > 1.
301 	 */
302 	if (prs->charmaxlen > 1)
303 	{
304 		Oid			collation = DEFAULT_COLLATION_OID;	/* TODO */
305 		pg_locale_t mylocale = 0;	/* TODO */
306 
307 		prs->usewide = true;
308 		if (lc_ctype_is_c(collation))
309 		{
310 			/*
311 			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
312 			 * be different from sizeof(wchar_t)
313 			 */
314 			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
315 			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
316 		}
317 		else
318 		{
319 			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
320 			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
321 					   mylocale);
322 		}
323 	}
324 	else
325 		prs->usewide = false;
326 #endif
327 
328 	prs->state = newTParserPosition(NULL);
329 	prs->state->state = TPS_Base;
330 
331 #ifdef WPARSER_TRACE
332 
333 	/*
334 	 * Use of %.*s here is a bit risky since it can misbehave if the data is
335 	 * not in what libc thinks is the prevailing encoding.  However, since
336 	 * this is just a debugging aid, we choose to live with that.
337 	 */
338 	fprintf(stderr, "parsing \"%.*s\"\n", len, str);
339 #endif
340 
341 	return prs;
342 }
343 
344 /*
345  * As an alternative to a full TParserInit one can create a
346  * TParserCopy which basically is a regular TParser without a private
347  * copy of the string - instead it uses the one from another TParser.
348  * This is useful because at some places TParsers are created
349  * recursively and the repeated copying around of the strings can
350  * cause major inefficiency if the source string is long.
351  * The new parser starts parsing at the original's current position.
352  *
353  * Obviously one must not close the original TParser before the copy.
354  */
355 static TParser *
TParserCopyInit(const TParser * orig)356 TParserCopyInit(const TParser *orig)
357 {
358 	TParser    *prs = (TParser *) palloc0(sizeof(TParser));
359 
360 	prs->charmaxlen = orig->charmaxlen;
361 	prs->str = orig->str + orig->state->posbyte;
362 	prs->lenstr = orig->lenstr - orig->state->posbyte;
363 
364 #ifdef USE_WIDE_UPPER_LOWER
365 	prs->usewide = orig->usewide;
366 
367 	if (orig->pgwstr)
368 		prs->pgwstr = orig->pgwstr + orig->state->poschar;
369 	if (orig->wstr)
370 		prs->wstr = orig->wstr + orig->state->poschar;
371 #endif
372 
373 	prs->state = newTParserPosition(NULL);
374 	prs->state->state = TPS_Base;
375 
376 #ifdef WPARSER_TRACE
377 	/* See note above about %.*s */
378 	fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
379 #endif
380 
381 	return prs;
382 }
383 
384 
385 static void
TParserClose(TParser * prs)386 TParserClose(TParser *prs)
387 {
388 	while (prs->state)
389 	{
390 		TParserPosition *ptr = prs->state->prev;
391 
392 		pfree(prs->state);
393 		prs->state = ptr;
394 	}
395 
396 #ifdef USE_WIDE_UPPER_LOWER
397 	if (prs->wstr)
398 		pfree(prs->wstr);
399 	if (prs->pgwstr)
400 		pfree(prs->pgwstr);
401 #endif
402 
403 #ifdef WPARSER_TRACE
404 	fprintf(stderr, "closing parser\n");
405 #endif
406 	pfree(prs);
407 }
408 
409 /*
410  * Close a parser created with TParserCopyInit
411  */
412 static void
TParserCopyClose(TParser * prs)413 TParserCopyClose(TParser *prs)
414 {
415 	while (prs->state)
416 	{
417 		TParserPosition *ptr = prs->state->prev;
418 
419 		pfree(prs->state);
420 		prs->state = ptr;
421 	}
422 
423 #ifdef WPARSER_TRACE
424 	fprintf(stderr, "closing parser copy\n");
425 #endif
426 	pfree(prs);
427 }
428 
429 
430 /*
431  * Character-type support functions, equivalent to is* macros, but
432  * working with any possible encodings and locales. Notes:
433  *	- with multibyte encoding and C-locale isw* function may fail
434  *	  or give wrong result.
435  *	- multibyte encoding and C-locale often are used for
436  *	  Asian languages.
437  *	- if locale is C then we use pgwstr instead of wstr.
438  */
439 
440 #ifdef USE_WIDE_UPPER_LOWER
441 
442 #define p_iswhat(type)														\
443 static int																	\
444 p_is##type(TParser *prs) {													\
445 	Assert( prs->state );													\
446 	if ( prs->usewide )														\
447 	{																		\
448 		if ( prs->pgwstr )													\
449 		{																	\
450 			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\
451 			if ( c > 0x7f )													\
452 				return 0;													\
453 			return is##type( c );											\
454 		}																	\
455 		return isw##type( *( prs->wstr + prs->state->poschar ) );			\
456 	}																		\
457 																			\
458 	return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
459 }	\
460 																			\
461 static int																	\
462 p_isnot##type(TParser *prs) {												\
463 	return !p_is##type(prs);												\
464 }
465 
466 static int
p_isalnum(TParser * prs)467 p_isalnum(TParser *prs)
468 {
469 	Assert(prs->state);
470 
471 	if (prs->usewide)
472 	{
473 		if (prs->pgwstr)
474 		{
475 			unsigned int c = *(prs->pgwstr + prs->state->poschar);
476 
477 			/*
478 			 * any non-ascii symbol with multibyte encoding with C-locale is
479 			 * an alpha character
480 			 */
481 			if (c > 0x7f)
482 				return 1;
483 
484 			return isalnum(c);
485 		}
486 
487 		return iswalnum(*(prs->wstr + prs->state->poschar));
488 	}
489 
490 	return isalnum(*(unsigned char *) (prs->str + prs->state->posbyte));
491 }
492 static int
p_isnotalnum(TParser * prs)493 p_isnotalnum(TParser *prs)
494 {
495 	return !p_isalnum(prs);
496 }
497 
498 static int
p_isalpha(TParser * prs)499 p_isalpha(TParser *prs)
500 {
501 	Assert(prs->state);
502 
503 	if (prs->usewide)
504 	{
505 		if (prs->pgwstr)
506 		{
507 			unsigned int c = *(prs->pgwstr + prs->state->poschar);
508 
509 			/*
510 			 * any non-ascii symbol with multibyte encoding with C-locale is
511 			 * an alpha character
512 			 */
513 			if (c > 0x7f)
514 				return 1;
515 
516 			return isalpha(c);
517 		}
518 
519 		return iswalpha(*(prs->wstr + prs->state->poschar));
520 	}
521 
522 	return isalpha(*(unsigned char *) (prs->str + prs->state->posbyte));
523 }
524 
525 static int
p_isnotalpha(TParser * prs)526 p_isnotalpha(TParser *prs)
527 {
528 	return !p_isalpha(prs);
529 }
530 
531 /* p_iseq should be used only for ascii symbols */
532 
533 static int
p_iseq(TParser * prs,char c)534 p_iseq(TParser *prs, char c)
535 {
536 	Assert(prs->state);
537 	return ((prs->state->charlen == 1 && *(prs->str + prs->state->posbyte) == c)) ? 1 : 0;
538 }
539 #else							/* USE_WIDE_UPPER_LOWER */
540 
541 #define p_iswhat(type)														\
542 static int																	\
543 p_is##type(TParser *prs) {													\
544 	Assert( prs->state );													\
545 	return is##type( (unsigned char)*( prs->str + prs->state->posbyte ) );	\
546 }	\
547 																			\
548 static int																	\
549 p_isnot##type(TParser *prs) {												\
550 	return !p_is##type(prs);												\
551 }
552 
553 
554 static int
p_iseq(TParser * prs,char c)555 p_iseq(TParser *prs, char c)
556 {
557 	Assert(prs->state);
558 	return (*(prs->str + prs->state->posbyte) == c) ? 1 : 0;
559 }
560 
561 p_iswhat(alnum)
p_iswhat(alpha)562 p_iswhat(alpha)
563 #endif							/* USE_WIDE_UPPER_LOWER */
564 
565 p_iswhat(digit)
566 p_iswhat(lower)
567 p_iswhat(print)
568 p_iswhat(punct)
569 p_iswhat(space)
570 p_iswhat(upper)
571 p_iswhat(xdigit)
572 
573 static int
574 p_isEOF(TParser *prs)
575 {
576 	Assert(prs->state);
577 	return (prs->state->posbyte == prs->lenstr || prs->state->charlen == 0) ? 1 : 0;
578 }
579 
580 static int
p_iseqC(TParser * prs)581 p_iseqC(TParser *prs)
582 {
583 	return p_iseq(prs, prs->c);
584 }
585 
586 static int
p_isneC(TParser * prs)587 p_isneC(TParser *prs)
588 {
589 	return !p_iseq(prs, prs->c);
590 }
591 
592 static int
p_isascii(TParser * prs)593 p_isascii(TParser *prs)
594 {
595 	return (prs->state->charlen == 1 && isascii((unsigned char) *(prs->str + prs->state->posbyte))) ? 1 : 0;
596 }
597 
598 static int
p_isasclet(TParser * prs)599 p_isasclet(TParser *prs)
600 {
601 	return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
602 }
603 
604 static int
p_isurlchar(TParser * prs)605 p_isurlchar(TParser *prs)
606 {
607 	char		ch;
608 
609 	/* no non-ASCII need apply */
610 	if (prs->state->charlen != 1)
611 		return 0;
612 	ch = *(prs->str + prs->state->posbyte);
613 	/* no spaces or control characters */
614 	if (ch <= 0x20 || ch >= 0x7F)
615 		return 0;
616 	/* reject characters disallowed by RFC 3986 */
617 	switch (ch)
618 	{
619 		case '"':
620 		case '<':
621 		case '>':
622 		case '\\':
623 		case '^':
624 		case '`':
625 		case '{':
626 		case '|':
627 		case '}':
628 			return 0;
629 	}
630 	return 1;
631 }
632 
633 
634 /* deliberately suppress unused-function complaints for the above */
635 void		_make_compiler_happy(void);
636 void
_make_compiler_happy(void)637 _make_compiler_happy(void)
638 {
639 	p_isalnum(NULL);
640 	p_isnotalnum(NULL);
641 	p_isalpha(NULL);
642 	p_isnotalpha(NULL);
643 	p_isdigit(NULL);
644 	p_isnotdigit(NULL);
645 	p_islower(NULL);
646 	p_isnotlower(NULL);
647 	p_isprint(NULL);
648 	p_isnotprint(NULL);
649 	p_ispunct(NULL);
650 	p_isnotpunct(NULL);
651 	p_isspace(NULL);
652 	p_isnotspace(NULL);
653 	p_isupper(NULL);
654 	p_isnotupper(NULL);
655 	p_isxdigit(NULL);
656 	p_isnotxdigit(NULL);
657 	p_isEOF(NULL);
658 	p_iseqC(NULL);
659 	p_isneC(NULL);
660 }
661 
662 
663 static void
SpecialTags(TParser * prs)664 SpecialTags(TParser *prs)
665 {
666 	switch (prs->state->lenchartoken)
667 	{
668 		case 8:					/* </script */
669 			if (pg_strncasecmp(prs->token, "</script", 8) == 0)
670 				prs->ignore = false;
671 			break;
672 		case 7:					/* <script || </style */
673 			if (pg_strncasecmp(prs->token, "</style", 7) == 0)
674 				prs->ignore = false;
675 			else if (pg_strncasecmp(prs->token, "<script", 7) == 0)
676 				prs->ignore = true;
677 			break;
678 		case 6:					/* <style */
679 			if (pg_strncasecmp(prs->token, "<style", 6) == 0)
680 				prs->ignore = true;
681 			break;
682 		default:
683 			break;
684 	}
685 }
686 
687 static void
SpecialFURL(TParser * prs)688 SpecialFURL(TParser *prs)
689 {
690 	prs->wanthost = true;
691 	prs->state->posbyte -= prs->state->lenbytetoken;
692 	prs->state->poschar -= prs->state->lenchartoken;
693 }
694 
695 static void
SpecialHyphen(TParser * prs)696 SpecialHyphen(TParser *prs)
697 {
698 	prs->state->posbyte -= prs->state->lenbytetoken;
699 	prs->state->poschar -= prs->state->lenchartoken;
700 }
701 
702 static void
SpecialVerVersion(TParser * prs)703 SpecialVerVersion(TParser *prs)
704 {
705 	prs->state->posbyte -= prs->state->lenbytetoken;
706 	prs->state->poschar -= prs->state->lenchartoken;
707 	prs->state->lenbytetoken = 0;
708 	prs->state->lenchartoken = 0;
709 }
710 
711 static int
p_isstophost(TParser * prs)712 p_isstophost(TParser *prs)
713 {
714 	if (prs->wanthost)
715 	{
716 		prs->wanthost = false;
717 		return 1;
718 	}
719 	return 0;
720 }
721 
722 static int
p_isignore(TParser * prs)723 p_isignore(TParser *prs)
724 {
725 	return (prs->ignore) ? 1 : 0;
726 }
727 
728 static int
p_ishost(TParser * prs)729 p_ishost(TParser *prs)
730 {
731 	TParser    *tmpprs = TParserCopyInit(prs);
732 	int			res = 0;
733 
734 	tmpprs->wanthost = true;
735 
736 	if (TParserGet(tmpprs) && tmpprs->type == HOST)
737 	{
738 		prs->state->posbyte += tmpprs->lenbytetoken;
739 		prs->state->poschar += tmpprs->lenchartoken;
740 		prs->state->lenbytetoken += tmpprs->lenbytetoken;
741 		prs->state->lenchartoken += tmpprs->lenchartoken;
742 		prs->state->charlen = tmpprs->state->charlen;
743 		res = 1;
744 	}
745 	TParserCopyClose(tmpprs);
746 
747 	return res;
748 }
749 
750 static int
p_isURLPath(TParser * prs)751 p_isURLPath(TParser *prs)
752 {
753 	TParser    *tmpprs = TParserCopyInit(prs);
754 	int			res = 0;
755 
756 	tmpprs->state = newTParserPosition(tmpprs->state);
757 	tmpprs->state->state = TPS_InURLPathFirst;
758 
759 	if (TParserGet(tmpprs) && tmpprs->type == URLPATH)
760 	{
761 		prs->state->posbyte += tmpprs->lenbytetoken;
762 		prs->state->poschar += tmpprs->lenchartoken;
763 		prs->state->lenbytetoken += tmpprs->lenbytetoken;
764 		prs->state->lenchartoken += tmpprs->lenchartoken;
765 		prs->state->charlen = tmpprs->state->charlen;
766 		res = 1;
767 	}
768 	TParserCopyClose(tmpprs);
769 
770 	return res;
771 }
772 
773 /*
774  * returns true if current character has zero display length or
775  * it's a special sign in several languages. Such characters
776  * aren't a word-breaker although they aren't an isalpha.
777  * In beginning of word they aren't a part of it.
778  */
779 static int
p_isspecial(TParser * prs)780 p_isspecial(TParser *prs)
781 {
782 	/*
783 	 * pg_dsplen could return -1 which means error or control character
784 	 */
785 	if (pg_dsplen(prs->str + prs->state->posbyte) == 0)
786 		return 1;
787 
788 #ifdef USE_WIDE_UPPER_LOWER
789 
790 	/*
791 	 * Unicode Characters in the 'Mark, Spacing Combining' Category That
792 	 * characters are not alpha although they are not breakers of word too.
793 	 * Check that only in utf encoding, because other encodings aren't
794 	 * supported by postgres or even exists.
795 	 */
796 	if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
797 	{
798 		static const pg_wchar strange_letter[] = {
799 			/*
800 			 * use binary search, so elements should be ordered
801 			 */
802 			0x0903,				/* DEVANAGARI SIGN VISARGA */
803 			0x093E,				/* DEVANAGARI VOWEL SIGN AA */
804 			0x093F,				/* DEVANAGARI VOWEL SIGN I */
805 			0x0940,				/* DEVANAGARI VOWEL SIGN II */
806 			0x0949,				/* DEVANAGARI VOWEL SIGN CANDRA O */
807 			0x094A,				/* DEVANAGARI VOWEL SIGN SHORT O */
808 			0x094B,				/* DEVANAGARI VOWEL SIGN O */
809 			0x094C,				/* DEVANAGARI VOWEL SIGN AU */
810 			0x0982,				/* BENGALI SIGN ANUSVARA */
811 			0x0983,				/* BENGALI SIGN VISARGA */
812 			0x09BE,				/* BENGALI VOWEL SIGN AA */
813 			0x09BF,				/* BENGALI VOWEL SIGN I */
814 			0x09C0,				/* BENGALI VOWEL SIGN II */
815 			0x09C7,				/* BENGALI VOWEL SIGN E */
816 			0x09C8,				/* BENGALI VOWEL SIGN AI */
817 			0x09CB,				/* BENGALI VOWEL SIGN O */
818 			0x09CC,				/* BENGALI VOWEL SIGN AU */
819 			0x09D7,				/* BENGALI AU LENGTH MARK */
820 			0x0A03,				/* GURMUKHI SIGN VISARGA */
821 			0x0A3E,				/* GURMUKHI VOWEL SIGN AA */
822 			0x0A3F,				/* GURMUKHI VOWEL SIGN I */
823 			0x0A40,				/* GURMUKHI VOWEL SIGN II */
824 			0x0A83,				/* GUJARATI SIGN VISARGA */
825 			0x0ABE,				/* GUJARATI VOWEL SIGN AA */
826 			0x0ABF,				/* GUJARATI VOWEL SIGN I */
827 			0x0AC0,				/* GUJARATI VOWEL SIGN II */
828 			0x0AC9,				/* GUJARATI VOWEL SIGN CANDRA O */
829 			0x0ACB,				/* GUJARATI VOWEL SIGN O */
830 			0x0ACC,				/* GUJARATI VOWEL SIGN AU */
831 			0x0B02,				/* ORIYA SIGN ANUSVARA */
832 			0x0B03,				/* ORIYA SIGN VISARGA */
833 			0x0B3E,				/* ORIYA VOWEL SIGN AA */
834 			0x0B40,				/* ORIYA VOWEL SIGN II */
835 			0x0B47,				/* ORIYA VOWEL SIGN E */
836 			0x0B48,				/* ORIYA VOWEL SIGN AI */
837 			0x0B4B,				/* ORIYA VOWEL SIGN O */
838 			0x0B4C,				/* ORIYA VOWEL SIGN AU */
839 			0x0B57,				/* ORIYA AU LENGTH MARK */
840 			0x0BBE,				/* TAMIL VOWEL SIGN AA */
841 			0x0BBF,				/* TAMIL VOWEL SIGN I */
842 			0x0BC1,				/* TAMIL VOWEL SIGN U */
843 			0x0BC2,				/* TAMIL VOWEL SIGN UU */
844 			0x0BC6,				/* TAMIL VOWEL SIGN E */
845 			0x0BC7,				/* TAMIL VOWEL SIGN EE */
846 			0x0BC8,				/* TAMIL VOWEL SIGN AI */
847 			0x0BCA,				/* TAMIL VOWEL SIGN O */
848 			0x0BCB,				/* TAMIL VOWEL SIGN OO */
849 			0x0BCC,				/* TAMIL VOWEL SIGN AU */
850 			0x0BD7,				/* TAMIL AU LENGTH MARK */
851 			0x0C01,				/* TELUGU SIGN CANDRABINDU */
852 			0x0C02,				/* TELUGU SIGN ANUSVARA */
853 			0x0C03,				/* TELUGU SIGN VISARGA */
854 			0x0C41,				/* TELUGU VOWEL SIGN U */
855 			0x0C42,				/* TELUGU VOWEL SIGN UU */
856 			0x0C43,				/* TELUGU VOWEL SIGN VOCALIC R */
857 			0x0C44,				/* TELUGU VOWEL SIGN VOCALIC RR */
858 			0x0C82,				/* KANNADA SIGN ANUSVARA */
859 			0x0C83,				/* KANNADA SIGN VISARGA */
860 			0x0CBE,				/* KANNADA VOWEL SIGN AA */
861 			0x0CC0,				/* KANNADA VOWEL SIGN II */
862 			0x0CC1,				/* KANNADA VOWEL SIGN U */
863 			0x0CC2,				/* KANNADA VOWEL SIGN UU */
864 			0x0CC3,				/* KANNADA VOWEL SIGN VOCALIC R */
865 			0x0CC4,				/* KANNADA VOWEL SIGN VOCALIC RR */
866 			0x0CC7,				/* KANNADA VOWEL SIGN EE */
867 			0x0CC8,				/* KANNADA VOWEL SIGN AI */
868 			0x0CCA,				/* KANNADA VOWEL SIGN O */
869 			0x0CCB,				/* KANNADA VOWEL SIGN OO */
870 			0x0CD5,				/* KANNADA LENGTH MARK */
871 			0x0CD6,				/* KANNADA AI LENGTH MARK */
872 			0x0D02,				/* MALAYALAM SIGN ANUSVARA */
873 			0x0D03,				/* MALAYALAM SIGN VISARGA */
874 			0x0D3E,				/* MALAYALAM VOWEL SIGN AA */
875 			0x0D3F,				/* MALAYALAM VOWEL SIGN I */
876 			0x0D40,				/* MALAYALAM VOWEL SIGN II */
877 			0x0D46,				/* MALAYALAM VOWEL SIGN E */
878 			0x0D47,				/* MALAYALAM VOWEL SIGN EE */
879 			0x0D48,				/* MALAYALAM VOWEL SIGN AI */
880 			0x0D4A,				/* MALAYALAM VOWEL SIGN O */
881 			0x0D4B,				/* MALAYALAM VOWEL SIGN OO */
882 			0x0D4C,				/* MALAYALAM VOWEL SIGN AU */
883 			0x0D57,				/* MALAYALAM AU LENGTH MARK */
884 			0x0D82,				/* SINHALA SIGN ANUSVARAYA */
885 			0x0D83,				/* SINHALA SIGN VISARGAYA */
886 			0x0DCF,				/* SINHALA VOWEL SIGN AELA-PILLA */
887 			0x0DD0,				/* SINHALA VOWEL SIGN KETTI AEDA-PILLA */
888 			0x0DD1,				/* SINHALA VOWEL SIGN DIGA AEDA-PILLA */
889 			0x0DD8,				/* SINHALA VOWEL SIGN GAETTA-PILLA */
890 			0x0DD9,				/* SINHALA VOWEL SIGN KOMBUVA */
891 			0x0DDA,				/* SINHALA VOWEL SIGN DIGA KOMBUVA */
892 			0x0DDB,				/* SINHALA VOWEL SIGN KOMBU DEKA */
893 			0x0DDC,				/* SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA */
894 			0x0DDD,				/* SINHALA VOWEL SIGN KOMBUVA HAA DIGA
895 								 * AELA-PILLA */
896 			0x0DDE,				/* SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA */
897 			0x0DDF,				/* SINHALA VOWEL SIGN GAYANUKITTA */
898 			0x0DF2,				/* SINHALA VOWEL SIGN DIGA GAETTA-PILLA */
899 			0x0DF3,				/* SINHALA VOWEL SIGN DIGA GAYANUKITTA */
900 			0x0F3E,				/* TIBETAN SIGN YAR TSHES */
901 			0x0F3F,				/* TIBETAN SIGN MAR TSHES */
902 			0x0F7F,				/* TIBETAN SIGN RNAM BCAD */
903 			0x102B,				/* MYANMAR VOWEL SIGN TALL AA */
904 			0x102C,				/* MYANMAR VOWEL SIGN AA */
905 			0x1031,				/* MYANMAR VOWEL SIGN E */
906 			0x1038,				/* MYANMAR SIGN VISARGA */
907 			0x103B,				/* MYANMAR CONSONANT SIGN MEDIAL YA */
908 			0x103C,				/* MYANMAR CONSONANT SIGN MEDIAL RA */
909 			0x1056,				/* MYANMAR VOWEL SIGN VOCALIC R */
910 			0x1057,				/* MYANMAR VOWEL SIGN VOCALIC RR */
911 			0x1062,				/* MYANMAR VOWEL SIGN SGAW KAREN EU */
912 			0x1063,				/* MYANMAR TONE MARK SGAW KAREN HATHI */
913 			0x1064,				/* MYANMAR TONE MARK SGAW KAREN KE PHO */
914 			0x1067,				/* MYANMAR VOWEL SIGN WESTERN PWO KAREN EU */
915 			0x1068,				/* MYANMAR VOWEL SIGN WESTERN PWO KAREN UE */
916 			0x1069,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-1 */
917 			0x106A,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-2 */
918 			0x106B,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-3 */
919 			0x106C,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-4 */
920 			0x106D,				/* MYANMAR SIGN WESTERN PWO KAREN TONE-5 */
921 			0x1083,				/* MYANMAR VOWEL SIGN SHAN AA */
922 			0x1084,				/* MYANMAR VOWEL SIGN SHAN E */
923 			0x1087,				/* MYANMAR SIGN SHAN TONE-2 */
924 			0x1088,				/* MYANMAR SIGN SHAN TONE-3 */
925 			0x1089,				/* MYANMAR SIGN SHAN TONE-5 */
926 			0x108A,				/* MYANMAR SIGN SHAN TONE-6 */
927 			0x108B,				/* MYANMAR SIGN SHAN COUNCIL TONE-2 */
928 			0x108C,				/* MYANMAR SIGN SHAN COUNCIL TONE-3 */
929 			0x108F,				/* MYANMAR SIGN RUMAI PALAUNG TONE-5 */
930 			0x17B6,				/* KHMER VOWEL SIGN AA */
931 			0x17BE,				/* KHMER VOWEL SIGN OE */
932 			0x17BF,				/* KHMER VOWEL SIGN YA */
933 			0x17C0,				/* KHMER VOWEL SIGN IE */
934 			0x17C1,				/* KHMER VOWEL SIGN E */
935 			0x17C2,				/* KHMER VOWEL SIGN AE */
936 			0x17C3,				/* KHMER VOWEL SIGN AI */
937 			0x17C4,				/* KHMER VOWEL SIGN OO */
938 			0x17C5,				/* KHMER VOWEL SIGN AU */
939 			0x17C7,				/* KHMER SIGN REAHMUK */
940 			0x17C8,				/* KHMER SIGN YUUKALEAPINTU */
941 			0x1923,				/* LIMBU VOWEL SIGN EE */
942 			0x1924,				/* LIMBU VOWEL SIGN AI */
943 			0x1925,				/* LIMBU VOWEL SIGN OO */
944 			0x1926,				/* LIMBU VOWEL SIGN AU */
945 			0x1929,				/* LIMBU SUBJOINED LETTER YA */
946 			0x192A,				/* LIMBU SUBJOINED LETTER RA */
947 			0x192B,				/* LIMBU SUBJOINED LETTER WA */
948 			0x1930,				/* LIMBU SMALL LETTER KA */
949 			0x1931,				/* LIMBU SMALL LETTER NGA */
950 			0x1933,				/* LIMBU SMALL LETTER TA */
951 			0x1934,				/* LIMBU SMALL LETTER NA */
952 			0x1935,				/* LIMBU SMALL LETTER PA */
953 			0x1936,				/* LIMBU SMALL LETTER MA */
954 			0x1937,				/* LIMBU SMALL LETTER RA */
955 			0x1938,				/* LIMBU SMALL LETTER LA */
956 			0x19B0,				/* NEW TAI LUE VOWEL SIGN VOWEL SHORTENER */
957 			0x19B1,				/* NEW TAI LUE VOWEL SIGN AA */
958 			0x19B2,				/* NEW TAI LUE VOWEL SIGN II */
959 			0x19B3,				/* NEW TAI LUE VOWEL SIGN U */
960 			0x19B4,				/* NEW TAI LUE VOWEL SIGN UU */
961 			0x19B5,				/* NEW TAI LUE VOWEL SIGN E */
962 			0x19B6,				/* NEW TAI LUE VOWEL SIGN AE */
963 			0x19B7,				/* NEW TAI LUE VOWEL SIGN O */
964 			0x19B8,				/* NEW TAI LUE VOWEL SIGN OA */
965 			0x19B9,				/* NEW TAI LUE VOWEL SIGN UE */
966 			0x19BA,				/* NEW TAI LUE VOWEL SIGN AY */
967 			0x19BB,				/* NEW TAI LUE VOWEL SIGN AAY */
968 			0x19BC,				/* NEW TAI LUE VOWEL SIGN UY */
969 			0x19BD,				/* NEW TAI LUE VOWEL SIGN OY */
970 			0x19BE,				/* NEW TAI LUE VOWEL SIGN OAY */
971 			0x19BF,				/* NEW TAI LUE VOWEL SIGN UEY */
972 			0x19C0,				/* NEW TAI LUE VOWEL SIGN IY */
973 			0x19C8,				/* NEW TAI LUE TONE MARK-1 */
974 			0x19C9,				/* NEW TAI LUE TONE MARK-2 */
975 			0x1A19,				/* BUGINESE VOWEL SIGN E */
976 			0x1A1A,				/* BUGINESE VOWEL SIGN O */
977 			0x1A1B,				/* BUGINESE VOWEL SIGN AE */
978 			0x1B04,				/* BALINESE SIGN BISAH */
979 			0x1B35,				/* BALINESE VOWEL SIGN TEDUNG */
980 			0x1B3B,				/* BALINESE VOWEL SIGN RA REPA TEDUNG */
981 			0x1B3D,				/* BALINESE VOWEL SIGN LA LENGA TEDUNG */
982 			0x1B3E,				/* BALINESE VOWEL SIGN TALING */
983 			0x1B3F,				/* BALINESE VOWEL SIGN TALING REPA */
984 			0x1B40,				/* BALINESE VOWEL SIGN TALING TEDUNG */
985 			0x1B41,				/* BALINESE VOWEL SIGN TALING REPA TEDUNG */
986 			0x1B43,				/* BALINESE VOWEL SIGN PEPET TEDUNG */
987 			0x1B44,				/* BALINESE ADEG ADEG */
988 			0x1B82,				/* SUNDANESE SIGN PANGWISAD */
989 			0x1BA1,				/* SUNDANESE CONSONANT SIGN PAMINGKAL */
990 			0x1BA6,				/* SUNDANESE VOWEL SIGN PANAELAENG */
991 			0x1BA7,				/* SUNDANESE VOWEL SIGN PANOLONG */
992 			0x1BAA,				/* SUNDANESE SIGN PAMAAEH */
993 			0x1C24,				/* LEPCHA SUBJOINED LETTER YA */
994 			0x1C25,				/* LEPCHA SUBJOINED LETTER RA */
995 			0x1C26,				/* LEPCHA VOWEL SIGN AA */
996 			0x1C27,				/* LEPCHA VOWEL SIGN I */
997 			0x1C28,				/* LEPCHA VOWEL SIGN O */
998 			0x1C29,				/* LEPCHA VOWEL SIGN OO */
999 			0x1C2A,				/* LEPCHA VOWEL SIGN U */
1000 			0x1C2B,				/* LEPCHA VOWEL SIGN UU */
1001 			0x1C34,				/* LEPCHA CONSONANT SIGN NYIN-DO */
1002 			0x1C35,				/* LEPCHA CONSONANT SIGN KANG */
1003 			0xA823,				/* SYLOTI NAGRI VOWEL SIGN A */
1004 			0xA824,				/* SYLOTI NAGRI VOWEL SIGN I */
1005 			0xA827,				/* SYLOTI NAGRI VOWEL SIGN OO */
1006 			0xA880,				/* SAURASHTRA SIGN ANUSVARA */
1007 			0xA881,				/* SAURASHTRA SIGN VISARGA */
1008 			0xA8B4,				/* SAURASHTRA CONSONANT SIGN HAARU */
1009 			0xA8B5,				/* SAURASHTRA VOWEL SIGN AA */
1010 			0xA8B6,				/* SAURASHTRA VOWEL SIGN I */
1011 			0xA8B7,				/* SAURASHTRA VOWEL SIGN II */
1012 			0xA8B8,				/* SAURASHTRA VOWEL SIGN U */
1013 			0xA8B9,				/* SAURASHTRA VOWEL SIGN UU */
1014 			0xA8BA,				/* SAURASHTRA VOWEL SIGN VOCALIC R */
1015 			0xA8BB,				/* SAURASHTRA VOWEL SIGN VOCALIC RR */
1016 			0xA8BC,				/* SAURASHTRA VOWEL SIGN VOCALIC L */
1017 			0xA8BD,				/* SAURASHTRA VOWEL SIGN VOCALIC LL */
1018 			0xA8BE,				/* SAURASHTRA VOWEL SIGN E */
1019 			0xA8BF,				/* SAURASHTRA VOWEL SIGN EE */
1020 			0xA8C0,				/* SAURASHTRA VOWEL SIGN AI */
1021 			0xA8C1,				/* SAURASHTRA VOWEL SIGN O */
1022 			0xA8C2,				/* SAURASHTRA VOWEL SIGN OO */
1023 			0xA8C3,				/* SAURASHTRA VOWEL SIGN AU */
1024 			0xA952,				/* REJANG CONSONANT SIGN H */
1025 			0xA953,				/* REJANG VIRAMA */
1026 			0xAA2F,				/* CHAM VOWEL SIGN O */
1027 			0xAA30,				/* CHAM VOWEL SIGN AI */
1028 			0xAA33,				/* CHAM CONSONANT SIGN YA */
1029 			0xAA34,				/* CHAM CONSONANT SIGN RA */
1030 			0xAA4D				/* CHAM CONSONANT SIGN FINAL H */
1031 		};
1032 		const pg_wchar *StopLow = strange_letter,
1033 				   *StopHigh = strange_letter + lengthof(strange_letter),
1034 				   *StopMiddle;
1035 		pg_wchar	c;
1036 
1037 		if (prs->pgwstr)
1038 			c = *(prs->pgwstr + prs->state->poschar);
1039 		else
1040 			c = (pg_wchar) *(prs->wstr + prs->state->poschar);
1041 
1042 		while (StopLow < StopHigh)
1043 		{
1044 			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1045 			if (*StopMiddle == c)
1046 				return 1;
1047 			else if (*StopMiddle < c)
1048 				StopLow = StopMiddle + 1;
1049 			else
1050 				StopHigh = StopMiddle;
1051 		}
1052 	}
1053 #endif
1054 
1055 	return 0;
1056 }
1057 
1058 /*
1059  * Table of state/action of parser
1060  */
1061 
1062 static const TParserStateActionItem actionTPS_Base[] = {
1063 	{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
1064 	{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
1065 	{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
1066 	{p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
1067 	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1068 	{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
1069 	{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1070 	{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
1071 	{p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
1072 	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1073 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1074 	{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
1075 	{NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
1076 };
1077 
1078 
1079 static const TParserStateActionItem actionTPS_InNumWord[] = {
1080 	{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
1081 	{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1082 	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1083 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1084 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1085 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1086 	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1087 	{NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
1088 };
1089 
1090 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
1091 	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
1092 	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1093 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1094 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1095 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1096 	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1097 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1098 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1099 	{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
1100 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1101 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1102 	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1103 	{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
1104 	{p_isspecial, 0, A_NEXT, TPS_InWord, 0, NULL},
1105 	{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
1106 };
1107 
1108 static const TParserStateActionItem actionTPS_InWord[] = {
1109 	{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
1110 	{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
1111 	{p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
1112 	{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1113 	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1114 	{NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
1115 };
1116 
1117 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
1118 	{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
1119 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1120 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1121 	{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
1122 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1123 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1124 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1125 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1126 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1127 	{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
1128 	{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1129 	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
1130 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1131 	{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
1132 };
1133 
1134 static const TParserStateActionItem actionTPS_InSignedIntFirst[] = {
1135 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1136 	{p_isdigit, 0, A_NEXT | A_CLEAR, TPS_InSignedInt, 0, NULL},
1137 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1138 };
1139 
1140 static const TParserStateActionItem actionTPS_InSignedInt[] = {
1141 	{p_isEOF, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL},
1142 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1143 	{p_iseqC, '.', A_PUSH, TPS_InDecimalFirst, 0, NULL},
1144 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1145 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1146 	{NULL, 0, A_BINGO, TPS_Base, SIGNEDINT, NULL}
1147 };
1148 
1149 static const TParserStateActionItem actionTPS_InSpace[] = {
1150 	{p_isEOF, 0, A_BINGO, TPS_Base, SPACE, NULL},
1151 	{p_iseqC, '<', A_BINGO, TPS_Base, SPACE, NULL},
1152 	{p_isignore, 0, A_NEXT, TPS_Null, 0, NULL},
1153 	{p_iseqC, '-', A_BINGO, TPS_Base, SPACE, NULL},
1154 	{p_iseqC, '+', A_BINGO, TPS_Base, SPACE, NULL},
1155 	{p_iseqC, '&', A_BINGO, TPS_Base, SPACE, NULL},
1156 	{p_iseqC, '/', A_BINGO, TPS_Base, SPACE, NULL},
1157 	{p_isnotalnum, 0, A_NEXT, TPS_InSpace, 0, NULL},
1158 	{NULL, 0, A_BINGO, TPS_Base, SPACE, NULL}
1159 };
1160 
1161 static const TParserStateActionItem actionTPS_InUDecimalFirst[] = {
1162 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1163 	{p_isdigit, 0, A_CLEAR, TPS_InUDecimal, 0, NULL},
1164 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1165 };
1166 
1167 static const TParserStateActionItem actionTPS_InUDecimal[] = {
1168 	{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1169 	{p_isdigit, 0, A_NEXT, TPS_InUDecimal, 0, NULL},
1170 	{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1171 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1172 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1173 	{NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1174 };
1175 
1176 static const TParserStateActionItem actionTPS_InDecimalFirst[] = {
1177 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1178 	{p_isdigit, 0, A_CLEAR, TPS_InDecimal, 0, NULL},
1179 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1180 };
1181 
1182 static const TParserStateActionItem actionTPS_InDecimal[] = {
1183 	{p_isEOF, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL},
1184 	{p_isdigit, 0, A_NEXT, TPS_InDecimal, 0, NULL},
1185 	{p_iseqC, '.', A_PUSH, TPS_InVerVersion, 0, NULL},
1186 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1187 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
1188 	{NULL, 0, A_BINGO, TPS_Base, DECIMAL_T, NULL}
1189 };
1190 
1191 static const TParserStateActionItem actionTPS_InVerVersion[] = {
1192 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1193 	{p_isdigit, 0, A_RERUN, TPS_InSVerVersion, 0, SpecialVerVersion},
1194 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1195 };
1196 
1197 static const TParserStateActionItem actionTPS_InSVerVersion[] = {
1198 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1199 	{p_isdigit, 0, A_BINGO | A_CLRALL, TPS_InUnsignedInt, SPACE, NULL},
1200 	{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1201 };
1202 
1203 
1204 static const TParserStateActionItem actionTPS_InVersionFirst[] = {
1205 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1206 	{p_isdigit, 0, A_CLEAR, TPS_InVersion, 0, NULL},
1207 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1208 };
1209 
1210 static const TParserStateActionItem actionTPS_InVersion[] = {
1211 	{p_isEOF, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL},
1212 	{p_isdigit, 0, A_NEXT, TPS_InVersion, 0, NULL},
1213 	{p_iseqC, '.', A_PUSH, TPS_InVersionFirst, 0, NULL},
1214 	{NULL, 0, A_BINGO, TPS_Base, VERSIONNUMBER, NULL}
1215 };
1216 
1217 static const TParserStateActionItem actionTPS_InMantissaFirst[] = {
1218 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1219 	{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1220 	{p_iseqC, '+', A_NEXT, TPS_InMantissaSign, 0, NULL},
1221 	{p_iseqC, '-', A_NEXT, TPS_InMantissaSign, 0, NULL},
1222 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1223 };
1224 
1225 static const TParserStateActionItem actionTPS_InMantissaSign[] = {
1226 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1227 	{p_isdigit, 0, A_CLEAR, TPS_InMantissa, 0, NULL},
1228 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1229 };
1230 
1231 static const TParserStateActionItem actionTPS_InMantissa[] = {
1232 	{p_isEOF, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL},
1233 	{p_isdigit, 0, A_NEXT, TPS_InMantissa, 0, NULL},
1234 	{NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL}
1235 };
1236 
1237 static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = {
1238 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1239 	{p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL},
1240 	{p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1241 	{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1242 	{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1243 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1244 };
1245 
1246 static const TParserStateActionItem actionTPS_InXMLEntity[] = {
1247 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1248 	{p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL},
1249 	{p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL},
1250 	{p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL},
1251 	{p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL},
1252 	{p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL},
1253 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1254 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1255 };
1256 
1257 static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = {
1258 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1259 	{p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1260 	{p_iseqC, 'X', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL},
1261 	{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1262 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1263 };
1264 
1265 static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = {
1266 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1267 	{p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1268 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1269 };
1270 
1271 static const TParserStateActionItem actionTPS_InXMLEntityNum[] = {
1272 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1273 	{p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL},
1274 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1275 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1276 };
1277 
1278 static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = {
1279 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1280 	{p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL},
1281 	{p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL},
1282 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1283 };
1284 
1285 static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = {
1286 	{NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL}
1287 };
1288 
1289 static const TParserStateActionItem actionTPS_InTagFirst[] = {
1290 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1291 	{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
1292 	{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
1293 	{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
1294 	{p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
1295 	{p_iseqC, ':', A_PUSH, TPS_InTagName, 0, NULL},
1296 	{p_iseqC, '_', A_PUSH, TPS_InTagName, 0, NULL},
1297 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1298 };
1299 
1300 static const TParserStateActionItem actionTPS_InXMLBegin[] = {
1301 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1302 	/* <?xml ... */
1303 	/* XXX do we wants states for the m and l ?  Right now this accepts <?xZ */
1304 	{p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
1305 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1306 };
1307 
1308 static const TParserStateActionItem actionTPS_InTagCloseFirst[] = {
1309 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1310 	{p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
1311 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1312 };
1313 
1314 static const TParserStateActionItem actionTPS_InTagName[] = {
1315 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1316 	/* <br/> case */
1317 	{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
1318 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1319 	{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
1320 	{p_isalnum, 0, A_NEXT, TPS_Null, 0, NULL},
1321 	{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1322 	{p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1323 	{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1324 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1325 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1326 };
1327 
1328 static const TParserStateActionItem actionTPS_InTagBeginEnd[] = {
1329 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1330 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, NULL},
1331 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1332 };
1333 
1334 static const TParserStateActionItem actionTPS_InTag[] = {
1335 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1336 	{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
1337 	{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
1338 	{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
1339 	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
1340 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1341 	{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
1342 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1343 	{p_iseqC, '_', A_NEXT, TPS_Null, 0, NULL},
1344 	{p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
1345 	{p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
1346 	{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
1347 	{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
1348 	{p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
1349 	{p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
1350 	{p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
1351 	{p_iseqC, '~', A_NEXT, TPS_Null, 0, NULL},
1352 	{p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
1353 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1354 };
1355 
1356 static const TParserStateActionItem actionTPS_InTagEscapeK[] = {
1357 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1358 	{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1359 	{p_iseqC, '\'', A_NEXT, TPS_InTag, 0, NULL},
1360 	{NULL, 0, A_NEXT, TPS_InTagEscapeK, 0, NULL}
1361 };
1362 
1363 static const TParserStateActionItem actionTPS_InTagEscapeKK[] = {
1364 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1365 	{p_iseqC, '\\', A_PUSH, TPS_InTagBackSleshed, 0, NULL},
1366 	{p_iseqC, '"', A_NEXT, TPS_InTag, 0, NULL},
1367 	{NULL, 0, A_NEXT, TPS_InTagEscapeKK, 0, NULL}
1368 };
1369 
1370 static const TParserStateActionItem actionTPS_InTagBackSleshed[] = {
1371 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1372 	{NULL, 0, A_MERGE, TPS_Null, 0, NULL}
1373 };
1374 
1375 static const TParserStateActionItem actionTPS_InTagEnd[] = {
1376 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1377 };
1378 
1379 static const TParserStateActionItem actionTPS_InCommentFirst[] = {
1380 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1381 	{p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
1382 	/* <!DOCTYPE ...> */
1383 	{p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
1384 	{p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
1385 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1386 };
1387 
1388 static const TParserStateActionItem actionTPS_InCommentLast[] = {
1389 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1390 	{p_iseqC, '-', A_NEXT, TPS_InComment, 0, NULL},
1391 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1392 };
1393 
1394 static const TParserStateActionItem actionTPS_InComment[] = {
1395 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1396 	{p_iseqC, '-', A_NEXT, TPS_InCloseCommentFirst, 0, NULL},
1397 	{NULL, 0, A_NEXT, TPS_Null, 0, NULL}
1398 };
1399 
1400 static const TParserStateActionItem actionTPS_InCloseCommentFirst[] = {
1401 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1402 	{p_iseqC, '-', A_NEXT, TPS_InCloseCommentLast, 0, NULL},
1403 	{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1404 };
1405 
1406 static const TParserStateActionItem actionTPS_InCloseCommentLast[] = {
1407 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1408 	{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
1409 	{p_iseqC, '>', A_NEXT, TPS_InCommentEnd, 0, NULL},
1410 	{NULL, 0, A_NEXT, TPS_InComment, 0, NULL}
1411 };
1412 
1413 static const TParserStateActionItem actionTPS_InCommentEnd[] = {
1414 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
1415 };
1416 
1417 static const TParserStateActionItem actionTPS_InHostFirstDomain[] = {
1418 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1419 	{p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
1420 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1421 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1422 };
1423 
1424 static const TParserStateActionItem actionTPS_InHostDomainSecond[] = {
1425 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1426 	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1427 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1428 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1429 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1430 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1431 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1432 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1433 };
1434 
1435 static const TParserStateActionItem actionTPS_InHostDomain[] = {
1436 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1437 	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
1438 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
1439 	{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
1440 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1441 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1442 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1443 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1444 	{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
1445 	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1446 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1447 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1448 };
1449 
1450 static const TParserStateActionItem actionTPS_InPortFirst[] = {
1451 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1452 	{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1453 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1454 };
1455 
1456 static const TParserStateActionItem actionTPS_InPort[] = {
1457 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
1458 	{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
1459 	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
1460 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
1461 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
1462 };
1463 
1464 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
1465 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1466 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1467 	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1468 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1469 };
1470 
1471 static const TParserStateActionItem actionTPS_InHost[] = {
1472 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1473 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
1474 	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
1475 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
1476 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
1477 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1478 	{p_iseqC, '_', A_PUSH, TPS_InHostFirstAN, 0, NULL},
1479 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1480 };
1481 
1482 static const TParserStateActionItem actionTPS_InEmail[] = {
1483 	{p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
1484 	{p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
1485 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1486 };
1487 
1488 static const TParserStateActionItem actionTPS_InFileFirst[] = {
1489 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1490 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1491 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1492 	{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
1493 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1494 	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
1495 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1496 };
1497 
1498 static const TParserStateActionItem actionTPS_InFileTwiddle[] = {
1499 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1500 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1501 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1502 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1503 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1504 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1505 };
1506 
1507 static const TParserStateActionItem actionTPS_InPathFirst[] = {
1508 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1509 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1510 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1511 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1512 	{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1513 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1514 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1515 };
1516 
1517 static const TParserStateActionItem actionTPS_InPathFirstFirst[] = {
1518 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1519 	{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
1520 	{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
1521 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1522 };
1523 
1524 static const TParserStateActionItem actionTPS_InPathSecond[] = {
1525 	{p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1526 	{p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
1527 	{p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1528 	{p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
1529 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1530 };
1531 
1532 static const TParserStateActionItem actionTPS_InFile[] = {
1533 	{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
1534 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
1535 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
1536 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
1537 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
1538 	{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
1539 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
1540 	{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
1541 };
1542 
1543 static const TParserStateActionItem actionTPS_InFileNext[] = {
1544 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1545 	{p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
1546 	{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
1547 	{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
1548 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1549 };
1550 
1551 static const TParserStateActionItem actionTPS_InURLPathFirst[] = {
1552 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1553 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1554 	{NULL, 0, A_POP, TPS_Null, 0, NULL},
1555 };
1556 
1557 static const TParserStateActionItem actionTPS_InURLPathStart[] = {
1558 	{NULL, 0, A_NEXT, TPS_InURLPath, 0, NULL}
1559 };
1560 
1561 static const TParserStateActionItem actionTPS_InURLPath[] = {
1562 	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
1563 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
1564 	{NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
1565 };
1566 
1567 static const TParserStateActionItem actionTPS_InFURL[] = {
1568 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1569 	{p_isURLPath, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
1570 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1571 };
1572 
1573 static const TParserStateActionItem actionTPS_InProtocolFirst[] = {
1574 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1575 	{p_iseqC, '/', A_NEXT, TPS_InProtocolSecond, 0, NULL},
1576 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1577 };
1578 
1579 static const TParserStateActionItem actionTPS_InProtocolSecond[] = {
1580 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1581 	{p_iseqC, '/', A_NEXT, TPS_InProtocolEnd, 0, NULL},
1582 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1583 };
1584 
1585 static const TParserStateActionItem actionTPS_InProtocolEnd[] = {
1586 	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
1587 };
1588 
1589 static const TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
1590 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1591 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1592 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1593 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1594 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1595 };
1596 
1597 static const TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
1598 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
1599 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
1600 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1601 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1602 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1603 	{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
1604 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
1605 };
1606 
1607 static const TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
1608 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1609 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1610 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1611 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1612 };
1613 
1614 static const TParserStateActionItem actionTPS_InHyphenWord[] = {
1615 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
1616 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1617 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
1618 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1619 	{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
1620 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
1621 };
1622 
1623 static const TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
1624 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1625 	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1626 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1627 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1628 };
1629 
1630 static const TParserStateActionItem actionTPS_InHyphenNumWord[] = {
1631 	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
1632 	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1633 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1634 	{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
1635 	{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
1636 };
1637 
1638 static const TParserStateActionItem actionTPS_InHyphenDigitLookahead[] = {
1639 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1640 	{p_isdigit, 0, A_NEXT, TPS_InHyphenDigitLookahead, 0, NULL},
1641 	{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1642 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
1643 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1644 };
1645 
1646 static const TParserStateActionItem actionTPS_InParseHyphen[] = {
1647 	{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
1648 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1649 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1650 	{p_isdigit, 0, A_PUSH, TPS_InHyphenUnsignedInt, 0, NULL},
1651 	{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
1652 	{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
1653 };
1654 
1655 static const TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
1656 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1657 	{p_isalnum, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1658 	{p_isspecial, 0, A_BINGO | A_CLEAR, TPS_InParseHyphen, SPACE, NULL},
1659 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1660 };
1661 
1662 static const TParserStateActionItem actionTPS_InHyphenWordPart[] = {
1663 	{p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
1664 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1665 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1666 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1667 	{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
1668 };
1669 
1670 static const TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
1671 	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
1672 	{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
1673 	{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1674 	{p_isspecial, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
1675 	{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1676 	{NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
1677 };
1678 
1679 static const TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
1680 	{p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
1681 	{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1682 	{p_isspecial, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
1683 	{NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
1684 };
1685 
1686 static const TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
1687 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
1688 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
1689 	{p_isalpha, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1690 	{p_isspecial, 0, A_CLEAR, TPS_InHyphenNumWordPart, 0, NULL},
1691 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
1692 };
1693 
1694 
1695 /*
1696  * main table of per-state parser actions
1697  */
1698 typedef struct
1699 {
1700 	const TParserStateActionItem *action;	/* the actual state info */
1701 	TParserState state;			/* only for Assert crosscheck */
1702 #ifdef WPARSER_TRACE
1703 	const char *state_name;		/* only for debug printout */
1704 #endif
1705 } TParserStateAction;
1706 
1707 #ifdef WPARSER_TRACE
1708 #define TPARSERSTATEACTION(state) \
1709 	{ CppConcat(action,state), state, CppAsString(state) }
1710 #else
1711 #define TPARSERSTATEACTION(state) \
1712 	{ CppConcat(action,state), state }
1713 #endif
1714 
1715 /*
1716  * order must be the same as in typedef enum {} TParserState!!
1717  */
1718 
1719 static const TParserStateAction Actions[] = {
1720 	TPARSERSTATEACTION(TPS_Base),
1721 	TPARSERSTATEACTION(TPS_InNumWord),
1722 	TPARSERSTATEACTION(TPS_InAsciiWord),
1723 	TPARSERSTATEACTION(TPS_InWord),
1724 	TPARSERSTATEACTION(TPS_InUnsignedInt),
1725 	TPARSERSTATEACTION(TPS_InSignedIntFirst),
1726 	TPARSERSTATEACTION(TPS_InSignedInt),
1727 	TPARSERSTATEACTION(TPS_InSpace),
1728 	TPARSERSTATEACTION(TPS_InUDecimalFirst),
1729 	TPARSERSTATEACTION(TPS_InUDecimal),
1730 	TPARSERSTATEACTION(TPS_InDecimalFirst),
1731 	TPARSERSTATEACTION(TPS_InDecimal),
1732 	TPARSERSTATEACTION(TPS_InVerVersion),
1733 	TPARSERSTATEACTION(TPS_InSVerVersion),
1734 	TPARSERSTATEACTION(TPS_InVersionFirst),
1735 	TPARSERSTATEACTION(TPS_InVersion),
1736 	TPARSERSTATEACTION(TPS_InMantissaFirst),
1737 	TPARSERSTATEACTION(TPS_InMantissaSign),
1738 	TPARSERSTATEACTION(TPS_InMantissa),
1739 	TPARSERSTATEACTION(TPS_InXMLEntityFirst),
1740 	TPARSERSTATEACTION(TPS_InXMLEntity),
1741 	TPARSERSTATEACTION(TPS_InXMLEntityNumFirst),
1742 	TPARSERSTATEACTION(TPS_InXMLEntityNum),
1743 	TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst),
1744 	TPARSERSTATEACTION(TPS_InXMLEntityHexNum),
1745 	TPARSERSTATEACTION(TPS_InXMLEntityEnd),
1746 	TPARSERSTATEACTION(TPS_InTagFirst),
1747 	TPARSERSTATEACTION(TPS_InXMLBegin),
1748 	TPARSERSTATEACTION(TPS_InTagCloseFirst),
1749 	TPARSERSTATEACTION(TPS_InTagName),
1750 	TPARSERSTATEACTION(TPS_InTagBeginEnd),
1751 	TPARSERSTATEACTION(TPS_InTag),
1752 	TPARSERSTATEACTION(TPS_InTagEscapeK),
1753 	TPARSERSTATEACTION(TPS_InTagEscapeKK),
1754 	TPARSERSTATEACTION(TPS_InTagBackSleshed),
1755 	TPARSERSTATEACTION(TPS_InTagEnd),
1756 	TPARSERSTATEACTION(TPS_InCommentFirst),
1757 	TPARSERSTATEACTION(TPS_InCommentLast),
1758 	TPARSERSTATEACTION(TPS_InComment),
1759 	TPARSERSTATEACTION(TPS_InCloseCommentFirst),
1760 	TPARSERSTATEACTION(TPS_InCloseCommentLast),
1761 	TPARSERSTATEACTION(TPS_InCommentEnd),
1762 	TPARSERSTATEACTION(TPS_InHostFirstDomain),
1763 	TPARSERSTATEACTION(TPS_InHostDomainSecond),
1764 	TPARSERSTATEACTION(TPS_InHostDomain),
1765 	TPARSERSTATEACTION(TPS_InPortFirst),
1766 	TPARSERSTATEACTION(TPS_InPort),
1767 	TPARSERSTATEACTION(TPS_InHostFirstAN),
1768 	TPARSERSTATEACTION(TPS_InHost),
1769 	TPARSERSTATEACTION(TPS_InEmail),
1770 	TPARSERSTATEACTION(TPS_InFileFirst),
1771 	TPARSERSTATEACTION(TPS_InFileTwiddle),
1772 	TPARSERSTATEACTION(TPS_InPathFirst),
1773 	TPARSERSTATEACTION(TPS_InPathFirstFirst),
1774 	TPARSERSTATEACTION(TPS_InPathSecond),
1775 	TPARSERSTATEACTION(TPS_InFile),
1776 	TPARSERSTATEACTION(TPS_InFileNext),
1777 	TPARSERSTATEACTION(TPS_InURLPathFirst),
1778 	TPARSERSTATEACTION(TPS_InURLPathStart),
1779 	TPARSERSTATEACTION(TPS_InURLPath),
1780 	TPARSERSTATEACTION(TPS_InFURL),
1781 	TPARSERSTATEACTION(TPS_InProtocolFirst),
1782 	TPARSERSTATEACTION(TPS_InProtocolSecond),
1783 	TPARSERSTATEACTION(TPS_InProtocolEnd),
1784 	TPARSERSTATEACTION(TPS_InHyphenAsciiWordFirst),
1785 	TPARSERSTATEACTION(TPS_InHyphenAsciiWord),
1786 	TPARSERSTATEACTION(TPS_InHyphenWordFirst),
1787 	TPARSERSTATEACTION(TPS_InHyphenWord),
1788 	TPARSERSTATEACTION(TPS_InHyphenNumWordFirst),
1789 	TPARSERSTATEACTION(TPS_InHyphenNumWord),
1790 	TPARSERSTATEACTION(TPS_InHyphenDigitLookahead),
1791 	TPARSERSTATEACTION(TPS_InParseHyphen),
1792 	TPARSERSTATEACTION(TPS_InParseHyphenHyphen),
1793 	TPARSERSTATEACTION(TPS_InHyphenWordPart),
1794 	TPARSERSTATEACTION(TPS_InHyphenAsciiWordPart),
1795 	TPARSERSTATEACTION(TPS_InHyphenNumWordPart),
1796 	TPARSERSTATEACTION(TPS_InHyphenUnsignedInt)
1797 };
1798 
1799 
1800 static bool
TParserGet(TParser * prs)1801 TParserGet(TParser *prs)
1802 {
1803 	const TParserStateActionItem *item = NULL;
1804 
1805 	Assert(prs->state);
1806 
1807 	if (prs->state->posbyte >= prs->lenstr)
1808 		return false;
1809 
1810 	prs->token = prs->str + prs->state->posbyte;
1811 	prs->state->pushedAtAction = NULL;
1812 
1813 	/* look at string */
1814 	while (prs->state->posbyte <= prs->lenstr)
1815 	{
1816 		if (prs->state->posbyte == prs->lenstr)
1817 			prs->state->charlen = 0;
1818 		else
1819 			prs->state->charlen = (prs->charmaxlen == 1) ? prs->charmaxlen :
1820 				pg_mblen(prs->str + prs->state->posbyte);
1821 
1822 		Assert(prs->state->posbyte + prs->state->charlen <= prs->lenstr);
1823 		Assert(prs->state->state >= TPS_Base && prs->state->state < TPS_Null);
1824 		Assert(Actions[prs->state->state].state == prs->state->state);
1825 
1826 		if (prs->state->pushedAtAction)
1827 		{
1828 			/* After a POP, pick up at the next test */
1829 			item = prs->state->pushedAtAction + 1;
1830 			prs->state->pushedAtAction = NULL;
1831 		}
1832 		else
1833 		{
1834 			item = Actions[prs->state->state].action;
1835 			Assert(item != NULL);
1836 		}
1837 
1838 		/* find action by character class */
1839 		while (item->isclass)
1840 		{
1841 			prs->c = item->c;
1842 			if (item->isclass(prs) != 0)
1843 				break;
1844 			item++;
1845 		}
1846 
1847 #ifdef WPARSER_TRACE
1848 		{
1849 			TParserPosition *ptr;
1850 
1851 			fprintf(stderr, "state ");
1852 			/* indent according to stack depth */
1853 			for (ptr = prs->state->prev; ptr; ptr = ptr->prev)
1854 				fprintf(stderr, "  ");
1855 			fprintf(stderr, "%s ", Actions[prs->state->state].state_name);
1856 			if (prs->state->posbyte < prs->lenstr)
1857 				fprintf(stderr, "at %c", *(prs->str + prs->state->posbyte));
1858 			else
1859 				fprintf(stderr, "at EOF");
1860 			fprintf(stderr, " matched rule %d flags%s%s%s%s%s%s%s%s%s%s%s\n",
1861 					(int) (item - Actions[prs->state->state].action),
1862 					(item->flags & A_BINGO) ? " BINGO" : "",
1863 					(item->flags & A_POP) ? " POP" : "",
1864 					(item->flags & A_PUSH) ? " PUSH" : "",
1865 					(item->flags & A_RERUN) ? " RERUN" : "",
1866 					(item->flags & A_CLEAR) ? " CLEAR" : "",
1867 					(item->flags & A_MERGE) ? " MERGE" : "",
1868 					(item->flags & A_CLRALL) ? " CLRALL" : "",
1869 					(item->tostate != TPS_Null) ? " tostate " : "",
1870 					(item->tostate != TPS_Null) ? Actions[item->tostate].state_name : "",
1871 					(item->type > 0) ? " type " : "",
1872 					tok_alias[item->type]);
1873 		}
1874 #endif
1875 
1876 		/* call special handler if exists */
1877 		if (item->special)
1878 			item->special(prs);
1879 
1880 		/* BINGO, token is found */
1881 		if (item->flags & A_BINGO)
1882 		{
1883 			Assert(item->type > 0);
1884 			prs->lenbytetoken = prs->state->lenbytetoken;
1885 			prs->lenchartoken = prs->state->lenchartoken;
1886 			prs->state->lenbytetoken = prs->state->lenchartoken = 0;
1887 			prs->type = item->type;
1888 		}
1889 
1890 		/* do various actions by flags */
1891 		if (item->flags & A_POP)
1892 		{						/* pop stored state in stack */
1893 			TParserPosition *ptr = prs->state->prev;
1894 
1895 			pfree(prs->state);
1896 			prs->state = ptr;
1897 			Assert(prs->state);
1898 		}
1899 		else if (item->flags & A_PUSH)
1900 		{						/* push (store) state in stack */
1901 			prs->state->pushedAtAction = item;	/* remember where we push */
1902 			prs->state = newTParserPosition(prs->state);
1903 		}
1904 		else if (item->flags & A_CLEAR)
1905 		{						/* clear previous pushed state */
1906 			TParserPosition *ptr;
1907 
1908 			Assert(prs->state->prev);
1909 			ptr = prs->state->prev->prev;
1910 			pfree(prs->state->prev);
1911 			prs->state->prev = ptr;
1912 		}
1913 		else if (item->flags & A_CLRALL)
1914 		{						/* clear all previous pushed state */
1915 			TParserPosition *ptr;
1916 
1917 			while (prs->state->prev)
1918 			{
1919 				ptr = prs->state->prev->prev;
1920 				pfree(prs->state->prev);
1921 				prs->state->prev = ptr;
1922 			}
1923 		}
1924 		else if (item->flags & A_MERGE)
1925 		{						/* merge posinfo with current and pushed state */
1926 			TParserPosition *ptr = prs->state;
1927 
1928 			Assert(prs->state->prev);
1929 			prs->state = prs->state->prev;
1930 
1931 			prs->state->posbyte = ptr->posbyte;
1932 			prs->state->poschar = ptr->poschar;
1933 			prs->state->charlen = ptr->charlen;
1934 			prs->state->lenbytetoken = ptr->lenbytetoken;
1935 			prs->state->lenchartoken = ptr->lenchartoken;
1936 			pfree(ptr);
1937 		}
1938 
1939 		/* set new state if pointed */
1940 		if (item->tostate != TPS_Null)
1941 			prs->state->state = item->tostate;
1942 
1943 		/* check for go away */
1944 		if ((item->flags & A_BINGO) ||
1945 			(prs->state->posbyte >= prs->lenstr &&
1946 			 (item->flags & A_RERUN) == 0))
1947 			break;
1948 
1949 		/* go to beginning of loop if we should rerun or we just restore state */
1950 		if (item->flags & (A_RERUN | A_POP))
1951 			continue;
1952 
1953 		/* move forward */
1954 		if (prs->state->charlen)
1955 		{
1956 			prs->state->posbyte += prs->state->charlen;
1957 			prs->state->lenbytetoken += prs->state->charlen;
1958 			prs->state->poschar++;
1959 			prs->state->lenchartoken++;
1960 		}
1961 	}
1962 
1963 	return (item && (item->flags & A_BINGO)) ? true : false;
1964 }
1965 
1966 Datum
prsd_lextype(PG_FUNCTION_ARGS)1967 prsd_lextype(PG_FUNCTION_ARGS)
1968 {
1969 	LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
1970 	int			i;
1971 
1972 	for (i = 1; i <= LASTNUM; i++)
1973 	{
1974 		descr[i - 1].lexid = i;
1975 		descr[i - 1].alias = pstrdup(tok_alias[i]);
1976 		descr[i - 1].descr = pstrdup(lex_descr[i]);
1977 	}
1978 
1979 	descr[LASTNUM].lexid = 0;
1980 
1981 	PG_RETURN_POINTER(descr);
1982 }
1983 
1984 Datum
prsd_start(PG_FUNCTION_ARGS)1985 prsd_start(PG_FUNCTION_ARGS)
1986 {
1987 	PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
1988 }
1989 
1990 Datum
prsd_nexttoken(PG_FUNCTION_ARGS)1991 prsd_nexttoken(PG_FUNCTION_ARGS)
1992 {
1993 	TParser    *p = (TParser *) PG_GETARG_POINTER(0);
1994 	char	  **t = (char **) PG_GETARG_POINTER(1);
1995 	int		   *tlen = (int *) PG_GETARG_POINTER(2);
1996 
1997 	if (!TParserGet(p))
1998 		PG_RETURN_INT32(0);
1999 
2000 	*t = p->token;
2001 	*tlen = p->lenbytetoken;
2002 
2003 	PG_RETURN_INT32(p->type);
2004 }
2005 
2006 Datum
prsd_end(PG_FUNCTION_ARGS)2007 prsd_end(PG_FUNCTION_ARGS)
2008 {
2009 	TParser    *p = (TParser *) PG_GETARG_POINTER(0);
2010 
2011 	TParserClose(p);
2012 	PG_RETURN_VOID();
2013 }
2014 
2015 
2016 /*
2017  * ts_headline support begins here
2018  */
2019 
2020 /* token type classification macros */
2021 #define LEAVETOKEN(x)	( (x)==SPACE )
2022 #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2023 #define ENDPUNCTOKEN(x) ( (x)==SPACE )
2024 
2025 #define TS_IDIGNORE(x)	( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
2026 #define HLIDREPLACE(x)	( (x)==TAG_T )
2027 #define HLIDSKIP(x)		( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2028 #define XMLHLIDSKIP(x)	( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
2029 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
2030 #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL_T || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
2031 
2032 /*
2033  * Macros useful in headline selection.  These rely on availability of
2034  * "HeadlineParsedText *prs" describing some text, and "int shortword"
2035  * describing the "short word" length parameter.
2036  */
2037 
2038 /* Interesting words are non-repeated search terms */
2039 #define INTERESTINGWORD(j) \
2040 	(prs->words[j].item && !prs->words[j].repeated)
2041 
2042 /* Don't want to end at a non-word or a short word, unless interesting */
2043 #define BADENDPOINT(j) \
2044 	((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
2045 	 !INTERESTINGWORD(j))
2046 
2047 typedef struct
2048 {
2049 	/* one cover (well, really one fragment) for mark_hl_fragments */
2050 	int32		startpos;		/* fragment's starting word index */
2051 	int32		endpos;			/* ending word index (inclusive) */
2052 	int32		poslen;			/* number of interesting words */
2053 	int32		curlen;			/* total number of words */
2054 	bool		chosen;			/* chosen? */
2055 	bool		excluded;		/* excluded? */
2056 } CoverPos;
2057 
2058 typedef struct
2059 {
2060 	/* callback data for checkcondition_HL */
2061 	HeadlineWordEntry *words;
2062 	int			len;
2063 } hlCheck;
2064 
2065 
2066 /*
2067  * TS_execute callback for matching a tsquery operand to headline words
2068  */
2069 static bool
checkcondition_HL(void * opaque,QueryOperand * val,ExecPhraseData * data)2070 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
2071 {
2072 	hlCheck    *checkval = (hlCheck *) opaque;
2073 	int			i;
2074 
2075 	/* scan words array for marching items */
2076 	for (i = 0; i < checkval->len; i++)
2077 	{
2078 		if (checkval->words[i].item == val)
2079 		{
2080 			/* if data == NULL, don't need to report positions */
2081 			if (!data)
2082 				return true;
2083 
2084 			if (!data->pos)
2085 			{
2086 				data->pos = palloc(sizeof(WordEntryPos) * checkval->len);
2087 				data->allocated = true;
2088 				data->npos = 1;
2089 				data->pos[0] = checkval->words[i].pos;
2090 			}
2091 			else if (data->pos[data->npos - 1] < checkval->words[i].pos)
2092 			{
2093 				data->pos[data->npos++] = checkval->words[i].pos;
2094 			}
2095 		}
2096 	}
2097 
2098 	if (data && data->npos > 0)
2099 		return true;
2100 
2101 	return false;
2102 }
2103 
2104 /*
2105  * hlFirstIndex: find first index >= pos containing any word used in query
2106  *
2107  * Returns -1 if no such index
2108  */
2109 static int
hlFirstIndex(HeadlineParsedText * prs,int pos)2110 hlFirstIndex(HeadlineParsedText *prs, int pos)
2111 {
2112 	int			i;
2113 
2114 	for (i = pos; i < prs->curwords; i++)
2115 	{
2116 		if (prs->words[i].item != NULL)
2117 			return i;
2118 	}
2119 	return -1;
2120 }
2121 
2122 /*
2123  * hlCover: try to find a substring of prs' word list that satisfies query
2124  *
2125  * At entry, *p must be the first word index to consider (initialize this
2126  * to zero, or to the next index after a previous successful search).
2127  * We will consider all substrings starting at or after that word, and
2128  * containing no more than max_cover words.  (We need a length limit to
2129  * keep this from taking O(N^2) time for a long document with many query
2130  * words but few complete matches.  Actually, since checkcondition_HL is
2131  * roughly O(N) in the length of the substring being checked, it's even
2132  * worse than that.)
2133  *
2134  * On success, sets *p to first word index and *q to last word index of the
2135  * cover substring, and returns true.
2136  *
2137  * The result is a minimal cover, in the sense that both *p and *q will be
2138  * words used in the query.
2139  */
2140 static bool
hlCover(HeadlineParsedText * prs,TSQuery query,int max_cover,int * p,int * q)2141 hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
2142 		int *p, int *q)
2143 {
2144 	int			pmin,
2145 				pmax,
2146 				nextpmin,
2147 				nextpmax;
2148 	hlCheck		ch;
2149 
2150 	/*
2151 	 * We look for the earliest, shortest substring of prs->words that
2152 	 * satisfies the query.  Both the pmin and pmax indices must be words
2153 	 * appearing in the query; there's no point in trying endpoints in between
2154 	 * such points.
2155 	 */
2156 	pmin = hlFirstIndex(prs, *p);
2157 	while (pmin >= 0)
2158 	{
2159 		/* This useless assignment just keeps stupider compilers quiet */
2160 		nextpmin = -1;
2161 		/* Consider substrings starting at pmin */
2162 		ch.words = &(prs->words[pmin]);
2163 		/* Consider the length-one substring first, then longer substrings */
2164 		pmax = pmin;
2165 		do
2166 		{
2167 			/* Try to match query against pmin .. pmax substring */
2168 			ch.len = pmax - pmin + 1;
2169 			if (TS_execute(GETQUERY(query), &ch,
2170 						   TS_EXEC_EMPTY, checkcondition_HL))
2171 			{
2172 				*p = pmin;
2173 				*q = pmax;
2174 				return true;
2175 			}
2176 			/* Nope, so advance pmax to next feasible endpoint */
2177 			nextpmax = hlFirstIndex(prs, pmax + 1);
2178 
2179 			/*
2180 			 * If this is our first advance past pmin, then the result is also
2181 			 * the next feasible value of pmin; remember it to save a
2182 			 * redundant search.
2183 			 */
2184 			if (pmax == pmin)
2185 				nextpmin = nextpmax;
2186 			pmax = nextpmax;
2187 		}
2188 		while (pmax >= 0 && pmax - pmin < max_cover);
2189 		/* No luck here, so try next feasible startpoint */
2190 		pmin = nextpmin;
2191 	}
2192 	return false;
2193 }
2194 
2195 /*
2196  * Apply suitable highlight marking to words selected by headline selector
2197  *
2198  * The words from startpos to endpos inclusive are marked per highlightall
2199  */
2200 static void
mark_fragment(HeadlineParsedText * prs,bool highlightall,int startpos,int endpos)2201 mark_fragment(HeadlineParsedText *prs, bool highlightall,
2202 			  int startpos, int endpos)
2203 {
2204 	int			i;
2205 
2206 	for (i = startpos; i <= endpos; i++)
2207 	{
2208 		if (prs->words[i].item)
2209 			prs->words[i].selected = 1;
2210 		if (!highlightall)
2211 		{
2212 			if (HLIDREPLACE(prs->words[i].type))
2213 				prs->words[i].replace = 1;
2214 			else if (HLIDSKIP(prs->words[i].type))
2215 				prs->words[i].skip = 1;
2216 		}
2217 		else
2218 		{
2219 			if (XMLHLIDSKIP(prs->words[i].type))
2220 				prs->words[i].skip = 1;
2221 		}
2222 
2223 		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2224 	}
2225 }
2226 
2227 /*
2228  * split a cover substring into fragments not longer than max_words
2229  *
2230  * At entry, *startpos and *endpos are the (remaining) bounds of the cover
2231  * substring.  They are updated to hold the bounds of the next fragment.
2232  *
2233  * *curlen and *poslen are set to the fragment's length, in words and
2234  * interesting words respectively.
2235  */
2236 static void
get_next_fragment(HeadlineParsedText * prs,int * startpos,int * endpos,int * curlen,int * poslen,int max_words)2237 get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
2238 				  int *curlen, int *poslen, int max_words)
2239 {
2240 	int			i;
2241 
2242 	/*
2243 	 * Objective: select a fragment of words between startpos and endpos such
2244 	 * that it has at most max_words and both ends have query words. If the
2245 	 * startpos and endpos are the endpoints of the cover and the cover has
2246 	 * fewer words than max_words, then this function should just return the
2247 	 * cover
2248 	 */
2249 	/* first move startpos to an item */
2250 	for (i = *startpos; i <= *endpos; i++)
2251 	{
2252 		*startpos = i;
2253 		if (INTERESTINGWORD(i))
2254 			break;
2255 	}
2256 	/* cut endpos to have only max_words */
2257 	*curlen = 0;
2258 	*poslen = 0;
2259 	for (i = *startpos; i <= *endpos && *curlen < max_words; i++)
2260 	{
2261 		if (!NONWORDTOKEN(prs->words[i].type))
2262 			*curlen += 1;
2263 		if (INTERESTINGWORD(i))
2264 			*poslen += 1;
2265 	}
2266 	/* if the cover was cut then move back endpos to a query item */
2267 	if (*endpos > i)
2268 	{
2269 		*endpos = i;
2270 		for (i = *endpos; i >= *startpos; i--)
2271 		{
2272 			*endpos = i;
2273 			if (INTERESTINGWORD(i))
2274 				break;
2275 			if (!NONWORDTOKEN(prs->words[i].type))
2276 				*curlen -= 1;
2277 		}
2278 	}
2279 }
2280 
2281 /*
2282  * Headline selector used when MaxFragments > 0
2283  *
2284  * Note: in this mode, highlightall is disregarded for phrase selection;
2285  * it only controls presentation details.
2286  */
2287 static void
mark_hl_fragments(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_fragments,int max_cover)2288 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2289 				  int shortword, int min_words,
2290 				  int max_words, int max_fragments, int max_cover)
2291 {
2292 	int32		poslen,
2293 				curlen,
2294 				i,
2295 				f,
2296 				num_f = 0;
2297 	int32		stretch,
2298 				maxstretch,
2299 				posmarker;
2300 
2301 	int32		startpos = 0,
2302 				endpos = 0,
2303 				p = 0,
2304 				q = 0;
2305 
2306 	int32		numcovers = 0,
2307 				maxcovers = 32;
2308 
2309 	int32		minI,
2310 				minwords,
2311 				maxitems;
2312 	CoverPos   *covers;
2313 
2314 	covers = palloc(maxcovers * sizeof(CoverPos));
2315 
2316 	/* get all covers */
2317 	while (hlCover(prs, query, max_cover, &p, &q))
2318 	{
2319 		startpos = p;
2320 		endpos = q;
2321 
2322 		/*
2323 		 * Break the cover into smaller fragments such that each fragment has
2324 		 * at most max_words. Also ensure that each end of each fragment is a
2325 		 * query word. This will allow us to stretch the fragment in either
2326 		 * direction
2327 		 */
2328 
2329 		while (startpos <= endpos)
2330 		{
2331 			get_next_fragment(prs, &startpos, &endpos, &curlen, &poslen, max_words);
2332 			if (numcovers >= maxcovers)
2333 			{
2334 				maxcovers *= 2;
2335 				covers = repalloc(covers, sizeof(CoverPos) * maxcovers);
2336 			}
2337 			covers[numcovers].startpos = startpos;
2338 			covers[numcovers].endpos = endpos;
2339 			covers[numcovers].curlen = curlen;
2340 			covers[numcovers].poslen = poslen;
2341 			covers[numcovers].chosen = false;
2342 			covers[numcovers].excluded = false;
2343 			numcovers++;
2344 			startpos = endpos + 1;
2345 			endpos = q;
2346 		}
2347 
2348 		/* move p to generate the next cover */
2349 		p++;
2350 	}
2351 
2352 	/* choose best covers */
2353 	for (f = 0; f < max_fragments; f++)
2354 	{
2355 		maxitems = 0;
2356 		minwords = PG_INT32_MAX;
2357 		minI = -1;
2358 
2359 		/*
2360 		 * Choose the cover that contains max items. In case of tie choose the
2361 		 * one with smaller number of words.
2362 		 */
2363 		for (i = 0; i < numcovers; i++)
2364 		{
2365 			if (!covers[i].chosen && !covers[i].excluded &&
2366 				(maxitems < covers[i].poslen ||
2367 				 (maxitems == covers[i].poslen &&
2368 				  minwords > covers[i].curlen)))
2369 			{
2370 				maxitems = covers[i].poslen;
2371 				minwords = covers[i].curlen;
2372 				minI = i;
2373 			}
2374 		}
2375 		/* if a cover was found mark it */
2376 		if (minI >= 0)
2377 		{
2378 			covers[minI].chosen = true;
2379 			/* adjust the size of cover */
2380 			startpos = covers[minI].startpos;
2381 			endpos = covers[minI].endpos;
2382 			curlen = covers[minI].curlen;
2383 			/* stretch the cover if cover size is lower than max_words */
2384 			if (curlen < max_words)
2385 			{
2386 				/* divide the stretch on both sides of cover */
2387 				maxstretch = (max_words - curlen) / 2;
2388 
2389 				/*
2390 				 * first stretch the startpos stop stretching if 1. we hit the
2391 				 * beginning of document 2. exceed maxstretch 3. we hit an
2392 				 * already marked fragment
2393 				 */
2394 				stretch = 0;
2395 				posmarker = startpos;
2396 				for (i = startpos - 1; i >= 0 && stretch < maxstretch && !prs->words[i].in; i--)
2397 				{
2398 					if (!NONWORDTOKEN(prs->words[i].type))
2399 					{
2400 						curlen++;
2401 						stretch++;
2402 					}
2403 					posmarker = i;
2404 				}
2405 				/* cut back startpos till we find a good endpoint */
2406 				for (i = posmarker; i < startpos && BADENDPOINT(i); i++)
2407 				{
2408 					if (!NONWORDTOKEN(prs->words[i].type))
2409 						curlen--;
2410 				}
2411 				startpos = i;
2412 				/* now stretch the endpos as much as possible */
2413 				posmarker = endpos;
2414 				for (i = endpos + 1; i < prs->curwords && curlen < max_words && !prs->words[i].in; i++)
2415 				{
2416 					if (!NONWORDTOKEN(prs->words[i].type))
2417 						curlen++;
2418 					posmarker = i;
2419 				}
2420 				/* cut back endpos till we find a good endpoint */
2421 				for (i = posmarker; i > endpos && BADENDPOINT(i); i--)
2422 				{
2423 					if (!NONWORDTOKEN(prs->words[i].type))
2424 						curlen--;
2425 				}
2426 				endpos = i;
2427 			}
2428 			covers[minI].startpos = startpos;
2429 			covers[minI].endpos = endpos;
2430 			covers[minI].curlen = curlen;
2431 			/* Mark the chosen fragments (covers) */
2432 			mark_fragment(prs, highlightall, startpos, endpos);
2433 			num_f++;
2434 			/* Exclude covers overlapping this one from future consideration */
2435 			for (i = 0; i < numcovers; i++)
2436 			{
2437 				if (i != minI &&
2438 					((covers[i].startpos >= startpos &&
2439 					  covers[i].startpos <= endpos) ||
2440 					 (covers[i].endpos >= startpos &&
2441 					  covers[i].endpos <= endpos) ||
2442 					 (covers[i].startpos < startpos &&
2443 					  covers[i].endpos > endpos)))
2444 					covers[i].excluded = true;
2445 			}
2446 		}
2447 		else
2448 			break;				/* no selectable covers remain */
2449 	}
2450 
2451 	/* show the first min_words words if we have not marked anything */
2452 	if (num_f <= 0)
2453 	{
2454 		startpos = endpos = curlen = 0;
2455 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
2456 		{
2457 			if (!NONWORDTOKEN(prs->words[i].type))
2458 				curlen++;
2459 			endpos = i;
2460 		}
2461 		mark_fragment(prs, highlightall, startpos, endpos);
2462 	}
2463 
2464 	pfree(covers);
2465 }
2466 
2467 /*
2468  * Headline selector used when MaxFragments == 0
2469  */
2470 static void
mark_hl_words(HeadlineParsedText * prs,TSQuery query,bool highlightall,int shortword,int min_words,int max_words,int max_cover)2471 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
2472 			  int shortword, int min_words, int max_words, int max_cover)
2473 {
2474 	int			p = 0,
2475 				q = 0;
2476 	int			bestb = -1,
2477 				beste = -1;
2478 	int			bestlen = -1;
2479 	bool		bestcover = false;
2480 	int			pose,
2481 				posb,
2482 				poslen,
2483 				curlen;
2484 	bool		poscover;
2485 	int			i;
2486 
2487 	if (!highlightall)
2488 	{
2489 		/* examine all covers, select a headline using the best one */
2490 		while (hlCover(prs, query, max_cover, &p, &q))
2491 		{
2492 			/*
2493 			 * Count words (curlen) and interesting words (poslen) within
2494 			 * cover, but stop once we reach max_words.  This step doesn't
2495 			 * consider whether that's a good stopping point.  posb and pose
2496 			 * are set to the start and end indexes of the possible headline.
2497 			 */
2498 			curlen = 0;
2499 			poslen = 0;
2500 			posb = pose = p;
2501 			for (i = p; i <= q && curlen < max_words; i++)
2502 			{
2503 				if (!NONWORDTOKEN(prs->words[i].type))
2504 					curlen++;
2505 				if (INTERESTINGWORD(i))
2506 					poslen++;
2507 				pose = i;
2508 			}
2509 
2510 			if (curlen < max_words)
2511 			{
2512 				/*
2513 				 * We have room to lengthen the headline, so search forward
2514 				 * until it's full or we find a good stopping point.  We'll
2515 				 * reconsider the word at "q", then move forward.
2516 				 */
2517 				for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2518 				{
2519 					if (i > q)
2520 					{
2521 						if (!NONWORDTOKEN(prs->words[i].type))
2522 							curlen++;
2523 						if (INTERESTINGWORD(i))
2524 							poslen++;
2525 					}
2526 					pose = i;
2527 					if (BADENDPOINT(i))
2528 						continue;
2529 					if (curlen >= min_words)
2530 						break;
2531 				}
2532 				if (curlen < min_words)
2533 				{
2534 					/*
2535 					 * Reached end of text and our headline is still shorter
2536 					 * than min_words, so try to extend it to the left.
2537 					 */
2538 					for (i = p - 1; i >= 0; i--)
2539 					{
2540 						if (!NONWORDTOKEN(prs->words[i].type))
2541 							curlen++;
2542 						if (INTERESTINGWORD(i))
2543 							poslen++;
2544 						if (curlen >= max_words)
2545 							break;
2546 						if (BADENDPOINT(i))
2547 							continue;
2548 						if (curlen >= min_words)
2549 							break;
2550 					}
2551 					posb = (i >= 0) ? i : 0;
2552 				}
2553 			}
2554 			else
2555 			{
2556 				/*
2557 				 * Can't make headline longer, so consider making it shorter
2558 				 * if needed to avoid a bad endpoint.
2559 				 */
2560 				if (i > q)
2561 					i = q;
2562 				for (; curlen > min_words; i--)
2563 				{
2564 					if (!BADENDPOINT(i))
2565 						break;
2566 					if (!NONWORDTOKEN(prs->words[i].type))
2567 						curlen--;
2568 					if (INTERESTINGWORD(i))
2569 						poslen--;
2570 					pose = i - 1;
2571 				}
2572 			}
2573 
2574 			/*
2575 			 * Check whether the proposed headline includes the original
2576 			 * cover; it might not if we trimmed it due to max_words.
2577 			 */
2578 			poscover = (posb <= p && pose >= q);
2579 
2580 			/*
2581 			 * Adopt this headline if it's better than the last one, giving
2582 			 * highest priority to headlines including the cover, then to
2583 			 * headlines with more interesting words, then to headlines with
2584 			 * good stopping points.  (Since bestlen is initially -1, we will
2585 			 * certainly adopt the first headline.)
2586 			 */
2587 			if (poscover > bestcover ||
2588 				(poscover == bestcover && poslen > bestlen) ||
2589 				(poscover == bestcover && poslen == bestlen &&
2590 				 !BADENDPOINT(pose) && BADENDPOINT(beste)))
2591 			{
2592 				bestb = posb;
2593 				beste = pose;
2594 				bestlen = poslen;
2595 				bestcover = poscover;
2596 			}
2597 
2598 			/* move p to generate the next cover */
2599 			p++;
2600 		}
2601 
2602 		/*
2603 		 * If we found nothing acceptable, select min_words words starting at
2604 		 * the beginning.
2605 		 */
2606 		if (bestlen < 0)
2607 		{
2608 			curlen = 0;
2609 			pose = 0;
2610 			for (i = 0; i < prs->curwords && curlen < min_words; i++)
2611 			{
2612 				if (!NONWORDTOKEN(prs->words[i].type))
2613 					curlen++;
2614 				pose = i;
2615 			}
2616 			bestb = 0;
2617 			beste = pose;
2618 		}
2619 	}
2620 	else
2621 	{
2622 		/* highlightall mode: headline is whole document */
2623 		bestb = 0;
2624 		beste = prs->curwords - 1;
2625 	}
2626 
2627 	mark_fragment(prs, highlightall, bestb, beste);
2628 }
2629 
2630 /*
2631  * Default parser's prsheadline function
2632  */
2633 Datum
prsd_headline(PG_FUNCTION_ARGS)2634 prsd_headline(PG_FUNCTION_ARGS)
2635 {
2636 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
2637 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
2638 	TSQuery		query = PG_GETARG_TSQUERY(2);
2639 
2640 	/* default option values: */
2641 	int			min_words = 15;
2642 	int			max_words = 35;
2643 	int			shortword = 3;
2644 	int			max_fragments = 0;
2645 	bool		highlightall = false;
2646 	int			max_cover;
2647 	ListCell   *l;
2648 
2649 	/* Extract configuration option values */
2650 	prs->startsel = NULL;
2651 	prs->stopsel = NULL;
2652 	prs->fragdelim = NULL;
2653 	foreach(l, prsoptions)
2654 	{
2655 		DefElem    *defel = (DefElem *) lfirst(l);
2656 		char	   *val = defGetString(defel);
2657 
2658 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
2659 			max_words = pg_atoi(val, sizeof(int32), 0);
2660 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
2661 			min_words = pg_atoi(val, sizeof(int32), 0);
2662 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
2663 			shortword = pg_atoi(val, sizeof(int32), 0);
2664 		else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
2665 			max_fragments = pg_atoi(val, sizeof(int32), 0);
2666 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
2667 			prs->startsel = pstrdup(val);
2668 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
2669 			prs->stopsel = pstrdup(val);
2670 		else if (pg_strcasecmp(defel->defname, "FragmentDelimiter") == 0)
2671 			prs->fragdelim = pstrdup(val);
2672 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
2673 			highlightall = (pg_strcasecmp(val, "1") == 0 ||
2674 							pg_strcasecmp(val, "on") == 0 ||
2675 							pg_strcasecmp(val, "true") == 0 ||
2676 							pg_strcasecmp(val, "t") == 0 ||
2677 							pg_strcasecmp(val, "y") == 0 ||
2678 							pg_strcasecmp(val, "yes") == 0);
2679 		else
2680 			ereport(ERROR,
2681 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2682 					 errmsg("unrecognized headline parameter: \"%s\"",
2683 							defel->defname)));
2684 	}
2685 
2686 	/*
2687 	 * We might eventually make max_cover a user-settable parameter, but for
2688 	 * now, just compute a reasonable value based on max_words and
2689 	 * max_fragments.
2690 	 */
2691 	max_cover = Max(max_words * 10, 100);
2692 	if (max_fragments > 0)
2693 		max_cover *= max_fragments;
2694 
2695 	/* in HighlightAll mode these parameters are ignored */
2696 	if (!highlightall)
2697 	{
2698 		if (min_words >= max_words)
2699 			ereport(ERROR,
2700 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2701 					 errmsg("MinWords should be less than MaxWords")));
2702 		if (min_words <= 0)
2703 			ereport(ERROR,
2704 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2705 					 errmsg("MinWords should be positive")));
2706 		if (shortword < 0)
2707 			ereport(ERROR,
2708 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2709 					 errmsg("ShortWord should be >= 0")));
2710 		if (max_fragments < 0)
2711 			ereport(ERROR,
2712 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2713 					 errmsg("MaxFragments should be >= 0")));
2714 	}
2715 
2716 	/* Apply appropriate headline selector */
2717 	if (max_fragments == 0)
2718 		mark_hl_words(prs, query, highlightall, shortword,
2719 					  min_words, max_words, max_cover);
2720 	else
2721 		mark_hl_fragments(prs, query, highlightall, shortword,
2722 						  min_words, max_words, max_fragments, max_cover);
2723 
2724 	/* Fill in default values for string options */
2725 	if (!prs->startsel)
2726 		prs->startsel = pstrdup("<b>");
2727 	if (!prs->stopsel)
2728 		prs->stopsel = pstrdup("</b>");
2729 	if (!prs->fragdelim)
2730 		prs->fragdelim = pstrdup(" ... ");
2731 
2732 	/* Caller will need these lengths, too */
2733 	prs->startsellen = strlen(prs->startsel);
2734 	prs->stopsellen = strlen(prs->stopsel);
2735 	prs->fragdelimlen = strlen(prs->fragdelim);
2736 
2737 	PG_RETURN_POINTER(prs);
2738 }
2739