xref: /openbsd/usr.bin/lex/parse.y (revision 20c29e2b)
1 /*	$OpenBSD: parse.y,v 1.11 2024/11/09 18:03:44 op Exp $	*/
2 
3 /* parse.y - parser for flex input */
4 
5 %token CHAR NUMBER SECTEND SCDECL XSCDECL NAME PREVCCL EOF_OP
6 %token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_EXTRA_TYPE
7 %token OPT_TABLES
8 
9 %token CCE_ALNUM CCE_ALPHA CCE_BLANK CCE_CNTRL CCE_DIGIT CCE_GRAPH
10 %token CCE_LOWER CCE_PRINT CCE_PUNCT CCE_SPACE CCE_UPPER CCE_XDIGIT
11 
12 %token CCE_NEG_ALNUM CCE_NEG_ALPHA CCE_NEG_BLANK CCE_NEG_CNTRL CCE_NEG_DIGIT CCE_NEG_GRAPH
13 %token CCE_NEG_LOWER CCE_NEG_PRINT CCE_NEG_PUNCT CCE_NEG_SPACE CCE_NEG_UPPER CCE_NEG_XDIGIT
14 
15 %left CCL_OP_DIFF CCL_OP_UNION
16 
17 /*
18  *POSIX and AT&T lex place the
19  * precedence of the repeat operator, {}, below that of concatenation.
20  * Thus, ab{3} is ababab.  Most other POSIX utilities use an Extended
21  * Regular Expression (ERE) precedence that has the repeat operator
22  * higher than concatenation.  This causes ab{3} to yield abbb.
23  *
24  * In order to support the POSIX and AT&T precedence and the flex
25  * precedence we define two token sets for the begin and end tokens of
26  * the repeat operator, '{' and '}'.  The lexical scanner chooses
27  * which tokens to return based on whether posix_compat or lex_compat
28  * are specified. Specifying either posix_compat or lex_compat will
29  * cause flex to parse scanner files as per the AT&T and
30  * POSIX-mandated behavior.
31  */
32 
33 %token BEGIN_REPEAT_POSIX END_REPEAT_POSIX BEGIN_REPEAT_FLEX END_REPEAT_FLEX
34 
35 
36 %{
37 /*  Copyright (c) 1990 The Regents of the University of California. */
38 /*  All rights reserved. */
39 
40 /*  This code is derived from software contributed to Berkeley by */
41 /*  Vern Paxson. */
42 
43 /*  The United States Government has rights in this work pursuant */
44 /*  to contract no. DE-AC03-76SF00098 between the United States */
45 /*  Department of Energy and the University of California. */
46 
47 /*  This file is part of flex. */
48 
49 /*  Redistribution and use in source and binary forms, with or without */
50 /*  modification, are permitted provided that the following conditions */
51 /*  are met: */
52 
53 /*  1. Redistributions of source code must retain the above copyright */
54 /*     notice, this list of conditions and the following disclaimer. */
55 /*  2. Redistributions in binary form must reproduce the above copyright */
56 /*     notice, this list of conditions and the following disclaimer in the */
57 /*     documentation and/or other materials provided with the distribution. */
58 
59 /*  Neither the name of the University nor the names of its contributors */
60 /*  may be used to endorse or promote products derived from this software */
61 /*  without specific prior written permission. */
62 
63 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
64 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
65 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
66 /*  PURPOSE. */
67 
68 #include "flexdef.h"
69 #include "tables.h"
70 
71 int pat, scnum, eps, headcnt, trailcnt, lastchar, i, rulelen;
72 int trlcontxt, xcluflg, currccl, cclsorted, varlength, variable_trail_rule;
73 
74 int *scon_stk;
75 int scon_stk_ptr;
76 
77 static int madeany = false;  /* whether we've made the '.' character class */
78 static int ccldot, cclany;
79 int previous_continued_action;	/* whether the previous rule's action was '|' */
80 
81 #define format_warn3(fmt, a1, a2) \
82 	do{ \
83         char fw3_msg[MAXLINE];\
84         snprintf( fw3_msg, MAXLINE,(fmt), (a1), (a2) );\
85         warn( fw3_msg );\
86 	}while(0)
87 
88 /* Expand a POSIX character class expression. */
89 #define CCL_EXPR(func) \
90 	do{ \
91 	int c; \
92 	for ( c = 0; c < csize; ++c ) \
93 		if ( isascii(c) && func(c) ) \
94 			ccladd( currccl, c ); \
95 	}while(0)
96 
97 /* negated class */
98 #define CCL_NEG_EXPR(func) \
99 	do{ \
100 	int c; \
101 	for ( c = 0; c < csize; ++c ) \
102 		if ( !func(c) ) \
103 			ccladd( currccl, c ); \
104 	}while(0)
105 
106 /* On some over-ambitious machines, such as DEC Alpha's, the default
107  * token type is "long" instead of "int"; this leads to problems with
108  * declaring yylval in flexdef.h.  But so far, all the yacc's I've seen
109  * wrap their definitions of YYSTYPE with "#ifndef YYSTYPE"'s, so the
110  * following should ensure that the default token type is "int".
111  */
112 #define YYSTYPE int
113 
114 %}
115 
116 %%
117 goal		:  initlex sect1 sect1end sect2 initforrule
118 			{ /* add default rule */
119 			int def_rule;
120 
121 			pat = cclinit();
122 			cclnegate( pat );
123 
124 			def_rule = mkstate( -pat );
125 
126 			/* Remember the number of the default rule so we
127 			 * don't generate "can't match" warnings for it.
128 			 */
129 			default_rule = num_rules;
130 
131 			finish_rule( def_rule, false, 0, 0, 0);
132 
133 			for ( i = 1; i <= lastsc; ++i )
134 				scset[i] = mkbranch( scset[i], def_rule );
135 
136 			if ( spprdflt )
137 				add_action(
138 				"YY_FATAL_ERROR( \"flex scanner jammed\" )" );
139 			else
140 				add_action( "ECHO" );
141 
142 			add_action( ";\n\tYY_BREAK\n" );
143 			}
144 		;
145 
146 initlex		:
147 			{ /* initialize for processing rules */
148 
149 			/* Create default DFA start condition. */
150 			scinstal( "INITIAL", false );
151 			}
152 		;
153 
154 sect1		:  sect1 startconddecl namelist1
155 		|  sect1 options
156 		|
157 		|  error
158 			{ synerr( _("unknown error processing section 1") ); }
159 		;
160 
161 sect1end	:  SECTEND
162 			{
163 			check_options();
164 			scon_stk = allocate_integer_array( lastsc + 1 );
165 			scon_stk_ptr = 0;
166 			}
167 		;
168 
169 startconddecl	:  SCDECL
170 			{ xcluflg = false; }
171 
172 		|  XSCDECL
173 			{ xcluflg = true; }
174 		;
175 
176 namelist1	:  namelist1 NAME
177 			{ scinstal( nmstr, xcluflg ); }
178 
179 		|  NAME
180 			{ scinstal( nmstr, xcluflg ); }
181 
182 		|  error
183 			{ synerr( _("bad start condition list") ); }
184 		;
185 
186 options		:  OPTION_OP optionlist
187 		;
188 
189 optionlist	:  optionlist option
190 		|
191 		;
192 
193 option		:  OPT_OUTFILE '=' NAME
194 			{
195 			outfilename = copy_string( nmstr );
196 			did_outfilename = 1;
197 			}
198 		|  OPT_EXTRA_TYPE '=' NAME
199 			{ extra_type = copy_string( nmstr ); }
200 		|  OPT_PREFIX '=' NAME
201 			{ prefix = copy_string( nmstr ); }
202 		|  OPT_YYCLASS '=' NAME
203 			{ yyclass = copy_string( nmstr ); }
204 		|  OPT_HEADER '=' NAME
205 			{ headerfilename = copy_string( nmstr ); }
206 	    |  OPT_TABLES '=' NAME
207             { tablesext = true; tablesfilename = copy_string( nmstr ); }
208 		;
209 
210 sect2		:  sect2 scon initforrule flexrule '\n'
211 			{ scon_stk_ptr = $2; }
212 		|  sect2 scon '{' sect2 '}'
213 			{ scon_stk_ptr = $2; }
214 		|
215 		;
216 
217 initforrule	:
218 			{
219 			/* Initialize for a parse of one rule. */
220 			trlcontxt = variable_trail_rule = varlength = false;
221 			trailcnt = headcnt = rulelen = 0;
222 			current_state_type = STATE_NORMAL;
223 			previous_continued_action = continued_action;
224 			in_rule = true;
225 
226 			new_rule();
227 			}
228 		;
229 
230 flexrule	:  '^' rule
231 			{
232 			pat = $2;
233 			finish_rule( pat, variable_trail_rule,
234 				headcnt, trailcnt , previous_continued_action);
235 
236 			if ( scon_stk_ptr > 0 )
237 				{
238 				for ( i = 1; i <= scon_stk_ptr; ++i )
239 					scbol[scon_stk[i]] =
240 						mkbranch( scbol[scon_stk[i]],
241 								pat );
242 				}
243 
244 			else
245 				{
246 				/* Add to all non-exclusive start conditions,
247 				 * including the default (0) start condition.
248 				 */
249 
250 				for ( i = 1; i <= lastsc; ++i )
251 					if ( ! scxclu[i] )
252 						scbol[i] = mkbranch( scbol[i],
253 									pat );
254 				}
255 
256 			if ( ! bol_needed )
257 				{
258 				bol_needed = true;
259 
260 				if ( performance_report > 1 )
261 					pinpoint_message(
262 			"'^' operator results in sub-optimal performance" );
263 				}
264 			}
265 
266 		|  rule
267 			{
268 			pat = $1;
269 			finish_rule( pat, variable_trail_rule,
270 				headcnt, trailcnt , previous_continued_action);
271 
272 			if ( scon_stk_ptr > 0 )
273 				{
274 				for ( i = 1; i <= scon_stk_ptr; ++i )
275 					scset[scon_stk[i]] =
276 						mkbranch( scset[scon_stk[i]],
277 								pat );
278 				}
279 
280 			else
281 				{
282 				for ( i = 1; i <= lastsc; ++i )
283 					if ( ! scxclu[i] )
284 						scset[i] =
285 							mkbranch( scset[i],
286 								pat );
287 				}
288 			}
289 
290 		|  EOF_OP
291 			{
292 			if ( scon_stk_ptr > 0 )
293 				build_eof_action();
294 
295 			else
296 				{
297 				/* This EOF applies to all start conditions
298 				 * which don't already have EOF actions.
299 				 */
300 				for ( i = 1; i <= lastsc; ++i )
301 					if ( ! sceof[i] )
302 						scon_stk[++scon_stk_ptr] = i;
303 
304 				if ( scon_stk_ptr == 0 )
305 					warn(
306 			"all start conditions already have <<EOF>> rules" );
307 
308 				else
309 					build_eof_action();
310 				}
311 			}
312 
313 		|  error
314 			{ synerr( _("unrecognized rule") ); }
315 		;
316 
317 scon_stk_ptr	:
318 			{ $$ = scon_stk_ptr; }
319 		;
320 
321 scon		:  '<' scon_stk_ptr namelist2 '>'
322 			{ $$ = $2; }
323 
324 		|  '<' '*' '>'
325 			{
326 			$$ = scon_stk_ptr;
327 
328 			for ( i = 1; i <= lastsc; ++i )
329 				{
330 				int j;
331 
332 				for ( j = 1; j <= scon_stk_ptr; ++j )
333 					if ( scon_stk[j] == i )
334 						break;
335 
336 				if ( j > scon_stk_ptr )
337 					scon_stk[++scon_stk_ptr] = i;
338 				}
339 			}
340 
341 		|
342 			{ $$ = scon_stk_ptr; }
343 		;
344 
345 namelist2	:  namelist2 ',' sconname
346 
347 		|  sconname
348 
349 		|  error
350 			{ synerr( _("bad start condition list") ); }
351 		;
352 
353 sconname	:  NAME
354 			{
355 			if ( (scnum = sclookup( nmstr )) == 0 )
356 				format_pinpoint_message(
357 					"undeclared start condition %s",
358 					nmstr );
359 			else
360 				{
361 				for ( i = 1; i <= scon_stk_ptr; ++i )
362 					if ( scon_stk[i] == scnum )
363 						{
364 						format_warn(
365 							"<%s> specified twice",
366 							scname[scnum] );
367 						break;
368 						}
369 
370 				if ( i > scon_stk_ptr )
371 					scon_stk[++scon_stk_ptr] = scnum;
372 				}
373 			}
374 		;
375 
376 rule		:  re2 re
377 			{
378 			if ( transchar[lastst[$2]] != SYM_EPSILON )
379 				/* Provide final transition \now/ so it
380 				 * will be marked as a trailing context
381 				 * state.
382 				 */
383 				$2 = link_machines( $2,
384 						mkstate( SYM_EPSILON ) );
385 
386 			mark_beginning_as_normal( $2 );
387 			current_state_type = STATE_NORMAL;
388 
389 			if ( previous_continued_action )
390 				{
391 				/* We need to treat this as variable trailing
392 				 * context so that the backup does not happen
393 				 * in the action but before the action switch
394 				 * statement.  If the backup happens in the
395 				 * action, then the rules "falling into" this
396 				 * one's action will *also* do the backup,
397 				 * erroneously.
398 				 */
399 				if ( ! varlength || headcnt != 0 )
400 					warn(
401 		"trailing context made variable due to preceding '|' action" );
402 
403 				/* Mark as variable. */
404 				varlength = true;
405 				headcnt = 0;
406 
407 				}
408 
409 			if ( lex_compat || (varlength && headcnt == 0) )
410 				{ /* variable trailing context rule */
411 				/* Mark the first part of the rule as the
412 				 * accepting "head" part of a trailing
413 				 * context rule.
414 				 *
415 				 * By the way, we didn't do this at the
416 				 * beginning of this production because back
417 				 * then current_state_type was set up for a
418 				 * trail rule, and add_accept() can create
419 				 * a new state ...
420 				 */
421 				add_accept( $1,
422 					num_rules | YY_TRAILING_HEAD_MASK );
423 				variable_trail_rule = true;
424 				}
425 
426 			else
427 				trailcnt = rulelen;
428 
429 			$$ = link_machines( $1, $2 );
430 			}
431 
432 		|  re2 re '$'
433 			{ synerr( _("trailing context used twice") ); }
434 
435 		|  re '$'
436 			{
437 			headcnt = 0;
438 			trailcnt = 1;
439 			rulelen = 1;
440 			varlength = false;
441 
442 			current_state_type = STATE_TRAILING_CONTEXT;
443 
444 			if ( trlcontxt )
445 				{
446 				synerr( _("trailing context used twice") );
447 				$$ = mkstate( SYM_EPSILON );
448 				}
449 
450 			else if ( previous_continued_action )
451 				{
452 				/* See the comment in the rule for "re2 re"
453 				 * above.
454 				 */
455 				warn(
456 		"trailing context made variable due to preceding '|' action" );
457 
458 				varlength = true;
459 				}
460 
461 			if ( lex_compat || varlength )
462 				{
463 				/* Again, see the comment in the rule for
464 				 * "re2 re" above.
465 				 */
466 				add_accept( $1,
467 					num_rules | YY_TRAILING_HEAD_MASK );
468 				variable_trail_rule = true;
469 				}
470 
471 			trlcontxt = true;
472 
473 			eps = mkstate( SYM_EPSILON );
474 			$$ = link_machines( $1,
475 				link_machines( eps, mkstate( '\n' ) ) );
476 			}
477 
478 		|  re
479 			{
480 			$$ = $1;
481 
482 			if ( trlcontxt )
483 				{
484 				if ( lex_compat || (varlength && headcnt == 0) )
485 					/* Both head and trail are
486 					 * variable-length.
487 					 */
488 					variable_trail_rule = true;
489 				else
490 					trailcnt = rulelen;
491 				}
492 			}
493 		;
494 
495 
496 re		:  re '|' series
497 			{
498 			varlength = true;
499 			$$ = mkor( $1, $3 );
500 			}
501 
502 		|  series
503 			{ $$ = $1; }
504 		;
505 
506 
507 re2		:  re '/'
508 			{
509 			/* This rule is written separately so the
510 			 * reduction will occur before the trailing
511 			 * series is parsed.
512 			 */
513 
514 			if ( trlcontxt )
515 				synerr( _("trailing context used twice") );
516 			else
517 				trlcontxt = true;
518 
519 			if ( varlength )
520 				/* We hope the trailing context is
521 				 * fixed-length.
522 				 */
523 				varlength = false;
524 			else
525 				headcnt = rulelen;
526 
527 			rulelen = 0;
528 
529 			current_state_type = STATE_TRAILING_CONTEXT;
530 			$$ = $1;
531 			}
532 		;
533 
534 series		:  series singleton
535 			{
536 			/* This is where concatenation of adjacent patterns
537 			 * gets done.
538 			 */
539 			$$ = link_machines( $1, $2 );
540 			}
541 
542 		|  singleton
543 			{ $$ = $1; }
544 
545 		|  series BEGIN_REPEAT_POSIX NUMBER ',' NUMBER END_REPEAT_POSIX
546 			{
547 			varlength = true;
548 
549 			if ( $3 > $5 || $3 < 0 )
550 				{
551 				synerr( _("bad iteration values") );
552 				$$ = $1;
553 				}
554 			else
555 				{
556 				if ( $3 == 0 )
557 					{
558 					if ( $5 <= 0 )
559 						{
560 						synerr(
561 						_("bad iteration values") );
562 						$$ = $1;
563 						}
564 					else
565 						$$ = mkopt(
566 							mkrep( $1, 1, $5 ) );
567 					}
568 				else
569 					$$ = mkrep( $1, $3, $5 );
570 				}
571 			}
572 
573 		|  series BEGIN_REPEAT_POSIX NUMBER ',' END_REPEAT_POSIX
574 			{
575 			varlength = true;
576 
577 			if ( $3 <= 0 )
578 				{
579 				synerr( _("iteration value must be positive") );
580 				$$ = $1;
581 				}
582 
583 			else
584 				$$ = mkrep( $1, $3, INFINITE_REPEAT );
585 			}
586 
587 		|  series BEGIN_REPEAT_POSIX NUMBER END_REPEAT_POSIX
588 			{
589 			/* The series could be something like "(foo)",
590 			 * in which case we have no idea what its length
591 			 * is, so we punt here.
592 			 */
593 			varlength = true;
594 
595 			if ( $3 <= 0 )
596 				{
597 				  synerr( _("iteration value must be positive")
598 					  );
599 				$$ = $1;
600 				}
601 
602 			else
603 				$$ = link_machines( $1,
604 						copysingl( $1, $3 - 1 ) );
605 			}
606 
607 		;
608 
609 singleton	:  singleton '*'
610 			{
611 			varlength = true;
612 
613 			$$ = mkclos( $1 );
614 			}
615 
616 		|  singleton '+'
617 			{
618 			varlength = true;
619 			$$ = mkposcl( $1 );
620 			}
621 
622 		|  singleton '?'
623 			{
624 			varlength = true;
625 			$$ = mkopt( $1 );
626 			}
627 
628 		|  singleton BEGIN_REPEAT_FLEX NUMBER ',' NUMBER END_REPEAT_FLEX
629 			{
630 			varlength = true;
631 
632 			if ( $3 > $5 || $3 < 0 )
633 				{
634 				synerr( _("bad iteration values") );
635 				$$ = $1;
636 				}
637 			else
638 				{
639 				if ( $3 == 0 )
640 					{
641 					if ( $5 <= 0 )
642 						{
643 						synerr(
644 						_("bad iteration values") );
645 						$$ = $1;
646 						}
647 					else
648 						$$ = mkopt(
649 							mkrep( $1, 1, $5 ) );
650 					}
651 				else
652 					$$ = mkrep( $1, $3, $5 );
653 				}
654 			}
655 
656 		|  singleton BEGIN_REPEAT_FLEX NUMBER ',' END_REPEAT_FLEX
657 			{
658 			varlength = true;
659 
660 			if ( $3 <= 0 )
661 				{
662 				synerr( _("iteration value must be positive") );
663 				$$ = $1;
664 				}
665 
666 			else
667 				$$ = mkrep( $1, $3, INFINITE_REPEAT );
668 			}
669 
670 		|  singleton BEGIN_REPEAT_FLEX NUMBER END_REPEAT_FLEX
671 			{
672 			/* The singleton could be something like "(foo)",
673 			 * in which case we have no idea what its length
674 			 * is, so we punt here.
675 			 */
676 			varlength = true;
677 
678 			if ( $3 <= 0 )
679 				{
680 				synerr( _("iteration value must be positive") );
681 				$$ = $1;
682 				}
683 
684 			else
685 				$$ = link_machines( $1,
686 						copysingl( $1, $3 - 1 ) );
687 			}
688 
689 		|  '.'
690 			{
691 			if ( ! madeany )
692 				{
693 				/* Create the '.' character class. */
694                     ccldot = cclinit();
695                     ccladd( ccldot, '\n' );
696                     cclnegate( ccldot );
697 
698                     if ( useecs )
699                         mkeccl( ccltbl + cclmap[ccldot],
700                             ccllen[ccldot], nextecm,
701                             ecgroup, csize, csize );
702 
703 				/* Create the (?s:'.') character class. */
704                     cclany = cclinit();
705                     cclnegate( cclany );
706 
707                     if ( useecs )
708                         mkeccl( ccltbl + cclmap[cclany],
709                             ccllen[cclany], nextecm,
710                             ecgroup, csize, csize );
711 
712 				madeany = true;
713 				}
714 
715 			++rulelen;
716 
717             if (sf_dot_all())
718                 $$ = mkstate( -cclany );
719             else
720                 $$ = mkstate( -ccldot );
721 			}
722 
723 		|  fullccl
724 			{
725 				/* Sort characters for fast searching.
726 				 */
727 				qsort( ccltbl + cclmap[$1], ccllen[$1], sizeof (*ccltbl), cclcmp );
728 
729 			if ( useecs )
730 				mkeccl( ccltbl + cclmap[$1], ccllen[$1],
731 					nextecm, ecgroup, csize, csize );
732 
733 			++rulelen;
734 
735 			if (ccl_has_nl[$1])
736 				rule_has_nl[num_rules] = true;
737 
738 			$$ = mkstate( -$1 );
739 			}
740 
741 		|  PREVCCL
742 			{
743 			++rulelen;
744 
745 			if (ccl_has_nl[$1])
746 				rule_has_nl[num_rules] = true;
747 
748 			$$ = mkstate( -$1 );
749 			}
750 
751 		|  '"' string '"'
752 			{ $$ = $2; }
753 
754 		|  '(' re ')'
755 			{ $$ = $2; }
756 
757 		|  CHAR
758 			{
759 			++rulelen;
760 
761 			if ($1 == nlch)
762 				rule_has_nl[num_rules] = true;
763 
764             if (sf_case_ins() && has_case($1))
765                 /* create an alternation, as in (a|A) */
766                 $$ = mkor (mkstate($1), mkstate(reverse_case($1)));
767             else
768                 $$ = mkstate( $1 );
769 			}
770 		;
771 fullccl:
772         fullccl CCL_OP_DIFF  braceccl  { $$ = ccl_set_diff  ($1, $3); }
773     |   fullccl CCL_OP_UNION braceccl  { $$ = ccl_set_union ($1, $3); }
774     |   braceccl
775     ;
776 
777 braceccl:
778 
779             '[' ccl ']' { $$ = $2; }
780 
781 		|  '[' '^' ccl ']'
782 			{
783 			cclnegate( $3 );
784 			$$ = $3;
785 			}
786 		;
787 
788 ccl		:  ccl CHAR '-' CHAR
789 			{
790 
791 			if (sf_case_ins())
792 			  {
793 
794 			    /* If one end of the range has case and the other
795 			     * does not, or the cases are different, then we're not
796 			     * sure what range the user is trying to express.
797 			     * Examples: [@-z] or [S-t]
798 			     */
799 			    if (has_case ($2) != has_case ($4)
800 				     || (has_case ($2) && (b_islower ($2) != b_islower ($4)))
801 				     || (has_case ($2) && (b_isupper ($2) != b_isupper ($4))))
802 			      format_warn3 (
803 			      _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"),
804 					    $2, $4);
805 
806 			    /* If the range spans uppercase characters but not
807 			     * lowercase (or vice-versa), then should we automatically
808 			     * include lowercase characters in the range?
809 			     * Example: [@-_] spans [a-z] but not [A-Z]
810 			     */
811 			    else if (!has_case ($2) && !has_case ($4) && !range_covers_case ($2, $4))
812 			      format_warn3 (
813 			      _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"),
814 					    $2, $4);
815 			  }
816 
817 			if ( $2 > $4 )
818 				synerr( _("negative range in character class") );
819 
820 			else
821 				{
822 				for ( i = $2; i <= $4; ++i )
823 					ccladd( $1, i );
824 
825 				/* Keep track if this ccl is staying in
826 				 * alphabetical order.
827 				 */
828 				cclsorted = cclsorted && ($2 > lastchar);
829 				lastchar = $4;
830 
831                 /* Do it again for upper/lowercase */
832                 if (sf_case_ins() && has_case($2) && has_case($4)){
833                     $2 = reverse_case ($2);
834                     $4 = reverse_case ($4);
835 
836                     for ( i = $2; i <= $4; ++i )
837                         ccladd( $1, i );
838 
839                     cclsorted = cclsorted && ($2 > lastchar);
840                     lastchar = $4;
841                 }
842 
843 				}
844 
845 			$$ = $1;
846 			}
847 
848 		|  ccl CHAR
849 			{
850 			ccladd( $1, $2 );
851 			cclsorted = cclsorted && ($2 > lastchar);
852 			lastchar = $2;
853 
854             /* Do it again for upper/lowercase */
855             if (sf_case_ins() && has_case($2)){
856                 $2 = reverse_case ($2);
857                 ccladd ($1, $2);
858 
859                 cclsorted = cclsorted && ($2 > lastchar);
860                 lastchar = $2;
861             }
862 
863 			$$ = $1;
864 			}
865 
866 		|  ccl ccl_expr
867 			{
868 			/* Too hard to properly maintain cclsorted. */
869 			cclsorted = false;
870 			$$ = $1;
871 			}
872 
873 		|
874 			{
875 			cclsorted = true;
876 			lastchar = 0;
877 			currccl = $$ = cclinit();
878 			}
879 		;
880 
881 ccl_expr:
882            CCE_ALNUM	{ CCL_EXPR(isalnum); }
883 		|  CCE_ALPHA	{ CCL_EXPR(isalpha); }
884 		|  CCE_BLANK	{ CCL_EXPR(isblank); }
885 		|  CCE_CNTRL	{ CCL_EXPR(iscntrl); }
886 		|  CCE_DIGIT	{ CCL_EXPR(isdigit); }
887 		|  CCE_GRAPH	{ CCL_EXPR(isgraph); }
888 		|  CCE_LOWER	{
889                           CCL_EXPR(islower);
890                           if (sf_case_ins())
891                               CCL_EXPR(isupper);
892                         }
893 		|  CCE_PRINT	{ CCL_EXPR(isprint); }
894 		|  CCE_PUNCT	{ CCL_EXPR(ispunct); }
895 		|  CCE_SPACE	{ CCL_EXPR(isspace); }
896 		|  CCE_XDIGIT	{ CCL_EXPR(isxdigit); }
897 		|  CCE_UPPER	{
898                     CCL_EXPR(isupper);
899                     if (sf_case_ins())
900                         CCL_EXPR(islower);
901 				}
902 
903         |  CCE_NEG_ALNUM	{ CCL_NEG_EXPR(isalnum); }
904 		|  CCE_NEG_ALPHA	{ CCL_NEG_EXPR(isalpha); }
905 		|  CCE_NEG_BLANK	{ CCL_NEG_EXPR(isblank); }
906 		|  CCE_NEG_CNTRL	{ CCL_NEG_EXPR(iscntrl); }
907 		|  CCE_NEG_DIGIT	{ CCL_NEG_EXPR(isdigit); }
908 		|  CCE_NEG_GRAPH	{ CCL_NEG_EXPR(isgraph); }
909 		|  CCE_NEG_PRINT	{ CCL_NEG_EXPR(isprint); }
910 		|  CCE_NEG_PUNCT	{ CCL_NEG_EXPR(ispunct); }
911 		|  CCE_NEG_SPACE	{ CCL_NEG_EXPR(isspace); }
912 		|  CCE_NEG_XDIGIT	{ CCL_NEG_EXPR(isxdigit); }
913 		|  CCE_NEG_LOWER	{
914 				if ( sf_case_ins() )
915 					warn(_("[:^lower:] is ambiguous in case insensitive scanner"));
916 				else
917 					CCL_NEG_EXPR(islower);
918 				}
919 		|  CCE_NEG_UPPER	{
920 				if ( sf_case_ins() )
921 					warn(_("[:^upper:] ambiguous in case insensitive scanner"));
922 				else
923 					CCL_NEG_EXPR(isupper);
924 				}
925 		;
926 
927 string		:  string CHAR
928 			{
929 			if ( $2 == nlch )
930 				rule_has_nl[num_rules] = true;
931 
932 			++rulelen;
933 
934             if (sf_case_ins() && has_case($2))
935                 $$ = mkor (mkstate($2), mkstate(reverse_case($2)));
936             else
937                 $$ = mkstate ($2);
938 
939 			$$ = link_machines( $1, $$);
940 			}
941 
942 		|
943 			{ $$ = mkstate( SYM_EPSILON ); }
944 		;
945 
946 %%
947 
948 
949 /* build_eof_action - build the "<<EOF>>" action for the active start
950  *                    conditions
951  */
952 
953 void build_eof_action(void)
954 	{
955 	int i;
956 	char action_text[MAXLINE];
957 
958 	for ( i = 1; i <= scon_stk_ptr; ++i )
959 		{
960 		if ( sceof[scon_stk[i]] )
961 			format_pinpoint_message(
962 				"multiple <<EOF>> rules for start condition %s",
963 				scname[scon_stk[i]] );
964 
965 		else
966 			{
967 			sceof[scon_stk[i]] = true;
968 
969 			if (previous_continued_action /* && previous action was regular */)
970 				add_action("YY_RULE_SETUP\n");
971 
972 			snprintf( action_text, sizeof(action_text), "case YY_STATE_EOF(%s):\n",
973 				scname[scon_stk[i]] );
974 			add_action( action_text );
975 			}
976 		}
977 
978 	line_directive_out( (FILE *) 0, 1 );
979 
980 	/* This isn't a normal rule after all - don't count it as
981 	 * such, so we don't have any holes in the rule numbering
982 	 * (which make generating "rule can never match" warnings
983 	 * more difficult.
984 	 */
985 	--num_rules;
986 	++num_eof_rules;
987 	}
988 
989 
990 /* format_synerr - write out formatted syntax error */
991 
format_synerr(const char * msg,const char arg[])992 void format_synerr(const char *msg, const char arg[])
993 	{
994 	char errmsg[MAXLINE];
995 
996 	(void) snprintf( errmsg, sizeof(errmsg), msg, arg );
997 	synerr( errmsg );
998 	}
999 
1000 
1001 /* synerr - report a syntax error */
1002 
synerr(const char * str)1003 void synerr(const char *str)
1004 	{
1005 	syntaxerror = true;
1006 	pinpoint_message( str );
1007 	}
1008 
1009 
1010 /* format_warn - write out formatted warning */
1011 
format_warn(const char * msg,const char arg[])1012 void format_warn(const char *msg, const char arg[])
1013 	{
1014 	char warn_msg[MAXLINE];
1015 
1016 	snprintf( warn_msg, sizeof(warn_msg), msg, arg );
1017 	warn( warn_msg );
1018 	}
1019 
1020 
1021 /* warn - report a warning, unless -w was given */
1022 
warn(const char * str)1023 void warn(const char *str)
1024 	{
1025 	line_warning( str, linenum );
1026 	}
1027 
1028 /* format_pinpoint_message - write out a message formatted with one string,
1029  *			     pinpointing its location
1030  */
1031 
format_pinpoint_message(const char * msg,const char arg[])1032 void format_pinpoint_message(const char *msg, const char arg[])
1033 	{
1034 	char errmsg[MAXLINE];
1035 
1036 	snprintf( errmsg, sizeof(errmsg), msg, arg );
1037 	pinpoint_message( errmsg );
1038 	}
1039 
1040 
1041 /* pinpoint_message - write out a message, pinpointing its location */
1042 
pinpoint_message(const char * str)1043 void pinpoint_message(const char *str)
1044 	{
1045 	line_pinpoint( str, linenum );
1046 	}
1047 
1048 
1049 /* line_warning - report a warning at a given line, unless -w was given */
1050 
line_warning(const char * str,int line)1051 void line_warning(const char *str, int line)
1052 	{
1053 	char warning[MAXLINE];
1054 
1055 	if ( ! nowarn )
1056 		{
1057 		snprintf( warning, sizeof(warning), "warning, %s", str );
1058 		line_pinpoint( warning, line );
1059 		}
1060 	}
1061 
1062 
1063 /* line_pinpoint - write out a message, pinpointing it at the given line */
1064 
line_pinpoint(const char * str,int line)1065 void line_pinpoint(const char *str, int line)
1066 	{
1067 	fprintf( stderr, "%s:%d: %s\n", infilename, line, str );
1068 	}
1069 
1070 
1071 /* yyerror - eat up an error message from the parser;
1072  *	     currently, messages are ignore
1073  */
1074 
yyerror(const char * msg)1075 void yyerror(const char *msg)
1076 	{
1077 	}
1078