xref: /386bsd/usr/src/usr.bin/lex/parse.y (revision a2142627)
1 /* parse.y - parser for flex input */
2 
3 %token CHAR NUMBER SECTEND SCDECL XSCDECL WHITESPACE NAME PREVCCL EOF_OP
4 
5 %{
6 /*-
7  * Copyright (c) 1990 The Regents of the University of California.
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Vern Paxson.
12  *
13  * The United States Government has rights in this work pursuant
14  * to contract no. DE-AC03-76SF00098 between the United States
15  * Department of Energy and the University of California.
16  *
17  * Redistribution and use in source and binary forms are permitted provided
18  * that: (1) source distributions retain this entire copyright notice and
19  * comment, and (2) distributions including binaries display the following
20  * acknowledgement:  ``This product includes software developed by the
21  * University of California, Berkeley and its contributors'' in the
22  * documentation or other materials provided with the distribution and in
23  * all advertising materials mentioning features or use of this software.
24  * Neither the name of the University nor the names of its contributors may
25  * be used to endorse or promote products derived from this software without
26  * specific prior written permission.
27  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED
28  * WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
29  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
30  */
31 
32 /* $Header: /home/daffy/u0/vern/flex/RCS/parse.y,v 2.15 93/12/09 13:57:23 vern Exp $ */
33 
34 
35 /* Some versions of bison are broken in that they use alloca() but don't
36  * declare it properly.  The following is the patented (just kidding!)
37  * #ifdef chud to fix the problem, courtesy of Francois Pinard.
38  */
39 #ifdef YYBISON
40 /* AIX requires this to be the first thing in the file.  */
41 #ifdef __GNUC__
42 #define alloca __builtin_alloca
43 #else /* not __GNUC__ */
44 #if HAVE_ALLOCA_H
45 #include <alloca.h>
46 #else /* not HAVE_ALLOCA_H */
47 #ifdef _AIX
48  #pragma alloca
49 #else /* not _AIX */
50 char *alloca ();
51 #endif /* not _AIX */
52 #endif /* not HAVE_ALLOCA_H */
53 #endif /* not __GNUC__ */
54 #endif /* YYBISON */
55 
56 /* Bletch, ^^^^ that was ugly! */
57 
58 
59 #include "flexdef.h"
60 
61 int pat, scnum, eps, headcnt, trailcnt, anyccl, lastchar, i, actvp, rulelen;
62 int trlcontxt, xcluflg, cclsorted, varlength, variable_trail_rule;
63 int *active_ss;
64 Char clower();
65 void build_eof_action();
66 void yyerror();
67 
68 static int madeany = false;  /* whether we've made the '.' character class */
69 int previous_continued_action;	/* whether the previous rule's action was '|' */
70 
71 /* On some over-ambitious machines, such as DEC Alpha's, the default
72  * token type is "long" instead of "int"; this leads to problems with
73  * declaring yylval in flexdef.h.  But so far, all the yacc's I've seen
74  * wrap their definitions of YYSTYPE with "#ifndef YYSTYPE"'s, so the
75  * following should ensure that the default token type is "int".
76  */
77 #define YYSTYPE int
78 
79 %}
80 
81 %%
82 goal		:  initlex sect1 sect1end sect2 initforrule
83 			{ /* add default rule */
84 			int def_rule;
85 
86 			pat = cclinit();
87 			cclnegate( pat );
88 
89 			def_rule = mkstate( -pat );
90 
91 			/* Remember the number of the default rule so we
92 			 * don't generate "can't match" warnings for it.
93 			 */
94 			default_rule = num_rules;
95 
96 			finish_rule( def_rule, false, 0, 0 );
97 
98 			for ( i = 1; i <= lastsc; ++i )
99 				scset[i] = mkbranch( scset[i], def_rule );
100 
101 			if ( spprdflt )
102 				add_action(
103 				"YY_FATAL_ERROR( \"flex scanner jammed\" )" );
104 			else
105 				add_action( "ECHO" );
106 
107 			add_action( ";\n\tYY_BREAK\n" );
108 			}
109 		;
110 
111 initlex		:
112 			{ /* initialize for processing rules */
113 
114 			/* Create default DFA start condition. */
115 			scinstal( "INITIAL", false );
116 
117 			/* Initially, the start condition scoping is
118 			 * "no start conditions active".
119 			 */
120 			actvp = 0;
121 			}
122 		;
123 
124 sect1		:  sect1 startconddecl WHITESPACE namelist1 '\n'
125 		|
126 		|  error '\n'
127 			{ synerr( "unknown error processing section 1" ); }
128 		;
129 
130 sect1end	:  SECTEND
131 			{
132 			/* We now know how many start conditions there
133 			 * are, so create the "activity" map indicating
134 			 * which conditions are active.
135 			 */
136 			active_ss = allocate_integer_array( lastsc + 1 );
137 
138 			for ( i = 1; i <= lastsc; ++i )
139 				active_ss[i] = 0;
140 			}
141 		;
142 
143 startconddecl	:  SCDECL
144 			{ xcluflg = false; }
145 
146 		|  XSCDECL
147 			{ xcluflg = true; }
148 		;
149 
150 namelist1	:  namelist1 WHITESPACE NAME
151 			{ scinstal( nmstr, xcluflg ); }
152 
153 		|  NAME
154 			{ scinstal( nmstr, xcluflg ); }
155 
156 		|  error
157 			{ synerr( "bad start condition list" ); }
158 		;
159 
160 sect2		:  sect2 initforrule flexrule '\n'
161 		|
162 		;
163 
164 initforrule	:
165 			{
166 			/* Initialize for a parse of one rule. */
167 			trlcontxt = variable_trail_rule = varlength = false;
168 			trailcnt = headcnt = rulelen = 0;
169 			current_state_type = STATE_NORMAL;
170 			previous_continued_action = continued_action;
171 			new_rule();
172 			}
173 		;
174 
175 flexrule	:  scon '^' rule
176 			{
177 			pat = $3;
178 			finish_rule( pat, variable_trail_rule,
179 				headcnt, trailcnt );
180 
181 			for ( i = 1; i <= actvp; ++i )
182 				scbol[actvsc[i]] =
183 					mkbranch( scbol[actvsc[i]], pat );
184 
185 			if ( ! bol_needed )
186 				{
187 				bol_needed = true;
188 
189 				if ( performance_report > 1 )
190 					pinpoint_message(
191 			"'^' operator results in sub-optimal performance" );
192 				}
193 			}
194 
195 		|  scon rule
196 			{
197 			pat = $2;
198 			finish_rule( pat, variable_trail_rule,
199 				headcnt, trailcnt );
200 
201 			for ( i = 1; i <= actvp; ++i )
202 				scset[actvsc[i]] =
203 					mkbranch( scset[actvsc[i]], pat );
204 			}
205 
206 		|  '^' rule
207 			{
208 			pat = $2;
209 			finish_rule( pat, variable_trail_rule,
210 				headcnt, trailcnt );
211 
212 			/* Add to all non-exclusive start conditions,
213 			 * including the default (0) start condition.
214 			 */
215 
216 			for ( i = 1; i <= lastsc; ++i )
217 				if ( ! scxclu[i] )
218 					scbol[i] = mkbranch( scbol[i], pat );
219 
220 			if ( ! bol_needed )
221 				{
222 				bol_needed = true;
223 
224 				if ( performance_report > 1 )
225 					pinpoint_message(
226 			"'^' operator results in sub-optimal performance" );
227 				}
228 			}
229 
230 		|  rule
231 			{
232 			pat = $1;
233 			finish_rule( pat, variable_trail_rule,
234 				headcnt, trailcnt );
235 
236 			for ( i = 1; i <= lastsc; ++i )
237 				if ( ! scxclu[i] )
238 					scset[i] = mkbranch( scset[i], pat );
239 			}
240 
241 		|  scon EOF_OP
242 			{ build_eof_action(); }
243 
244 		|  EOF_OP
245 			{
246 			/* This EOF applies to all start conditions
247 			 * which don't already have EOF actions.
248 			 */
249 			actvp = 0;
250 
251 			for ( i = 1; i <= lastsc; ++i )
252 				if ( ! sceof[i] )
253 					actvsc[++actvp] = i;
254 
255 			if ( actvp == 0 )
256 				warn(
257 			"all start conditions already have <<EOF>> rules" );
258 
259 			else
260 				build_eof_action();
261 			}
262 
263 		|  error
264 			{ synerr( "unrecognized rule" ); }
265 		;
266 
267 scon		:  '<' namelist2 '>'
268 
269 		|  '<' '*' '>'
270 			{
271 			actvp = 0;
272 
273 			for ( i = 1; i <= lastsc; ++i )
274 				actvsc[++actvp] = i;
275 			}
276 		;
277 
278 namelist2	:  namelist2 ',' sconname
279 
280 		|  { actvp = 0; } sconname
281 
282 		|  error
283 			{ synerr( "bad start condition list" ); }
284 		;
285 
286 sconname	:  NAME
287 			{
288 			if ( (scnum = sclookup( nmstr )) == 0 )
289 				format_pinpoint_message(
290 					"undeclared start condition %s",
291 					nmstr );
292 			else
293 				{
294 				if ( ++actvp >= current_max_scs )
295 					/* Some bozo has included multiple
296 					 * instances of start condition names.
297 					 */
298 					pinpoint_message(
299 				"too many start conditions in <> construct!" );
300 
301 				else
302 					actvsc[actvp] = scnum;
303 				}
304 			}
305 		;
306 
307 rule		:  re2 re
308 			{
309 			if ( transchar[lastst[$2]] != SYM_EPSILON )
310 				/* Provide final transition \now/ so it
311 				 * will be marked as a trailing context
312 				 * state.
313 				 */
314 				$2 = link_machines( $2,
315 						mkstate( SYM_EPSILON ) );
316 
317 			mark_beginning_as_normal( $2 );
318 			current_state_type = STATE_NORMAL;
319 
320 			if ( previous_continued_action )
321 				{
322 				/* We need to treat this as variable trailing
323 				 * context so that the backup does not happen
324 				 * in the action but before the action switch
325 				 * statement.  If the backup happens in the
326 				 * action, then the rules "falling into" this
327 				 * one's action will *also* do the backup,
328 				 * erroneously.
329 				 */
330 				if ( ! varlength || headcnt != 0 )
331 					warn(
332 		"trailing context made variable due to preceding '|' action" );
333 
334 				/* Mark as variable. */
335 				varlength = true;
336 				headcnt = 0;
337 				}
338 
339 			if ( lex_compat || (varlength && headcnt == 0) )
340 				{ /* variable trailing context rule */
341 				/* Mark the first part of the rule as the
342 				 * accepting "head" part of a trailing
343 				 * context rule.
344 				 *
345 				 * By the way, we didn't do this at the
346 				 * beginning of this production because back
347 				 * then current_state_type was set up for a
348 				 * trail rule, and add_accept() can create
349 				 * a new state ...
350 				 */
351 				add_accept( $1,
352 					num_rules | YY_TRAILING_HEAD_MASK );
353 				variable_trail_rule = true;
354 				}
355 
356 			else
357 				trailcnt = rulelen;
358 
359 			$$ = link_machines( $1, $2 );
360 			}
361 
362 		|  re2 re '$'
363 			{ synerr( "trailing context used twice" ); }
364 
365 		|  re '$'
366 			{
367 			headcnt = 0;
368 			trailcnt = 1;
369 			rulelen = 1;
370 			varlength = false;
371 
372 			current_state_type = STATE_TRAILING_CONTEXT;
373 
374 			if ( trlcontxt )
375 				{
376 				synerr( "trailing context used twice" );
377 				$$ = mkstate( SYM_EPSILON );
378 				}
379 
380 			else if ( previous_continued_action )
381 				{
382 				/* See the comment in the rule for "re2 re"
383 				 * above.
384 				 */
385 				warn(
386 		"trailing context made variable due to preceding '|' action" );
387 
388 				varlength = true;
389 				}
390 
391 			if ( lex_compat || varlength )
392 				{
393 				/* Again, see the comment in the rule for
394 				 * "re2 re" above.
395 				 */
396 				add_accept( $1,
397 					num_rules | YY_TRAILING_HEAD_MASK );
398 				variable_trail_rule = true;
399 				}
400 
401 			trlcontxt = true;
402 
403 			eps = mkstate( SYM_EPSILON );
404 			$$ = link_machines( $1,
405 				link_machines( eps, mkstate( '\n' ) ) );
406 			}
407 
408 		|  re
409 			{
410 			$$ = $1;
411 
412 			if ( trlcontxt )
413 				{
414 				if ( lex_compat || (varlength && headcnt == 0) )
415 					/* Both head and trail are
416 					 * variable-length.
417 					 */
418 					variable_trail_rule = true;
419 				else
420 					trailcnt = rulelen;
421 				}
422 			}
423 		;
424 
425 
426 re		:  re '|' series
427 			{
428 			varlength = true;
429 			$$ = mkor( $1, $3 );
430 			}
431 
432 		|  series
433 			{ $$ = $1; }
434 		;
435 
436 
437 re2		:  re '/'
438 			{
439 			/* This rule is written separately so the
440 			 * reduction will occur before the trailing
441 			 * series is parsed.
442 			 */
443 
444 			if ( trlcontxt )
445 				synerr( "trailing context used twice" );
446 			else
447 				trlcontxt = true;
448 
449 			if ( varlength )
450 				/* We hope the trailing context is
451 				 * fixed-length.
452 				 */
453 				varlength = false;
454 			else
455 				headcnt = rulelen;
456 
457 			rulelen = 0;
458 
459 			current_state_type = STATE_TRAILING_CONTEXT;
460 			$$ = $1;
461 			}
462 		;
463 
464 series		:  series singleton
465 			{
466 			/* This is where concatenation of adjacent patterns
467 			 * gets done.
468 			 */
469 			$$ = link_machines( $1, $2 );
470 			}
471 
472 		|  singleton
473 			{ $$ = $1; }
474 		;
475 
476 singleton	:  singleton '*'
477 			{
478 			varlength = true;
479 
480 			$$ = mkclos( $1 );
481 			}
482 
483 		|  singleton '+'
484 			{
485 			varlength = true;
486 			$$ = mkposcl( $1 );
487 			}
488 
489 		|  singleton '?'
490 			{
491 			varlength = true;
492 			$$ = mkopt( $1 );
493 			}
494 
495 		|  singleton '{' NUMBER ',' NUMBER '}'
496 			{
497 			varlength = true;
498 
499 			if ( $3 > $5 || $3 < 0 )
500 				{
501 				synerr( "bad iteration values" );
502 				$$ = $1;
503 				}
504 			else
505 				{
506 				if ( $3 == 0 )
507 					{
508 					if ( $5 <= 0 )
509 						{
510 						synerr(
511 						"bad iteration values" );
512 						$$ = $1;
513 						}
514 					else
515 						$$ = mkopt(
516 							mkrep( $1, 1, $5 ) );
517 					}
518 				else
519 					$$ = mkrep( $1, $3, $5 );
520 				}
521 			}
522 
523 		|  singleton '{' NUMBER ',' '}'
524 			{
525 			varlength = true;
526 
527 			if ( $3 <= 0 )
528 				{
529 				synerr( "iteration value must be positive" );
530 				$$ = $1;
531 				}
532 
533 			else
534 				$$ = mkrep( $1, $3, INFINITY );
535 			}
536 
537 		|  singleton '{' NUMBER '}'
538 			{
539 			/* The singleton could be something like "(foo)",
540 			 * in which case we have no idea what its length
541 			 * is, so we punt here.
542 			 */
543 			varlength = true;
544 
545 			if ( $3 <= 0 )
546 				{
547 				synerr( "iteration value must be positive" );
548 				$$ = $1;
549 				}
550 
551 			else
552 				$$ = link_machines( $1,
553 						copysingl( $1, $3 - 1 ) );
554 			}
555 
556 		|  '.'
557 			{
558 			if ( ! madeany )
559 				{
560 				/* Create the '.' character class. */
561 				anyccl = cclinit();
562 				ccladd( anyccl, '\n' );
563 				cclnegate( anyccl );
564 
565 				if ( useecs )
566 					mkeccl( ccltbl + cclmap[anyccl],
567 						ccllen[anyccl], nextecm,
568 						ecgroup, csize, csize );
569 
570 				madeany = true;
571 				}
572 
573 			++rulelen;
574 
575 			$$ = mkstate( -anyccl );
576 			}
577 
578 		|  fullccl
579 			{
580 			if ( ! cclsorted )
581 				/* Sort characters for fast searching.  We
582 				 * use a shell sort since this list could
583 				 * be large.
584 				 */
585 				cshell( ccltbl + cclmap[$1], ccllen[$1], true );
586 
587 			if ( useecs )
588 				mkeccl( ccltbl + cclmap[$1], ccllen[$1],
589 					nextecm, ecgroup, csize, csize );
590 
591 			++rulelen;
592 
593 			$$ = mkstate( -$1 );
594 			}
595 
596 		|  PREVCCL
597 			{
598 			++rulelen;
599 
600 			$$ = mkstate( -$1 );
601 			}
602 
603 		|  '"' string '"'
604 			{ $$ = $2; }
605 
606 		|  '(' re ')'
607 			{ $$ = $2; }
608 
609 		|  CHAR
610 			{
611 			++rulelen;
612 
613 			if ( caseins && $1 >= 'A' && $1 <= 'Z' )
614 				$1 = clower( $1 );
615 
616 			$$ = mkstate( $1 );
617 			}
618 		;
619 
620 fullccl		:  '[' ccl ']'
621 			{ $$ = $2; }
622 
623 		|  '[' '^' ccl ']'
624 			{
625 			cclnegate( $3 );
626 			$$ = $3;
627 			}
628 		;
629 
630 ccl		:  ccl CHAR '-' CHAR
631 			{
632 			if ( caseins )
633 				{
634 				if ( $2 >= 'A' && $2 <= 'Z' )
635 					$2 = clower( $2 );
636 				if ( $4 >= 'A' && $4 <= 'Z' )
637 					$4 = clower( $4 );
638 				}
639 
640 			if ( $2 > $4 )
641 				synerr( "negative range in character class" );
642 
643 			else
644 				{
645 				for ( i = $2; i <= $4; ++i )
646 					ccladd( $1, i );
647 
648 				/* Keep track if this ccl is staying in
649 				 * alphabetical order.
650 				 */
651 				cclsorted = cclsorted && ($2 > lastchar);
652 				lastchar = $4;
653 				}
654 
655 			$$ = $1;
656 			}
657 
658 		|  ccl CHAR
659 			{
660 			if ( caseins && $2 >= 'A' && $2 <= 'Z' )
661 				$2 = clower( $2 );
662 
663 			ccladd( $1, $2 );
664 			cclsorted = cclsorted && ($2 > lastchar);
665 			lastchar = $2;
666 			$$ = $1;
667 			}
668 
669 		|
670 			{
671 			cclsorted = true;
672 			lastchar = 0;
673 			$$ = cclinit();
674 			}
675 		;
676 
677 string		:  string CHAR
678 			{
679 			if ( caseins && $2 >= 'A' && $2 <= 'Z' )
680 				$2 = clower( $2 );
681 
682 			++rulelen;
683 
684 			$$ = link_machines( $1, mkstate( $2 ) );
685 			}
686 
687 		|
688 			{ $$ = mkstate( SYM_EPSILON ); }
689 		;
690 
691 %%
692 
693 
694 /* build_eof_action - build the "<<EOF>>" action for the active start
695  *                    conditions
696  */
697 
698 void build_eof_action()
699 	{
700 	register int i;
701 	char action_text[MAXLINE];
702 
703 	for ( i = 1; i <= actvp; ++i )
704 		{
705 		if ( sceof[actvsc[i]] )
706 			format_pinpoint_message(
707 				"multiple <<EOF>> rules for start condition %s",
708 				scname[actvsc[i]] );
709 
710 		else
711 			{
712 			sceof[actvsc[i]] = true;
713 			sprintf( action_text, "case YY_STATE_EOF(%s):\n",
714 			scname[actvsc[i]] );
715 			add_action( action_text );
716 			}
717 		}
718 
719 	line_directive_out( (FILE *) 0 );
720 
721 	/* This isn't a normal rule after all - don't count it as
722 	 * such, so we don't have any holes in the rule numbering
723 	 * (which make generating "rule can never match" warnings
724 	 * more difficult.
725 	 */
726 	--num_rules;
727 	++num_eof_rules;
728 	}
729 
730 
731 /* format_synerr - write out formatted syntax error */
732 
format_synerr(msg,arg)733 void format_synerr( msg, arg )
734 char msg[], arg[];
735 	{
736 	char errmsg[MAXLINE];
737 
738 	(void) sprintf( errmsg, msg, arg );
739 	synerr( errmsg );
740 	}
741 
742 
743 /* synerr - report a syntax error */
744 
synerr(str)745 void synerr( str )
746 char str[];
747 	{
748 	syntaxerror = true;
749 	pinpoint_message( str );
750 	}
751 
752 
753 /* warn - report a warning, unless -w was given */
754 
warn(str)755 void warn( str )
756 char str[];
757 	{
758 	line_warning( str, linenum );
759 	}
760 
761 /* format_pinpoint_message - write out a message formatted with one string,
762  *			     pinpointing its location
763  */
764 
format_pinpoint_message(msg,arg)765 void format_pinpoint_message( msg, arg )
766 char msg[], arg[];
767 	{
768 	char errmsg[MAXLINE];
769 
770 	(void) sprintf( errmsg, msg, arg );
771 	pinpoint_message( errmsg );
772 	}
773 
774 
775 /* pinpoint_message - write out a message, pinpointing its location */
776 
pinpoint_message(str)777 void pinpoint_message( str )
778 char str[];
779 	{
780 	line_pinpoint( str, linenum );
781 	}
782 
783 
784 /* line_warning - report a warning at a given line, unless -w was given */
785 
line_warning(str,line)786 void line_warning( str, line )
787 char str[];
788 int line;
789 	{
790 	char warning[MAXLINE];
791 
792 	if ( ! nowarn )
793 		{
794 		sprintf( warning, "warning, %s", str );
795 		line_pinpoint( warning, line );
796 		}
797 	}
798 
799 
800 /* line_pinpoint - write out a message, pinpointing it at the given line */
801 
line_pinpoint(str,line)802 void line_pinpoint( str, line )
803 char str[];
804 int line;
805 	{
806 	fprintf( stderr, "\"%s\", line %d: %s\n", infilename, line, str );
807 	}
808 
809 
810 /* yyerror - eat up an error message from the parser;
811  *	     currently, messages are ignore
812  */
813 
yyerror(msg)814 void yyerror( msg )
815 char msg[];
816 	{
817 	}
818