xref: /openbsd/usr.bin/lex/nfa.c (revision 4bdff4be)
1 /*	$OpenBSD: nfa.c,v 1.11 2015/11/19 22:52:40 tedu Exp $	*/
2 
3 /* nfa - NFA construction routines */
4 
5 /*  Copyright (c) 1990 The Regents of the University of California. */
6 /*  All rights reserved. */
7 
8 /*  This code is derived from software contributed to Berkeley by */
9 /*  Vern Paxson. */
10 
11 /*  The United States Government has rights in this work pursuant */
12 /*  to contract no. DE-AC03-76SF00098 between the United States */
13 /*  Department of Energy and the University of California. */
14 
15 /*  This file is part of flex. */
16 
17 /*  Redistribution and use in source and binary forms, with or without */
18 /*  modification, are permitted provided that the following conditions */
19 /*  are met: */
20 
21 /*  1. Redistributions of source code must retain the above copyright */
22 /*     notice, this list of conditions and the following disclaimer. */
23 /*  2. Redistributions in binary form must reproduce the above copyright */
24 /*     notice, this list of conditions and the following disclaimer in the */
25 /*     documentation and/or other materials provided with the distribution. */
26 
27 /*  Neither the name of the University nor the names of its contributors */
28 /*  may be used to endorse or promote products derived from this software */
29 /*  without specific prior written permission. */
30 
31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /*  PURPOSE. */
35 
36 #include "flexdef.h"
37 
38 
39 /* declare functions that have forward references */
40 
41 int dupmachine PROTO((int));
42 void mkxtion PROTO((int, int));
43 
44 
45 /* add_accept - add an accepting state to a machine
46  *
47  * accepting_number becomes mach's accepting number.
48  */
49 
50 void
51 add_accept(mach, accepting_number)
52 	int mach, accepting_number;
53 {
54 	/*
55 	 * Hang the accepting number off an epsilon state.  if it is
56 	 * associated with a state that has a non-epsilon out-transition,
57 	 * then the state will accept BEFORE it makes that transition, i.e.,
58 	 * one character too soon.
59 	 */
60 
61 	if (transchar[finalst[mach]] == SYM_EPSILON)
62 		accptnum[finalst[mach]] = accepting_number;
63 
64 	else {
65 		int astate = mkstate(SYM_EPSILON);
66 
67 		accptnum[astate] = accepting_number;
68 		(void) link_machines(mach, astate);
69 	}
70 }
71 
72 
73 /* copysingl - make a given number of copies of a singleton machine
74  *
75  * synopsis
76  *
77  *   newsng = copysingl( singl, num );
78  *
79  *     newsng - a new singleton composed of num copies of singl
80  *     singl  - a singleton machine
81  *     num    - the number of copies of singl to be present in newsng
82  */
83 
84 int
85 copysingl(singl, num)
86 	int singl, num;
87 {
88 	int copy, i;
89 
90 	copy = mkstate(SYM_EPSILON);
91 
92 	for (i = 1; i <= num; ++i)
93 		copy = link_machines(copy, dupmachine(singl));
94 
95 	return copy;
96 }
97 
98 
99 /* dumpnfa - debugging routine to write out an nfa */
100 
101 void
102 dumpnfa(state1)
103 	int state1;
104 
105 {
106 	int sym, tsp1, tsp2, anum, ns;
107 
108 	fprintf(stderr,
109 	    _
110 	    ("\n\n********** beginning dump of nfa with start state %d\n"),
111 	    state1);
112 
113 	/*
114 	 * We probably should loop starting at firstst[state1] and going to
115 	 * lastst[state1], but they're not maintained properly when we "or"
116 	 * all of the rules together.  So we use our knowledge that the
117 	 * machine starts at state 1 and ends at lastnfa.
118 	 */
119 
120 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
121 	for (ns = 1; ns <= lastnfa; ++ns) {
122 		fprintf(stderr, _("state # %4d\t"), ns);
123 
124 		sym = transchar[ns];
125 		tsp1 = trans1[ns];
126 		tsp2 = trans2[ns];
127 		anum = accptnum[ns];
128 
129 		fprintf(stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2);
130 
131 		if (anum != NIL)
132 			fprintf(stderr, "  [%d]", anum);
133 
134 		fprintf(stderr, "\n");
135 	}
136 
137 	fprintf(stderr, _("********** end of dump\n"));
138 }
139 
140 
141 /* dupmachine - make a duplicate of a given machine
142  *
143  * synopsis
144  *
145  *   copy = dupmachine( mach );
146  *
147  *     copy - holds duplicate of mach
148  *     mach - machine to be duplicated
149  *
150  * note that the copy of mach is NOT an exact duplicate; rather, all the
151  * transition states values are adjusted so that the copy is self-contained,
152  * as the original should have been.
153  *
154  * also note that the original MUST be contiguous, with its low and high
155  * states accessible by the arrays firstst and lastst
156  */
157 
158 int
159 dupmachine(mach)
160 	int mach;
161 {
162 	int i, init, state_offset;
163 	int state = 0;
164 	int last = lastst[mach];
165 
166 	for (i = firstst[mach]; i <= last; ++i) {
167 		state = mkstate(transchar[i]);
168 
169 		if (trans1[i] != NO_TRANSITION) {
170 			mkxtion(finalst[state], trans1[i] + state - i);
171 
172 			if (transchar[i] == SYM_EPSILON &&
173 			    trans2[i] != NO_TRANSITION)
174 				mkxtion(finalst[state],
175 				    trans2[i] + state - i);
176 		}
177 		accptnum[state] = accptnum[i];
178 	}
179 
180 	if (state == 0)
181 		flexfatal(_("empty machine in dupmachine()"));
182 
183 	state_offset = state - i + 1;
184 
185 	init = mach + state_offset;
186 	firstst[init] = firstst[mach] + state_offset;
187 	finalst[init] = finalst[mach] + state_offset;
188 	lastst[init] = lastst[mach] + state_offset;
189 
190 	return init;
191 }
192 
193 
194 /* finish_rule - finish up the processing for a rule
195  *
196  * An accepting number is added to the given machine.  If variable_trail_rule
197  * is true then the rule has trailing context and both the head and trail
198  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
199  * the machine recognizes a pattern with trailing context and headcnt is
200  * the number of characters in the matched part of the pattern, or zero
201  * if the matched part has variable length.  trailcnt is the number of
202  * trailing context characters in the pattern, or zero if the trailing
203  * context has variable length.
204  */
205 
206 void
207 finish_rule(mach, variable_trail_rule, headcnt, trailcnt,
208     pcont_act)
209 	int mach, variable_trail_rule, headcnt, trailcnt, pcont_act;
210 {
211 	char action_text[MAXLINE];
212 
213 	add_accept(mach, num_rules);
214 
215 	/*
216 	 * We did this in new_rule(), but it often gets the wrong number
217 	 * because we do it before we start parsing the current rule.
218 	 */
219 	rule_linenum[num_rules] = linenum;
220 
221 	/*
222 	 * If this is a continued action, then the line-number has already
223 	 * been updated, giving us the wrong number.
224 	 */
225 	if (continued_action)
226 		--rule_linenum[num_rules];
227 
228 
229 	/*
230 	 * If the previous rule was continued action, then we inherit the
231 	 * previous newline flag, possibly overriding the current one.
232 	 */
233 	if (pcont_act && rule_has_nl[num_rules - 1])
234 		rule_has_nl[num_rules] = true;
235 
236 	snprintf(action_text, sizeof(action_text), "case %d:\n", num_rules);
237 	add_action(action_text);
238 	if (rule_has_nl[num_rules]) {
239 		snprintf(action_text, sizeof(action_text), "/* rule %d can match eol */\n",
240 		    num_rules);
241 		add_action(action_text);
242 	}
243 	if (variable_trail_rule) {
244 		rule_type[num_rules] = RULE_VARIABLE;
245 
246 		if (performance_report > 0)
247 			fprintf(stderr,
248 			    _
249 			    ("Variable trailing context rule at line %d\n"),
250 			    rule_linenum[num_rules]);
251 
252 		variable_trailing_context_rules = true;
253 	} else {
254 		rule_type[num_rules] = RULE_NORMAL;
255 
256 		if (headcnt > 0 || trailcnt > 0) {
257 			/*
258 			 * Do trailing context magic to not match the
259 			 * trailing characters.
260 			 */
261 			char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
262 			char *scanner_bp = "yy_bp";
263 
264 			add_action
265 			    ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
266 
267 			if (headcnt > 0) {
268 				if (rule_has_nl[num_rules]) {
269 					snprintf(action_text, sizeof(action_text),
270 					    "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt);
271 					add_action(action_text);
272 				}
273 				snprintf(action_text, sizeof(action_text), "%s = %s + %d;\n",
274 				    scanner_cp, scanner_bp, headcnt);
275 				add_action(action_text);
276 			} else {
277 				if (rule_has_nl[num_rules]) {
278 					snprintf(action_text, sizeof(action_text),
279 					    "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt);
280 					add_action(action_text);
281 				}
282 				snprintf(action_text, sizeof(action_text), "%s -= %d;\n",
283 				    scanner_cp, trailcnt);
284 				add_action(action_text);
285 			}
286 
287 			add_action
288 			    ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
289 		}
290 	}
291 
292 	/*
293 	 * Okay, in the action code at this point yytext and yyleng have
294 	 * their proper final values for this rule, so here's the point to do
295 	 * any user action.  But don't do it for continued actions, as
296 	 * that'll result in multiple YY_RULE_SETUP's.
297 	 */
298 	if (!continued_action)
299 		add_action("YY_RULE_SETUP\n");
300 
301 	line_directive_out((FILE *) 0, 1);
302 }
303 
304 
305 /* link_machines - connect two machines together
306  *
307  * synopsis
308  *
309  *   new = link_machines( first, last );
310  *
311  *     new    - a machine constructed by connecting first to last
312  *     first  - the machine whose successor is to be last
313  *     last   - the machine whose predecessor is to be first
314  *
315  * note: this routine concatenates the machine first with the machine
316  *  last to produce a machine new which will pattern-match first first
317  *  and then last, and will fail if either of the sub-patterns fails.
318  *  FIRST is set to new by the operation.  last is unmolested.
319  */
320 
321 int
322 link_machines(first, last)
323 	int first, last;
324 {
325 	if (first == NIL)
326 		return last;
327 
328 	else if (last == NIL)
329 		return first;
330 
331 	else {
332 		mkxtion(finalst[first], last);
333 		finalst[first] = finalst[last];
334 		lastst[first] = MAX(lastst[first], lastst[last]);
335 		firstst[first] = MIN(firstst[first], firstst[last]);
336 
337 		return first;
338 	}
339 }
340 
341 
342 /* mark_beginning_as_normal - mark each "beginning" state in a machine
343  *                            as being a "normal" (i.e., not trailing context-
344  *                            associated) states
345  *
346  * The "beginning" states are the epsilon closure of the first state
347  */
348 
349 void
350 mark_beginning_as_normal(mach)
351 	int mach;
352 {
353 	switch (state_type[mach]) {
354 	case STATE_NORMAL:
355 		/* Oh, we've already visited here. */
356 		return;
357 
358 	case STATE_TRAILING_CONTEXT:
359 		state_type[mach] = STATE_NORMAL;
360 
361 		if (transchar[mach] == SYM_EPSILON) {
362 			if (trans1[mach] != NO_TRANSITION)
363 				mark_beginning_as_normal(trans1[mach]);
364 
365 			if (trans2[mach] != NO_TRANSITION)
366 				mark_beginning_as_normal(trans2[mach]);
367 		}
368 		break;
369 
370 	default:
371 		flexerror(_
372 		    ("bad state type in mark_beginning_as_normal()"));
373 		break;
374 	}
375 }
376 
377 
378 /* mkbranch - make a machine that branches to two machines
379  *
380  * synopsis
381  *
382  *   branch = mkbranch( first, second );
383  *
384  *     branch - a machine which matches either first's pattern or second's
385  *     first, second - machines whose patterns are to be or'ed (the | operator)
386  *
387  * Note that first and second are NEITHER destroyed by the operation.  Also,
388  * the resulting machine CANNOT be used with any other "mk" operation except
389  * more mkbranch's.  Compare with mkor()
390  */
391 
392 int
393 mkbranch(first, second)
394 	int first, second;
395 {
396 	int eps;
397 
398 	if (first == NO_TRANSITION)
399 		return second;
400 
401 	else if (second == NO_TRANSITION)
402 		return first;
403 
404 	eps = mkstate(SYM_EPSILON);
405 
406 	mkxtion(eps, first);
407 	mkxtion(eps, second);
408 
409 	return eps;
410 }
411 
412 
413 /* mkclos - convert a machine into a closure
414  *
415  * synopsis
416  *   new = mkclos( state );
417  *
418  * new - a new state which matches the closure of "state"
419  */
420 
421 int
422 mkclos(state)
423 	int state;
424 {
425 	return mkopt(mkposcl(state));
426 }
427 
428 
429 /* mkopt - make a machine optional
430  *
431  * synopsis
432  *
433  *   new = mkopt( mach );
434  *
435  *     new  - a machine which optionally matches whatever mach matched
436  *     mach - the machine to make optional
437  *
438  * notes:
439  *     1. mach must be the last machine created
440  *     2. mach is destroyed by the call
441  */
442 
443 int
444 mkopt(mach)
445 	int mach;
446 {
447 	int eps;
448 
449 	if (!SUPER_FREE_EPSILON(finalst[mach])) {
450 		eps = mkstate(SYM_EPSILON);
451 		mach = link_machines(mach, eps);
452 	}
453 	/*
454 	 * Can't skimp on the following if FREE_EPSILON(mach) is true because
455 	 * some state interior to "mach" might point back to the beginning
456 	 * for a closure.
457 	 */
458 	eps = mkstate(SYM_EPSILON);
459 	mach = link_machines(eps, mach);
460 
461 	mkxtion(mach, finalst[mach]);
462 
463 	return mach;
464 }
465 
466 
467 /* mkor - make a machine that matches either one of two machines
468  *
469  * synopsis
470  *
471  *   new = mkor( first, second );
472  *
473  *     new - a machine which matches either first's pattern or second's
474  *     first, second - machines whose patterns are to be or'ed (the | operator)
475  *
476  * note that first and second are both destroyed by the operation
477  * the code is rather convoluted because an attempt is made to minimize
478  * the number of epsilon states needed
479  */
480 
481 int
482 mkor(first, second)
483 	int first, second;
484 {
485 	int eps, orend;
486 
487 	if (first == NIL)
488 		return second;
489 
490 	else if (second == NIL)
491 		return first;
492 
493 	else {
494 		/*
495 		 * See comment in mkopt() about why we can't use the first
496 		 * state of "first" or "second" if they satisfy
497 		 * "FREE_EPSILON".
498 		 */
499 		eps = mkstate(SYM_EPSILON);
500 
501 		first = link_machines(eps, first);
502 
503 		mkxtion(first, second);
504 
505 		if (SUPER_FREE_EPSILON(finalst[first]) &&
506 		    accptnum[finalst[first]] == NIL) {
507 			orend = finalst[first];
508 			mkxtion(finalst[second], orend);
509 		} else if (SUPER_FREE_EPSILON(finalst[second]) &&
510 		    accptnum[finalst[second]] == NIL) {
511 			orend = finalst[second];
512 			mkxtion(finalst[first], orend);
513 		} else {
514 			eps = mkstate(SYM_EPSILON);
515 
516 			first = link_machines(first, eps);
517 			orend = finalst[first];
518 
519 			mkxtion(finalst[second], orend);
520 		}
521 	}
522 
523 	finalst[first] = orend;
524 	return first;
525 }
526 
527 
528 /* mkposcl - convert a machine into a positive closure
529  *
530  * synopsis
531  *   new = mkposcl( state );
532  *
533  *    new - a machine matching the positive closure of "state"
534  */
535 
536 int
537 mkposcl(state)
538 	int state;
539 {
540 	int eps;
541 
542 	if (SUPER_FREE_EPSILON(finalst[state])) {
543 		mkxtion(finalst[state], state);
544 		return state;
545 	} else {
546 		eps = mkstate(SYM_EPSILON);
547 		mkxtion(eps, state);
548 		return link_machines(state, eps);
549 	}
550 }
551 
552 
553 /* mkrep - make a replicated machine
554  *
555  * synopsis
556  *   new = mkrep( mach, lb, ub );
557  *
558  *    new - a machine that matches whatever "mach" matched from "lb"
559  *          number of times to "ub" number of times
560  *
561  * note
562  *   if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
563  */
564 
565 int
566 mkrep(mach, lb, ub)
567 	int mach, lb, ub;
568 {
569 	int base_mach, tail, copy, i;
570 
571 	base_mach = copysingl(mach, lb - 1);
572 
573 	if (ub == INFINITE_REPEAT) {
574 		copy = dupmachine(mach);
575 		mach = link_machines(mach,
576 		    link_machines(base_mach,
577 			mkclos(copy)));
578 	} else {
579 		tail = mkstate(SYM_EPSILON);
580 
581 		for (i = lb; i < ub; ++i) {
582 			copy = dupmachine(mach);
583 			tail = mkopt(link_machines(copy, tail));
584 		}
585 
586 		mach =
587 		    link_machines(mach,
588 		    link_machines(base_mach, tail));
589 	}
590 
591 	return mach;
592 }
593 
594 
595 /* mkstate - create a state with a transition on a given symbol
596  *
597  * synopsis
598  *
599  *   state = mkstate( sym );
600  *
601  *     state - a new state matching sym
602  *     sym   - the symbol the new state is to have an out-transition on
603  *
604  * note that this routine makes new states in ascending order through the
605  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
606  * relies on machines being made in ascending order and that they are
607  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
608  * that it admittedly is)
609  */
610 
611 int
612 mkstate(sym)
613 	int sym;
614 {
615 	if (++lastnfa >= current_mns) {
616 		if ((current_mns += MNS_INCREMENT) >= maximum_mns)
617 			lerrif(_
618 			    ("input rules are too complicated (>= %d NFA states)"),
619 			    current_mns);
620 
621 		++num_reallocs;
622 
623 		firstst = reallocate_integer_array(firstst, current_mns);
624 		lastst = reallocate_integer_array(lastst, current_mns);
625 		finalst = reallocate_integer_array(finalst, current_mns);
626 		transchar =
627 		    reallocate_integer_array(transchar, current_mns);
628 		trans1 = reallocate_integer_array(trans1, current_mns);
629 		trans2 = reallocate_integer_array(trans2, current_mns);
630 		accptnum =
631 		    reallocate_integer_array(accptnum, current_mns);
632 		assoc_rule =
633 		    reallocate_integer_array(assoc_rule, current_mns);
634 		state_type =
635 		    reallocate_integer_array(state_type, current_mns);
636 	}
637 	firstst[lastnfa] = lastnfa;
638 	finalst[lastnfa] = lastnfa;
639 	lastst[lastnfa] = lastnfa;
640 	transchar[lastnfa] = sym;
641 	trans1[lastnfa] = NO_TRANSITION;
642 	trans2[lastnfa] = NO_TRANSITION;
643 	accptnum[lastnfa] = NIL;
644 	assoc_rule[lastnfa] = num_rules;
645 	state_type[lastnfa] = current_state_type;
646 
647 	/*
648 	 * Fix up equivalence classes base on this transition.  Note that any
649 	 * character which has its own transition gets its own equivalence
650 	 * class.  Thus only characters which are only in character classes
651 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
652 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
653 	 * puts them in the same equivalence class (barring other differences
654 	 * elsewhere in the input).
655 	 */
656 
657 	if (sym < 0) {
658 		/*
659 		 * We don't have to update the equivalence classes since that
660 		 * was already done when the ccl was created for the first
661 		 * time.
662 		 */
663 	} else if (sym == SYM_EPSILON)
664 		++numeps;
665 
666 	else {
667 		check_char(sym);
668 
669 		if (useecs)
670 			/* Map NUL's to csize. */
671 			mkechar(sym ? sym : csize, nextecm, ecgroup);
672 	}
673 
674 	return lastnfa;
675 }
676 
677 
678 /* mkxtion - make a transition from one state to another
679  *
680  * synopsis
681  *
682  *   mkxtion( statefrom, stateto );
683  *
684  *     statefrom - the state from which the transition is to be made
685  *     stateto   - the state to which the transition is to be made
686  */
687 
688 void
689 mkxtion(statefrom, stateto)
690 	int statefrom, stateto;
691 {
692 	if (trans1[statefrom] == NO_TRANSITION)
693 		trans1[statefrom] = stateto;
694 
695 	else if ((transchar[statefrom] != SYM_EPSILON) ||
696 	    (trans2[statefrom] != NO_TRANSITION))
697 		flexfatal(_("found too many transitions in mkxtion()"));
698 
699 	else {			/* second out-transition for an epsilon state */
700 		++eps2;
701 		trans2[statefrom] = stateto;
702 	}
703 }
704 
705 /* new_rule - initialize for a new rule */
706 
707 void
708 new_rule()
709 {
710 	if (++num_rules >= current_max_rules) {
711 		++num_reallocs;
712 		current_max_rules += MAX_RULES_INCREMENT;
713 		rule_type = reallocate_integer_array(rule_type,
714 		    current_max_rules);
715 		rule_linenum = reallocate_integer_array(rule_linenum,
716 		    current_max_rules);
717 		rule_useful = reallocate_integer_array(rule_useful,
718 		    current_max_rules);
719 		rule_has_nl = reallocate_bool_array(rule_has_nl,
720 		    current_max_rules);
721 	}
722 	if (num_rules > MAX_RULE)
723 		lerrif(_("too many rules (> %d)!"), MAX_RULE);
724 
725 	rule_linenum[num_rules] = linenum;
726 	rule_useful[num_rules] = false;
727 	rule_has_nl[num_rules] = false;
728 }
729