xref: /minix/external/bsd/flex/dist/nfa.c (revision 0a6a1f1d)
1 /*	$NetBSD: nfa.c,v 1.3 2014/10/30 18:44:05 christos Exp $	*/
2 
3 /* nfa - NFA construction routines */
4 
5 /*  Copyright (c) 1990 The Regents of the University of California. */
6 /*  All rights reserved. */
7 
8 /*  This code is derived from software contributed to Berkeley by */
9 /*  Vern Paxson. */
10 
11 /*  The United States Government has rights in this work pursuant */
12 /*  to contract no. DE-AC03-76SF00098 between the United States */
13 /*  Department of Energy and the University of California. */
14 
15 /*  This file is part of flex. */
16 
17 /*  Redistribution and use in source and binary forms, with or without */
18 /*  modification, are permitted provided that the following conditions */
19 /*  are met: */
20 
21 /*  1. Redistributions of source code must retain the above copyright */
22 /*     notice, this list of conditions and the following disclaimer. */
23 /*  2. Redistributions in binary form must reproduce the above copyright */
24 /*     notice, this list of conditions and the following disclaimer in the */
25 /*     documentation and/or other materials provided with the distribution. */
26 
27 /*  Neither the name of the University nor the names of its contributors */
28 /*  may be used to endorse or promote products derived from this software */
29 /*  without specific prior written permission. */
30 
31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /*  PURPOSE. */
35 #include "flexdef.h"
36 __RCSID("$NetBSD: nfa.c,v 1.3 2014/10/30 18:44:05 christos Exp $");
37 
38 
39 
40 /* declare functions that have forward references */
41 
42 int dupmachine PROTO ((int));
43 void mkxtion PROTO ((int, int));
44 
45 
46 /* add_accept - add an accepting state to a machine
47  *
48  * accepting_number becomes mach's accepting number.
49  */
50 
add_accept(mach,accepting_number)51 void    add_accept (mach, accepting_number)
52      int     mach, accepting_number;
53 {
54 	/* Hang the accepting number off an epsilon state.  if it is associated
55 	 * with a state that has a non-epsilon out-transition, then the state
56 	 * will accept BEFORE it makes that transition, i.e., one character
57 	 * too soon.
58 	 */
59 
60 	if (transchar[finalst[mach]] == SYM_EPSILON)
61 		accptnum[finalst[mach]] = accepting_number;
62 
63 	else {
64 		int     astate = mkstate (SYM_EPSILON);
65 
66 		accptnum[astate] = accepting_number;
67 		(void) link_machines (mach, astate);
68 	}
69 }
70 
71 
72 /* copysingl - make a given number of copies of a singleton machine
73  *
74  * synopsis
75  *
76  *   newsng = copysingl( singl, num );
77  *
78  *     newsng - a new singleton composed of num copies of singl
79  *     singl  - a singleton machine
80  *     num    - the number of copies of singl to be present in newsng
81  */
82 
copysingl(singl,num)83 int     copysingl (singl, num)
84      int     singl, num;
85 {
86 	int     copy, i;
87 
88 	copy = mkstate (SYM_EPSILON);
89 
90 	for (i = 1; i <= num; ++i)
91 		copy = link_machines (copy, dupmachine (singl));
92 
93 	return copy;
94 }
95 
96 
97 /* dumpnfa - debugging routine to write out an nfa */
98 
dumpnfa(state1)99 void    dumpnfa (state1)
100      int     state1;
101 
102 {
103 	int     sym, tsp1, tsp2, anum, ns;
104 
105 	fprintf (stderr,
106 		 _
107 		 ("\n\n********** beginning dump of nfa with start state %d\n"),
108 		 state1);
109 
110 	/* We probably should loop starting at firstst[state1] and going to
111 	 * lastst[state1], but they're not maintained properly when we "or"
112 	 * all of the rules together.  So we use our knowledge that the machine
113 	 * starts at state 1 and ends at lastnfa.
114 	 */
115 
116 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
117 	for (ns = 1; ns <= lastnfa; ++ns) {
118 		fprintf (stderr, _("state # %4d\t"), ns);
119 
120 		sym = transchar[ns];
121 		tsp1 = trans1[ns];
122 		tsp2 = trans2[ns];
123 		anum = accptnum[ns];
124 
125 		fprintf (stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2);
126 
127 		if (anum != NIL)
128 			fprintf (stderr, "  [%d]", anum);
129 
130 		fprintf (stderr, "\n");
131 	}
132 
133 	fprintf (stderr, _("********** end of dump\n"));
134 }
135 
136 
137 /* dupmachine - make a duplicate of a given machine
138  *
139  * synopsis
140  *
141  *   copy = dupmachine( mach );
142  *
143  *     copy - holds duplicate of mach
144  *     mach - machine to be duplicated
145  *
146  * note that the copy of mach is NOT an exact duplicate; rather, all the
147  * transition states values are adjusted so that the copy is self-contained,
148  * as the original should have been.
149  *
150  * also note that the original MUST be contiguous, with its low and high
151  * states accessible by the arrays firstst and lastst
152  */
153 
dupmachine(mach)154 int     dupmachine (mach)
155      int     mach;
156 {
157 	int     i, init, state_offset;
158 	int     state = 0;
159 	int     last = lastst[mach];
160 
161 	for (i = firstst[mach]; i <= last; ++i) {
162 		state = mkstate (transchar[i]);
163 
164 		if (trans1[i] != NO_TRANSITION) {
165 			mkxtion (finalst[state], trans1[i] + state - i);
166 
167 			if (transchar[i] == SYM_EPSILON &&
168 			    trans2[i] != NO_TRANSITION)
169 					mkxtion (finalst[state],
170 						 trans2[i] + state - i);
171 		}
172 
173 		accptnum[state] = accptnum[i];
174 	}
175 
176 	if (state == 0)
177 		flexfatal (_("empty machine in dupmachine()"));
178 
179 	state_offset = state - i + 1;
180 
181 	init = mach + state_offset;
182 	firstst[init] = firstst[mach] + state_offset;
183 	finalst[init] = finalst[mach] + state_offset;
184 	lastst[init] = lastst[mach] + state_offset;
185 
186 	return init;
187 }
188 
189 
190 /* finish_rule - finish up the processing for a rule
191  *
192  * An accepting number is added to the given machine.  If variable_trail_rule
193  * is true then the rule has trailing context and both the head and trail
194  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
195  * the machine recognizes a pattern with trailing context and headcnt is
196  * the number of characters in the matched part of the pattern, or zero
197  * if the matched part has variable length.  trailcnt is the number of
198  * trailing context characters in the pattern, or zero if the trailing
199  * context has variable length.
200  */
201 
finish_rule(mach,variable_trail_rule,headcnt,trailcnt,pcont_act)202 void    finish_rule (mach, variable_trail_rule, headcnt, trailcnt,
203 		     pcont_act)
204      int     mach, variable_trail_rule, headcnt, trailcnt, pcont_act;
205 {
206 	char    action_text[MAXLINE];
207 
208 	add_accept (mach, num_rules);
209 
210 	/* We did this in new_rule(), but it often gets the wrong
211 	 * number because we do it before we start parsing the current rule.
212 	 */
213 	rule_linenum[num_rules] = linenum;
214 
215 	/* If this is a continued action, then the line-number has already
216 	 * been updated, giving us the wrong number.
217 	 */
218 	if (continued_action)
219 		--rule_linenum[num_rules];
220 
221 
222 	/* If the previous rule was continued action, then we inherit the
223 	 * previous newline flag, possibly overriding the current one.
224 	 */
225 	if (pcont_act && rule_has_nl[num_rules - 1])
226 		rule_has_nl[num_rules] = true;
227 
228 	snprintf (action_text, sizeof(action_text), "case %d:\n", num_rules);
229 	add_action (action_text);
230 	if (rule_has_nl[num_rules]) {
231 		snprintf (action_text, sizeof(action_text), "/* rule %d can match eol */\n",
232 			 num_rules);
233 		add_action (action_text);
234 	}
235 
236 
237 	if (variable_trail_rule) {
238 		rule_type[num_rules] = RULE_VARIABLE;
239 
240 		if (performance_report > 0)
241 			fprintf (stderr,
242 				 _
243 				 ("Variable trailing context rule at line %d\n"),
244 				 rule_linenum[num_rules]);
245 
246 		variable_trailing_context_rules = true;
247 	}
248 
249 	else {
250 		rule_type[num_rules] = RULE_NORMAL;
251 
252 		if (headcnt > 0 || trailcnt > 0) {
253 			/* Do trailing context magic to not match the trailing
254 			 * characters.
255 			 */
256 			char   *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
257 			char   *scanner_bp = "yy_bp";
258 
259 			add_action
260 				("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
261 
262 			if (headcnt > 0) {
263 				if (rule_has_nl[num_rules]) {
264 					snprintf (action_text, sizeof(action_text),
265 						"YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt);
266 					add_action (action_text);
267 				}
268 				snprintf (action_text, sizeof(action_text), "%s = %s + %d;\n",
269 					 scanner_cp, scanner_bp, headcnt);
270 				add_action (action_text);
271 			}
272 
273 			else {
274 				if (rule_has_nl[num_rules]) {
275 					snprintf (action_text, sizeof(action_text),
276 						 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt);
277 					add_action (action_text);
278 				}
279 
280 				snprintf (action_text, sizeof(action_text), "%s -= %d;\n",
281 					 scanner_cp, trailcnt);
282 				add_action (action_text);
283 			}
284 
285 			add_action
286 				("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
287 		}
288 	}
289 
290 	/* Okay, in the action code at this point yytext and yyleng have
291 	 * their proper final values for this rule, so here's the point
292 	 * to do any user action.  But don't do it for continued actions,
293 	 * as that'll result in multiple YY_RULE_SETUP's.
294 	 */
295 	if (!continued_action)
296 		add_action ("YY_RULE_SETUP\n");
297 
298 	line_directive_out ((FILE *) 0, 1);
299 }
300 
301 
302 /* link_machines - connect two machines together
303  *
304  * synopsis
305  *
306  *   new = link_machines( first, last );
307  *
308  *     new    - a machine constructed by connecting first to last
309  *     first  - the machine whose successor is to be last
310  *     last   - the machine whose predecessor is to be first
311  *
312  * note: this routine concatenates the machine first with the machine
313  *  last to produce a machine new which will pattern-match first first
314  *  and then last, and will fail if either of the sub-patterns fails.
315  *  FIRST is set to new by the operation.  last is unmolested.
316  */
317 
link_machines(first,last)318 int     link_machines (first, last)
319      int     first, last;
320 {
321 	if (first == NIL)
322 		return last;
323 
324 	else if (last == NIL)
325 		return first;
326 
327 	else {
328 		mkxtion (finalst[first], last);
329 		finalst[first] = finalst[last];
330 		lastst[first] = MAX (lastst[first], lastst[last]);
331 		firstst[first] = MIN (firstst[first], firstst[last]);
332 
333 		return first;
334 	}
335 }
336 
337 
338 /* mark_beginning_as_normal - mark each "beginning" state in a machine
339  *                            as being a "normal" (i.e., not trailing context-
340  *                            associated) states
341  *
342  * The "beginning" states are the epsilon closure of the first state
343  */
344 
mark_beginning_as_normal(mach)345 void    mark_beginning_as_normal (mach)
346      register int mach;
347 {
348 	switch (state_type[mach]) {
349 	case STATE_NORMAL:
350 		/* Oh, we've already visited here. */
351 		return;
352 
353 	case STATE_TRAILING_CONTEXT:
354 		state_type[mach] = STATE_NORMAL;
355 
356 		if (transchar[mach] == SYM_EPSILON) {
357 			if (trans1[mach] != NO_TRANSITION)
358 				mark_beginning_as_normal (trans1[mach]);
359 
360 			if (trans2[mach] != NO_TRANSITION)
361 				mark_beginning_as_normal (trans2[mach]);
362 		}
363 		break;
364 
365 	default:
366 		flexerror (_
367 			   ("bad state type in mark_beginning_as_normal()"));
368 		break;
369 	}
370 }
371 
372 
373 /* mkbranch - make a machine that branches to two machines
374  *
375  * synopsis
376  *
377  *   branch = mkbranch( first, second );
378  *
379  *     branch - a machine which matches either first's pattern or second's
380  *     first, second - machines whose patterns are to be or'ed (the | operator)
381  *
382  * Note that first and second are NEITHER destroyed by the operation.  Also,
383  * the resulting machine CANNOT be used with any other "mk" operation except
384  * more mkbranch's.  Compare with mkor()
385  */
386 
mkbranch(first,second)387 int     mkbranch (first, second)
388      int     first, second;
389 {
390 	int     eps;
391 
392 	if (first == NO_TRANSITION)
393 		return second;
394 
395 	else if (second == NO_TRANSITION)
396 		return first;
397 
398 	eps = mkstate (SYM_EPSILON);
399 
400 	mkxtion (eps, first);
401 	mkxtion (eps, second);
402 
403 	return eps;
404 }
405 
406 
407 /* mkclos - convert a machine into a closure
408  *
409  * synopsis
410  *   new = mkclos( state );
411  *
412  * new - a new state which matches the closure of "state"
413  */
414 
mkclos(state)415 int     mkclos (state)
416      int     state;
417 {
418 	return mkopt (mkposcl (state));
419 }
420 
421 
422 /* mkopt - make a machine optional
423  *
424  * synopsis
425  *
426  *   new = mkopt( mach );
427  *
428  *     new  - a machine which optionally matches whatever mach matched
429  *     mach - the machine to make optional
430  *
431  * notes:
432  *     1. mach must be the last machine created
433  *     2. mach is destroyed by the call
434  */
435 
mkopt(mach)436 int     mkopt (mach)
437      int     mach;
438 {
439 	int     eps;
440 
441 	if (!SUPER_FREE_EPSILON (finalst[mach])) {
442 		eps = mkstate (SYM_EPSILON);
443 		mach = link_machines (mach, eps);
444 	}
445 
446 	/* Can't skimp on the following if FREE_EPSILON(mach) is true because
447 	 * some state interior to "mach" might point back to the beginning
448 	 * for a closure.
449 	 */
450 	eps = mkstate (SYM_EPSILON);
451 	mach = link_machines (eps, mach);
452 
453 	mkxtion (mach, finalst[mach]);
454 
455 	return mach;
456 }
457 
458 
459 /* mkor - make a machine that matches either one of two machines
460  *
461  * synopsis
462  *
463  *   new = mkor( first, second );
464  *
465  *     new - a machine which matches either first's pattern or second's
466  *     first, second - machines whose patterns are to be or'ed (the | operator)
467  *
468  * note that first and second are both destroyed by the operation
469  * the code is rather convoluted because an attempt is made to minimize
470  * the number of epsilon states needed
471  */
472 
mkor(first,second)473 int     mkor (first, second)
474      int     first, second;
475 {
476 	int     eps, orend;
477 
478 	if (first == NIL)
479 		return second;
480 
481 	else if (second == NIL)
482 		return first;
483 
484 	else {
485 		/* See comment in mkopt() about why we can't use the first
486 		 * state of "first" or "second" if they satisfy "FREE_EPSILON".
487 		 */
488 		eps = mkstate (SYM_EPSILON);
489 
490 		first = link_machines (eps, first);
491 
492 		mkxtion (first, second);
493 
494 		if (SUPER_FREE_EPSILON (finalst[first]) &&
495 		    accptnum[finalst[first]] == NIL) {
496 			orend = finalst[first];
497 			mkxtion (finalst[second], orend);
498 		}
499 
500 		else if (SUPER_FREE_EPSILON (finalst[second]) &&
501 			 accptnum[finalst[second]] == NIL) {
502 			orend = finalst[second];
503 			mkxtion (finalst[first], orend);
504 		}
505 
506 		else {
507 			eps = mkstate (SYM_EPSILON);
508 
509 			first = link_machines (first, eps);
510 			orend = finalst[first];
511 
512 			mkxtion (finalst[second], orend);
513 		}
514 	}
515 
516 	finalst[first] = orend;
517 	return first;
518 }
519 
520 
521 /* mkposcl - convert a machine into a positive closure
522  *
523  * synopsis
524  *   new = mkposcl( state );
525  *
526  *    new - a machine matching the positive closure of "state"
527  */
528 
mkposcl(state)529 int     mkposcl (state)
530      int     state;
531 {
532 	int     eps;
533 
534 	if (SUPER_FREE_EPSILON (finalst[state])) {
535 		mkxtion (finalst[state], state);
536 		return state;
537 	}
538 
539 	else {
540 		eps = mkstate (SYM_EPSILON);
541 		mkxtion (eps, state);
542 		return link_machines (state, eps);
543 	}
544 }
545 
546 
547 /* mkrep - make a replicated machine
548  *
549  * synopsis
550  *   new = mkrep( mach, lb, ub );
551  *
552  *    new - a machine that matches whatever "mach" matched from "lb"
553  *          number of times to "ub" number of times
554  *
555  * note
556  *   if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
557  */
558 
mkrep(mach,lb,ub)559 int     mkrep (mach, lb, ub)
560      int     mach, lb, ub;
561 {
562 	int     base_mach, tail, copy, i;
563 
564 	base_mach = copysingl (mach, lb - 1);
565 
566 	if (ub == INFINITE_REPEAT) {
567 		copy = dupmachine (mach);
568 		mach = link_machines (mach,
569 				      link_machines (base_mach,
570 						     mkclos (copy)));
571 	}
572 
573 	else {
574 		tail = mkstate (SYM_EPSILON);
575 
576 		for (i = lb; i < ub; ++i) {
577 			copy = dupmachine (mach);
578 			tail = mkopt (link_machines (copy, tail));
579 		}
580 
581 		mach =
582 			link_machines (mach,
583 				       link_machines (base_mach, tail));
584 	}
585 
586 	return mach;
587 }
588 
589 
590 /* mkstate - create a state with a transition on a given symbol
591  *
592  * synopsis
593  *
594  *   state = mkstate( sym );
595  *
596  *     state - a new state matching sym
597  *     sym   - the symbol the new state is to have an out-transition on
598  *
599  * note that this routine makes new states in ascending order through the
600  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
601  * relies on machines being made in ascending order and that they are
602  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
603  * that it admittedly is)
604  */
605 
mkstate(sym)606 int     mkstate (sym)
607      int     sym;
608 {
609 	if (++lastnfa >= current_mns) {
610 		if ((current_mns += MNS_INCREMENT) >= maximum_mns)
611 			lerrif (_
612 				("input rules are too complicated (>= %d NFA states)"),
613 current_mns);
614 
615 		++num_reallocs;
616 
617 		firstst = reallocate_integer_array (firstst, current_mns);
618 		lastst = reallocate_integer_array (lastst, current_mns);
619 		finalst = reallocate_integer_array (finalst, current_mns);
620 		transchar =
621 			reallocate_integer_array (transchar, current_mns);
622 		trans1 = reallocate_integer_array (trans1, current_mns);
623 		trans2 = reallocate_integer_array (trans2, current_mns);
624 		accptnum =
625 			reallocate_integer_array (accptnum, current_mns);
626 		assoc_rule =
627 			reallocate_integer_array (assoc_rule, current_mns);
628 		state_type =
629 			reallocate_integer_array (state_type, current_mns);
630 	}
631 
632 	firstst[lastnfa] = lastnfa;
633 	finalst[lastnfa] = lastnfa;
634 	lastst[lastnfa] = lastnfa;
635 	transchar[lastnfa] = sym;
636 	trans1[lastnfa] = NO_TRANSITION;
637 	trans2[lastnfa] = NO_TRANSITION;
638 	accptnum[lastnfa] = NIL;
639 	assoc_rule[lastnfa] = num_rules;
640 	state_type[lastnfa] = current_state_type;
641 
642 	/* Fix up equivalence classes base on this transition.  Note that any
643 	 * character which has its own transition gets its own equivalence
644 	 * class.  Thus only characters which are only in character classes
645 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
646 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
647 	 * puts them in the same equivalence class (barring other differences
648 	 * elsewhere in the input).
649 	 */
650 
651 	if (sym < 0) {
652 		/* We don't have to update the equivalence classes since
653 		 * that was already done when the ccl was created for the
654 		 * first time.
655 		 */
656 	}
657 
658 	else if (sym == SYM_EPSILON)
659 		++numeps;
660 
661 	else {
662 		check_char (sym);
663 
664 		if (useecs)
665 			/* Map NUL's to csize. */
666 			mkechar (sym ? sym : csize, nextecm, ecgroup);
667 	}
668 
669 	return lastnfa;
670 }
671 
672 
673 /* mkxtion - make a transition from one state to another
674  *
675  * synopsis
676  *
677  *   mkxtion( statefrom, stateto );
678  *
679  *     statefrom - the state from which the transition is to be made
680  *     stateto   - the state to which the transition is to be made
681  */
682 
mkxtion(statefrom,stateto)683 void    mkxtion (statefrom, stateto)
684      int     statefrom, stateto;
685 {
686 	if (trans1[statefrom] == NO_TRANSITION)
687 		trans1[statefrom] = stateto;
688 
689 	else if ((transchar[statefrom] != SYM_EPSILON) ||
690 		 (trans2[statefrom] != NO_TRANSITION))
691 		flexfatal (_("found too many transitions in mkxtion()"));
692 
693 	else {			/* second out-transition for an epsilon state */
694 		++eps2;
695 		trans2[statefrom] = stateto;
696 	}
697 }
698 
699 /* new_rule - initialize for a new rule */
700 
new_rule()701 void    new_rule ()
702 {
703 	if (++num_rules >= current_max_rules) {
704 		++num_reallocs;
705 		current_max_rules += MAX_RULES_INCREMENT;
706 		rule_type = reallocate_integer_array (rule_type,
707 						      current_max_rules);
708 		rule_linenum = reallocate_integer_array (rule_linenum,
709 							 current_max_rules);
710 		rule_useful = reallocate_integer_array (rule_useful,
711 							current_max_rules);
712 		rule_has_nl = reallocate_bool_array (rule_has_nl,
713 						     current_max_rules);
714 	}
715 
716 	if (num_rules > MAX_RULE)
717 		lerrif (_("too many rules (> %d)!"), MAX_RULE);
718 
719 	rule_linenum[num_rules] = linenum;
720 	rule_useful[num_rules] = false;
721 	rule_has_nl[num_rules] = false;
722 }
723