xref: /openbsd/usr.bin/lex/nfa.c (revision 20c29e2b)
1 /*	$OpenBSD: nfa.c,v 1.12 2024/11/09 18:03:44 op Exp $	*/
2 
3 /* nfa - NFA construction routines */
4 
5 /*  Copyright (c) 1990 The Regents of the University of California. */
6 /*  All rights reserved. */
7 
8 /*  This code is derived from software contributed to Berkeley by */
9 /*  Vern Paxson. */
10 
11 /*  The United States Government has rights in this work pursuant */
12 /*  to contract no. DE-AC03-76SF00098 between the United States */
13 /*  Department of Energy and the University of California. */
14 
15 /*  This file is part of flex. */
16 
17 /*  Redistribution and use in source and binary forms, with or without */
18 /*  modification, are permitted provided that the following conditions */
19 /*  are met: */
20 
21 /*  1. Redistributions of source code must retain the above copyright */
22 /*     notice, this list of conditions and the following disclaimer. */
23 /*  2. Redistributions in binary form must reproduce the above copyright */
24 /*     notice, this list of conditions and the following disclaimer in the */
25 /*     documentation and/or other materials provided with the distribution. */
26 
27 /*  Neither the name of the University nor the names of its contributors */
28 /*  may be used to endorse or promote products derived from this software */
29 /*  without specific prior written permission. */
30 
31 /*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /*  IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /*  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /*  PURPOSE. */
35 
36 #include "flexdef.h"
37 
38 
39 /* declare functions that have forward references */
40 
41 int dupmachine PROTO((int));
42 void mkxtion PROTO((int, int));
43 
44 
45 /* add_accept - add an accepting state to a machine
46  *
47  * accepting_number becomes mach's accepting number.
48  */
49 
50 void
add_accept(int mach,int accepting_number)51 add_accept(int mach, int accepting_number)
52 {
53 	/*
54 	 * Hang the accepting number off an epsilon state.  if it is
55 	 * associated with a state that has a non-epsilon out-transition,
56 	 * then the state will accept BEFORE it makes that transition, i.e.,
57 	 * one character too soon.
58 	 */
59 
60 	if (transchar[finalst[mach]] == SYM_EPSILON)
61 		accptnum[finalst[mach]] = accepting_number;
62 
63 	else {
64 		int astate = mkstate(SYM_EPSILON);
65 
66 		accptnum[astate] = accepting_number;
67 		(void) link_machines(mach, astate);
68 	}
69 }
70 
71 
72 /* copysingl - make a given number of copies of a singleton machine
73  *
74  * synopsis
75  *
76  *   newsng = copysingl( singl, num );
77  *
78  *     newsng - a new singleton composed of num copies of singl
79  *     singl  - a singleton machine
80  *     num    - the number of copies of singl to be present in newsng
81  */
82 
83 int
copysingl(int singl,int num)84 copysingl(int singl, int num)
85 {
86 	int copy, i;
87 
88 	copy = mkstate(SYM_EPSILON);
89 
90 	for (i = 1; i <= num; ++i)
91 		copy = link_machines(copy, dupmachine(singl));
92 
93 	return copy;
94 }
95 
96 
97 /* dumpnfa - debugging routine to write out an nfa */
98 
99 void
dumpnfa(int state1)100 dumpnfa(int state1)
101 {
102 	int sym, tsp1, tsp2, anum, ns;
103 
104 	fprintf(stderr,
105 	    _
106 	    ("\n\n********** beginning dump of nfa with start state %d\n"),
107 	    state1);
108 
109 	/*
110 	 * We probably should loop starting at firstst[state1] and going to
111 	 * lastst[state1], but they're not maintained properly when we "or"
112 	 * all of the rules together.  So we use our knowledge that the
113 	 * machine starts at state 1 and ends at lastnfa.
114 	 */
115 
116 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
117 	for (ns = 1; ns <= lastnfa; ++ns) {
118 		fprintf(stderr, _("state # %4d\t"), ns);
119 
120 		sym = transchar[ns];
121 		tsp1 = trans1[ns];
122 		tsp2 = trans2[ns];
123 		anum = accptnum[ns];
124 
125 		fprintf(stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2);
126 
127 		if (anum != NIL)
128 			fprintf(stderr, "  [%d]", anum);
129 
130 		fprintf(stderr, "\n");
131 	}
132 
133 	fprintf(stderr, _("********** end of dump\n"));
134 }
135 
136 
137 /* dupmachine - make a duplicate of a given machine
138  *
139  * synopsis
140  *
141  *   copy = dupmachine( mach );
142  *
143  *     copy - holds duplicate of mach
144  *     mach - machine to be duplicated
145  *
146  * note that the copy of mach is NOT an exact duplicate; rather, all the
147  * transition states values are adjusted so that the copy is self-contained,
148  * as the original should have been.
149  *
150  * also note that the original MUST be contiguous, with its low and high
151  * states accessible by the arrays firstst and lastst
152  */
153 
154 int
dupmachine(int mach)155 dupmachine(int mach)
156 {
157 	int i, init, state_offset;
158 	int state = 0;
159 	int last = lastst[mach];
160 
161 	for (i = firstst[mach]; i <= last; ++i) {
162 		state = mkstate(transchar[i]);
163 
164 		if (trans1[i] != NO_TRANSITION) {
165 			mkxtion(finalst[state], trans1[i] + state - i);
166 
167 			if (transchar[i] == SYM_EPSILON &&
168 			    trans2[i] != NO_TRANSITION)
169 				mkxtion(finalst[state],
170 				    trans2[i] + state - i);
171 		}
172 		accptnum[state] = accptnum[i];
173 	}
174 
175 	if (state == 0)
176 		flexfatal(_("empty machine in dupmachine()"));
177 
178 	state_offset = state - i + 1;
179 
180 	init = mach + state_offset;
181 	firstst[init] = firstst[mach] + state_offset;
182 	finalst[init] = finalst[mach] + state_offset;
183 	lastst[init] = lastst[mach] + state_offset;
184 
185 	return init;
186 }
187 
188 
189 /* finish_rule - finish up the processing for a rule
190  *
191  * An accepting number is added to the given machine.  If variable_trail_rule
192  * is true then the rule has trailing context and both the head and trail
193  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
194  * the machine recognizes a pattern with trailing context and headcnt is
195  * the number of characters in the matched part of the pattern, or zero
196  * if the matched part has variable length.  trailcnt is the number of
197  * trailing context characters in the pattern, or zero if the trailing
198  * context has variable length.
199  */
200 
201 void
finish_rule(int mach,int variable_trail_rule,int headcnt,int trailcnt,int pcont_act)202 finish_rule(int mach, int variable_trail_rule, int headcnt, int trailcnt,
203     int pcont_act)
204 {
205 	char action_text[MAXLINE];
206 
207 	add_accept(mach, num_rules);
208 
209 	/*
210 	 * We did this in new_rule(), but it often gets the wrong number
211 	 * because we do it before we start parsing the current rule.
212 	 */
213 	rule_linenum[num_rules] = linenum;
214 
215 	/*
216 	 * If this is a continued action, then the line-number has already
217 	 * been updated, giving us the wrong number.
218 	 */
219 	if (continued_action)
220 		--rule_linenum[num_rules];
221 
222 
223 	/*
224 	 * If the previous rule was continued action, then we inherit the
225 	 * previous newline flag, possibly overriding the current one.
226 	 */
227 	if (pcont_act && rule_has_nl[num_rules - 1])
228 		rule_has_nl[num_rules] = true;
229 
230 	snprintf(action_text, sizeof(action_text), "case %d:\n", num_rules);
231 	add_action(action_text);
232 	if (rule_has_nl[num_rules]) {
233 		snprintf(action_text, sizeof(action_text), "/* rule %d can match eol */\n",
234 		    num_rules);
235 		add_action(action_text);
236 	}
237 	if (variable_trail_rule) {
238 		rule_type[num_rules] = RULE_VARIABLE;
239 
240 		if (performance_report > 0)
241 			fprintf(stderr,
242 			    _
243 			    ("Variable trailing context rule at line %d\n"),
244 			    rule_linenum[num_rules]);
245 
246 		variable_trailing_context_rules = true;
247 	} else {
248 		rule_type[num_rules] = RULE_NORMAL;
249 
250 		if (headcnt > 0 || trailcnt > 0) {
251 			/*
252 			 * Do trailing context magic to not match the
253 			 * trailing characters.
254 			 */
255 			char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
256 			char *scanner_bp = "yy_bp";
257 
258 			add_action
259 			    ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
260 
261 			if (headcnt > 0) {
262 				if (rule_has_nl[num_rules]) {
263 					snprintf(action_text, sizeof(action_text),
264 					    "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt);
265 					add_action(action_text);
266 				}
267 				snprintf(action_text, sizeof(action_text), "%s = %s + %d;\n",
268 				    scanner_cp, scanner_bp, headcnt);
269 				add_action(action_text);
270 			} else {
271 				if (rule_has_nl[num_rules]) {
272 					snprintf(action_text, sizeof(action_text),
273 					    "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt);
274 					add_action(action_text);
275 				}
276 				snprintf(action_text, sizeof(action_text), "%s -= %d;\n",
277 				    scanner_cp, trailcnt);
278 				add_action(action_text);
279 			}
280 
281 			add_action
282 			    ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
283 		}
284 	}
285 
286 	/*
287 	 * Okay, in the action code at this point yytext and yyleng have
288 	 * their proper final values for this rule, so here's the point to do
289 	 * any user action.  But don't do it for continued actions, as
290 	 * that'll result in multiple YY_RULE_SETUP's.
291 	 */
292 	if (!continued_action)
293 		add_action("YY_RULE_SETUP\n");
294 
295 	line_directive_out((FILE *) 0, 1);
296 }
297 
298 
299 /* link_machines - connect two machines together
300  *
301  * synopsis
302  *
303  *   new = link_machines( first, last );
304  *
305  *     new    - a machine constructed by connecting first to last
306  *     first  - the machine whose successor is to be last
307  *     last   - the machine whose predecessor is to be first
308  *
309  * note: this routine concatenates the machine first with the machine
310  *  last to produce a machine new which will pattern-match first first
311  *  and then last, and will fail if either of the sub-patterns fails.
312  *  FIRST is set to new by the operation.  last is unmolested.
313  */
314 
315 int
link_machines(int first,int last)316 link_machines(int first, int last)
317 {
318 	if (first == NIL)
319 		return last;
320 
321 	else if (last == NIL)
322 		return first;
323 
324 	else {
325 		mkxtion(finalst[first], last);
326 		finalst[first] = finalst[last];
327 		lastst[first] = MAX(lastst[first], lastst[last]);
328 		firstst[first] = MIN(firstst[first], firstst[last]);
329 
330 		return first;
331 	}
332 }
333 
334 
335 /* mark_beginning_as_normal - mark each "beginning" state in a machine
336  *                            as being a "normal" (i.e., not trailing context-
337  *                            associated) states
338  *
339  * The "beginning" states are the epsilon closure of the first state
340  */
341 
342 void
mark_beginning_as_normal(int mach)343 mark_beginning_as_normal(int mach)
344 {
345 	switch (state_type[mach]) {
346 	case STATE_NORMAL:
347 		/* Oh, we've already visited here. */
348 		return;
349 
350 	case STATE_TRAILING_CONTEXT:
351 		state_type[mach] = STATE_NORMAL;
352 
353 		if (transchar[mach] == SYM_EPSILON) {
354 			if (trans1[mach] != NO_TRANSITION)
355 				mark_beginning_as_normal(trans1[mach]);
356 
357 			if (trans2[mach] != NO_TRANSITION)
358 				mark_beginning_as_normal(trans2[mach]);
359 		}
360 		break;
361 
362 	default:
363 		flexerror(_
364 		    ("bad state type in mark_beginning_as_normal()"));
365 		break;
366 	}
367 }
368 
369 
370 /* mkbranch - make a machine that branches to two machines
371  *
372  * synopsis
373  *
374  *   branch = mkbranch( first, second );
375  *
376  *     branch - a machine which matches either first's pattern or second's
377  *     first, second - machines whose patterns are to be or'ed (the | operator)
378  *
379  * Note that first and second are NEITHER destroyed by the operation.  Also,
380  * the resulting machine CANNOT be used with any other "mk" operation except
381  * more mkbranch's.  Compare with mkor()
382  */
383 
384 int
mkbranch(int first,int second)385 mkbranch(int first, int second)
386 {
387 	int eps;
388 
389 	if (first == NO_TRANSITION)
390 		return second;
391 
392 	else if (second == NO_TRANSITION)
393 		return first;
394 
395 	eps = mkstate(SYM_EPSILON);
396 
397 	mkxtion(eps, first);
398 	mkxtion(eps, second);
399 
400 	return eps;
401 }
402 
403 
404 /* mkclos - convert a machine into a closure
405  *
406  * synopsis
407  *   new = mkclos( state );
408  *
409  * new - a new state which matches the closure of "state"
410  */
411 
412 int
mkclos(int state)413 mkclos(int state)
414 {
415 	return mkopt(mkposcl(state));
416 }
417 
418 
419 /* mkopt - make a machine optional
420  *
421  * synopsis
422  *
423  *   new = mkopt( mach );
424  *
425  *     new  - a machine which optionally matches whatever mach matched
426  *     mach - the machine to make optional
427  *
428  * notes:
429  *     1. mach must be the last machine created
430  *     2. mach is destroyed by the call
431  */
432 
433 int
mkopt(int mach)434 mkopt(int mach)
435 {
436 	int eps;
437 
438 	if (!SUPER_FREE_EPSILON(finalst[mach])) {
439 		eps = mkstate(SYM_EPSILON);
440 		mach = link_machines(mach, eps);
441 	}
442 	/*
443 	 * Can't skimp on the following if FREE_EPSILON(mach) is true because
444 	 * some state interior to "mach" might point back to the beginning
445 	 * for a closure.
446 	 */
447 	eps = mkstate(SYM_EPSILON);
448 	mach = link_machines(eps, mach);
449 
450 	mkxtion(mach, finalst[mach]);
451 
452 	return mach;
453 }
454 
455 
456 /* mkor - make a machine that matches either one of two machines
457  *
458  * synopsis
459  *
460  *   new = mkor( first, second );
461  *
462  *     new - a machine which matches either first's pattern or second's
463  *     first, second - machines whose patterns are to be or'ed (the | operator)
464  *
465  * note that first and second are both destroyed by the operation
466  * the code is rather convoluted because an attempt is made to minimize
467  * the number of epsilon states needed
468  */
469 
470 int
mkor(int first,int second)471 mkor(int first, int second)
472 {
473 	int eps, orend;
474 
475 	if (first == NIL)
476 		return second;
477 
478 	else if (second == NIL)
479 		return first;
480 
481 	else {
482 		/*
483 		 * See comment in mkopt() about why we can't use the first
484 		 * state of "first" or "second" if they satisfy
485 		 * "FREE_EPSILON".
486 		 */
487 		eps = mkstate(SYM_EPSILON);
488 
489 		first = link_machines(eps, first);
490 
491 		mkxtion(first, second);
492 
493 		if (SUPER_FREE_EPSILON(finalst[first]) &&
494 		    accptnum[finalst[first]] == NIL) {
495 			orend = finalst[first];
496 			mkxtion(finalst[second], orend);
497 		} else if (SUPER_FREE_EPSILON(finalst[second]) &&
498 		    accptnum[finalst[second]] == NIL) {
499 			orend = finalst[second];
500 			mkxtion(finalst[first], orend);
501 		} else {
502 			eps = mkstate(SYM_EPSILON);
503 
504 			first = link_machines(first, eps);
505 			orend = finalst[first];
506 
507 			mkxtion(finalst[second], orend);
508 		}
509 	}
510 
511 	finalst[first] = orend;
512 	return first;
513 }
514 
515 
516 /* mkposcl - convert a machine into a positive closure
517  *
518  * synopsis
519  *   new = mkposcl( state );
520  *
521  *    new - a machine matching the positive closure of "state"
522  */
523 
524 int
mkposcl(int state)525 mkposcl(int state)
526 {
527 	int eps;
528 
529 	if (SUPER_FREE_EPSILON(finalst[state])) {
530 		mkxtion(finalst[state], state);
531 		return state;
532 	} else {
533 		eps = mkstate(SYM_EPSILON);
534 		mkxtion(eps, state);
535 		return link_machines(state, eps);
536 	}
537 }
538 
539 
540 /* mkrep - make a replicated machine
541  *
542  * synopsis
543  *   new = mkrep( mach, lb, ub );
544  *
545  *    new - a machine that matches whatever "mach" matched from "lb"
546  *          number of times to "ub" number of times
547  *
548  * note
549  *   if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
550  */
551 
552 int
mkrep(int mach,int lb,int ub)553 mkrep(int mach, int lb, int ub)
554 {
555 	int base_mach, tail, copy, i;
556 
557 	base_mach = copysingl(mach, lb - 1);
558 
559 	if (ub == INFINITE_REPEAT) {
560 		copy = dupmachine(mach);
561 		mach = link_machines(mach,
562 		    link_machines(base_mach,
563 			mkclos(copy)));
564 	} else {
565 		tail = mkstate(SYM_EPSILON);
566 
567 		for (i = lb; i < ub; ++i) {
568 			copy = dupmachine(mach);
569 			tail = mkopt(link_machines(copy, tail));
570 		}
571 
572 		mach =
573 		    link_machines(mach,
574 		    link_machines(base_mach, tail));
575 	}
576 
577 	return mach;
578 }
579 
580 
581 /* mkstate - create a state with a transition on a given symbol
582  *
583  * synopsis
584  *
585  *   state = mkstate( sym );
586  *
587  *     state - a new state matching sym
588  *     sym   - the symbol the new state is to have an out-transition on
589  *
590  * note that this routine makes new states in ascending order through the
591  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
592  * relies on machines being made in ascending order and that they are
593  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
594  * that it admittedly is)
595  */
596 
597 int
mkstate(int sym)598 mkstate(int sym)
599 {
600 	if (++lastnfa >= current_mns) {
601 		if ((current_mns += MNS_INCREMENT) >= maximum_mns)
602 			lerrif(_
603 			    ("input rules are too complicated (>= %d NFA states)"),
604 			    current_mns);
605 
606 		++num_reallocs;
607 
608 		firstst = reallocate_integer_array(firstst, current_mns);
609 		lastst = reallocate_integer_array(lastst, current_mns);
610 		finalst = reallocate_integer_array(finalst, current_mns);
611 		transchar =
612 		    reallocate_integer_array(transchar, current_mns);
613 		trans1 = reallocate_integer_array(trans1, current_mns);
614 		trans2 = reallocate_integer_array(trans2, current_mns);
615 		accptnum =
616 		    reallocate_integer_array(accptnum, current_mns);
617 		assoc_rule =
618 		    reallocate_integer_array(assoc_rule, current_mns);
619 		state_type =
620 		    reallocate_integer_array(state_type, current_mns);
621 	}
622 	firstst[lastnfa] = lastnfa;
623 	finalst[lastnfa] = lastnfa;
624 	lastst[lastnfa] = lastnfa;
625 	transchar[lastnfa] = sym;
626 	trans1[lastnfa] = NO_TRANSITION;
627 	trans2[lastnfa] = NO_TRANSITION;
628 	accptnum[lastnfa] = NIL;
629 	assoc_rule[lastnfa] = num_rules;
630 	state_type[lastnfa] = current_state_type;
631 
632 	/*
633 	 * Fix up equivalence classes base on this transition.  Note that any
634 	 * character which has its own transition gets its own equivalence
635 	 * class.  Thus only characters which are only in character classes
636 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
637 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
638 	 * puts them in the same equivalence class (barring other differences
639 	 * elsewhere in the input).
640 	 */
641 
642 	if (sym < 0) {
643 		/*
644 		 * We don't have to update the equivalence classes since that
645 		 * was already done when the ccl was created for the first
646 		 * time.
647 		 */
648 	} else if (sym == SYM_EPSILON)
649 		++numeps;
650 
651 	else {
652 		check_char(sym);
653 
654 		if (useecs)
655 			/* Map NUL's to csize. */
656 			mkechar(sym ? sym : csize, nextecm, ecgroup);
657 	}
658 
659 	return lastnfa;
660 }
661 
662 
663 /* mkxtion - make a transition from one state to another
664  *
665  * synopsis
666  *
667  *   mkxtion( statefrom, stateto );
668  *
669  *     statefrom - the state from which the transition is to be made
670  *     stateto   - the state to which the transition is to be made
671  */
672 
673 void
mkxtion(int statefrom,int stateto)674 mkxtion(int statefrom, int stateto)
675 {
676 	if (trans1[statefrom] == NO_TRANSITION)
677 		trans1[statefrom] = stateto;
678 
679 	else if ((transchar[statefrom] != SYM_EPSILON) ||
680 	    (trans2[statefrom] != NO_TRANSITION))
681 		flexfatal(_("found too many transitions in mkxtion()"));
682 
683 	else {			/* second out-transition for an epsilon state */
684 		++eps2;
685 		trans2[statefrom] = stateto;
686 	}
687 }
688 
689 /* new_rule - initialize for a new rule */
690 
691 void
new_rule(void)692 new_rule(void)
693 {
694 	if (++num_rules >= current_max_rules) {
695 		++num_reallocs;
696 		current_max_rules += MAX_RULES_INCREMENT;
697 		rule_type = reallocate_integer_array(rule_type,
698 		    current_max_rules);
699 		rule_linenum = reallocate_integer_array(rule_linenum,
700 		    current_max_rules);
701 		rule_useful = reallocate_integer_array(rule_useful,
702 		    current_max_rules);
703 		rule_has_nl = reallocate_bool_array(rule_has_nl,
704 		    current_max_rules);
705 	}
706 	if (num_rules > MAX_RULE)
707 		lerrif(_("too many rules (> %d)!"), MAX_RULE);
708 
709 	rule_linenum[num_rules] = linenum;
710 	rule_useful[num_rules] = false;
711 	rule_has_nl[num_rules] = false;
712 }
713