1 /* $OpenBSD: nfa.c,v 1.12 2024/11/09 18:03:44 op Exp $ */
2
3 /* nfa - NFA construction routines */
4
5 /* Copyright (c) 1990 The Regents of the University of California. */
6 /* All rights reserved. */
7
8 /* This code is derived from software contributed to Berkeley by */
9 /* Vern Paxson. */
10
11 /* The United States Government has rights in this work pursuant */
12 /* to contract no. DE-AC03-76SF00098 between the United States */
13 /* Department of Energy and the University of California. */
14
15 /* This file is part of flex. */
16
17 /* Redistribution and use in source and binary forms, with or without */
18 /* modification, are permitted provided that the following conditions */
19 /* are met: */
20
21 /* 1. Redistributions of source code must retain the above copyright */
22 /* notice, this list of conditions and the following disclaimer. */
23 /* 2. Redistributions in binary form must reproduce the above copyright */
24 /* notice, this list of conditions and the following disclaimer in the */
25 /* documentation and/or other materials provided with the distribution. */
26
27 /* Neither the name of the University nor the names of its contributors */
28 /* may be used to endorse or promote products derived from this software */
29 /* without specific prior written permission. */
30
31 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */
32 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */
33 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */
34 /* PURPOSE. */
35
36 #include "flexdef.h"
37
38
39 /* declare functions that have forward references */
40
41 int dupmachine PROTO((int));
42 void mkxtion PROTO((int, int));
43
44
45 /* add_accept - add an accepting state to a machine
46 *
47 * accepting_number becomes mach's accepting number.
48 */
49
50 void
add_accept(int mach,int accepting_number)51 add_accept(int mach, int accepting_number)
52 {
53 /*
54 * Hang the accepting number off an epsilon state. if it is
55 * associated with a state that has a non-epsilon out-transition,
56 * then the state will accept BEFORE it makes that transition, i.e.,
57 * one character too soon.
58 */
59
60 if (transchar[finalst[mach]] == SYM_EPSILON)
61 accptnum[finalst[mach]] = accepting_number;
62
63 else {
64 int astate = mkstate(SYM_EPSILON);
65
66 accptnum[astate] = accepting_number;
67 (void) link_machines(mach, astate);
68 }
69 }
70
71
72 /* copysingl - make a given number of copies of a singleton machine
73 *
74 * synopsis
75 *
76 * newsng = copysingl( singl, num );
77 *
78 * newsng - a new singleton composed of num copies of singl
79 * singl - a singleton machine
80 * num - the number of copies of singl to be present in newsng
81 */
82
83 int
copysingl(int singl,int num)84 copysingl(int singl, int num)
85 {
86 int copy, i;
87
88 copy = mkstate(SYM_EPSILON);
89
90 for (i = 1; i <= num; ++i)
91 copy = link_machines(copy, dupmachine(singl));
92
93 return copy;
94 }
95
96
97 /* dumpnfa - debugging routine to write out an nfa */
98
99 void
dumpnfa(int state1)100 dumpnfa(int state1)
101 {
102 int sym, tsp1, tsp2, anum, ns;
103
104 fprintf(stderr,
105 _
106 ("\n\n********** beginning dump of nfa with start state %d\n"),
107 state1);
108
109 /*
110 * We probably should loop starting at firstst[state1] and going to
111 * lastst[state1], but they're not maintained properly when we "or"
112 * all of the rules together. So we use our knowledge that the
113 * machine starts at state 1 and ends at lastnfa.
114 */
115
116 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
117 for (ns = 1; ns <= lastnfa; ++ns) {
118 fprintf(stderr, _("state # %4d\t"), ns);
119
120 sym = transchar[ns];
121 tsp1 = trans1[ns];
122 tsp2 = trans2[ns];
123 anum = accptnum[ns];
124
125 fprintf(stderr, "%3d: %4d, %4d", sym, tsp1, tsp2);
126
127 if (anum != NIL)
128 fprintf(stderr, " [%d]", anum);
129
130 fprintf(stderr, "\n");
131 }
132
133 fprintf(stderr, _("********** end of dump\n"));
134 }
135
136
137 /* dupmachine - make a duplicate of a given machine
138 *
139 * synopsis
140 *
141 * copy = dupmachine( mach );
142 *
143 * copy - holds duplicate of mach
144 * mach - machine to be duplicated
145 *
146 * note that the copy of mach is NOT an exact duplicate; rather, all the
147 * transition states values are adjusted so that the copy is self-contained,
148 * as the original should have been.
149 *
150 * also note that the original MUST be contiguous, with its low and high
151 * states accessible by the arrays firstst and lastst
152 */
153
154 int
dupmachine(int mach)155 dupmachine(int mach)
156 {
157 int i, init, state_offset;
158 int state = 0;
159 int last = lastst[mach];
160
161 for (i = firstst[mach]; i <= last; ++i) {
162 state = mkstate(transchar[i]);
163
164 if (trans1[i] != NO_TRANSITION) {
165 mkxtion(finalst[state], trans1[i] + state - i);
166
167 if (transchar[i] == SYM_EPSILON &&
168 trans2[i] != NO_TRANSITION)
169 mkxtion(finalst[state],
170 trans2[i] + state - i);
171 }
172 accptnum[state] = accptnum[i];
173 }
174
175 if (state == 0)
176 flexfatal(_("empty machine in dupmachine()"));
177
178 state_offset = state - i + 1;
179
180 init = mach + state_offset;
181 firstst[init] = firstst[mach] + state_offset;
182 finalst[init] = finalst[mach] + state_offset;
183 lastst[init] = lastst[mach] + state_offset;
184
185 return init;
186 }
187
188
189 /* finish_rule - finish up the processing for a rule
190 *
191 * An accepting number is added to the given machine. If variable_trail_rule
192 * is true then the rule has trailing context and both the head and trail
193 * are variable size. Otherwise if headcnt or trailcnt is non-zero then
194 * the machine recognizes a pattern with trailing context and headcnt is
195 * the number of characters in the matched part of the pattern, or zero
196 * if the matched part has variable length. trailcnt is the number of
197 * trailing context characters in the pattern, or zero if the trailing
198 * context has variable length.
199 */
200
201 void
finish_rule(int mach,int variable_trail_rule,int headcnt,int trailcnt,int pcont_act)202 finish_rule(int mach, int variable_trail_rule, int headcnt, int trailcnt,
203 int pcont_act)
204 {
205 char action_text[MAXLINE];
206
207 add_accept(mach, num_rules);
208
209 /*
210 * We did this in new_rule(), but it often gets the wrong number
211 * because we do it before we start parsing the current rule.
212 */
213 rule_linenum[num_rules] = linenum;
214
215 /*
216 * If this is a continued action, then the line-number has already
217 * been updated, giving us the wrong number.
218 */
219 if (continued_action)
220 --rule_linenum[num_rules];
221
222
223 /*
224 * If the previous rule was continued action, then we inherit the
225 * previous newline flag, possibly overriding the current one.
226 */
227 if (pcont_act && rule_has_nl[num_rules - 1])
228 rule_has_nl[num_rules] = true;
229
230 snprintf(action_text, sizeof(action_text), "case %d:\n", num_rules);
231 add_action(action_text);
232 if (rule_has_nl[num_rules]) {
233 snprintf(action_text, sizeof(action_text), "/* rule %d can match eol */\n",
234 num_rules);
235 add_action(action_text);
236 }
237 if (variable_trail_rule) {
238 rule_type[num_rules] = RULE_VARIABLE;
239
240 if (performance_report > 0)
241 fprintf(stderr,
242 _
243 ("Variable trailing context rule at line %d\n"),
244 rule_linenum[num_rules]);
245
246 variable_trailing_context_rules = true;
247 } else {
248 rule_type[num_rules] = RULE_NORMAL;
249
250 if (headcnt > 0 || trailcnt > 0) {
251 /*
252 * Do trailing context magic to not match the
253 * trailing characters.
254 */
255 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp";
256 char *scanner_bp = "yy_bp";
257
258 add_action
259 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n");
260
261 if (headcnt > 0) {
262 if (rule_has_nl[num_rules]) {
263 snprintf(action_text, sizeof(action_text),
264 "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt);
265 add_action(action_text);
266 }
267 snprintf(action_text, sizeof(action_text), "%s = %s + %d;\n",
268 scanner_cp, scanner_bp, headcnt);
269 add_action(action_text);
270 } else {
271 if (rule_has_nl[num_rules]) {
272 snprintf(action_text, sizeof(action_text),
273 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt);
274 add_action(action_text);
275 }
276 snprintf(action_text, sizeof(action_text), "%s -= %d;\n",
277 scanner_cp, trailcnt);
278 add_action(action_text);
279 }
280
281 add_action
282 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n");
283 }
284 }
285
286 /*
287 * Okay, in the action code at this point yytext and yyleng have
288 * their proper final values for this rule, so here's the point to do
289 * any user action. But don't do it for continued actions, as
290 * that'll result in multiple YY_RULE_SETUP's.
291 */
292 if (!continued_action)
293 add_action("YY_RULE_SETUP\n");
294
295 line_directive_out((FILE *) 0, 1);
296 }
297
298
299 /* link_machines - connect two machines together
300 *
301 * synopsis
302 *
303 * new = link_machines( first, last );
304 *
305 * new - a machine constructed by connecting first to last
306 * first - the machine whose successor is to be last
307 * last - the machine whose predecessor is to be first
308 *
309 * note: this routine concatenates the machine first with the machine
310 * last to produce a machine new which will pattern-match first first
311 * and then last, and will fail if either of the sub-patterns fails.
312 * FIRST is set to new by the operation. last is unmolested.
313 */
314
315 int
link_machines(int first,int last)316 link_machines(int first, int last)
317 {
318 if (first == NIL)
319 return last;
320
321 else if (last == NIL)
322 return first;
323
324 else {
325 mkxtion(finalst[first], last);
326 finalst[first] = finalst[last];
327 lastst[first] = MAX(lastst[first], lastst[last]);
328 firstst[first] = MIN(firstst[first], firstst[last]);
329
330 return first;
331 }
332 }
333
334
335 /* mark_beginning_as_normal - mark each "beginning" state in a machine
336 * as being a "normal" (i.e., not trailing context-
337 * associated) states
338 *
339 * The "beginning" states are the epsilon closure of the first state
340 */
341
342 void
mark_beginning_as_normal(int mach)343 mark_beginning_as_normal(int mach)
344 {
345 switch (state_type[mach]) {
346 case STATE_NORMAL:
347 /* Oh, we've already visited here. */
348 return;
349
350 case STATE_TRAILING_CONTEXT:
351 state_type[mach] = STATE_NORMAL;
352
353 if (transchar[mach] == SYM_EPSILON) {
354 if (trans1[mach] != NO_TRANSITION)
355 mark_beginning_as_normal(trans1[mach]);
356
357 if (trans2[mach] != NO_TRANSITION)
358 mark_beginning_as_normal(trans2[mach]);
359 }
360 break;
361
362 default:
363 flexerror(_
364 ("bad state type in mark_beginning_as_normal()"));
365 break;
366 }
367 }
368
369
370 /* mkbranch - make a machine that branches to two machines
371 *
372 * synopsis
373 *
374 * branch = mkbranch( first, second );
375 *
376 * branch - a machine which matches either first's pattern or second's
377 * first, second - machines whose patterns are to be or'ed (the | operator)
378 *
379 * Note that first and second are NEITHER destroyed by the operation. Also,
380 * the resulting machine CANNOT be used with any other "mk" operation except
381 * more mkbranch's. Compare with mkor()
382 */
383
384 int
mkbranch(int first,int second)385 mkbranch(int first, int second)
386 {
387 int eps;
388
389 if (first == NO_TRANSITION)
390 return second;
391
392 else if (second == NO_TRANSITION)
393 return first;
394
395 eps = mkstate(SYM_EPSILON);
396
397 mkxtion(eps, first);
398 mkxtion(eps, second);
399
400 return eps;
401 }
402
403
404 /* mkclos - convert a machine into a closure
405 *
406 * synopsis
407 * new = mkclos( state );
408 *
409 * new - a new state which matches the closure of "state"
410 */
411
412 int
mkclos(int state)413 mkclos(int state)
414 {
415 return mkopt(mkposcl(state));
416 }
417
418
419 /* mkopt - make a machine optional
420 *
421 * synopsis
422 *
423 * new = mkopt( mach );
424 *
425 * new - a machine which optionally matches whatever mach matched
426 * mach - the machine to make optional
427 *
428 * notes:
429 * 1. mach must be the last machine created
430 * 2. mach is destroyed by the call
431 */
432
433 int
mkopt(int mach)434 mkopt(int mach)
435 {
436 int eps;
437
438 if (!SUPER_FREE_EPSILON(finalst[mach])) {
439 eps = mkstate(SYM_EPSILON);
440 mach = link_machines(mach, eps);
441 }
442 /*
443 * Can't skimp on the following if FREE_EPSILON(mach) is true because
444 * some state interior to "mach" might point back to the beginning
445 * for a closure.
446 */
447 eps = mkstate(SYM_EPSILON);
448 mach = link_machines(eps, mach);
449
450 mkxtion(mach, finalst[mach]);
451
452 return mach;
453 }
454
455
456 /* mkor - make a machine that matches either one of two machines
457 *
458 * synopsis
459 *
460 * new = mkor( first, second );
461 *
462 * new - a machine which matches either first's pattern or second's
463 * first, second - machines whose patterns are to be or'ed (the | operator)
464 *
465 * note that first and second are both destroyed by the operation
466 * the code is rather convoluted because an attempt is made to minimize
467 * the number of epsilon states needed
468 */
469
470 int
mkor(int first,int second)471 mkor(int first, int second)
472 {
473 int eps, orend;
474
475 if (first == NIL)
476 return second;
477
478 else if (second == NIL)
479 return first;
480
481 else {
482 /*
483 * See comment in mkopt() about why we can't use the first
484 * state of "first" or "second" if they satisfy
485 * "FREE_EPSILON".
486 */
487 eps = mkstate(SYM_EPSILON);
488
489 first = link_machines(eps, first);
490
491 mkxtion(first, second);
492
493 if (SUPER_FREE_EPSILON(finalst[first]) &&
494 accptnum[finalst[first]] == NIL) {
495 orend = finalst[first];
496 mkxtion(finalst[second], orend);
497 } else if (SUPER_FREE_EPSILON(finalst[second]) &&
498 accptnum[finalst[second]] == NIL) {
499 orend = finalst[second];
500 mkxtion(finalst[first], orend);
501 } else {
502 eps = mkstate(SYM_EPSILON);
503
504 first = link_machines(first, eps);
505 orend = finalst[first];
506
507 mkxtion(finalst[second], orend);
508 }
509 }
510
511 finalst[first] = orend;
512 return first;
513 }
514
515
516 /* mkposcl - convert a machine into a positive closure
517 *
518 * synopsis
519 * new = mkposcl( state );
520 *
521 * new - a machine matching the positive closure of "state"
522 */
523
524 int
mkposcl(int state)525 mkposcl(int state)
526 {
527 int eps;
528
529 if (SUPER_FREE_EPSILON(finalst[state])) {
530 mkxtion(finalst[state], state);
531 return state;
532 } else {
533 eps = mkstate(SYM_EPSILON);
534 mkxtion(eps, state);
535 return link_machines(state, eps);
536 }
537 }
538
539
540 /* mkrep - make a replicated machine
541 *
542 * synopsis
543 * new = mkrep( mach, lb, ub );
544 *
545 * new - a machine that matches whatever "mach" matched from "lb"
546 * number of times to "ub" number of times
547 *
548 * note
549 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach"
550 */
551
552 int
mkrep(int mach,int lb,int ub)553 mkrep(int mach, int lb, int ub)
554 {
555 int base_mach, tail, copy, i;
556
557 base_mach = copysingl(mach, lb - 1);
558
559 if (ub == INFINITE_REPEAT) {
560 copy = dupmachine(mach);
561 mach = link_machines(mach,
562 link_machines(base_mach,
563 mkclos(copy)));
564 } else {
565 tail = mkstate(SYM_EPSILON);
566
567 for (i = lb; i < ub; ++i) {
568 copy = dupmachine(mach);
569 tail = mkopt(link_machines(copy, tail));
570 }
571
572 mach =
573 link_machines(mach,
574 link_machines(base_mach, tail));
575 }
576
577 return mach;
578 }
579
580
581 /* mkstate - create a state with a transition on a given symbol
582 *
583 * synopsis
584 *
585 * state = mkstate( sym );
586 *
587 * state - a new state matching sym
588 * sym - the symbol the new state is to have an out-transition on
589 *
590 * note that this routine makes new states in ascending order through the
591 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE
592 * relies on machines being made in ascending order and that they are
593 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge
594 * that it admittedly is)
595 */
596
597 int
mkstate(int sym)598 mkstate(int sym)
599 {
600 if (++lastnfa >= current_mns) {
601 if ((current_mns += MNS_INCREMENT) >= maximum_mns)
602 lerrif(_
603 ("input rules are too complicated (>= %d NFA states)"),
604 current_mns);
605
606 ++num_reallocs;
607
608 firstst = reallocate_integer_array(firstst, current_mns);
609 lastst = reallocate_integer_array(lastst, current_mns);
610 finalst = reallocate_integer_array(finalst, current_mns);
611 transchar =
612 reallocate_integer_array(transchar, current_mns);
613 trans1 = reallocate_integer_array(trans1, current_mns);
614 trans2 = reallocate_integer_array(trans2, current_mns);
615 accptnum =
616 reallocate_integer_array(accptnum, current_mns);
617 assoc_rule =
618 reallocate_integer_array(assoc_rule, current_mns);
619 state_type =
620 reallocate_integer_array(state_type, current_mns);
621 }
622 firstst[lastnfa] = lastnfa;
623 finalst[lastnfa] = lastnfa;
624 lastst[lastnfa] = lastnfa;
625 transchar[lastnfa] = sym;
626 trans1[lastnfa] = NO_TRANSITION;
627 trans2[lastnfa] = NO_TRANSITION;
628 accptnum[lastnfa] = NIL;
629 assoc_rule[lastnfa] = num_rules;
630 state_type[lastnfa] = current_state_type;
631
632 /*
633 * Fix up equivalence classes base on this transition. Note that any
634 * character which has its own transition gets its own equivalence
635 * class. Thus only characters which are only in character classes
636 * have a chance at being in the same equivalence class. E.g. "a|b"
637 * puts 'a' and 'b' into two different equivalence classes. "[ab]"
638 * puts them in the same equivalence class (barring other differences
639 * elsewhere in the input).
640 */
641
642 if (sym < 0) {
643 /*
644 * We don't have to update the equivalence classes since that
645 * was already done when the ccl was created for the first
646 * time.
647 */
648 } else if (sym == SYM_EPSILON)
649 ++numeps;
650
651 else {
652 check_char(sym);
653
654 if (useecs)
655 /* Map NUL's to csize. */
656 mkechar(sym ? sym : csize, nextecm, ecgroup);
657 }
658
659 return lastnfa;
660 }
661
662
663 /* mkxtion - make a transition from one state to another
664 *
665 * synopsis
666 *
667 * mkxtion( statefrom, stateto );
668 *
669 * statefrom - the state from which the transition is to be made
670 * stateto - the state to which the transition is to be made
671 */
672
673 void
mkxtion(int statefrom,int stateto)674 mkxtion(int statefrom, int stateto)
675 {
676 if (trans1[statefrom] == NO_TRANSITION)
677 trans1[statefrom] = stateto;
678
679 else if ((transchar[statefrom] != SYM_EPSILON) ||
680 (trans2[statefrom] != NO_TRANSITION))
681 flexfatal(_("found too many transitions in mkxtion()"));
682
683 else { /* second out-transition for an epsilon state */
684 ++eps2;
685 trans2[statefrom] = stateto;
686 }
687 }
688
689 /* new_rule - initialize for a new rule */
690
691 void
new_rule(void)692 new_rule(void)
693 {
694 if (++num_rules >= current_max_rules) {
695 ++num_reallocs;
696 current_max_rules += MAX_RULES_INCREMENT;
697 rule_type = reallocate_integer_array(rule_type,
698 current_max_rules);
699 rule_linenum = reallocate_integer_array(rule_linenum,
700 current_max_rules);
701 rule_useful = reallocate_integer_array(rule_useful,
702 current_max_rules);
703 rule_has_nl = reallocate_bool_array(rule_has_nl,
704 current_max_rules);
705 }
706 if (num_rules > MAX_RULE)
707 lerrif(_("too many rules (> %d)!"), MAX_RULE);
708
709 rule_linenum[num_rules] = linenum;
710 rule_useful[num_rules] = false;
711 rule_has_nl[num_rules] = false;
712 }
713