1 /* $OpenBSD: nfa.c,v 1.11 2015/11/19 22:52:40 tedu Exp $ */ 2 3 /* nfa - NFA construction routines */ 4 5 /* Copyright (c) 1990 The Regents of the University of California. */ 6 /* All rights reserved. */ 7 8 /* This code is derived from software contributed to Berkeley by */ 9 /* Vern Paxson. */ 10 11 /* The United States Government has rights in this work pursuant */ 12 /* to contract no. DE-AC03-76SF00098 between the United States */ 13 /* Department of Energy and the University of California. */ 14 15 /* This file is part of flex. */ 16 17 /* Redistribution and use in source and binary forms, with or without */ 18 /* modification, are permitted provided that the following conditions */ 19 /* are met: */ 20 21 /* 1. Redistributions of source code must retain the above copyright */ 22 /* notice, this list of conditions and the following disclaimer. */ 23 /* 2. Redistributions in binary form must reproduce the above copyright */ 24 /* notice, this list of conditions and the following disclaimer in the */ 25 /* documentation and/or other materials provided with the distribution. */ 26 27 /* Neither the name of the University nor the names of its contributors */ 28 /* may be used to endorse or promote products derived from this software */ 29 /* without specific prior written permission. */ 30 31 /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ 32 /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ 33 /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ 34 /* PURPOSE. */ 35 36 #include "flexdef.h" 37 38 39 /* declare functions that have forward references */ 40 41 int dupmachine PROTO((int)); 42 void mkxtion PROTO((int, int)); 43 44 45 /* add_accept - add an accepting state to a machine 46 * 47 * accepting_number becomes mach's accepting number. 48 */ 49 50 void 51 add_accept(mach, accepting_number) 52 int mach, accepting_number; 53 { 54 /* 55 * Hang the accepting number off an epsilon state. if it is 56 * associated with a state that has a non-epsilon out-transition, 57 * then the state will accept BEFORE it makes that transition, i.e., 58 * one character too soon. 59 */ 60 61 if (transchar[finalst[mach]] == SYM_EPSILON) 62 accptnum[finalst[mach]] = accepting_number; 63 64 else { 65 int astate = mkstate(SYM_EPSILON); 66 67 accptnum[astate] = accepting_number; 68 (void) link_machines(mach, astate); 69 } 70 } 71 72 73 /* copysingl - make a given number of copies of a singleton machine 74 * 75 * synopsis 76 * 77 * newsng = copysingl( singl, num ); 78 * 79 * newsng - a new singleton composed of num copies of singl 80 * singl - a singleton machine 81 * num - the number of copies of singl to be present in newsng 82 */ 83 84 int 85 copysingl(singl, num) 86 int singl, num; 87 { 88 int copy, i; 89 90 copy = mkstate(SYM_EPSILON); 91 92 for (i = 1; i <= num; ++i) 93 copy = link_machines(copy, dupmachine(singl)); 94 95 return copy; 96 } 97 98 99 /* dumpnfa - debugging routine to write out an nfa */ 100 101 void 102 dumpnfa(state1) 103 int state1; 104 105 { 106 int sym, tsp1, tsp2, anum, ns; 107 108 fprintf(stderr, 109 _ 110 ("\n\n********** beginning dump of nfa with start state %d\n"), 111 state1); 112 113 /* 114 * We probably should loop starting at firstst[state1] and going to 115 * lastst[state1], but they're not maintained properly when we "or" 116 * all of the rules together. So we use our knowledge that the 117 * machine starts at state 1 and ends at lastnfa. 118 */ 119 120 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 121 for (ns = 1; ns <= lastnfa; ++ns) { 122 fprintf(stderr, _("state # %4d\t"), ns); 123 124 sym = transchar[ns]; 125 tsp1 = trans1[ns]; 126 tsp2 = trans2[ns]; 127 anum = accptnum[ns]; 128 129 fprintf(stderr, "%3d: %4d, %4d", sym, tsp1, tsp2); 130 131 if (anum != NIL) 132 fprintf(stderr, " [%d]", anum); 133 134 fprintf(stderr, "\n"); 135 } 136 137 fprintf(stderr, _("********** end of dump\n")); 138 } 139 140 141 /* dupmachine - make a duplicate of a given machine 142 * 143 * synopsis 144 * 145 * copy = dupmachine( mach ); 146 * 147 * copy - holds duplicate of mach 148 * mach - machine to be duplicated 149 * 150 * note that the copy of mach is NOT an exact duplicate; rather, all the 151 * transition states values are adjusted so that the copy is self-contained, 152 * as the original should have been. 153 * 154 * also note that the original MUST be contiguous, with its low and high 155 * states accessible by the arrays firstst and lastst 156 */ 157 158 int 159 dupmachine(mach) 160 int mach; 161 { 162 int i, init, state_offset; 163 int state = 0; 164 int last = lastst[mach]; 165 166 for (i = firstst[mach]; i <= last; ++i) { 167 state = mkstate(transchar[i]); 168 169 if (trans1[i] != NO_TRANSITION) { 170 mkxtion(finalst[state], trans1[i] + state - i); 171 172 if (transchar[i] == SYM_EPSILON && 173 trans2[i] != NO_TRANSITION) 174 mkxtion(finalst[state], 175 trans2[i] + state - i); 176 } 177 accptnum[state] = accptnum[i]; 178 } 179 180 if (state == 0) 181 flexfatal(_("empty machine in dupmachine()")); 182 183 state_offset = state - i + 1; 184 185 init = mach + state_offset; 186 firstst[init] = firstst[mach] + state_offset; 187 finalst[init] = finalst[mach] + state_offset; 188 lastst[init] = lastst[mach] + state_offset; 189 190 return init; 191 } 192 193 194 /* finish_rule - finish up the processing for a rule 195 * 196 * An accepting number is added to the given machine. If variable_trail_rule 197 * is true then the rule has trailing context and both the head and trail 198 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 199 * the machine recognizes a pattern with trailing context and headcnt is 200 * the number of characters in the matched part of the pattern, or zero 201 * if the matched part has variable length. trailcnt is the number of 202 * trailing context characters in the pattern, or zero if the trailing 203 * context has variable length. 204 */ 205 206 void 207 finish_rule(mach, variable_trail_rule, headcnt, trailcnt, 208 pcont_act) 209 int mach, variable_trail_rule, headcnt, trailcnt, pcont_act; 210 { 211 char action_text[MAXLINE]; 212 213 add_accept(mach, num_rules); 214 215 /* 216 * We did this in new_rule(), but it often gets the wrong number 217 * because we do it before we start parsing the current rule. 218 */ 219 rule_linenum[num_rules] = linenum; 220 221 /* 222 * If this is a continued action, then the line-number has already 223 * been updated, giving us the wrong number. 224 */ 225 if (continued_action) 226 --rule_linenum[num_rules]; 227 228 229 /* 230 * If the previous rule was continued action, then we inherit the 231 * previous newline flag, possibly overriding the current one. 232 */ 233 if (pcont_act && rule_has_nl[num_rules - 1]) 234 rule_has_nl[num_rules] = true; 235 236 snprintf(action_text, sizeof(action_text), "case %d:\n", num_rules); 237 add_action(action_text); 238 if (rule_has_nl[num_rules]) { 239 snprintf(action_text, sizeof(action_text), "/* rule %d can match eol */\n", 240 num_rules); 241 add_action(action_text); 242 } 243 if (variable_trail_rule) { 244 rule_type[num_rules] = RULE_VARIABLE; 245 246 if (performance_report > 0) 247 fprintf(stderr, 248 _ 249 ("Variable trailing context rule at line %d\n"), 250 rule_linenum[num_rules]); 251 252 variable_trailing_context_rules = true; 253 } else { 254 rule_type[num_rules] = RULE_NORMAL; 255 256 if (headcnt > 0 || trailcnt > 0) { 257 /* 258 * Do trailing context magic to not match the 259 * trailing characters. 260 */ 261 char *scanner_cp = "YY_G(yy_c_buf_p) = yy_cp"; 262 char *scanner_bp = "yy_bp"; 263 264 add_action 265 ("*yy_cp = YY_G(yy_hold_char); /* undo effects of setting up yytext */\n"); 266 267 if (headcnt > 0) { 268 if (rule_has_nl[num_rules]) { 269 snprintf(action_text, sizeof(action_text), 270 "YY_LINENO_REWIND_TO(%s + %d);\n", scanner_bp, headcnt); 271 add_action(action_text); 272 } 273 snprintf(action_text, sizeof(action_text), "%s = %s + %d;\n", 274 scanner_cp, scanner_bp, headcnt); 275 add_action(action_text); 276 } else { 277 if (rule_has_nl[num_rules]) { 278 snprintf(action_text, sizeof(action_text), 279 "YY_LINENO_REWIND_TO(yy_cp - %d);\n", trailcnt); 280 add_action(action_text); 281 } 282 snprintf(action_text, sizeof(action_text), "%s -= %d;\n", 283 scanner_cp, trailcnt); 284 add_action(action_text); 285 } 286 287 add_action 288 ("YY_DO_BEFORE_ACTION; /* set up yytext again */\n"); 289 } 290 } 291 292 /* 293 * Okay, in the action code at this point yytext and yyleng have 294 * their proper final values for this rule, so here's the point to do 295 * any user action. But don't do it for continued actions, as 296 * that'll result in multiple YY_RULE_SETUP's. 297 */ 298 if (!continued_action) 299 add_action("YY_RULE_SETUP\n"); 300 301 line_directive_out((FILE *) 0, 1); 302 } 303 304 305 /* link_machines - connect two machines together 306 * 307 * synopsis 308 * 309 * new = link_machines( first, last ); 310 * 311 * new - a machine constructed by connecting first to last 312 * first - the machine whose successor is to be last 313 * last - the machine whose predecessor is to be first 314 * 315 * note: this routine concatenates the machine first with the machine 316 * last to produce a machine new which will pattern-match first first 317 * and then last, and will fail if either of the sub-patterns fails. 318 * FIRST is set to new by the operation. last is unmolested. 319 */ 320 321 int 322 link_machines(first, last) 323 int first, last; 324 { 325 if (first == NIL) 326 return last; 327 328 else if (last == NIL) 329 return first; 330 331 else { 332 mkxtion(finalst[first], last); 333 finalst[first] = finalst[last]; 334 lastst[first] = MAX(lastst[first], lastst[last]); 335 firstst[first] = MIN(firstst[first], firstst[last]); 336 337 return first; 338 } 339 } 340 341 342 /* mark_beginning_as_normal - mark each "beginning" state in a machine 343 * as being a "normal" (i.e., not trailing context- 344 * associated) states 345 * 346 * The "beginning" states are the epsilon closure of the first state 347 */ 348 349 void 350 mark_beginning_as_normal(mach) 351 int mach; 352 { 353 switch (state_type[mach]) { 354 case STATE_NORMAL: 355 /* Oh, we've already visited here. */ 356 return; 357 358 case STATE_TRAILING_CONTEXT: 359 state_type[mach] = STATE_NORMAL; 360 361 if (transchar[mach] == SYM_EPSILON) { 362 if (trans1[mach] != NO_TRANSITION) 363 mark_beginning_as_normal(trans1[mach]); 364 365 if (trans2[mach] != NO_TRANSITION) 366 mark_beginning_as_normal(trans2[mach]); 367 } 368 break; 369 370 default: 371 flexerror(_ 372 ("bad state type in mark_beginning_as_normal()")); 373 break; 374 } 375 } 376 377 378 /* mkbranch - make a machine that branches to two machines 379 * 380 * synopsis 381 * 382 * branch = mkbranch( first, second ); 383 * 384 * branch - a machine which matches either first's pattern or second's 385 * first, second - machines whose patterns are to be or'ed (the | operator) 386 * 387 * Note that first and second are NEITHER destroyed by the operation. Also, 388 * the resulting machine CANNOT be used with any other "mk" operation except 389 * more mkbranch's. Compare with mkor() 390 */ 391 392 int 393 mkbranch(first, second) 394 int first, second; 395 { 396 int eps; 397 398 if (first == NO_TRANSITION) 399 return second; 400 401 else if (second == NO_TRANSITION) 402 return first; 403 404 eps = mkstate(SYM_EPSILON); 405 406 mkxtion(eps, first); 407 mkxtion(eps, second); 408 409 return eps; 410 } 411 412 413 /* mkclos - convert a machine into a closure 414 * 415 * synopsis 416 * new = mkclos( state ); 417 * 418 * new - a new state which matches the closure of "state" 419 */ 420 421 int 422 mkclos(state) 423 int state; 424 { 425 return mkopt(mkposcl(state)); 426 } 427 428 429 /* mkopt - make a machine optional 430 * 431 * synopsis 432 * 433 * new = mkopt( mach ); 434 * 435 * new - a machine which optionally matches whatever mach matched 436 * mach - the machine to make optional 437 * 438 * notes: 439 * 1. mach must be the last machine created 440 * 2. mach is destroyed by the call 441 */ 442 443 int 444 mkopt(mach) 445 int mach; 446 { 447 int eps; 448 449 if (!SUPER_FREE_EPSILON(finalst[mach])) { 450 eps = mkstate(SYM_EPSILON); 451 mach = link_machines(mach, eps); 452 } 453 /* 454 * Can't skimp on the following if FREE_EPSILON(mach) is true because 455 * some state interior to "mach" might point back to the beginning 456 * for a closure. 457 */ 458 eps = mkstate(SYM_EPSILON); 459 mach = link_machines(eps, mach); 460 461 mkxtion(mach, finalst[mach]); 462 463 return mach; 464 } 465 466 467 /* mkor - make a machine that matches either one of two machines 468 * 469 * synopsis 470 * 471 * new = mkor( first, second ); 472 * 473 * new - a machine which matches either first's pattern or second's 474 * first, second - machines whose patterns are to be or'ed (the | operator) 475 * 476 * note that first and second are both destroyed by the operation 477 * the code is rather convoluted because an attempt is made to minimize 478 * the number of epsilon states needed 479 */ 480 481 int 482 mkor(first, second) 483 int first, second; 484 { 485 int eps, orend; 486 487 if (first == NIL) 488 return second; 489 490 else if (second == NIL) 491 return first; 492 493 else { 494 /* 495 * See comment in mkopt() about why we can't use the first 496 * state of "first" or "second" if they satisfy 497 * "FREE_EPSILON". 498 */ 499 eps = mkstate(SYM_EPSILON); 500 501 first = link_machines(eps, first); 502 503 mkxtion(first, second); 504 505 if (SUPER_FREE_EPSILON(finalst[first]) && 506 accptnum[finalst[first]] == NIL) { 507 orend = finalst[first]; 508 mkxtion(finalst[second], orend); 509 } else if (SUPER_FREE_EPSILON(finalst[second]) && 510 accptnum[finalst[second]] == NIL) { 511 orend = finalst[second]; 512 mkxtion(finalst[first], orend); 513 } else { 514 eps = mkstate(SYM_EPSILON); 515 516 first = link_machines(first, eps); 517 orend = finalst[first]; 518 519 mkxtion(finalst[second], orend); 520 } 521 } 522 523 finalst[first] = orend; 524 return first; 525 } 526 527 528 /* mkposcl - convert a machine into a positive closure 529 * 530 * synopsis 531 * new = mkposcl( state ); 532 * 533 * new - a machine matching the positive closure of "state" 534 */ 535 536 int 537 mkposcl(state) 538 int state; 539 { 540 int eps; 541 542 if (SUPER_FREE_EPSILON(finalst[state])) { 543 mkxtion(finalst[state], state); 544 return state; 545 } else { 546 eps = mkstate(SYM_EPSILON); 547 mkxtion(eps, state); 548 return link_machines(state, eps); 549 } 550 } 551 552 553 /* mkrep - make a replicated machine 554 * 555 * synopsis 556 * new = mkrep( mach, lb, ub ); 557 * 558 * new - a machine that matches whatever "mach" matched from "lb" 559 * number of times to "ub" number of times 560 * 561 * note 562 * if "ub" is INFINITE_REPEAT then "new" matches "lb" or more occurrences of "mach" 563 */ 564 565 int 566 mkrep(mach, lb, ub) 567 int mach, lb, ub; 568 { 569 int base_mach, tail, copy, i; 570 571 base_mach = copysingl(mach, lb - 1); 572 573 if (ub == INFINITE_REPEAT) { 574 copy = dupmachine(mach); 575 mach = link_machines(mach, 576 link_machines(base_mach, 577 mkclos(copy))); 578 } else { 579 tail = mkstate(SYM_EPSILON); 580 581 for (i = lb; i < ub; ++i) { 582 copy = dupmachine(mach); 583 tail = mkopt(link_machines(copy, tail)); 584 } 585 586 mach = 587 link_machines(mach, 588 link_machines(base_mach, tail)); 589 } 590 591 return mach; 592 } 593 594 595 /* mkstate - create a state with a transition on a given symbol 596 * 597 * synopsis 598 * 599 * state = mkstate( sym ); 600 * 601 * state - a new state matching sym 602 * sym - the symbol the new state is to have an out-transition on 603 * 604 * note that this routine makes new states in ascending order through the 605 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 606 * relies on machines being made in ascending order and that they are 607 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 608 * that it admittedly is) 609 */ 610 611 int 612 mkstate(sym) 613 int sym; 614 { 615 if (++lastnfa >= current_mns) { 616 if ((current_mns += MNS_INCREMENT) >= maximum_mns) 617 lerrif(_ 618 ("input rules are too complicated (>= %d NFA states)"), 619 current_mns); 620 621 ++num_reallocs; 622 623 firstst = reallocate_integer_array(firstst, current_mns); 624 lastst = reallocate_integer_array(lastst, current_mns); 625 finalst = reallocate_integer_array(finalst, current_mns); 626 transchar = 627 reallocate_integer_array(transchar, current_mns); 628 trans1 = reallocate_integer_array(trans1, current_mns); 629 trans2 = reallocate_integer_array(trans2, current_mns); 630 accptnum = 631 reallocate_integer_array(accptnum, current_mns); 632 assoc_rule = 633 reallocate_integer_array(assoc_rule, current_mns); 634 state_type = 635 reallocate_integer_array(state_type, current_mns); 636 } 637 firstst[lastnfa] = lastnfa; 638 finalst[lastnfa] = lastnfa; 639 lastst[lastnfa] = lastnfa; 640 transchar[lastnfa] = sym; 641 trans1[lastnfa] = NO_TRANSITION; 642 trans2[lastnfa] = NO_TRANSITION; 643 accptnum[lastnfa] = NIL; 644 assoc_rule[lastnfa] = num_rules; 645 state_type[lastnfa] = current_state_type; 646 647 /* 648 * Fix up equivalence classes base on this transition. Note that any 649 * character which has its own transition gets its own equivalence 650 * class. Thus only characters which are only in character classes 651 * have a chance at being in the same equivalence class. E.g. "a|b" 652 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 653 * puts them in the same equivalence class (barring other differences 654 * elsewhere in the input). 655 */ 656 657 if (sym < 0) { 658 /* 659 * We don't have to update the equivalence classes since that 660 * was already done when the ccl was created for the first 661 * time. 662 */ 663 } else if (sym == SYM_EPSILON) 664 ++numeps; 665 666 else { 667 check_char(sym); 668 669 if (useecs) 670 /* Map NUL's to csize. */ 671 mkechar(sym ? sym : csize, nextecm, ecgroup); 672 } 673 674 return lastnfa; 675 } 676 677 678 /* mkxtion - make a transition from one state to another 679 * 680 * synopsis 681 * 682 * mkxtion( statefrom, stateto ); 683 * 684 * statefrom - the state from which the transition is to be made 685 * stateto - the state to which the transition is to be made 686 */ 687 688 void 689 mkxtion(statefrom, stateto) 690 int statefrom, stateto; 691 { 692 if (trans1[statefrom] == NO_TRANSITION) 693 trans1[statefrom] = stateto; 694 695 else if ((transchar[statefrom] != SYM_EPSILON) || 696 (trans2[statefrom] != NO_TRANSITION)) 697 flexfatal(_("found too many transitions in mkxtion()")); 698 699 else { /* second out-transition for an epsilon state */ 700 ++eps2; 701 trans2[statefrom] = stateto; 702 } 703 } 704 705 /* new_rule - initialize for a new rule */ 706 707 void 708 new_rule() 709 { 710 if (++num_rules >= current_max_rules) { 711 ++num_reallocs; 712 current_max_rules += MAX_RULES_INCREMENT; 713 rule_type = reallocate_integer_array(rule_type, 714 current_max_rules); 715 rule_linenum = reallocate_integer_array(rule_linenum, 716 current_max_rules); 717 rule_useful = reallocate_integer_array(rule_useful, 718 current_max_rules); 719 rule_has_nl = reallocate_bool_array(rule_has_nl, 720 current_max_rules); 721 } 722 if (num_rules > MAX_RULE) 723 lerrif(_("too many rules (> %d)!"), MAX_RULE); 724 725 rule_linenum[num_rules] = linenum; 726 rule_useful[num_rules] = false; 727 rule_has_nl[num_rules] = false; 728 } 729