1-- Copyright (c) 1990 Regents of the University of California.
2-- All rights reserved.
3--
4-- This software was developed by John Self of the Arcadia project
5-- at the University of California, Irvine.
6--
7-- Redistribution and use in source and binary forms are permitted
8-- provided that the above copyright notice and this paragraph are
9-- duplicated in all such forms and that any documentation,
10-- advertising materials, and other materials related to such
11-- distribution and use acknowledge that the software was developed
12-- by the University of California, Irvine.  The name of the
13-- University may not be used to endorse or promote products derived
14-- from this software without specific prior written permission.
15-- THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
16-- IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
17-- WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
18
19-- TITLE NFA construction routines
20-- AUTHOR: John Self (UCI)
21-- DESCRIPTION builds the NFA.
22-- NOTES this file mirrors flex as closely as possible.
23-- $Header: /co/ua/self/arcadia/aflex/ada/src/RCS/nfaB.a,v 1.6 90/01/12 15:20:27 self Exp Locker: self $
24with Ada.Characters.Wide_Wide_Latin_1;
25with Ada.Integer_Wide_Wide_Text_IO;
26with Ada.Wide_Wide_Text_IO;
27
28with MISC_DEFS, NFA, MISC, ECS;
29use MISC_DEFS;
30
31package body NFA is
32
33   use Ada.Integer_Wide_Wide_Text_IO;
34--     use Ada.Text_IO;
35   use Ada.Wide_Wide_Text_IO;
36
37-- add_accept - add an accepting state to a machine
38--
39-- accepting_number becomes mach's accepting number.
40
41  procedure ADD_ACCEPT(MACH             : in out INTEGER;
42                       ACCEPTING_NUMBER : in INTEGER) is
43  -- hang the accepting number off an epsilon state.  if it is associated
44  -- with a state that has a non-epsilon out-transition, then the state
45  -- will accept BEFORE it makes that transition, i.e., one character
46  -- too soon
47    ASTATE : INTEGER;
48  begin
49    if (TRANSCHAR(FINALST(MACH)) = SYM_EPSILON) then
50      ACCPTNUM(FINALST(MACH)) := ACCEPTING_NUMBER;
51    else
52      ASTATE := MKSTATE(SYM_EPSILON);
53      ACCPTNUM(ASTATE) := ACCEPTING_NUMBER;
54      MACH := LINK_MACHINES(MACH, ASTATE);
55    end if;
56  end ADD_ACCEPT;
57
58
59  -- copysingl - make a given number of copies of a singleton machine
60  --
61  --     newsng - a new singleton composed of num copies of singl
62  --     singl  - a singleton machine
63  --     num    - the number of copies of singl to be present in newsng
64
65  function COPYSINGL(SINGL, NUM : in INTEGER) return INTEGER is
66    COPY : INTEGER;
67  begin
68    COPY := MKSTATE(SYM_EPSILON);
69
70    for I in 1 .. NUM loop
71      COPY := LINK_MACHINES(COPY, DUPMACHINE(SINGL));
72    end loop;
73
74    return COPY;
75  end COPYSINGL;
76
77
78  -- dumpnfa - debugging routine to write out an nfa
79
80  procedure DUMPNFA(STATE1 : in INTEGER) is
81    SYM, TSP1, TSP2, ANUM : INTEGER;
82  begin
83    NEW_LINE(STANDARD_ERROR);
84    NEW_LINE(STANDARD_ERROR);
85    PUT(STANDARD_ERROR,
86      "********** beginning dump of nfa with start state ");
87    PUT(STANDARD_ERROR, STATE1, 0);
88    NEW_LINE(STANDARD_ERROR);
89
90    -- we probably should loop starting at firstst[state1] and going to
91    -- lastst[state1], but they're not maintained properly when we "or"
92    -- all of the rules together.  So we use our knowledge that the machine
93    -- starts at state 1 and ends at lastnfa.
94    for NS in 1 .. LASTNFA loop
95      PUT(STANDARD_ERROR, "state # ");
96      PUT(STANDARD_ERROR, NS, 4);
97      PUT(Ada.Characters.Wide_Wide_Latin_1.HT);
98      SYM := TRANSCHAR(NS);
99      TSP1 := TRANS1(NS);
100      TSP2 := TRANS2(NS);
101      ANUM := ACCPTNUM(NS);
102
103      PUT(STANDARD_ERROR, SYM, 5);
104      PUT(STANDARD_ERROR, ":    ");
105      PUT(STANDARD_ERROR, TSP1, 4);
106      PUT(STANDARD_ERROR, ",");
107      PUT(STANDARD_ERROR, TSP2, 4);
108      if (ANUM /= NIL) then
109        PUT(STANDARD_ERROR, "  [");
110        PUT(STANDARD_ERROR, ANUM, 0);
111        PUT(STANDARD_ERROR, "]");
112      end if;
113      NEW_LINE(STANDARD_ERROR);
114    end loop;
115
116    PUT(STANDARD_ERROR, "********** end of dump");
117    NEW_LINE(STANDARD_ERROR);
118  end DUMPNFA;
119
120  -- dupmachine - make a duplicate of a given machine
121  --
122  --     copy - holds duplicate of mach
123  --     mach - machine to be duplicated
124  --
125  -- note that the copy of mach is NOT an exact duplicate; rather, all the
126  -- transition states values are adjusted so that the copy is self-contained,
127  -- as the original should have been.
128  --
129  -- also note that the original MUST be contiguous, with its low and high
130  -- states accessible by the arrays firstst and lastst
131
132  function DUPMACHINE(MACH : in INTEGER) return INTEGER is
133    INIT, STATE_OFFSET : INTEGER;
134    STATE              : INTEGER := 0;
135    LAST               : constant INTEGER := LASTST(MACH);
136    I                  : INTEGER;
137  begin
138    I := FIRSTST(MACH);
139    while (I <= LAST) loop
140      STATE := MKSTATE(TRANSCHAR(I));
141
142      if (TRANS1(I) /= NO_TRANSITION) then
143        MKXTION(FINALST(STATE), TRANS1(I) + STATE - I);
144
145        if ((TRANSCHAR(I) = SYM_EPSILON) and (TRANS2(I) /= NO_TRANSITION)) then
146          MKXTION(FINALST(STATE), TRANS2(I) + STATE - I);
147        end if;
148      end if;
149
150      ACCPTNUM(STATE) := ACCPTNUM(I);
151      I := I + 1;
152    end loop;
153
154    if (STATE = 0) then
155      Misc.Aflex_Fatal ("empty machine in dupmachine()");
156    end if;
157
158    STATE_OFFSET := STATE - I + 1;
159
160    INIT := MACH + STATE_OFFSET;
161    FIRSTST(INIT) := FIRSTST(MACH) + STATE_OFFSET;
162    FINALST(INIT) := FINALST(MACH) + STATE_OFFSET;
163    LASTST(INIT) := LASTST(MACH) + STATE_OFFSET;
164
165    return INIT;
166  end DUPMACHINE;
167
168  -- finish_rule - finish up the processing for a rule
169  --
170  -- An accepting number is added to the given machine.  If variable_trail_rule
171  -- is true then the rule has trailing context and both the head and trail
172  -- are variable size.  Otherwise if headcnt or trailcnt is non-zero then
173  -- the machine recognizes a pattern with trailing context and headcnt is
174  -- the number of characters in the matched part of the pattern, or zero
175  -- if the matched part has variable length.  trailcnt is the number of
176  -- trailing context characters in the pattern, or zero if the trailing
177  -- context has variable length.
178
179  procedure FINISH_RULE(MACH                : in INTEGER;
180                        VARIABLE_TRAIL_RULE : in BOOLEAN;
181                        HEADCNT, TRAILCNT   : in INTEGER) is
182    P_MACH : INTEGER;
183  begin
184    P_MACH := MACH;
185    ADD_ACCEPT(P_MACH, NUM_RULES);
186
187    -- we did this in new_rule(), but it often gets the wrong
188    -- number because we do it before we start parsing the current rule
189    RULE_LINENUM(NUM_RULES) := LINENUM;
190
191    PUT(TEMP_ACTION_FILE, "            when ");
192    PUT(TEMP_ACTION_FILE, NUM_RULES, 1);
193    PUT_LINE(TEMP_ACTION_FILE, " => ");
194
195    if (VARIABLE_TRAIL_RULE) then
196      RULE_TYPE(NUM_RULES) := RULE_VARIABLE;
197
198      if (PERFORMANCE_REPORT) then
199        PUT(STANDARD_ERROR, "Variable trailing context rule at line ");
200        PUT(STANDARD_ERROR, RULE_LINENUM(NUM_RULES), 1);
201        NEW_LINE(STANDARD_ERROR);
202      end if;
203
204      VARIABLE_TRAILING_CONTEXT_RULES := TRUE;
205    else
206      RULE_TYPE(NUM_RULES) := RULE_NORMAL;
207
208      if ((HEADCNT > 0) or (TRAILCNT > 0)) then
209
210        -- do trailing context magic to not match the trailing characters
211
212        if (HEADCNT > 0) then
213          PUT(TEMP_ACTION_FILE, "yy_cp := yy_bp + ");
214          PUT(TEMP_ACTION_FILE, HEADCNT, 1);
215          PUT_LINE(TEMP_ACTION_FILE, ";");
216        else
217          PUT(TEMP_ACTION_FILE, "yy_cp := yy_cp - ");
218          PUT(TEMP_ACTION_FILE, TRAILCNT, 1);
219          PUT_LINE(TEMP_ACTION_FILE, ";");
220        end if;
221
222        PUT_LINE(TEMP_ACTION_FILE, "yy_c_buf_p := yy_cp;");
223        PUT_LINE(TEMP_ACTION_FILE,
224          "YY_DO_BEFORE_ACTION; -- set up yytext again");
225      end if;
226    end if;
227
228    MISC.LINE_DIRECTIVE_OUT(TEMP_ACTION_FILE);
229  end FINISH_RULE;
230
231  -- link_machines - connect two machines together
232  --
233  --     new    - a machine constructed by connecting first to last
234  --     first  - the machine whose successor is to be last
235  --     last   - the machine whose predecessor is to be first
236  --
237  -- note: this routine concatenates the machine first with the machine
238  --  last to produce a machine new which will pattern-match first first
239  --  and then last, and will fail if either of the sub-patterns fails.
240  --  FIRST is set to new by the operation.  last is unmolested.
241
242  function LINK_MACHINES(FIRST, LAST : in INTEGER) return INTEGER is
243  begin
244    if (FIRST = NIL) then
245      return LAST;
246    else
247      if (LAST = NIL) then
248        return FIRST;
249      else
250        MKXTION(FINALST(FIRST), LAST);
251        FINALST(FIRST) := FINALST(LAST);
252        LASTST(FIRST) := Integer'Max (LASTST(FIRST), LASTST(LAST));
253        FIRSTST(FIRST) := Integer'Min(FIRSTST(FIRST), FIRSTST(LAST));
254        return (FIRST);
255      end if;
256    end if;
257  end LINK_MACHINES;
258
259
260  -- mark_beginning_as_normal - mark each "beginning" state in a machine
261--                            as being a "normal" (i.e., not trailing context-
262  --                            associated) states
263  --
264  -- The "beginning" states are the epsilon closure of the first state
265
266  procedure MARK_BEGINNING_AS_NORMAL(MACH : in INTEGER) is
267  begin
268    case (STATE_TYPE(MACH)) is
269      when STATE_NORMAL =>
270
271        -- oh, we've already visited here
272        return;
273
274      when STATE_TRAILING_CONTEXT =>
275        STATE_TYPE(MACH) := STATE_NORMAL;
276
277        if (TRANSCHAR(MACH) = SYM_EPSILON) then
278          if (TRANS1(MACH) /= NO_TRANSITION) then
279            MARK_BEGINNING_AS_NORMAL(TRANS1(MACH));
280          end if;
281
282          if (TRANS2(MACH) /= NO_TRANSITION) then
283            MARK_BEGINNING_AS_NORMAL(TRANS2(MACH));
284          end if;
285        end if;
286    end case;
287  end MARK_BEGINNING_AS_NORMAL;
288
289  -- mkbranch - make a machine that branches to two machines
290  --
291  --     branch - a machine which matches either first's pattern or second's
292--     first, second - machines whose patterns are to be or'ed (the | operator)
293  --
294  -- note that first and second are NEITHER destroyed by the operation.  Also,
295  -- the resulting machine CANNOT be used with any other "mk" operation except
296  -- more mkbranch's.  Compare with mkor()
297  function MKBRANCH(FIRST, SECOND : in INTEGER) return INTEGER is
298    EPS : INTEGER;
299  begin
300    if (FIRST = NO_TRANSITION) then
301      return SECOND;
302    else
303      if (SECOND = NO_TRANSITION) then
304        return FIRST;
305      end if;
306    end if;
307
308    EPS := MKSTATE(SYM_EPSILON);
309
310    MKXTION(EPS, FIRST);
311    MKXTION(EPS, SECOND);
312
313    return EPS;
314  end MKBRANCH;
315
316
317  -- mkclos - convert a machine into a closure
318  --
319  --     new - a new state which matches the closure of "state"
320
321  function MKCLOS(STATE : in INTEGER) return INTEGER is
322  begin
323    return NFA.MKOPT(MKPOSCL(STATE));
324  end MKCLOS;
325
326
327  -- mkopt - make a machine optional
328  --
329  --     new  - a machine which optionally matches whatever mach matched
330  --     mach - the machine to make optional
331  --
332  -- notes:
333  --     1. mach must be the last machine created
334  --     2. mach is destroyed by the call
335
336  function MKOPT(MACH : in INTEGER) return INTEGER is
337    EPS    : INTEGER;
338    RESULT : INTEGER;
339  begin
340    RESULT := MACH;
341    if (not SUPER_FREE_EPSILON(FINALST(RESULT))) then
342      EPS := NFA.MKSTATE(SYM_EPSILON);
343      RESULT := NFA.LINK_MACHINES(RESULT, EPS);
344    end if;
345
346    -- can't skimp on the following if FREE_EPSILON(mach) is true because
347    -- some state interior to "mach" might point back to the beginning
348    -- for a closure
349    EPS := NFA.MKSTATE(SYM_EPSILON);
350    RESULT := NFA.LINK_MACHINES(EPS, RESULT);
351
352    NFA.MKXTION(RESULT, FINALST(RESULT));
353
354    return RESULT;
355  end MKOPT;
356
357
358  -- mkor - make a machine that matches either one of two machines
359  --
360  --     new - a machine which matches either first's pattern or second's
361--     first, second - machines whose patterns are to be or'ed (the | operator)
362  --
363  -- note that first and second are both destroyed by the operation
364  -- the code is rather convoluted because an attempt is made to minimize
365  -- the number of epsilon states needed
366
367  function MKOR(FIRST, SECOND : in INTEGER) return INTEGER is
368    EPS, OREND : INTEGER;
369    P_FIRST    : INTEGER;
370  begin
371    P_FIRST := FIRST;
372    if (P_FIRST = NIL) then
373      return SECOND;
374    else
375      if (SECOND = NIL) then
376        return P_FIRST;
377      else
378
379        -- see comment in mkopt() about why we can't use the first state
380        -- of "first" or "second" if they satisfy "FREE_EPSILON"
381        EPS := MKSTATE(SYM_EPSILON);
382
383        P_FIRST := LINK_MACHINES(EPS, P_FIRST);
384
385        MKXTION(P_FIRST, SECOND);
386
387        if ((SUPER_FREE_EPSILON(FINALST(P_FIRST))) and (ACCPTNUM(FINALST(P_FIRST
388          )) = NIL)) then
389          OREND := FINALST(P_FIRST);
390          MKXTION(FINALST(SECOND), OREND);
391        else
392          if ((SUPER_FREE_EPSILON(FINALST(SECOND))) and (ACCPTNUM(FINALST(SECOND
393            )) = NIL)) then
394            OREND := FINALST(SECOND);
395            MKXTION(FINALST(P_FIRST), OREND);
396          else
397            EPS := MKSTATE(SYM_EPSILON);
398            P_FIRST := LINK_MACHINES(P_FIRST, EPS);
399            OREND := FINALST(P_FIRST);
400
401            MKXTION(FINALST(SECOND), OREND);
402          end if;
403        end if;
404      end if;
405    end if;
406
407    FINALST(P_FIRST) := OREND;
408    return P_FIRST;
409  end MKOR;
410
411
412  -- mkposcl - convert a machine into a positive closure
413  --
414  --    new - a machine matching the positive closure of "state"
415
416  function MKPOSCL(STATE : in INTEGER) return INTEGER is
417    EPS : INTEGER;
418  begin
419    if (SUPER_FREE_EPSILON(FINALST(STATE))) then
420      MKXTION(FINALST(STATE), STATE);
421      return (STATE);
422    else
423      EPS := MKSTATE(SYM_EPSILON);
424      MKXTION(EPS, STATE);
425      return (LINK_MACHINES(STATE, EPS));
426    end if;
427  end MKPOSCL;
428
429  -- mkrep - make a replicated machine
430  --
431  --    new - a machine that matches whatever "mach" matched from "lb"
432  --          number of times to "ub" number of times
433  --
434  -- note
435--   if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
436
437  function MKREP(MACH, LB, UB : in INTEGER) return INTEGER is
438    BASE_MACH, TAIL, COPY : INTEGER;
439    P_MACH                : INTEGER;
440  begin
441    P_MACH := MACH;
442    BASE_MACH := COPYSINGL(P_MACH, LB - 1);
443
444    if (UB = INFINITY) then
445      COPY := DUPMACHINE(P_MACH);
446      P_MACH := LINK_MACHINES(P_MACH, LINK_MACHINES(BASE_MACH, MKCLOS(COPY)));
447    else
448      TAIL := MKSTATE(SYM_EPSILON);
449
450      for I in LB .. UB - 1 loop
451        COPY := DUPMACHINE(P_MACH);
452        TAIL := MKOPT(LINK_MACHINES(COPY, TAIL));
453      end loop;
454
455      P_MACH := LINK_MACHINES(P_MACH, LINK_MACHINES(BASE_MACH, TAIL));
456    end if;
457
458    return P_MACH;
459  end MKREP;
460
461  -- mkstate - create a state with a transition on a given symbol
462  --
463  --     state - a new state matching sym
464  --     sym   - the symbol the new state is to have an out-transition on
465  --
466  -- note that this routine makes new states in ascending order through the
467  -- state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
468  -- relies on machines being made in ascending order and that they are
469  -- CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
470  -- that it admittedly is)
471
472  function MKSTATE(SYM : in INTEGER) return INTEGER is
473  begin
474    LASTNFA := LASTNFA + 1;
475    if (LASTNFA >= CURRENT_MNS) then
476      CURRENT_MNS := CURRENT_MNS + MNS_INCREMENT;
477      if (CURRENT_MNS >= MAXIMUM_MNS) then
478            Misc.Aflex_Error
479              ("input rules are too complicated (>= "
480               & INTEGER'Wide_Wide_Image (CURRENT_MNS) & " NFA states) )");
481      end if;
482
483      NUM_REALLOCS := NUM_REALLOCS + 1;
484
485      REALLOCATE_INTEGER_ARRAY(FIRSTST, CURRENT_MNS);
486      REALLOCATE_INTEGER_ARRAY(LASTST, CURRENT_MNS);
487      REALLOCATE_INTEGER_ARRAY(FINALST, CURRENT_MNS);
488      REALLOCATE_INTEGER_ARRAY(TRANSCHAR, CURRENT_MNS);
489      REALLOCATE_INTEGER_ARRAY(TRANS1, CURRENT_MNS);
490      REALLOCATE_INTEGER_ARRAY(TRANS2, CURRENT_MNS);
491      REALLOCATE_INTEGER_ARRAY(ACCPTNUM, CURRENT_MNS);
492      REALLOCATE_INTEGER_ARRAY(ASSOC_RULE, CURRENT_MNS);
493      REALLOCATE_STATE_ENUM_ARRAY(STATE_TYPE, CURRENT_MNS);
494    end if;
495
496    FIRSTST(LASTNFA) := LASTNFA;
497    FINALST(LASTNFA) := LASTNFA;
498    LASTST(LASTNFA) := LASTNFA;
499    TRANSCHAR(LASTNFA) := SYM;
500    TRANS1(LASTNFA) := NO_TRANSITION;
501    TRANS2(LASTNFA) := NO_TRANSITION;
502    ACCPTNUM(LASTNFA) := NIL;
503    ASSOC_RULE(LASTNFA) := NUM_RULES;
504    STATE_TYPE(LASTNFA) := CURRENT_STATE_ENUM;
505
506    -- fix up equivalence classes base on this transition.  Note that any
507    -- character which has its own transition gets its own equivalence class.
508    -- Thus only characters which are only in character classes have a chance
509    -- at being in the same equivalence class.  E.g. "a|b" puts 'a' and 'b'
510    -- into two different equivalence classes.  "[ab]" puts them in the same
511    -- equivalence class (barring other differences elsewhere in the input).
512    if (SYM < 0) then
513
514      -- we don't have to update the equivalence classes since that was
515      -- already done when the ccl was created for the first time
516      null;
517    else
518      if (SYM = SYM_EPSILON) then
519        NUMEPS := NUMEPS + 1;
520      else
521        if (USEECS) then
522          ECS.MKECHAR(SYM, NEXTECM, ECGROUP);
523        end if;
524      end if;
525    end if;
526
527    return LASTNFA;
528  end MKSTATE;
529
530  -- mkxtion - make a transition from one state to another
531  --
532  --     statefrom - the state from which the transition is to be made
533  --     stateto   - the state to which the transition is to be made
534
535  procedure MKXTION(STATEFROM, STATETO : in INTEGER) is
536  begin
537    if (TRANS1(STATEFROM) = NO_TRANSITION) then
538      TRANS1(STATEFROM) := STATETO;
539    else
540      if ((TRANSCHAR(STATEFROM) /= SYM_EPSILON) or (TRANS2(STATEFROM) /=
541        NO_TRANSITION)) then
542        Misc.Aflex_Fatal ("found too many transitions in mkxtion()");
543      else
544
545        -- second out-transition for an epsilon state
546        EPS2 := EPS2 + 1;
547        TRANS2(STATEFROM) := STATETO;
548      end if;
549    end if;
550  end MKXTION;
551
552  -- new_rule - initialize for a new rule
553  --
554  -- the global num_rules is incremented and the any corresponding dynamic
555  -- arrays (such as rule_type()) are grown as needed.
556
557  procedure NEW_RULE is
558  begin
559    NUM_RULES := NUM_RULES + 1;
560    if (NUM_RULES >= CURRENT_MAX_RULES) then
561      NUM_REALLOCS := NUM_REALLOCS + 1;
562      CURRENT_MAX_RULES := CURRENT_MAX_RULES + MAX_RULES_INCREMENT;
563      REALLOCATE_RULE_ENUM_ARRAY(RULE_TYPE, CURRENT_MAX_RULES);
564      REALLOCATE_INTEGER_ARRAY(RULE_LINENUM, CURRENT_MAX_RULES);
565    end if;
566
567    if (NUM_RULES > MAX_RULE) then
568         Misc.Aflex_Error
569           ("too many rules  (> "
570            & INTEGER'Wide_Wide_Image (MAX_RULE) & ")!");
571    end if;
572
573    RULE_LINENUM(NUM_RULES) := LINENUM;
574  end NEW_RULE;
575
576end NFA;
577