1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT COMPILER COMPONENTS                         --
4--                                                                          --
5--                          G N A T . R E G E X P                           --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--            Copyright (C) 1999-2002 Ada Core Technologies, Inc.           --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
17-- for  more details.  You should have  received  a copy of the GNU General --
18-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
19-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
20-- MA 02111-1307, USA.                                                      --
21--                                                                          --
22-- As a special exception,  if other files  instantiate  generics from this --
23-- unit, or you link  this unit with other files  to produce an executable, --
24-- this  unit  does not  by itself cause  the resulting  executable  to  be --
25-- covered  by the  GNU  General  Public  License.  This exception does not --
26-- however invalidate  any other reasons why  the executable file  might be --
27-- covered by the  GNU Public License.                                      --
28--                                                                          --
29-- GNAT was originally developed  by the GNAT team at  New York University. --
30-- Extensive contributions were provided by Ada Core Technologies Inc.      --
31--                                                                          --
32------------------------------------------------------------------------------
33
34with Unchecked_Deallocation;
35with Ada.Exceptions;
36with GNAT.Case_Util;
37
38package body GNAT.Regexp is
39
40   Open_Paren    : constant Character := '(';
41   Close_Paren   : constant Character := ')';
42   Open_Bracket  : constant Character := '[';
43   Close_Bracket : constant Character := ']';
44
45   type State_Index is new Natural;
46   type Column_Index is new Natural;
47
48   type Regexp_Array is array
49     (State_Index range <>, Column_Index range <>) of State_Index;
50   --  First index is for the state number
51   --  Second index is for the character type
52   --  Contents is the new State
53
54   type Regexp_Array_Access is access Regexp_Array;
55   --  Use this type through the functions Set below, so that it
56   --  can grow dynamically depending on the needs.
57
58   type Mapping is array (Character'Range) of Column_Index;
59   --  Mapping between characters and column in the Regexp_Array
60
61   type Boolean_Array is array (State_Index range <>) of Boolean;
62
63   type Regexp_Value
64     (Alphabet_Size : Column_Index;
65      Num_States    : State_Index) is
66   record
67      Map            : Mapping;
68      States         : Regexp_Array (1 .. Num_States, 0 .. Alphabet_Size);
69      Is_Final       : Boolean_Array (1 .. Num_States);
70      Case_Sensitive : Boolean;
71   end record;
72   --  Deterministic finite-state machine
73
74   -----------------------
75   -- Local Subprograms --
76   -----------------------
77
78   procedure Set
79     (Table  : in out Regexp_Array_Access;
80      State  : State_Index;
81      Column : Column_Index;
82      Value  : State_Index);
83   --  Sets a value in the table. If the table is too small, reallocate it
84   --  dynamically so that (State, Column) is a valid index in it.
85
86   function Get
87     (Table  : Regexp_Array_Access;
88      State  : State_Index;
89      Column : Column_Index)
90      return   State_Index;
91   --  Returns the value in the table at (State, Column).
92   --  If this index does not exist in the table, returns 0
93
94   procedure Free is new Unchecked_Deallocation
95     (Regexp_Array, Regexp_Array_Access);
96
97   ------------
98   -- Adjust --
99   ------------
100
101   procedure Adjust (R : in out Regexp) is
102      Tmp : Regexp_Access;
103
104   begin
105      Tmp := new Regexp_Value (Alphabet_Size => R.R.Alphabet_Size,
106                               Num_States    => R.R.Num_States);
107      Tmp.all := R.R.all;
108      R.R := Tmp;
109   end Adjust;
110
111   -------------
112   -- Compile --
113   -------------
114
115   function Compile
116     (Pattern        : String;
117      Glob           : Boolean := False;
118      Case_Sensitive : Boolean := True)
119      return           Regexp
120   is
121      S : String := Pattern;
122      --  The pattern which is really compiled (when the pattern is case
123      --  insensitive, we convert this string to lower-cases
124
125      Map : Mapping := (others => 0);
126      --  Mapping between characters and columns in the tables
127
128      Alphabet_Size : Column_Index := 0;
129      --  Number of significant characters in the regular expression.
130      --  This total does not include special operators, such as *, (, ...
131
132      procedure Create_Mapping;
133      --  Creates a mapping between characters in the regexp and columns
134      --  in the tables representing the regexp. Test that the regexp is
135      --  well-formed Modifies Alphabet_Size and Map
136
137      procedure Create_Primary_Table
138        (Table       : out Regexp_Array_Access;
139         Num_States  : out State_Index;
140         Start_State : out State_Index;
141         End_State   : out State_Index);
142      --  Creates the first version of the regexp (this is a non determinist
143      --  finite state machine, which is unadapted for a fast pattern
144      --  matching algorithm). We use a recursive algorithm to process the
145      --  parenthesis sub-expressions.
146      --
147      --  Table : at the end of the procedure : Column 0 is for any character
148      --  ('.') and the last columns are for no character (closure)
149      --  Num_States is set to the number of states in the table
150      --  Start_State is the number of the starting state in the regexp
151      --  End_State is the number of the final state when the regexp matches
152
153      procedure Create_Primary_Table_Glob
154        (Table       : out Regexp_Array_Access;
155         Num_States  : out State_Index;
156         Start_State : out State_Index;
157         End_State   : out State_Index);
158      --  Same function as above, but it deals with the second possible
159      --  grammar for 'globbing pattern', which is a kind of subset of the
160      --  whole regular expression grammar.
161
162      function Create_Secondary_Table
163        (First_Table : Regexp_Array_Access;
164         Num_States  : State_Index;
165         Start_State : State_Index;
166         End_State   : State_Index)
167         return        Regexp;
168      --  Creates the definitive table representing the regular expression
169      --  This is actually a transformation of the primary table First_Table,
170      --  where every state is grouped with the states in its 'no-character'
171      --  columns. The transitions between the new states are then recalculated
172      --  and if necessary some new states are created.
173      --
174      --  Note that the resulting finite-state machine is not optimized in
175      --  terms of the number of states : it would be more time-consuming to
176      --  add a third pass to reduce the number of states in the machine, with
177      --  no speed improvement...
178
179      procedure Raise_Exception
180        (M     : String;
181         Index : Integer);
182      pragma No_Return (Raise_Exception);
183      --  Raise an exception, indicating an error at character Index in S.
184
185      --------------------
186      -- Create_Mapping --
187      --------------------
188
189      procedure Create_Mapping is
190
191         procedure Add_In_Map (C : Character);
192         --  Add a character in the mapping, if it is not already defined
193
194         -----------------
195         --  Add_In_Map --
196         -----------------
197
198         procedure Add_In_Map (C : Character) is
199         begin
200            if Map (C) = 0 then
201               Alphabet_Size := Alphabet_Size + 1;
202               Map (C) := Alphabet_Size;
203            end if;
204         end Add_In_Map;
205
206         J                 : Integer := S'First;
207         Parenthesis_Level : Integer := 0;
208         Curly_Level       : Integer := 0;
209
210      --  Start of processing for Create_Mapping
211
212      begin
213         while J <= S'Last loop
214            case S (J) is
215               when Open_Bracket =>
216                  J := J + 1;
217
218                  if S (J) = '^' then
219                     J := J + 1;
220                  end if;
221
222                  if S (J) = ']' or S (J) = '-' then
223                     J := J + 1;
224                  end if;
225
226                  --  The first character never has a special meaning
227
228                  loop
229                     if J > S'Last then
230                        Raise_Exception
231                          ("Ran out of characters while parsing ", J);
232                     end if;
233
234                     exit when S (J) = Close_Bracket;
235
236                     if S (J) = '-'
237                       and then S (J + 1) /= Close_Bracket
238                     then
239                        declare
240                           Start : constant Integer := J - 1;
241
242                        begin
243                           J := J + 1;
244
245                           if S (J) = '\' then
246                              J := J + 1;
247                           end if;
248
249                           for Char in S (Start) .. S (J) loop
250                              Add_In_Map (Char);
251                           end loop;
252                        end;
253                     else
254                        if S (J) = '\' then
255                           J := J + 1;
256                        end if;
257
258                        Add_In_Map (S (J));
259                     end if;
260
261                     J := J + 1;
262                  end loop;
263
264                  --  A close bracket must follow a open_bracket,
265                  --  and cannot be found alone on the line
266
267               when Close_Bracket =>
268                  Raise_Exception
269                    ("Incorrect character ']' in regular expression", J);
270
271               when '\' =>
272                  if J < S'Last  then
273                     J := J + 1;
274                     Add_In_Map (S (J));
275
276                  else
277                     --  \ not allowed at the end of the regexp
278
279                     Raise_Exception
280                       ("Incorrect character '\' in regular expression", J);
281                  end if;
282
283               when Open_Paren =>
284                  if not Glob then
285                     Parenthesis_Level := Parenthesis_Level + 1;
286                  else
287                     Add_In_Map (Open_Paren);
288                  end if;
289
290               when Close_Paren =>
291                  if not Glob then
292                     Parenthesis_Level := Parenthesis_Level - 1;
293
294                     if Parenthesis_Level < 0 then
295                        Raise_Exception
296                          ("')' is not associated with '(' in regular "
297                           & "expression", J);
298                     end if;
299
300                     if S (J - 1) = Open_Paren then
301                        Raise_Exception
302                          ("Empty parenthesis not allowed in regular "
303                           & "expression", J);
304                     end if;
305
306                  else
307                     Add_In_Map (Close_Paren);
308                  end if;
309
310               when '.' =>
311                  if Glob then
312                     Add_In_Map ('.');
313                  end if;
314
315               when '{' =>
316                  if not Glob then
317                     Add_In_Map (S (J));
318                  else
319                     Curly_Level := Curly_Level + 1;
320                  end if;
321
322               when '}' =>
323                  if not Glob then
324                     Add_In_Map (S (J));
325                  else
326                     Curly_Level := Curly_Level - 1;
327                  end if;
328
329               when '*' | '?' =>
330                  if not Glob then
331                     if J = S'First then
332                        Raise_Exception
333                          ("'*', '+', '?' and '|' operators can not be in "
334                           & "first position in regular expression", J);
335                     end if;
336                  end if;
337
338               when '|' | '+' =>
339                  if not Glob then
340                     if J = S'First then
341
342                        --  These operators must apply to a sub-expression,
343                        --  and cannot be found at the beginning of the line
344
345                        Raise_Exception
346                          ("'*', '+', '?' and '|' operators can not be in "
347                           & "first position in regular expression", J);
348                     end if;
349
350                  else
351                     Add_In_Map (S (J));
352                  end if;
353
354               when others =>
355                  Add_In_Map (S (J));
356            end case;
357
358            J := J + 1;
359         end loop;
360
361         --  A closing parenthesis must follow an open parenthesis
362
363         if Parenthesis_Level /= 0 then
364            Raise_Exception
365              ("'(' must always be associated with a ')'", J);
366         end if;
367
368         if Curly_Level /= 0 then
369            Raise_Exception
370              ("'{' must always be associated with a '}'", J);
371         end if;
372      end Create_Mapping;
373
374      --------------------------
375      -- Create_Primary_Table --
376      --------------------------
377
378      procedure Create_Primary_Table
379        (Table       : out Regexp_Array_Access;
380         Num_States  : out State_Index;
381         Start_State : out State_Index;
382         End_State   : out State_Index)
383      is
384         Empty_Char : constant Column_Index := Alphabet_Size + 1;
385
386         Current_State : State_Index := 0;
387         --  Index of the last created state
388
389         procedure Add_Empty_Char
390           (State    : State_Index;
391            To_State : State_Index);
392         --  Add a empty-character transition from State to To_State.
393
394         procedure Create_Repetition
395           (Repetition : Character;
396            Start_Prev : State_Index;
397            End_Prev   : State_Index;
398            New_Start  : out State_Index;
399            New_End    : in out State_Index);
400         --  Create the table in case we have a '*', '+' or '?'.
401         --  Start_Prev .. End_Prev should indicate respectively the start and
402         --  end index of the previous expression, to which '*', '+' or '?' is
403         --  applied.
404
405         procedure Create_Simple
406           (Start_Index : Integer;
407            End_Index   : Integer;
408            Start_State : out State_Index;
409            End_State   : out State_Index);
410         --  Fill the table for the regexp Simple.
411         --  This is the recursive procedure called to handle () expressions
412         --  If End_State = 0, then the call to Create_Simple creates an
413         --  independent regexp, not a concatenation
414         --  Start_Index .. End_Index is the starting index in the string S.
415         --
416         --  Warning: it may look like we are creating too many empty-string
417         --  transitions, but they are needed to get the correct regexp.
418         --  The table is filled as follow ( s means start-state, e means
419         --  end-state) :
420         --
421         --  regexp   state_num | a b * empty_string
422         --  -------  ---------------------------------------
423         --    a          1 (s) | 2 - - -
424         --               2 (e) | - - - -
425         --
426         --    ab         1 (s) | 2 - - -
427         --               2     | - - - 3
428         --               3     | - 4 - -
429         --               4 (e) | - - - -
430         --
431         --    a|b        1     | 2 - - -
432         --               2     | - - - 6
433         --               3     | - 4 - -
434         --               4     | - - - 6
435         --               5 (s) | - - - 1,3
436         --               6 (e) | - - - -
437         --
438         --    a*         1     | 2 - - -
439         --               2     | - - - 4
440         --               3 (s) | - - - 1,4
441         --               4 (e) | - - - 3
442         --
443         --    (a)        1 (s) | 2 - - -
444         --               2 (e) | - - - -
445         --
446         --    a+         1     | 2 - - -
447         --               2     | - - - 4
448         --               3 (s) | - - - 1
449         --               4 (e) | - - - 3
450         --
451         --    a?         1     | 2 - - -
452         --               2     | - - - 4
453         --               3 (s) | - - - 1,4
454         --               4 (e) | - - - -
455         --
456         --    .          1 (s) | 2 2 2 -
457         --               2 (e) | - - - -
458
459         function Next_Sub_Expression
460           (Start_Index : Integer;
461            End_Index   : Integer)
462            return        Integer;
463         --  Returns the index of the last character of the next sub-expression
464         --  in Simple. Index can not be greater than End_Index
465
466         --------------------
467         -- Add_Empty_Char --
468         --------------------
469
470         procedure Add_Empty_Char
471           (State    : State_Index;
472            To_State : State_Index)
473         is
474            J : Column_Index := Empty_Char;
475
476         begin
477            while Get (Table, State, J) /= 0 loop
478               J := J + 1;
479            end loop;
480
481            Set (Table, State, J, To_State);
482         end Add_Empty_Char;
483
484         -----------------------
485         -- Create_Repetition --
486         -----------------------
487
488         procedure Create_Repetition
489           (Repetition : Character;
490            Start_Prev : State_Index;
491            End_Prev   : State_Index;
492            New_Start  : out State_Index;
493            New_End    : in out State_Index)
494         is
495         begin
496            New_Start := Current_State + 1;
497
498            if New_End /= 0 then
499               Add_Empty_Char (New_End, New_Start);
500            end if;
501
502            Current_State := Current_State + 2;
503            New_End   := Current_State;
504
505            Add_Empty_Char (End_Prev, New_End);
506            Add_Empty_Char (New_Start, Start_Prev);
507
508            if Repetition /= '+' then
509               Add_Empty_Char (New_Start, New_End);
510            end if;
511
512            if Repetition /= '?' then
513               Add_Empty_Char (New_End, New_Start);
514            end if;
515         end Create_Repetition;
516
517         -------------------
518         -- Create_Simple --
519         -------------------
520
521         procedure Create_Simple
522           (Start_Index : Integer;
523            End_Index   : Integer;
524            Start_State : out State_Index;
525            End_State   : out State_Index)
526         is
527            J          : Integer := Start_Index;
528            Last_Start : State_Index := 0;
529
530         begin
531            Start_State := 0;
532            End_State   := 0;
533            while J <= End_Index loop
534               case S (J) is
535                  when Open_Paren =>
536                     declare
537                        J_Start    : constant Integer := J + 1;
538                        Next_Start : State_Index;
539                        Next_End   : State_Index;
540
541                     begin
542                        J := Next_Sub_Expression (J, End_Index);
543                        Create_Simple (J_Start, J - 1, Next_Start, Next_End);
544
545                        if J < End_Index
546                          and then (S (J + 1) = '*' or else
547                                    S (J + 1) = '+' or else
548                                    S (J + 1) = '?')
549                        then
550                           J := J + 1;
551                           Create_Repetition
552                             (S (J),
553                              Next_Start,
554                              Next_End,
555                              Last_Start,
556                              End_State);
557
558                        else
559                           Last_Start := Next_Start;
560
561                           if End_State /= 0 then
562                              Add_Empty_Char (End_State, Last_Start);
563                           end if;
564
565                           End_State := Next_End;
566                        end if;
567                     end;
568
569                  when '|' =>
570                     declare
571                        Start_Prev : constant State_Index := Start_State;
572                        End_Prev   : constant State_Index := End_State;
573                        Start_J    : constant Integer     := J + 1;
574                        Start_Next : State_Index := 0;
575                        End_Next   : State_Index := 0;
576
577                     begin
578                        J := Next_Sub_Expression (J, End_Index);
579
580                        --  Create a new state for the start of the alternative
581
582                        Current_State := Current_State + 1;
583                        Last_Start := Current_State;
584                        Start_State := Last_Start;
585
586                        --  Create the tree for the second part of alternative
587
588                        Create_Simple (Start_J, J, Start_Next, End_Next);
589
590                        --  Create the end state
591
592                        Add_Empty_Char (Last_Start, Start_Next);
593                        Add_Empty_Char (Last_Start, Start_Prev);
594                        Current_State := Current_State + 1;
595                        End_State := Current_State;
596                        Add_Empty_Char (End_Prev, End_State);
597                        Add_Empty_Char (End_Next, End_State);
598                     end;
599
600                  when Open_Bracket =>
601                     Current_State := Current_State + 1;
602
603                     declare
604                        Next_State : State_Index := Current_State + 1;
605
606                     begin
607                        J := J + 1;
608
609                        if S (J) = '^' then
610                           J := J + 1;
611
612                           Next_State := 0;
613
614                           for Column in 0 .. Alphabet_Size loop
615                              Set (Table, Current_State, Column,
616                                   Value => Current_State + 1);
617                           end loop;
618                        end if;
619
620                        --  Automatically add the first character
621
622                        if S (J) = '-' or S (J) = ']' then
623                           Set (Table, Current_State, Map (S (J)),
624                                Value => Next_State);
625                           J := J + 1;
626                        end if;
627
628                        --  Loop till closing bracket found
629
630                        loop
631                           exit when S (J) = Close_Bracket;
632
633                           if S (J) = '-'
634                             and then S (J + 1) /= ']'
635                           then
636                              declare
637                                 Start : constant Integer := J - 1;
638
639                              begin
640                                 J := J + 1;
641
642                                 if S (J) = '\' then
643                                    J := J + 1;
644                                 end if;
645
646                                 for Char in S (Start) .. S (J) loop
647                                    Set (Table, Current_State, Map (Char),
648                                         Value => Next_State);
649                                 end loop;
650                              end;
651
652                           else
653                              if S (J) = '\' then
654                                 J := J + 1;
655                              end if;
656
657                              Set (Table, Current_State, Map (S (J)),
658                                   Value => Next_State);
659                           end if;
660                           J := J + 1;
661                        end loop;
662                     end;
663
664                     Current_State := Current_State + 1;
665
666                     --  If the next symbol is a special symbol
667
668                     if J < End_Index
669                       and then (S (J + 1) = '*' or else
670                                 S (J + 1) = '+' or else
671                                 S (J + 1) = '?')
672                     then
673                        J := J + 1;
674                        Create_Repetition
675                          (S (J),
676                           Current_State - 1,
677                           Current_State,
678                           Last_Start,
679                           End_State);
680
681                     else
682                        Last_Start := Current_State - 1;
683
684                        if End_State /= 0 then
685                           Add_Empty_Char (End_State, Last_Start);
686                        end if;
687
688                        End_State := Current_State;
689                     end if;
690
691                  when '*' | '+' | '?' | Close_Paren | Close_Bracket =>
692                     Raise_Exception
693                       ("Incorrect character in regular expression :", J);
694
695                  when others =>
696                     Current_State := Current_State + 1;
697
698                     --  Create the state for the symbol S (J)
699
700                     if S (J) = '.' then
701                        for K in 0 .. Alphabet_Size loop
702                           Set (Table, Current_State, K,
703                                Value => Current_State + 1);
704                        end loop;
705
706                     else
707                        if S (J) = '\' then
708                           J := J + 1;
709                        end if;
710
711                        Set (Table, Current_State, Map (S (J)),
712                             Value => Current_State + 1);
713                     end if;
714
715                     Current_State := Current_State + 1;
716
717                     --  If the next symbol is a special symbol
718
719                     if J < End_Index
720                       and then (S (J + 1) = '*' or else
721                                 S (J + 1) = '+' or else
722                                 S (J + 1) = '?')
723                     then
724                        J := J + 1;
725                        Create_Repetition
726                          (S (J),
727                           Current_State - 1,
728                           Current_State,
729                           Last_Start,
730                           End_State);
731
732                     else
733                        Last_Start := Current_State - 1;
734
735                        if End_State /= 0 then
736                           Add_Empty_Char (End_State, Last_Start);
737                        end if;
738
739                        End_State := Current_State;
740                     end if;
741
742               end case;
743
744               if Start_State = 0 then
745                  Start_State := Last_Start;
746               end if;
747
748               J := J + 1;
749            end loop;
750         end Create_Simple;
751
752         -------------------------
753         -- Next_Sub_Expression --
754         -------------------------
755
756         function Next_Sub_Expression
757           (Start_Index : Integer;
758            End_Index   : Integer)
759            return        Integer
760         is
761            J              : Integer := Start_Index;
762            Start_On_Alter : Boolean := False;
763
764         begin
765            if S (J) = '|' then
766               Start_On_Alter := True;
767            end if;
768
769            loop
770               exit when J = End_Index;
771               J := J + 1;
772
773               case S (J) is
774                  when '\' =>
775                     J := J + 1;
776
777                  when Open_Bracket =>
778                     loop
779                        J := J + 1;
780                        exit when S (J) = Close_Bracket;
781
782                        if S (J) = '\' then
783                           J := J + 1;
784                        end if;
785                     end loop;
786
787                  when Open_Paren =>
788                     J := Next_Sub_Expression (J, End_Index);
789
790                  when Close_Paren =>
791                     return J;
792
793                  when '|' =>
794                     if Start_On_Alter then
795                        return J - 1;
796                     end if;
797
798                  when others =>
799                     null;
800               end case;
801            end loop;
802
803            return J;
804         end Next_Sub_Expression;
805
806      --  Start of Create_Primary_Table
807
808      begin
809         Table.all := (others => (others => 0));
810         Create_Simple (S'First, S'Last, Start_State, End_State);
811         Num_States := Current_State;
812      end Create_Primary_Table;
813
814      -------------------------------
815      -- Create_Primary_Table_Glob --
816      -------------------------------
817
818      procedure Create_Primary_Table_Glob
819        (Table       : out Regexp_Array_Access;
820         Num_States  : out State_Index;
821         Start_State : out State_Index;
822         End_State   : out State_Index)
823      is
824         Empty_Char : constant Column_Index := Alphabet_Size + 1;
825
826         Current_State : State_Index := 0;
827         --  Index of the last created state
828
829         procedure Add_Empty_Char
830           (State    : State_Index;
831            To_State : State_Index);
832         --  Add a empty-character transition from State to To_State.
833
834         procedure Create_Simple
835           (Start_Index : Integer;
836            End_Index   : Integer;
837            Start_State : out State_Index;
838            End_State   : out State_Index);
839         --  Fill the table for the S (Start_Index .. End_Index).
840         --  This is the recursive procedure called to handle () expressions
841
842         --------------------
843         -- Add_Empty_Char --
844         --------------------
845
846         procedure Add_Empty_Char
847           (State    : State_Index;
848            To_State : State_Index)
849         is
850            J : Column_Index := Empty_Char;
851
852         begin
853            while Get (Table, State, J) /= 0 loop
854               J := J + 1;
855            end loop;
856
857            Set (Table, State, J,
858                 Value => To_State);
859         end Add_Empty_Char;
860
861         -------------------
862         -- Create_Simple --
863         -------------------
864
865         procedure Create_Simple
866           (Start_Index : Integer;
867            End_Index   : Integer;
868            Start_State : out State_Index;
869            End_State   : out State_Index)
870         is
871            J          : Integer := Start_Index;
872            Last_Start : State_Index := 0;
873
874         begin
875            Start_State := 0;
876            End_State   := 0;
877
878            while J <= End_Index loop
879               case S (J) is
880
881                  when Open_Bracket =>
882                     Current_State := Current_State + 1;
883
884                     declare
885                        Next_State : State_Index := Current_State + 1;
886
887                     begin
888                        J := J + 1;
889
890                        if S (J) = '^' then
891                           J := J + 1;
892                           Next_State := 0;
893
894                           for Column in 0 .. Alphabet_Size loop
895                              Set (Table, Current_State, Column,
896                                   Value => Current_State + 1);
897                           end loop;
898                        end if;
899
900                        --  Automatically add the first character
901
902                        if S (J) = '-' or S (J) = ']' then
903                           Set (Table, Current_State, Map (S (J)),
904                                Value => Current_State);
905                           J := J + 1;
906                        end if;
907
908                        --  Loop till closing bracket found
909
910                        loop
911                           exit when S (J) = Close_Bracket;
912
913                           if S (J) = '-'
914                             and then S (J + 1) /= ']'
915                           then
916                              declare
917                                 Start : constant Integer := J - 1;
918                              begin
919                                 J := J + 1;
920
921                                 if S (J) = '\' then
922                                    J := J + 1;
923                                 end if;
924
925                                 for Char in S (Start) .. S (J) loop
926                                    Set (Table, Current_State, Map (Char),
927                                         Value => Next_State);
928                                 end loop;
929                              end;
930
931                           else
932                              if S (J) = '\' then
933                                 J := J + 1;
934                              end if;
935
936                              Set (Table, Current_State, Map (S (J)),
937                                   Value => Next_State);
938                           end if;
939                           J := J + 1;
940                        end loop;
941                     end;
942
943                     Last_Start := Current_State;
944                     Current_State := Current_State + 1;
945
946                     if End_State /= 0 then
947                        Add_Empty_Char (End_State, Last_Start);
948                     end if;
949
950                     End_State := Current_State;
951
952                  when '{' =>
953                     declare
954                        End_Sub          : Integer;
955                        Start_Regexp_Sub : State_Index;
956                        End_Regexp_Sub   : State_Index;
957                        Create_Start     : State_Index := 0;
958
959                        Create_End : State_Index := 0;
960                        --  Initialized to avoid junk warning
961
962                     begin
963                        while S (J) /= '}' loop
964
965                           --  First step : find sub pattern
966
967                           End_Sub := J + 1;
968                           while S (End_Sub) /= ','
969                             and then S (End_Sub) /= '}'
970                           loop
971                              End_Sub := End_Sub + 1;
972                           end loop;
973
974                           --  Second step : create a sub pattern
975
976                           Create_Simple
977                             (J + 1,
978                              End_Sub - 1,
979                              Start_Regexp_Sub,
980                              End_Regexp_Sub);
981
982                           J := End_Sub;
983
984                           --  Third step : create an alternative
985
986                           if Create_Start = 0 then
987                              Current_State := Current_State + 1;
988                              Create_Start := Current_State;
989                              Add_Empty_Char (Create_Start, Start_Regexp_Sub);
990                              Current_State := Current_State + 1;
991                              Create_End := Current_State;
992                              Add_Empty_Char (End_Regexp_Sub, Create_End);
993
994                           else
995                              Current_State := Current_State + 1;
996                              Add_Empty_Char (Current_State, Create_Start);
997                              Create_Start := Current_State;
998                              Add_Empty_Char (Create_Start, Start_Regexp_Sub);
999                              Add_Empty_Char (End_Regexp_Sub, Create_End);
1000                           end if;
1001                        end loop;
1002
1003                        if End_State /= 0 then
1004                           Add_Empty_Char (End_State, Create_Start);
1005                        end if;
1006
1007                        End_State := Create_End;
1008                        Last_Start := Create_Start;
1009                     end;
1010
1011                  when '*' =>
1012                     Current_State := Current_State + 1;
1013
1014                     if End_State /= 0 then
1015                        Add_Empty_Char (End_State, Current_State);
1016                     end if;
1017
1018                     Add_Empty_Char (Current_State, Current_State + 1);
1019                     Add_Empty_Char (Current_State, Current_State + 3);
1020                     Last_Start := Current_State;
1021
1022                     Current_State := Current_State + 1;
1023
1024                     for K in 0 .. Alphabet_Size loop
1025                        Set (Table, Current_State, K,
1026                             Value => Current_State + 1);
1027                     end loop;
1028
1029                     Current_State := Current_State + 1;
1030                     Add_Empty_Char (Current_State, Current_State + 1);
1031
1032                     Current_State := Current_State + 1;
1033                     Add_Empty_Char (Current_State,  Last_Start);
1034                     End_State := Current_State;
1035
1036                  when others =>
1037                     Current_State := Current_State + 1;
1038
1039                     if S (J) = '?' then
1040                        for K in 0 .. Alphabet_Size loop
1041                           Set (Table, Current_State, K,
1042                                Value => Current_State + 1);
1043                        end loop;
1044
1045                     else
1046                        if S (J) = '\' then
1047                           J := J + 1;
1048                        end if;
1049
1050                        --  Create the state for the symbol S (J)
1051
1052                        Set (Table, Current_State, Map (S (J)),
1053                             Value => Current_State + 1);
1054                     end if;
1055
1056                     Last_Start := Current_State;
1057                     Current_State := Current_State + 1;
1058
1059                     if End_State /= 0 then
1060                        Add_Empty_Char (End_State, Last_Start);
1061                     end if;
1062
1063                     End_State := Current_State;
1064
1065               end case;
1066
1067               if Start_State = 0 then
1068                  Start_State := Last_Start;
1069               end if;
1070
1071               J := J + 1;
1072            end loop;
1073         end Create_Simple;
1074
1075      --  Start of processing for Create_Primary_Table_Glob
1076
1077      begin
1078         Table.all := (others => (others => 0));
1079         Create_Simple (S'First, S'Last, Start_State, End_State);
1080         Num_States := Current_State;
1081      end Create_Primary_Table_Glob;
1082
1083      ----------------------------
1084      -- Create_Secondary_Table --
1085      ----------------------------
1086
1087      function Create_Secondary_Table
1088        (First_Table : Regexp_Array_Access;
1089         Num_States  : State_Index;
1090         Start_State : State_Index;
1091         End_State   : State_Index)
1092         return        Regexp
1093      is
1094         pragma Warnings (Off, Num_States);
1095
1096         Last_Index : constant State_Index := First_Table'Last (1);
1097         type Meta_State is array (1 .. Last_Index) of Boolean;
1098
1099         Table : Regexp_Array (1 .. Last_Index, 0 .. Alphabet_Size) :=
1100                   (others => (others => 0));
1101
1102         Meta_States : array (1 .. Last_Index + 1) of Meta_State :=
1103                         (others => (others => False));
1104
1105         Temp_State_Not_Null : Boolean;
1106
1107         Is_Final : Boolean_Array (1 .. Last_Index) := (others => False);
1108
1109         Current_State       : State_Index := 1;
1110         Nb_State            : State_Index := 1;
1111
1112         procedure Closure
1113           (State : in out Meta_State;
1114            Item  :        State_Index);
1115         --  Compute the closure of the state (that is every other state which
1116         --  has a empty-character transition) and add it to the state
1117
1118         -------------
1119         -- Closure --
1120         -------------
1121
1122         procedure Closure
1123           (State : in out Meta_State;
1124            Item  : State_Index)
1125         is
1126         begin
1127            if State (Item) then
1128               return;
1129            end if;
1130
1131            State (Item) := True;
1132
1133            for Column in Alphabet_Size + 1 .. First_Table'Last (2) loop
1134               if First_Table (Item, Column) = 0 then
1135                  return;
1136               end if;
1137
1138               Closure (State, First_Table (Item, Column));
1139            end loop;
1140         end Closure;
1141
1142      --  Start of procesing for Create_Secondary_Table
1143
1144      begin
1145         --  Create a new state
1146
1147         Closure (Meta_States (Current_State), Start_State);
1148
1149         while Current_State <= Nb_State loop
1150
1151            --  If this new meta-state includes the primary table end state,
1152            --  then this meta-state will be a final state in the regexp
1153
1154            if Meta_States (Current_State)(End_State) then
1155               Is_Final (Current_State) := True;
1156            end if;
1157
1158            --  For every character in the regexp, calculate the possible
1159            --  transitions from Current_State
1160
1161            for Column in 0 .. Alphabet_Size loop
1162               Meta_States (Nb_State + 1) := (others => False);
1163               Temp_State_Not_Null := False;
1164
1165               for K in Meta_States (Current_State)'Range loop
1166                  if Meta_States (Current_State)(K)
1167                    and then First_Table (K, Column) /= 0
1168                  then
1169                     Closure
1170                       (Meta_States (Nb_State + 1), First_Table (K, Column));
1171                     Temp_State_Not_Null := True;
1172                  end if;
1173               end loop;
1174
1175               --  If at least one transition existed
1176
1177               if Temp_State_Not_Null then
1178
1179                  --  Check if this new state corresponds to an old one
1180
1181                  for K in 1 .. Nb_State loop
1182                     if Meta_States (K) = Meta_States (Nb_State + 1) then
1183                        Table (Current_State, Column) := K;
1184                        exit;
1185                     end if;
1186                  end loop;
1187
1188                  --  If not, create a new state
1189
1190                  if Table (Current_State, Column) = 0 then
1191                     Nb_State := Nb_State + 1;
1192                     Table (Current_State, Column) := Nb_State;
1193                  end if;
1194               end if;
1195            end loop;
1196
1197            Current_State := Current_State + 1;
1198         end loop;
1199
1200         --  Returns the regexp
1201
1202         declare
1203            R : Regexp_Access;
1204
1205         begin
1206            R := new Regexp_Value (Alphabet_Size => Alphabet_Size,
1207                                   Num_States    => Nb_State);
1208            R.Map            := Map;
1209            R.Is_Final       := Is_Final (1 .. Nb_State);
1210            R.Case_Sensitive := Case_Sensitive;
1211
1212            for State in 1 .. Nb_State loop
1213               for K in 0 .. Alphabet_Size loop
1214                  R.States (State, K) := Table (State, K);
1215               end loop;
1216            end loop;
1217
1218            return (Ada.Finalization.Controlled with R => R);
1219         end;
1220      end Create_Secondary_Table;
1221
1222      ---------------------
1223      -- Raise_Exception --
1224      ---------------------
1225
1226      procedure Raise_Exception
1227        (M     : String;
1228         Index : Integer)
1229      is
1230      begin
1231         Ada.Exceptions.Raise_Exception
1232           (Error_In_Regexp'Identity, M & " at offset " & Index'Img);
1233      end Raise_Exception;
1234
1235   --  Start of processing for Compile
1236
1237   begin
1238      --  Special case for the empty string: it always matches, and the
1239      --  following processing would fail on it.
1240      if S = "" then
1241         return (Ada.Finalization.Controlled with
1242                 R => new Regexp_Value'
1243                      (Alphabet_Size => 0,
1244                       Num_States    => 1,
1245                       Map           => (others => 0),
1246                       States        => (others => (others => 1)),
1247                       Is_Final      => (others => True),
1248                       Case_Sensitive => True));
1249      end if;
1250
1251      if not Case_Sensitive then
1252         GNAT.Case_Util.To_Lower (S);
1253      end if;
1254
1255      Create_Mapping;
1256
1257      --  Creates the primary table
1258
1259      declare
1260         Table : Regexp_Array_Access;
1261         Num_States  : State_Index;
1262         Start_State : State_Index;
1263         End_State   : State_Index;
1264         R           : Regexp;
1265
1266      begin
1267         Table := new Regexp_Array (1 .. 100,
1268                                    0 .. Alphabet_Size + 10);
1269         if not Glob then
1270            Create_Primary_Table (Table, Num_States, Start_State, End_State);
1271         else
1272            Create_Primary_Table_Glob
1273              (Table, Num_States, Start_State, End_State);
1274         end if;
1275
1276         --  Creates the secondary table
1277
1278         R := Create_Secondary_Table
1279           (Table, Num_States, Start_State, End_State);
1280         Free (Table);
1281         return R;
1282      end;
1283   end Compile;
1284
1285   --------------
1286   -- Finalize --
1287   --------------
1288
1289   procedure Finalize (R : in out Regexp) is
1290      procedure Free is new
1291        Unchecked_Deallocation (Regexp_Value, Regexp_Access);
1292
1293   begin
1294      Free (R.R);
1295   end Finalize;
1296
1297   ---------
1298   -- Get --
1299   ---------
1300
1301   function Get
1302     (Table  : Regexp_Array_Access;
1303      State  : State_Index;
1304      Column : Column_Index)
1305      return   State_Index
1306   is
1307   begin
1308      if State <= Table'Last (1)
1309        and then Column <= Table'Last (2)
1310      then
1311         return Table (State, Column);
1312      else
1313         return 0;
1314      end if;
1315   end Get;
1316
1317   -----------
1318   -- Match --
1319   -----------
1320
1321   function Match (S : String; R : Regexp) return Boolean is
1322      Current_State : State_Index := 1;
1323
1324   begin
1325      if R.R = null then
1326         raise Constraint_Error;
1327      end if;
1328
1329      for Char in S'Range loop
1330
1331         if R.R.Case_Sensitive then
1332            Current_State := R.R.States (Current_State, R.R.Map (S (Char)));
1333         else
1334            Current_State :=
1335              R.R.States (Current_State,
1336                          R.R.Map (GNAT.Case_Util.To_Lower (S (Char))));
1337         end if;
1338
1339         if Current_State = 0 then
1340            return False;
1341         end if;
1342
1343      end loop;
1344
1345      return R.R.Is_Final (Current_State);
1346   end Match;
1347
1348   ---------
1349   -- Set --
1350   ---------
1351
1352   procedure Set
1353     (Table  : in out Regexp_Array_Access;
1354      State  : State_Index;
1355      Column : Column_Index;
1356      Value  : State_Index)
1357   is
1358      New_Lines   : State_Index;
1359      New_Columns : Column_Index;
1360      New_Table   : Regexp_Array_Access;
1361
1362   begin
1363      if State <= Table'Last (1)
1364        and then Column <= Table'Last (2)
1365      then
1366         Table (State, Column) := Value;
1367      else
1368         --  Doubles the size of the table until it is big enough that
1369         --  (State, Column) is a valid index
1370
1371         New_Lines := Table'Last (1) * (State / Table'Last (1) + 1);
1372         New_Columns := Table'Last (2) * (Column / Table'Last (2) + 1);
1373         New_Table := new Regexp_Array (Table'First (1) .. New_Lines,
1374                                        Table'First (2) .. New_Columns);
1375         New_Table.all := (others => (others => 0));
1376
1377         for J in Table'Range (1) loop
1378            for K in Table'Range (2) loop
1379               New_Table (J, K) := Table (J, K);
1380            end loop;
1381         end loop;
1382
1383         Free (Table);
1384         Table := New_Table;
1385         Table (State, Column) := Value;
1386      end if;
1387   end Set;
1388
1389end GNAT.Regexp;
1390