1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT COMPILER COMPONENTS                         --
4--                                                                          --
5--                        S Y S T E M . R E G E X P                         --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--                     Copyright (C) 1999-2012, AdaCore                     --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32with Ada.Unchecked_Deallocation;
33
34with System.Case_Util;
35
36package body System.Regexp is
37
38   Open_Paren    : constant Character := '(';
39   Close_Paren   : constant Character := ')';
40   Open_Bracket  : constant Character := '[';
41   Close_Bracket : constant Character := ']';
42
43   type State_Index is new Natural;
44   type Column_Index is new Natural;
45
46   type Regexp_Array is array
47     (State_Index range <>, Column_Index range <>) of State_Index;
48   --  First index is for the state number
49   --  Second index is for the character type
50   --  Contents is the new State
51
52   type Regexp_Array_Access is access Regexp_Array;
53   --  Use this type through the functions Set below, so that it
54   --  can grow dynamically depending on the needs.
55
56   type Mapping is array (Character'Range) of Column_Index;
57   --  Mapping between characters and column in the Regexp_Array
58
59   type Boolean_Array is array (State_Index range <>) of Boolean;
60
61   type Regexp_Value
62     (Alphabet_Size : Column_Index;
63      Num_States    : State_Index) is
64   record
65      Map            : Mapping;
66      States         : Regexp_Array (1 .. Num_States, 0 .. Alphabet_Size);
67      Is_Final       : Boolean_Array (1 .. Num_States);
68      Case_Sensitive : Boolean;
69   end record;
70   --  Deterministic finite-state machine
71
72   -----------------------
73   -- Local Subprograms --
74   -----------------------
75
76   procedure Set
77     (Table  : in out Regexp_Array_Access;
78      State  : State_Index;
79      Column : Column_Index;
80      Value  : State_Index);
81   --  Sets a value in the table. If the table is too small, reallocate it
82   --  dynamically so that (State, Column) is a valid index in it.
83
84   function Get
85     (Table  : Regexp_Array_Access;
86      State  : State_Index;
87      Column : Column_Index)
88      return   State_Index;
89   --  Returns the value in the table at (State, Column).
90   --  If this index does not exist in the table, returns 0
91
92   procedure Free is new Ada.Unchecked_Deallocation
93     (Regexp_Array, Regexp_Array_Access);
94
95   ------------
96   -- Adjust --
97   ------------
98
99   procedure Adjust (R : in out Regexp) is
100      Tmp : Regexp_Access;
101
102   begin
103      if R.R /= null then
104         Tmp := new Regexp_Value (Alphabet_Size => R.R.Alphabet_Size,
105                                  Num_States    => R.R.Num_States);
106         Tmp.all := R.R.all;
107         R.R := Tmp;
108      end if;
109   end Adjust;
110
111   -------------
112   -- Compile --
113   -------------
114
115   function Compile
116     (Pattern        : String;
117      Glob           : Boolean := False;
118      Case_Sensitive : Boolean := True)
119      return           Regexp
120   is
121      S : String := Pattern;
122      --  The pattern which is really compiled (when the pattern is case
123      --  insensitive, we convert this string to lower-cases
124
125      Map : Mapping := (others => 0);
126      --  Mapping between characters and columns in the tables
127
128      Alphabet_Size : Column_Index := 0;
129      --  Number of significant characters in the regular expression.
130      --  This total does not include special operators, such as *, (, ...
131
132      procedure Check_Well_Formed_Pattern;
133      --  Check that the pattern to compile is well-formed, so that subsequent
134      --  code can rely on this without performing each time the checks to
135      --  avoid accessing the pattern outside its bounds. However, not all
136      --  well-formedness rules are checked. In particular, rules about special
137      --  characters not being treated as regular characters are not checked.
138
139      procedure Create_Mapping;
140      --  Creates a mapping between characters in the regexp and columns
141      --  in the tables representing the regexp. Test that the regexp is
142      --  well-formed Modifies Alphabet_Size and Map
143
144      procedure Create_Primary_Table
145        (Table       : out Regexp_Array_Access;
146         Num_States  : out State_Index;
147         Start_State : out State_Index;
148         End_State   : out State_Index);
149      --  Creates the first version of the regexp (this is a non deterministic
150      --  finite state machine, which is unadapted for a fast pattern
151      --  matching algorithm). We use a recursive algorithm to process the
152      --  parenthesis sub-expressions.
153      --
154      --  Table : at the end of the procedure : Column 0 is for any character
155      --  ('.') and the last columns are for no character (closure)
156      --  Num_States is set to the number of states in the table
157      --  Start_State is the number of the starting state in the regexp
158      --  End_State is the number of the final state when the regexp matches
159
160      procedure Create_Primary_Table_Glob
161        (Table       : out Regexp_Array_Access;
162         Num_States  : out State_Index;
163         Start_State : out State_Index;
164         End_State   : out State_Index);
165      --  Same function as above, but it deals with the second possible
166      --  grammar for 'globbing pattern', which is a kind of subset of the
167      --  whole regular expression grammar.
168
169      function Create_Secondary_Table
170        (First_Table : Regexp_Array_Access;
171         Num_States  : State_Index;
172         Start_State : State_Index;
173         End_State   : State_Index)
174         return        Regexp;
175      --  Creates the definitive table representing the regular expression
176      --  This is actually a transformation of the primary table First_Table,
177      --  where every state is grouped with the states in its 'no-character'
178      --  columns. The transitions between the new states are then recalculated
179      --  and if necessary some new states are created.
180      --
181      --  Note that the resulting finite-state machine is not optimized in
182      --  terms of the number of states : it would be more time-consuming to
183      --  add a third pass to reduce the number of states in the machine, with
184      --  no speed improvement...
185
186      procedure Raise_Exception (M : String; Index : Integer);
187      pragma No_Return (Raise_Exception);
188      --  Raise an exception, indicating an error at character Index in S
189
190      -------------------------------
191      -- Check_Well_Formed_Pattern --
192      -------------------------------
193
194      procedure Check_Well_Formed_Pattern is
195         J : Integer;
196
197         Past_Elmt : Boolean := False;
198         --  Set to True everywhere an elmt has been parsed, if Glob=False,
199         --  meaning there can be now an occurrence of '*', '+' and '?'.
200
201         Past_Term : Boolean := False;
202         --  Set to True everywhere a term has been parsed, if Glob=False,
203         --  meaning there can be now an occurrence of '|'.
204
205         Parenthesis_Level : Integer := 0;
206         Curly_Level       : Integer := 0;
207
208         Last_Open : Integer := S'First - 1;
209         --  The last occurrence of an opening parenthesis, if Glob=False,
210         --  or the last occurrence of an opening curly brace, if Glob=True.
211
212         procedure Raise_Exception_If_No_More_Chars (K : Integer := 0);
213         --  If no more characters are raised, call Raise_Exception
214
215         --------------------------------------
216         -- Raise_Exception_If_No_More_Chars --
217         --------------------------------------
218
219         procedure Raise_Exception_If_No_More_Chars (K : Integer := 0) is
220         begin
221            if J + K > S'Last then
222               Raise_Exception ("Ill-formed pattern while parsing", J);
223            end if;
224         end Raise_Exception_If_No_More_Chars;
225
226      --  Start of processing for Check_Well_Formed_Pattern
227
228      begin
229         J := S'First;
230         while J <= S'Last loop
231            case S (J) is
232               when Open_Bracket =>
233                  J := J + 1;
234                  Raise_Exception_If_No_More_Chars;
235
236                  if not Glob then
237                     if S (J) = '^' then
238                        J := J + 1;
239                        Raise_Exception_If_No_More_Chars;
240                     end if;
241                  end if;
242
243                  --  The first character never has a special meaning
244
245                  if S (J) = ']' or else S (J) = '-' then
246                     J := J + 1;
247                     Raise_Exception_If_No_More_Chars;
248                  end if;
249
250                  --  The set of characters cannot be empty
251
252                  if S (J) = ']' then
253                     Raise_Exception
254                       ("Set of characters cannot be empty in regular "
255                          & "expression", J);
256                  end if;
257
258                  declare
259                     Possible_Range_Start : Boolean := True;
260                     --  Set True everywhere a range character '-' can occur
261
262                  begin
263                     loop
264                        exit when S (J) = Close_Bracket;
265
266                        --  The current character should be followed by a
267                        --  closing bracket.
268
269                        Raise_Exception_If_No_More_Chars (1);
270
271                        if S (J) = '-'
272                          and then S (J + 1) /= Close_Bracket
273                        then
274                           if not Possible_Range_Start then
275                              Raise_Exception
276                                ("No mix of ranges is allowed in "
277                                   & "regular expression", J);
278                           end if;
279
280                           J := J + 1;
281                           Raise_Exception_If_No_More_Chars;
282
283                           --  Range cannot be followed by '-' character,
284                           --  except as last character in the set.
285
286                           Possible_Range_Start := False;
287
288                        else
289                           Possible_Range_Start := True;
290                        end if;
291
292                        if S (J) = '\' then
293                           J := J + 1;
294                           Raise_Exception_If_No_More_Chars;
295                        end if;
296
297                        J := J + 1;
298                     end loop;
299                  end;
300
301                  --  A closing bracket can end an elmt or term
302
303                  Past_Elmt := True;
304                  Past_Term := True;
305
306               when Close_Bracket =>
307
308                  --  A close bracket must follow a open_bracket, and cannot be
309                  --  found alone on the line.
310
311                  Raise_Exception
312                    ("Incorrect character ']' in regular expression", J);
313
314               when '\' =>
315                  if J < S'Last then
316                     J := J + 1;
317
318                     --  Any character can be an elmt or a term
319
320                     Past_Elmt := True;
321                     Past_Term := True;
322
323                  else
324                     --  \ not allowed at the end of the regexp
325
326                     Raise_Exception
327                       ("Incorrect character '\' in regular expression", J);
328                  end if;
329
330               when Open_Paren =>
331                  if not Glob then
332                     Parenthesis_Level := Parenthesis_Level + 1;
333                     Last_Open := J;
334
335                     --  An open parenthesis does not end an elmt or term
336
337                     Past_Elmt := False;
338                     Past_Term := False;
339                  end if;
340
341               when Close_Paren =>
342                  if not Glob then
343                     Parenthesis_Level := Parenthesis_Level - 1;
344
345                     if Parenthesis_Level < 0 then
346                        Raise_Exception
347                          ("')' is not associated with '(' in regular "
348                           & "expression", J);
349                     end if;
350
351                     if J = Last_Open + 1 then
352                        Raise_Exception
353                          ("Empty parentheses not allowed in regular "
354                           & "expression", J);
355                     end if;
356
357                     if not Past_Term then
358                        Raise_Exception
359                          ("Closing parenthesis not allowed here in regular "
360                             & "expression", J);
361                     end if;
362
363                     --  A closing parenthesis can end an elmt or term
364
365                     Past_Elmt := True;
366                     Past_Term := True;
367                  end if;
368
369               when '{' =>
370                  if Glob then
371                     Curly_Level := Curly_Level + 1;
372                     Last_Open := J;
373
374                  else
375                     --  Any character can be an elmt or a term
376
377                     Past_Elmt := True;
378                     Past_Term := True;
379                  end if;
380
381                  --  No need to check for ',' as the code always accepts them
382
383               when '}' =>
384                  if Glob then
385                     Curly_Level := Curly_Level - 1;
386
387                     if Curly_Level < 0 then
388                        Raise_Exception
389                          ("'}' is not associated with '{' in regular "
390                           & "expression", J);
391                     end if;
392
393                     if J = Last_Open + 1 then
394                        Raise_Exception
395                          ("Empty curly braces not allowed in regular "
396                           & "expression", J);
397                     end if;
398
399                  else
400                     --  Any character can be an elmt or a term
401
402                     Past_Elmt := True;
403                     Past_Term := True;
404                  end if;
405
406               when '*' | '?' | '+' =>
407                  if not Glob then
408
409                     --  These operators must apply to an elmt sub-expression,
410                     --  and cannot be found if one has not just been parsed.
411
412                     if not Past_Elmt then
413                        Raise_Exception
414                          ("'*', '+' and '?' operators must be "
415                           & "applied to an element in regular expression", J);
416                     end if;
417
418                     Past_Elmt := False;
419                     Past_Term := True;
420                  end if;
421
422               when '|' =>
423                  if not Glob then
424
425                     --  This operator must apply to a term sub-expression,
426                     --  and cannot be found if one has not just been parsed.
427
428                     if not Past_Term then
429                        Raise_Exception
430                          ("'|' operator must be "
431                           & "applied to a term in regular expression", J);
432                     end if;
433
434                     Past_Elmt := False;
435                     Past_Term := False;
436                  end if;
437
438               when others =>
439                  if not Glob then
440
441                     --  Any character can be an elmt or a term
442
443                     Past_Elmt := True;
444                     Past_Term := True;
445                  end if;
446            end case;
447
448            J := J + 1;
449         end loop;
450
451         --  A closing parenthesis must follow an open parenthesis
452
453         if Parenthesis_Level /= 0 then
454            Raise_Exception
455              ("'(' must always be associated with a ')'", J);
456         end if;
457
458         --  A closing curly brace must follow an open curly brace
459
460         if Curly_Level /= 0 then
461            Raise_Exception
462              ("'{' must always be associated with a '}'", J);
463         end if;
464      end Check_Well_Formed_Pattern;
465
466      --------------------
467      -- Create_Mapping --
468      --------------------
469
470      procedure Create_Mapping is
471
472         procedure Add_In_Map (C : Character);
473         --  Add a character in the mapping, if it is not already defined
474
475         ----------------
476         -- Add_In_Map --
477         ----------------
478
479         procedure Add_In_Map (C : Character) is
480         begin
481            if Map (C) = 0 then
482               Alphabet_Size := Alphabet_Size + 1;
483               Map (C) := Alphabet_Size;
484            end if;
485         end Add_In_Map;
486
487         J                 : Integer := S'First;
488         Parenthesis_Level : Integer := 0;
489         Curly_Level       : Integer := 0;
490         Last_Open         : Integer := S'First - 1;
491
492      --  Start of processing for Create_Mapping
493
494      begin
495         while J <= S'Last loop
496            case S (J) is
497               when Open_Bracket =>
498                  J := J + 1;
499
500                  if S (J) = '^' then
501                     J := J + 1;
502                  end if;
503
504                  if S (J) = ']' or else S (J) = '-' then
505                     J := J + 1;
506                  end if;
507
508                  --  The first character never has a special meaning
509
510                  loop
511                     if J > S'Last then
512                        Raise_Exception
513                          ("Ran out of characters while parsing ", J);
514                     end if;
515
516                     exit when S (J) = Close_Bracket;
517
518                     if S (J) = '-'
519                       and then S (J + 1) /= Close_Bracket
520                     then
521                        declare
522                           Start : constant Integer := J - 1;
523
524                        begin
525                           J := J + 1;
526
527                           if S (J) = '\' then
528                              J := J + 1;
529                           end if;
530
531                           for Char in S (Start) .. S (J) loop
532                              Add_In_Map (Char);
533                           end loop;
534                        end;
535                     else
536                        if S (J) = '\' then
537                           J := J + 1;
538                        end if;
539
540                        Add_In_Map (S (J));
541                     end if;
542
543                     J := J + 1;
544                  end loop;
545
546                  --  A close bracket must follow a open_bracket,
547                  --  and cannot be found alone on the line
548
549               when Close_Bracket =>
550                  Raise_Exception
551                    ("Incorrect character ']' in regular expression", J);
552
553               when '\' =>
554                  if J < S'Last  then
555                     J := J + 1;
556                     Add_In_Map (S (J));
557
558                  else
559                     --  \ not allowed at the end of the regexp
560
561                     Raise_Exception
562                       ("Incorrect character '\' in regular expression", J);
563                  end if;
564
565               when Open_Paren =>
566                  if not Glob then
567                     Parenthesis_Level := Parenthesis_Level + 1;
568                     Last_Open := J;
569                  else
570                     Add_In_Map (Open_Paren);
571                  end if;
572
573               when Close_Paren =>
574                  if not Glob then
575                     Parenthesis_Level := Parenthesis_Level - 1;
576
577                     if Parenthesis_Level < 0 then
578                        Raise_Exception
579                          ("')' is not associated with '(' in regular "
580                           & "expression", J);
581                     end if;
582
583                     if J = Last_Open + 1 then
584                        Raise_Exception
585                          ("Empty parenthesis not allowed in regular "
586                           & "expression", J);
587                     end if;
588
589                  else
590                     Add_In_Map (Close_Paren);
591                  end if;
592
593               when '.' =>
594                  if Glob then
595                     Add_In_Map ('.');
596                  end if;
597
598               when '{' =>
599                  if not Glob then
600                     Add_In_Map (S (J));
601                  else
602                     Curly_Level := Curly_Level + 1;
603                  end if;
604
605               when '}' =>
606                  if not Glob then
607                     Add_In_Map (S (J));
608                  else
609                     Curly_Level := Curly_Level - 1;
610                  end if;
611
612               when '*' | '?' =>
613                  if not Glob then
614                     if J = S'First then
615                        Raise_Exception
616                          ("'*', '+', '?' and '|' operators cannot be in "
617                           & "first position in regular expression", J);
618                     end if;
619                  end if;
620
621               when '|' | '+' =>
622                  if not Glob then
623                     if J = S'First then
624
625                        --  These operators must apply to a sub-expression,
626                        --  and cannot be found at the beginning of the line
627
628                        Raise_Exception
629                          ("'*', '+', '?' and '|' operators cannot be in "
630                           & "first position in regular expression", J);
631                     end if;
632
633                  else
634                     Add_In_Map (S (J));
635                  end if;
636
637               when others =>
638                  Add_In_Map (S (J));
639            end case;
640
641            J := J + 1;
642         end loop;
643
644         --  A closing parenthesis must follow an open parenthesis
645
646         if Parenthesis_Level /= 0 then
647            Raise_Exception
648              ("'(' must always be associated with a ')'", J);
649         end if;
650
651         if Curly_Level /= 0 then
652            Raise_Exception
653              ("'{' must always be associated with a '}'", J);
654         end if;
655      end Create_Mapping;
656
657      --------------------------
658      -- Create_Primary_Table --
659      --------------------------
660
661      procedure Create_Primary_Table
662        (Table       : out Regexp_Array_Access;
663         Num_States  : out State_Index;
664         Start_State : out State_Index;
665         End_State   : out State_Index)
666      is
667         Empty_Char : constant Column_Index := Alphabet_Size + 1;
668
669         Current_State : State_Index := 0;
670         --  Index of the last created state
671
672         procedure Add_Empty_Char
673           (State    : State_Index;
674            To_State : State_Index);
675         --  Add a empty-character transition from State to To_State
676
677         procedure Create_Repetition
678           (Repetition : Character;
679            Start_Prev : State_Index;
680            End_Prev   : State_Index;
681            New_Start  : out State_Index;
682            New_End    : in out State_Index);
683         --  Create the table in case we have a '*', '+' or '?'.
684         --  Start_Prev .. End_Prev should indicate respectively the start and
685         --  end index of the previous expression, to which '*', '+' or '?' is
686         --  applied.
687
688         procedure Create_Simple
689           (Start_Index : Integer;
690            End_Index   : Integer;
691            Start_State : out State_Index;
692            End_State   : out State_Index);
693         --  Fill the table for the regexp Simple.
694         --  This is the recursive procedure called to handle () expressions
695         --  If End_State = 0, then the call to Create_Simple creates an
696         --  independent regexp, not a concatenation
697         --  Start_Index .. End_Index is the starting index in the string S.
698         --
699         --  Warning: it may look like we are creating too many empty-string
700         --  transitions, but they are needed to get the correct regexp.
701         --  The table is filled as follow ( s means start-state, e means
702         --  end-state) :
703         --
704         --  regexp   state_num | a b * empty_string
705         --  -------  ------------------------------
706         --    a          1 (s) | 2 - - -
707         --               2 (e) | - - - -
708         --
709         --    ab         1 (s) | 2 - - -
710         --               2     | - - - 3
711         --               3     | - 4 - -
712         --               4 (e) | - - - -
713         --
714         --    a|b        1     | 2 - - -
715         --               2     | - - - 6
716         --               3     | - 4 - -
717         --               4     | - - - 6
718         --               5 (s) | - - - 1,3
719         --               6 (e) | - - - -
720         --
721         --    a*         1     | 2 - - -
722         --               2     | - - - 4
723         --               3 (s) | - - - 1,4
724         --               4 (e) | - - - 3
725         --
726         --    (a)        1 (s) | 2 - - -
727         --               2 (e) | - - - -
728         --
729         --    a+         1     | 2 - - -
730         --               2     | - - - 4
731         --               3 (s) | - - - 1
732         --               4 (e) | - - - 3
733         --
734         --    a?         1     | 2 - - -
735         --               2     | - - - 4
736         --               3 (s) | - - - 1,4
737         --               4 (e) | - - - -
738         --
739         --    .          1 (s) | 2 2 2 -
740         --               2 (e) | - - - -
741
742         function Next_Sub_Expression
743           (Start_Index : Integer;
744            End_Index   : Integer)
745            return        Integer;
746         --  Returns the index of the last character of the next sub-expression
747         --  in Simple. Index cannot be greater than End_Index.
748
749         --------------------
750         -- Add_Empty_Char --
751         --------------------
752
753         procedure Add_Empty_Char
754           (State    : State_Index;
755            To_State : State_Index)
756         is
757            J : Column_Index := Empty_Char;
758
759         begin
760            while Get (Table, State, J) /= 0 loop
761               J := J + 1;
762            end loop;
763
764            Set (Table, State, J, To_State);
765         end Add_Empty_Char;
766
767         -----------------------
768         -- Create_Repetition --
769         -----------------------
770
771         procedure Create_Repetition
772           (Repetition : Character;
773            Start_Prev : State_Index;
774            End_Prev   : State_Index;
775            New_Start  : out State_Index;
776            New_End    : in out State_Index)
777         is
778         begin
779            New_Start := Current_State + 1;
780
781            if New_End /= 0 then
782               Add_Empty_Char (New_End, New_Start);
783            end if;
784
785            Current_State := Current_State + 2;
786            New_End   := Current_State;
787
788            Add_Empty_Char (End_Prev, New_End);
789            Add_Empty_Char (New_Start, Start_Prev);
790
791            if Repetition /= '+' then
792               Add_Empty_Char (New_Start, New_End);
793            end if;
794
795            if Repetition /= '?' then
796               Add_Empty_Char (New_End, New_Start);
797            end if;
798         end Create_Repetition;
799
800         -------------------
801         -- Create_Simple --
802         -------------------
803
804         procedure Create_Simple
805           (Start_Index : Integer;
806            End_Index   : Integer;
807            Start_State : out State_Index;
808            End_State   : out State_Index)
809         is
810            J          : Integer := Start_Index;
811            Last_Start : State_Index := 0;
812
813         begin
814            Start_State := 0;
815            End_State   := 0;
816            while J <= End_Index loop
817               case S (J) is
818                  when Open_Paren =>
819                     declare
820                        J_Start    : constant Integer := J + 1;
821                        Next_Start : State_Index;
822                        Next_End   : State_Index;
823
824                     begin
825                        J := Next_Sub_Expression (J, End_Index);
826                        Create_Simple (J_Start, J - 1, Next_Start, Next_End);
827
828                        if J < End_Index
829                          and then (S (J + 1) = '*' or else
830                                    S (J + 1) = '+' or else
831                                    S (J + 1) = '?')
832                        then
833                           J := J + 1;
834                           Create_Repetition
835                             (S (J),
836                              Next_Start,
837                              Next_End,
838                              Last_Start,
839                              End_State);
840
841                        else
842                           Last_Start := Next_Start;
843
844                           if End_State /= 0 then
845                              Add_Empty_Char (End_State, Last_Start);
846                           end if;
847
848                           End_State := Next_End;
849                        end if;
850                     end;
851
852                  when '|' =>
853                     declare
854                        Start_Prev : constant State_Index := Start_State;
855                        End_Prev   : constant State_Index := End_State;
856                        Start_J    : constant Integer     := J + 1;
857                        Start_Next : State_Index := 0;
858                        End_Next   : State_Index := 0;
859
860                     begin
861                        J := Next_Sub_Expression (J, End_Index);
862
863                        --  Create a new state for the start of the alternative
864
865                        Current_State := Current_State + 1;
866                        Last_Start := Current_State;
867                        Start_State := Last_Start;
868
869                        --  Create the tree for the second part of alternative
870
871                        Create_Simple (Start_J, J, Start_Next, End_Next);
872
873                        --  Create the end state
874
875                        Add_Empty_Char (Last_Start, Start_Next);
876                        Add_Empty_Char (Last_Start, Start_Prev);
877                        Current_State := Current_State + 1;
878                        End_State := Current_State;
879                        Add_Empty_Char (End_Prev, End_State);
880                        Add_Empty_Char (End_Next, End_State);
881                     end;
882
883                  when Open_Bracket =>
884                     Current_State := Current_State + 1;
885
886                     declare
887                        Next_State : State_Index := Current_State + 1;
888
889                     begin
890                        J := J + 1;
891
892                        if S (J) = '^' then
893                           J := J + 1;
894
895                           Next_State := 0;
896
897                           for Column in 0 .. Alphabet_Size loop
898                              Set (Table, Current_State, Column,
899                                   Value => Current_State + 1);
900                           end loop;
901                        end if;
902
903                        --  Automatically add the first character
904
905                        if S (J) = '-' or else S (J) = ']' then
906                           Set (Table, Current_State, Map (S (J)),
907                                Value => Next_State);
908                           J := J + 1;
909                        end if;
910
911                        --  Loop till closing bracket found
912
913                        loop
914                           exit when S (J) = Close_Bracket;
915
916                           if S (J) = '-'
917                             and then S (J + 1) /= ']'
918                           then
919                              declare
920                                 Start : constant Integer := J - 1;
921
922                              begin
923                                 J := J + 1;
924
925                                 if S (J) = '\' then
926                                    J := J + 1;
927                                 end if;
928
929                                 for Char in S (Start) .. S (J) loop
930                                    Set (Table, Current_State, Map (Char),
931                                         Value => Next_State);
932                                 end loop;
933                              end;
934
935                           else
936                              if S (J) = '\' then
937                                 J := J + 1;
938                              end if;
939
940                              Set (Table, Current_State, Map (S (J)),
941                                   Value => Next_State);
942                           end if;
943                           J := J + 1;
944                        end loop;
945                     end;
946
947                     Current_State := Current_State + 1;
948
949                     --  If the next symbol is a special symbol
950
951                     if J < End_Index
952                       and then (S (J + 1) = '*' or else
953                                 S (J + 1) = '+' or else
954                                 S (J + 1) = '?')
955                     then
956                        J := J + 1;
957                        Create_Repetition
958                          (S (J),
959                           Current_State - 1,
960                           Current_State,
961                           Last_Start,
962                           End_State);
963
964                     else
965                        Last_Start := Current_State - 1;
966
967                        if End_State /= 0 then
968                           Add_Empty_Char (End_State, Last_Start);
969                        end if;
970
971                        End_State := Current_State;
972                     end if;
973
974                  when '*' | '+' | '?' | Close_Paren | Close_Bracket =>
975                     Raise_Exception
976                       ("Incorrect character in regular expression :", J);
977
978                  when others =>
979                     Current_State := Current_State + 1;
980
981                     --  Create the state for the symbol S (J)
982
983                     if S (J) = '.' then
984                        for K in 0 .. Alphabet_Size loop
985                           Set (Table, Current_State, K,
986                                Value => Current_State + 1);
987                        end loop;
988
989                     else
990                        if S (J) = '\' then
991                           J := J + 1;
992                        end if;
993
994                        Set (Table, Current_State, Map (S (J)),
995                             Value => Current_State + 1);
996                     end if;
997
998                     Current_State := Current_State + 1;
999
1000                     --  If the next symbol is a special symbol
1001
1002                     if J < End_Index
1003                       and then (S (J + 1) = '*' or else
1004                                 S (J + 1) = '+' or else
1005                                 S (J + 1) = '?')
1006                     then
1007                        J := J + 1;
1008                        Create_Repetition
1009                          (S (J),
1010                           Current_State - 1,
1011                           Current_State,
1012                           Last_Start,
1013                           End_State);
1014
1015                     else
1016                        Last_Start := Current_State - 1;
1017
1018                        if End_State /= 0 then
1019                           Add_Empty_Char (End_State, Last_Start);
1020                        end if;
1021
1022                        End_State := Current_State;
1023                     end if;
1024
1025               end case;
1026
1027               if Start_State = 0 then
1028                  Start_State := Last_Start;
1029               end if;
1030
1031               J := J + 1;
1032            end loop;
1033         end Create_Simple;
1034
1035         -------------------------
1036         -- Next_Sub_Expression --
1037         -------------------------
1038
1039         function Next_Sub_Expression
1040           (Start_Index : Integer;
1041            End_Index   : Integer)
1042            return        Integer
1043         is
1044            J              : Integer := Start_Index;
1045            Start_On_Alter : Boolean := False;
1046
1047         begin
1048            if S (J) = '|' then
1049               Start_On_Alter := True;
1050            end if;
1051
1052            loop
1053               exit when J = End_Index;
1054               J := J + 1;
1055
1056               case S (J) is
1057                  when '\' =>
1058                     J := J + 1;
1059
1060                  when Open_Bracket =>
1061                     loop
1062                        J := J + 1;
1063                        exit when S (J) = Close_Bracket;
1064
1065                        if S (J) = '\' then
1066                           J := J + 1;
1067                        end if;
1068                     end loop;
1069
1070                  when Open_Paren =>
1071                     J := Next_Sub_Expression (J, End_Index);
1072
1073                  when Close_Paren =>
1074                     return J;
1075
1076                  when '|' =>
1077                     if Start_On_Alter then
1078                        return J - 1;
1079                     end if;
1080
1081                  when others =>
1082                     null;
1083               end case;
1084            end loop;
1085
1086            return J;
1087         end Next_Sub_Expression;
1088
1089      --  Start of Create_Primary_Table
1090
1091      begin
1092         Table.all := (others => (others => 0));
1093         Create_Simple (S'First, S'Last, Start_State, End_State);
1094         Num_States := Current_State;
1095      end Create_Primary_Table;
1096
1097      -------------------------------
1098      -- Create_Primary_Table_Glob --
1099      -------------------------------
1100
1101      procedure Create_Primary_Table_Glob
1102        (Table       : out Regexp_Array_Access;
1103         Num_States  : out State_Index;
1104         Start_State : out State_Index;
1105         End_State   : out State_Index)
1106      is
1107         Empty_Char : constant Column_Index := Alphabet_Size + 1;
1108
1109         Current_State : State_Index := 0;
1110         --  Index of the last created state
1111
1112         procedure Add_Empty_Char
1113           (State    : State_Index;
1114            To_State : State_Index);
1115         --  Add a empty-character transition from State to To_State
1116
1117         procedure Create_Simple
1118           (Start_Index : Integer;
1119            End_Index   : Integer;
1120            Start_State : out State_Index;
1121            End_State   : out State_Index);
1122         --  Fill the table for the S (Start_Index .. End_Index).
1123         --  This is the recursive procedure called to handle () expressions
1124
1125         --------------------
1126         -- Add_Empty_Char --
1127         --------------------
1128
1129         procedure Add_Empty_Char
1130           (State    : State_Index;
1131            To_State : State_Index)
1132         is
1133            J : Column_Index := Empty_Char;
1134
1135         begin
1136            while Get (Table, State, J) /= 0 loop
1137               J := J + 1;
1138            end loop;
1139
1140            Set (Table, State, J,
1141                 Value => To_State);
1142         end Add_Empty_Char;
1143
1144         -------------------
1145         -- Create_Simple --
1146         -------------------
1147
1148         procedure Create_Simple
1149           (Start_Index : Integer;
1150            End_Index   : Integer;
1151            Start_State : out State_Index;
1152            End_State   : out State_Index)
1153         is
1154            J          : Integer := Start_Index;
1155            Last_Start : State_Index := 0;
1156
1157         begin
1158            Start_State := 0;
1159            End_State   := 0;
1160
1161            while J <= End_Index loop
1162               case S (J) is
1163
1164                  when Open_Bracket =>
1165                     Current_State := Current_State + 1;
1166
1167                     declare
1168                        Next_State : State_Index := Current_State + 1;
1169
1170                     begin
1171                        J := J + 1;
1172
1173                        if S (J) = '^' then
1174                           J := J + 1;
1175                           Next_State := 0;
1176
1177                           for Column in 0 .. Alphabet_Size loop
1178                              Set (Table, Current_State, Column,
1179                                   Value => Current_State + 1);
1180                           end loop;
1181                        end if;
1182
1183                        --  Automatically add the first character
1184
1185                        if S (J) = '-' or else S (J) = ']' then
1186                           Set (Table, Current_State, Map (S (J)),
1187                                Value => Current_State);
1188                           J := J + 1;
1189                        end if;
1190
1191                        --  Loop till closing bracket found
1192
1193                        loop
1194                           exit when S (J) = Close_Bracket;
1195
1196                           if S (J) = '-'
1197                             and then S (J + 1) /= ']'
1198                           then
1199                              declare
1200                                 Start : constant Integer := J - 1;
1201                              begin
1202                                 J := J + 1;
1203
1204                                 if S (J) = '\' then
1205                                    J := J + 1;
1206                                 end if;
1207
1208                                 for Char in S (Start) .. S (J) loop
1209                                    Set (Table, Current_State, Map (Char),
1210                                         Value => Next_State);
1211                                 end loop;
1212                              end;
1213
1214                           else
1215                              if S (J) = '\' then
1216                                 J := J + 1;
1217                              end if;
1218
1219                              Set (Table, Current_State, Map (S (J)),
1220                                   Value => Next_State);
1221                           end if;
1222                           J := J + 1;
1223                        end loop;
1224                     end;
1225
1226                     Last_Start := Current_State;
1227                     Current_State := Current_State + 1;
1228
1229                     if End_State /= 0 then
1230                        Add_Empty_Char (End_State, Last_Start);
1231                     end if;
1232
1233                     End_State := Current_State;
1234
1235                  when '{' =>
1236                     declare
1237                        End_Sub          : Integer;
1238                        Start_Regexp_Sub : State_Index;
1239                        End_Regexp_Sub   : State_Index;
1240                        Create_Start     : State_Index := 0;
1241
1242                        Create_End : State_Index := 0;
1243                        --  Initialized to avoid junk warning
1244
1245                     begin
1246                        while S (J) /= '}' loop
1247
1248                           --  First step : find sub pattern
1249
1250                           End_Sub := J + 1;
1251                           while S (End_Sub) /= ','
1252                             and then S (End_Sub) /= '}'
1253                           loop
1254                              End_Sub := End_Sub + 1;
1255                           end loop;
1256
1257                           --  Second step : create a sub pattern
1258
1259                           Create_Simple
1260                             (J + 1,
1261                              End_Sub - 1,
1262                              Start_Regexp_Sub,
1263                              End_Regexp_Sub);
1264
1265                           J := End_Sub;
1266
1267                           --  Third step : create an alternative
1268
1269                           if Create_Start = 0 then
1270                              Current_State := Current_State + 1;
1271                              Create_Start := Current_State;
1272                              Add_Empty_Char (Create_Start, Start_Regexp_Sub);
1273                              Current_State := Current_State + 1;
1274                              Create_End := Current_State;
1275                              Add_Empty_Char (End_Regexp_Sub, Create_End);
1276
1277                           else
1278                              Current_State := Current_State + 1;
1279                              Add_Empty_Char (Current_State, Create_Start);
1280                              Create_Start := Current_State;
1281                              Add_Empty_Char (Create_Start, Start_Regexp_Sub);
1282                              Add_Empty_Char (End_Regexp_Sub, Create_End);
1283                           end if;
1284                        end loop;
1285
1286                        if End_State /= 0 then
1287                           Add_Empty_Char (End_State, Create_Start);
1288                        end if;
1289
1290                        End_State := Create_End;
1291                        Last_Start := Create_Start;
1292                     end;
1293
1294                  when '*' =>
1295                     Current_State := Current_State + 1;
1296
1297                     if End_State /= 0 then
1298                        Add_Empty_Char (End_State, Current_State);
1299                     end if;
1300
1301                     Add_Empty_Char (Current_State, Current_State + 1);
1302                     Add_Empty_Char (Current_State, Current_State + 3);
1303                     Last_Start := Current_State;
1304
1305                     Current_State := Current_State + 1;
1306
1307                     for K in 0 .. Alphabet_Size loop
1308                        Set (Table, Current_State, K,
1309                             Value => Current_State + 1);
1310                     end loop;
1311
1312                     Current_State := Current_State + 1;
1313                     Add_Empty_Char (Current_State, Current_State + 1);
1314
1315                     Current_State := Current_State + 1;
1316                     Add_Empty_Char (Current_State,  Last_Start);
1317                     End_State := Current_State;
1318
1319                  when others =>
1320                     Current_State := Current_State + 1;
1321
1322                     if S (J) = '?' then
1323                        for K in 0 .. Alphabet_Size loop
1324                           Set (Table, Current_State, K,
1325                                Value => Current_State + 1);
1326                        end loop;
1327
1328                     else
1329                        if S (J) = '\' then
1330                           J := J + 1;
1331                        end if;
1332
1333                        --  Create the state for the symbol S (J)
1334
1335                        Set (Table, Current_State, Map (S (J)),
1336                             Value => Current_State + 1);
1337                     end if;
1338
1339                     Last_Start := Current_State;
1340                     Current_State := Current_State + 1;
1341
1342                     if End_State /= 0 then
1343                        Add_Empty_Char (End_State, Last_Start);
1344                     end if;
1345
1346                     End_State := Current_State;
1347
1348               end case;
1349
1350               if Start_State = 0 then
1351                  Start_State := Last_Start;
1352               end if;
1353
1354               J := J + 1;
1355            end loop;
1356         end Create_Simple;
1357
1358      --  Start of processing for Create_Primary_Table_Glob
1359
1360      begin
1361         Table.all := (others => (others => 0));
1362         Create_Simple (S'First, S'Last, Start_State, End_State);
1363         Num_States := Current_State;
1364      end Create_Primary_Table_Glob;
1365
1366      ----------------------------
1367      -- Create_Secondary_Table --
1368      ----------------------------
1369
1370      function Create_Secondary_Table
1371        (First_Table : Regexp_Array_Access;
1372         Num_States  : State_Index;
1373         Start_State : State_Index;
1374         End_State   : State_Index) return Regexp
1375      is
1376         pragma Warnings (Off, Num_States);
1377
1378         Last_Index : constant State_Index := First_Table'Last (1);
1379         type Meta_State is array (1 .. Last_Index) of Boolean;
1380
1381         Table : Regexp_Array (1 .. Last_Index, 0 .. Alphabet_Size) :=
1382                   (others => (others => 0));
1383
1384         Meta_States : array (1 .. Last_Index + 1) of Meta_State :=
1385                         (others => (others => False));
1386
1387         Temp_State_Not_Null : Boolean;
1388
1389         Is_Final : Boolean_Array (1 .. Last_Index) := (others => False);
1390
1391         Current_State       : State_Index := 1;
1392         Nb_State            : State_Index := 1;
1393
1394         procedure Closure
1395           (State : in out Meta_State;
1396            Item  :        State_Index);
1397         --  Compute the closure of the state (that is every other state which
1398         --  has a empty-character transition) and add it to the state
1399
1400         -------------
1401         -- Closure --
1402         -------------
1403
1404         procedure Closure
1405           (State : in out Meta_State;
1406            Item  : State_Index)
1407         is
1408         begin
1409            if State (Item) then
1410               return;
1411            end if;
1412
1413            State (Item) := True;
1414
1415            for Column in Alphabet_Size + 1 .. First_Table'Last (2) loop
1416               if First_Table (Item, Column) = 0 then
1417                  return;
1418               end if;
1419
1420               Closure (State, First_Table (Item, Column));
1421            end loop;
1422         end Closure;
1423
1424      --  Start of processing for Create_Secondary_Table
1425
1426      begin
1427         --  Create a new state
1428
1429         Closure (Meta_States (Current_State), Start_State);
1430
1431         while Current_State <= Nb_State loop
1432
1433            --  If this new meta-state includes the primary table end state,
1434            --  then this meta-state will be a final state in the regexp
1435
1436            if Meta_States (Current_State)(End_State) then
1437               Is_Final (Current_State) := True;
1438            end if;
1439
1440            --  For every character in the regexp, calculate the possible
1441            --  transitions from Current_State
1442
1443            for Column in 0 .. Alphabet_Size loop
1444               Meta_States (Nb_State + 1) := (others => False);
1445               Temp_State_Not_Null := False;
1446
1447               for K in Meta_States (Current_State)'Range loop
1448                  if Meta_States (Current_State)(K)
1449                    and then First_Table (K, Column) /= 0
1450                  then
1451                     Closure
1452                       (Meta_States (Nb_State + 1), First_Table (K, Column));
1453                     Temp_State_Not_Null := True;
1454                  end if;
1455               end loop;
1456
1457               --  If at least one transition existed
1458
1459               if Temp_State_Not_Null then
1460
1461                  --  Check if this new state corresponds to an old one
1462
1463                  for K in 1 .. Nb_State loop
1464                     if Meta_States (K) = Meta_States (Nb_State + 1) then
1465                        Table (Current_State, Column) := K;
1466                        exit;
1467                     end if;
1468                  end loop;
1469
1470                  --  If not, create a new state
1471
1472                  if Table (Current_State, Column) = 0 then
1473                     Nb_State := Nb_State + 1;
1474                     Table (Current_State, Column) := Nb_State;
1475                  end if;
1476               end if;
1477            end loop;
1478
1479            Current_State := Current_State + 1;
1480         end loop;
1481
1482         --  Returns the regexp
1483
1484         declare
1485            R : Regexp_Access;
1486
1487         begin
1488            R := new Regexp_Value (Alphabet_Size => Alphabet_Size,
1489                                   Num_States    => Nb_State);
1490            R.Map            := Map;
1491            R.Is_Final       := Is_Final (1 .. Nb_State);
1492            R.Case_Sensitive := Case_Sensitive;
1493
1494            for State in 1 .. Nb_State loop
1495               for K in 0 .. Alphabet_Size loop
1496                  R.States (State, K) := Table (State, K);
1497               end loop;
1498            end loop;
1499
1500            return (Ada.Finalization.Controlled with R => R);
1501         end;
1502      end Create_Secondary_Table;
1503
1504      ---------------------
1505      -- Raise_Exception --
1506      ---------------------
1507
1508      procedure Raise_Exception (M : String; Index : Integer) is
1509      begin
1510         raise Error_In_Regexp with M & " at offset" & Index'Img;
1511      end Raise_Exception;
1512
1513   --  Start of processing for Compile
1514
1515   begin
1516      --  Special case for the empty string: it always matches, and the
1517      --  following processing would fail on it.
1518      if S = "" then
1519         return (Ada.Finalization.Controlled with
1520                 R => new Regexp_Value'
1521                      (Alphabet_Size => 0,
1522                       Num_States    => 1,
1523                       Map           => (others => 0),
1524                       States        => (others => (others => 1)),
1525                       Is_Final      => (others => True),
1526                       Case_Sensitive => True));
1527      end if;
1528
1529      if not Case_Sensitive then
1530         System.Case_Util.To_Lower (S);
1531      end if;
1532
1533      --  Check the pattern is well-formed before any treatment
1534
1535      Check_Well_Formed_Pattern;
1536
1537      Create_Mapping;
1538
1539      --  Creates the primary table
1540
1541      declare
1542         Table       : Regexp_Array_Access;
1543         Num_States  : State_Index;
1544         Start_State : State_Index;
1545         End_State   : State_Index;
1546         R           : Regexp;
1547
1548      begin
1549         Table := new Regexp_Array (1 .. 100,
1550                                    0 .. Alphabet_Size + 10);
1551         if not Glob then
1552            Create_Primary_Table (Table, Num_States, Start_State, End_State);
1553         else
1554            Create_Primary_Table_Glob
1555              (Table, Num_States, Start_State, End_State);
1556         end if;
1557
1558         --  Creates the secondary table
1559
1560         R := Create_Secondary_Table
1561           (Table, Num_States, Start_State, End_State);
1562         Free (Table);
1563         return R;
1564      end;
1565   end Compile;
1566
1567   --------------
1568   -- Finalize --
1569   --------------
1570
1571   procedure Finalize (R : in out Regexp) is
1572      procedure Free is new
1573        Ada.Unchecked_Deallocation (Regexp_Value, Regexp_Access);
1574
1575   begin
1576      Free (R.R);
1577   end Finalize;
1578
1579   ---------
1580   -- Get --
1581   ---------
1582
1583   function Get
1584     (Table  : Regexp_Array_Access;
1585      State  : State_Index;
1586      Column : Column_Index) return State_Index
1587   is
1588   begin
1589      if State <= Table'Last (1)
1590        and then Column <= Table'Last (2)
1591      then
1592         return Table (State, Column);
1593      else
1594         return 0;
1595      end if;
1596   end Get;
1597
1598   -----------
1599   -- Match --
1600   -----------
1601
1602   function Match (S : String; R : Regexp) return Boolean is
1603      Current_State : State_Index := 1;
1604
1605   begin
1606      if R.R = null then
1607         raise Constraint_Error;
1608      end if;
1609
1610      for Char in S'Range loop
1611
1612         if R.R.Case_Sensitive then
1613            Current_State := R.R.States (Current_State, R.R.Map (S (Char)));
1614         else
1615            Current_State :=
1616              R.R.States (Current_State,
1617                          R.R.Map (System.Case_Util.To_Lower (S (Char))));
1618         end if;
1619
1620         if Current_State = 0 then
1621            return False;
1622         end if;
1623
1624      end loop;
1625
1626      return R.R.Is_Final (Current_State);
1627   end Match;
1628
1629   ---------
1630   -- Set --
1631   ---------
1632
1633   procedure Set
1634     (Table  : in out Regexp_Array_Access;
1635      State  : State_Index;
1636      Column : Column_Index;
1637      Value  : State_Index)
1638   is
1639      New_Lines   : State_Index;
1640      New_Columns : Column_Index;
1641      New_Table   : Regexp_Array_Access;
1642
1643   begin
1644      if State <= Table'Last (1)
1645        and then Column <= Table'Last (2)
1646      then
1647         Table (State, Column) := Value;
1648      else
1649         --  Doubles the size of the table until it is big enough that
1650         --  (State, Column) is a valid index
1651
1652         New_Lines := Table'Last (1) * (State / Table'Last (1) + 1);
1653         New_Columns := Table'Last (2) * (Column / Table'Last (2) + 1);
1654         New_Table := new Regexp_Array (Table'First (1) .. New_Lines,
1655                                        Table'First (2) .. New_Columns);
1656         New_Table.all := (others => (others => 0));
1657
1658         for J in Table'Range (1) loop
1659            for K in Table'Range (2) loop
1660               New_Table (J, K) := Table (J, K);
1661            end loop;
1662         end loop;
1663
1664         Free (Table);
1665         Table := New_Table;
1666         Table (State, Column) := Value;
1667      end if;
1668   end Set;
1669
1670end System.Regexp;
1671