1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT LIBRARY COMPONENTS                          --
4--                                                                          --
5--                          G N A T . R E G P A T                           --
6--                                                                          --
7--                                 B o d y                                  --
8--                                                                          --
9--               Copyright (C) 1986 by University of Toronto.               --
10--           Copyright (C) 1996-2003 Ada Core Technologies, Inc.            --
11--                                                                          --
12-- GNAT is free software;  you can  redistribute it  and/or modify it under --
13-- terms of the  GNU General Public License as published  by the Free Soft- --
14-- ware  Foundation;  either version 2,  or (at your option) any later ver- --
15-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
16-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
17-- or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License --
18-- for  more details.  You should have  received  a copy of the GNU General --
19-- Public License  distributed with GNAT;  see file COPYING.  If not, write --
20-- to  the Free Software Foundation,  59 Temple Place - Suite 330,  Boston, --
21-- MA 02111-1307, USA.                                                      --
22--                                                                          --
23-- As a special exception,  if other files  instantiate  generics from this --
24-- unit, or you link  this unit with other files  to produce an executable, --
25-- this  unit  does not  by itself cause  the resulting  executable  to  be --
26-- covered  by the  GNU  General  Public  License.  This exception does not --
27-- however invalidate  any other reasons why  the executable file  might be --
28-- covered by the  GNU Public License.                                      --
29--                                                                          --
30-- GNAT was originally developed  by the GNAT team at  New York University. --
31-- Extensive contributions were provided by Ada Core Technologies Inc.      --
32--                                                                          --
33------------------------------------------------------------------------------
34
35--  This is an altered Ada 95 version of the original V8 style regular
36--  expression library written in C by Henry Spencer. Apart from the
37--  translation to Ada, the interface has been considerably changed to
38--  use the Ada String type instead of C-style nul-terminated strings.
39
40--  Beware that some of this code is subtly aware of the way operator
41--  precedence is structured in regular expressions. Serious changes in
42--  regular-expression syntax might require a total rethink.
43
44with System.IO;               use System.IO;
45with Ada.Characters.Handling; use Ada.Characters.Handling;
46with Unchecked_Conversion;
47
48package body GNAT.Regpat is
49
50   MAGIC : constant Character := Character'Val (10#0234#);
51   --  The first byte of the regexp internal "program" is actually
52   --  this magic number; the start node begins in the second byte.
53   --
54   --  This is used to make sure that a regular expression was correctly
55   --  compiled.
56
57   ----------------------------
58   -- Implementation details --
59   ----------------------------
60
61   --  This is essentially a linear encoding of a nondeterministic
62   --  finite-state machine, also known as syntax charts or
63   --  "railroad normal form" in parsing technology.
64
65   --  Each node is an opcode plus a "next" pointer, possibly plus an
66   --  operand. "Next" pointers of all nodes except BRANCH implement
67   --  concatenation; a "next" pointer with a BRANCH on both ends of it
68   --  is connecting two alternatives.
69
70   --  The operand of some types of node is a literal string; for others,
71   --  it is a node leading into a sub-FSM. In particular, the operand of
72   --  a BRANCH node is the first node of the branch.
73   --  (NB this is *not* a tree structure:  the tail of the branch connects
74   --  to the thing following the set of BRANCHes).
75
76   --  You can see the exact byte-compiled version by using the Dump
77   --  subprogram. However, here are a few examples:
78
79   --  (a|b):  1 : MAGIC
80   --          2 : BRANCH  (next at  10)
81   --          5 :    EXACT  (next at  18)   operand=a
82   --         10 : BRANCH  (next at  18)
83   --         13 :    EXACT  (next at  18)   operand=b
84   --         18 : EOP  (next at 0)
85   --
86   --  (ab)*:  1 : MAGIC
87   --          2 : CURLYX  (next at  26)  { 0, 32767}
88   --          9 :    OPEN 1  (next at  13)
89   --         13 :       EXACT  (next at  19)   operand=ab
90   --         19 :    CLOSE 1  (next at  23)
91   --         23 :    WHILEM  (next at 0)
92   --         26 : NOTHING  (next at  29)
93   --         29 : EOP  (next at 0)
94
95   --  The opcodes are:
96
97   type Opcode is
98
99      --  Name          Operand?  Meaning
100
101     (EOP,        -- no        End of program
102      MINMOD,     -- no        Next operator is not greedy
103
104      --  Classes of characters
105
106      ANY,        -- no        Match any one character except newline
107      SANY,       -- no        Match any character, including new line
108      ANYOF,      -- class     Match any character in this class
109      EXACT,      -- str       Match this string exactly
110      EXACTF,     -- str       Match this string (case-folding is one)
111      NOTHING,    -- no        Match empty string
112      SPACE,      -- no        Match any whitespace character
113      NSPACE,     -- no        Match any non-whitespace character
114      DIGIT,      -- no        Match any numeric character
115      NDIGIT,     -- no        Match any non-numeric character
116      ALNUM,      -- no        Match any alphanumeric character
117      NALNUM,     -- no        Match any non-alphanumeric character
118
119      --  Branches
120
121      BRANCH,     -- node      Match this alternative, or the next
122
123      --  Simple loops (when the following node is one character in length)
124
125      STAR,       -- node      Match this simple thing 0 or more times
126      PLUS,       -- node      Match this simple thing 1 or more times
127      CURLY,      -- 2num node Match this simple thing between n and m times.
128
129      --  Complex loops
130
131      CURLYX,     -- 2num node Match this complex thing {n,m} times
132      --                       The nums are coded on two characters each.
133
134      WHILEM,     -- no        Do curly processing and see if rest matches
135
136      --  Matches after or before a word
137
138      BOL,        -- no        Match "" at beginning of line
139      MBOL,       -- no        Same, assuming mutiline (match after \n)
140      SBOL,       -- no        Same, assuming single line (don't match at \n)
141      EOL,        -- no        Match "" at end of line
142      MEOL,       -- no        Same, assuming mutiline (match before \n)
143      SEOL,       -- no        Same, assuming single line (don't match at \n)
144
145      BOUND,      -- no        Match "" at any word boundary
146      NBOUND,     -- no        Match "" at any word non-boundary
147
148      --  Parenthesis groups handling
149
150      REFF,       -- num       Match some already matched string, folded
151      OPEN,       -- num       Mark this point in input as start of #n
152      CLOSE);     -- num       Analogous to OPEN
153
154   for Opcode'Size use 8;
155
156   --  Opcode notes:
157
158   --  BRANCH
159   --    The set of branches constituting a single choice are hooked
160   --    together with their "next" pointers, since precedence prevents
161   --    anything being concatenated to any individual branch. The
162   --    "next" pointer of the last BRANCH in a choice points to the
163   --    thing following the whole choice. This is also where the
164   --    final "next" pointer of each individual branch points; each
165   --    branch starts with the operand node of a BRANCH node.
166
167   --  STAR,PLUS
168   --    '?', and complex '*' and '+', are implemented with CURLYX.
169   --    branches. Simple cases (one character per match) are implemented with
170   --    STAR and PLUS for speed and to minimize recursive plunges.
171
172   --  OPEN,CLOSE
173   --    ...are numbered at compile time.
174
175   --  EXACT, EXACTF
176   --    There are in fact two arguments, the first one is the length (minus
177   --    one of the string argument), coded on one character, the second
178   --    argument is the string itself, coded on length + 1 characters.
179
180   --  A node is one char of opcode followed by two chars of "next" pointer.
181   --  "Next" pointers are stored as two 8-bit pieces, high order first. The
182   --  value is a positive offset from the opcode of the node containing it.
183   --  An operand, if any, simply follows the node. (Note that much of the
184   --  code generation knows about this implicit relationship.)
185
186   --  Using two bytes for the "next" pointer is vast overkill for most
187   --  things, but allows patterns to get big without disasters.
188
189   -----------------------
190   -- Character classes --
191   -----------------------
192   --  This is the implementation for character classes ([...]) in the
193   --  syntax for regular expressions. Each character (0..256) has an
194   --  entry into the table. This makes for a very fast matching
195   --  algorithm.
196
197   type Class_Byte is mod 256;
198   type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte;
199
200   type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte;
201   Bit_Conversion : constant Bit_Conversion_Array :=
202                      (1, 2, 4, 8, 16, 32, 64, 128);
203
204   type Std_Class is (ANYOF_NONE,
205                      ANYOF_ALNUM,   --  Alphanumeric class [a-zA-Z0-9]
206                      ANYOF_NALNUM,
207                      ANYOF_SPACE,   --  Space class [ \t\n\r\f]
208                      ANYOF_NSPACE,
209                      ANYOF_DIGIT,   --  Digit class [0-9]
210                      ANYOF_NDIGIT,
211                      ANYOF_ALNUMC,  --  Alphanumeric class [a-zA-Z0-9]
212                      ANYOF_NALNUMC,
213                      ANYOF_ALPHA,   --  Alpha class [a-zA-Z]
214                      ANYOF_NALPHA,
215                      ANYOF_ASCII,   --  Ascii class (7 bits) 0..127
216                      ANYOF_NASCII,
217                      ANYOF_CNTRL,   --  Control class
218                      ANYOF_NCNTRL,
219                      ANYOF_GRAPH,   --  Graphic class
220                      ANYOF_NGRAPH,
221                      ANYOF_LOWER,   --  Lower case class [a-z]
222                      ANYOF_NLOWER,
223                      ANYOF_PRINT,   --  printable class
224                      ANYOF_NPRINT,
225                      ANYOF_PUNCT,   --
226                      ANYOF_NPUNCT,
227                      ANYOF_UPPER,   --  Upper case class [A-Z]
228                      ANYOF_NUPPER,
229                      ANYOF_XDIGIT,  --  Hexadecimal digit
230                      ANYOF_NXDIGIT
231                      );
232
233   procedure Set_In_Class
234     (Bitmap : in out Character_Class;
235      C      : Character);
236   --  Set the entry to True for C in the class Bitmap.
237
238   function Get_From_Class
239     (Bitmap : Character_Class;
240      C      : Character) return Boolean;
241   --  Return True if the entry is set for C in the class Bitmap.
242
243   procedure Reset_Class (Bitmap : out Character_Class);
244   --  Clear all the entries in the class Bitmap.
245
246   pragma Inline (Set_In_Class);
247   pragma Inline (Get_From_Class);
248   pragma Inline (Reset_Class);
249
250   -----------------------
251   -- Local Subprograms --
252   -----------------------
253
254   function "=" (Left : Character; Right : Opcode) return Boolean;
255
256   function Is_Alnum (C : Character) return Boolean;
257   --  Return True if C is an alphanum character or an underscore ('_')
258
259   function Is_White_Space (C : Character) return Boolean;
260   --  Return True if C is a whitespace character
261
262   function Is_Printable (C : Character) return Boolean;
263   --  Return True if C is a printable character
264
265   function Operand (P : Pointer) return Pointer;
266   --  Return a pointer to the first operand of the node at P
267
268   function String_Length
269     (Program : Program_Data;
270      P       : Pointer) return Program_Size;
271   --  Return the length of the string argument of the node at P
272
273   function String_Operand (P : Pointer) return Pointer;
274   --  Return a pointer to the string argument of the node at P
275
276   procedure Bitmap_Operand
277     (Program : Program_Data;
278      P       : Pointer;
279      Op      : out Character_Class);
280   --  Return a pointer to the string argument of the node at P
281
282   function Get_Next_Offset
283     (Program : Program_Data;
284      IP      : Pointer) return Pointer;
285   --  Get the offset field of a node. Used by Get_Next.
286
287   function Get_Next
288     (Program : Program_Data;
289      IP      : Pointer) return Pointer;
290   --  Dig the next instruction pointer out of a node
291
292   procedure Optimize (Self : in out Pattern_Matcher);
293   --  Optimize a Pattern_Matcher by noting certain special cases
294
295   function Read_Natural
296     (Program : Program_Data;
297      IP      : Pointer) return Natural;
298   --  Return the 2-byte natural coded at position IP.
299
300   --  All of the subprograms above are tiny and should be inlined
301
302   pragma Inline ("=");
303   pragma Inline (Is_Alnum);
304   pragma Inline (Is_White_Space);
305   pragma Inline (Get_Next);
306   pragma Inline (Get_Next_Offset);
307   pragma Inline (Operand);
308   pragma Inline (Read_Natural);
309   pragma Inline (String_Length);
310   pragma Inline (String_Operand);
311
312   type Expression_Flags is record
313      Has_Width,            -- Known never to match null string
314      Simple,               -- Simple enough to be STAR/PLUS operand
315      SP_Start  : Boolean;  -- Starts with * or +
316   end record;
317
318   Worst_Expression : constant Expression_Flags := (others => False);
319   --  Worst case
320
321   ---------
322   -- "=" --
323   ---------
324
325   function "=" (Left : Character; Right : Opcode) return Boolean is
326   begin
327      return Character'Pos (Left) = Opcode'Pos (Right);
328   end "=";
329
330   --------------------
331   -- Bitmap_Operand --
332   --------------------
333
334   procedure Bitmap_Operand
335     (Program : Program_Data;
336      P       : Pointer;
337      Op      : out Character_Class)
338   is
339      function Convert is new Unchecked_Conversion
340        (Program_Data, Character_Class);
341
342   begin
343      Op (0 .. 31) := Convert (Program (P + 3 .. P + 34));
344   end Bitmap_Operand;
345
346   -------------
347   -- Compile --
348   -------------
349
350   procedure Compile
351     (Matcher         : out Pattern_Matcher;
352      Expression      : String;
353      Final_Code_Size : out Program_Size;
354      Flags           : Regexp_Flags := No_Flags)
355   is
356      --  We can't allocate space until we know how big the compiled form
357      --  will be, but we can't compile it (and thus know how big it is)
358      --  until we've got a place to put the code. So we cheat: we compile
359      --  it twice, once with code generation turned off and size counting
360      --  turned on, and once "for real".
361
362      --  This also means that we don't allocate space until we are sure
363      --  that the thing really will compile successfully, and we never
364      --  have to move the code and thus invalidate pointers into it.
365
366      --  Beware that the optimization-preparation code in here knows
367      --  about some of the structure of the compiled regexp.
368
369      PM        : Pattern_Matcher renames Matcher;
370      Program   : Program_Data renames PM.Program;
371
372      Emit_Code : constant Boolean := PM.Size > 0;
373      Emit_Ptr  : Pointer := Program_First;
374
375      Parse_Pos : Natural := Expression'First; -- Input-scan pointer
376      Parse_End : constant Natural := Expression'Last;
377
378      ----------------------------
379      -- Subprograms for Create --
380      ----------------------------
381
382      procedure Emit (B : Character);
383      --  Output the Character B to the Program. If code-generation is
384      --  disabled, simply increments the program counter.
385
386      function  Emit_Node (Op : Opcode) return Pointer;
387      --  If code-generation is enabled, Emit_Node outputs the
388      --  opcode Op and reserves space for a pointer to the next node.
389      --  Return value is the location of new opcode, ie old Emit_Ptr.
390
391      procedure Emit_Natural (IP : Pointer; N : Natural);
392      --  Split N on two characters at position IP.
393
394      procedure Emit_Class (Bitmap : Character_Class);
395      --  Emits a character class.
396
397      procedure Case_Emit (C : Character);
398      --  Emit C, after converting is to lower-case if the regular
399      --  expression is case insensitive.
400
401      procedure Parse
402        (Parenthesized : Boolean;
403         Flags         : out Expression_Flags;
404         IP            : out Pointer);
405      --  Parse regular expression, i.e. main body or parenthesized thing
406      --  Caller must absorb opening parenthesis.
407
408      procedure Parse_Branch
409        (Flags         : out Expression_Flags;
410         First         : Boolean;
411         IP            : out Pointer);
412      --  Implements the concatenation operator and handles '|'
413      --  First should be true if this is the first item of the alternative.
414
415      procedure Parse_Piece
416        (Expr_Flags : out Expression_Flags;
417         IP         : out Pointer);
418      --  Parse something followed by possible [*+?]
419
420      procedure Parse_Atom
421        (Expr_Flags : out Expression_Flags;
422         IP         : out Pointer);
423      --  Parse_Atom is the lowest level parse procedure.
424      --  Optimization:  gobbles an entire sequence of ordinary characters
425      --  so that it can turn them into a single node, which is smaller to
426      --  store and faster to run. Backslashed characters are exceptions,
427      --  each becoming a separate node; the code is simpler that way and
428      --  it's not worth fixing.
429
430      procedure Insert_Operator
431        (Op       : Opcode;
432         Operand  : Pointer;
433         Greedy   : Boolean := True);
434      --  Insert_Operator inserts an operator in front of an
435      --  already-emitted operand and relocates the operand.
436      --  This applies to PLUS and STAR.
437      --  If Minmod is True, then the operator is non-greedy.
438
439      procedure Insert_Curly_Operator
440        (Op      : Opcode;
441         Min     : Natural;
442         Max     : Natural;
443         Operand : Pointer;
444         Greedy  : Boolean := True);
445      --  Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}).
446      --  If Minmod is True, then the operator is non-greedy.
447
448      procedure Link_Tail (P, Val : Pointer);
449      --  Link_Tail sets the next-pointer at the end of a node chain
450
451      procedure Link_Operand_Tail (P, Val : Pointer);
452      --  Link_Tail on operand of first argument; nop if operandless
453
454      function  Next_Instruction (P : Pointer) return Pointer;
455      --  Dig the "next" pointer out of a node
456
457      procedure Fail (M : in String);
458      pragma No_Return (Fail);
459      --  Fail with a diagnostic message, if possible
460
461      function Is_Curly_Operator (IP : Natural) return Boolean;
462      --  Return True if IP is looking at a '{' that is the beginning
463      --  of a curly operator, ie it matches {\d+,?\d*}
464
465      function Is_Mult (IP : Natural) return Boolean;
466      --  Return True if C is a regexp multiplier: '+', '*' or '?'
467
468      procedure Get_Curly_Arguments
469        (IP     : Natural;
470         Min    : out Natural;
471         Max    : out Natural;
472         Greedy : out Boolean);
473      --  Parse the argument list for a curly operator.
474      --  It is assumed that IP is indeed pointing at a valid operator.
475      --  So what is IP and how come IP is not referenced in the body ???
476
477      procedure Parse_Character_Class (IP : out Pointer);
478      --  Parse a character class.
479      --  The calling subprogram should consume the opening '[' before.
480
481      procedure Parse_Literal
482        (Expr_Flags : out Expression_Flags;
483         IP         : out Pointer);
484      --  Parse_Literal encodes a string of characters to be matched exactly
485
486      function Parse_Posix_Character_Class return Std_Class;
487      --  Parse a posic character class, like [:alpha:] or [:^alpha:].
488      --  The called is suppoed to absorbe the opening [.
489
490      pragma Inline (Is_Mult);
491      pragma Inline (Emit_Natural);
492      pragma Inline (Parse_Character_Class); --  since used only once
493
494      ---------------
495      -- Case_Emit --
496      ---------------
497
498      procedure Case_Emit (C : Character) is
499      begin
500         if (Flags and Case_Insensitive) /= 0 then
501            Emit (To_Lower (C));
502
503         else
504            --  Dump current character
505
506            Emit (C);
507         end if;
508      end Case_Emit;
509
510      ----------
511      -- Emit --
512      ----------
513
514      procedure Emit (B : Character) is
515      begin
516         if Emit_Code then
517            Program (Emit_Ptr) := B;
518         end if;
519
520         Emit_Ptr := Emit_Ptr + 1;
521      end Emit;
522
523      ----------------
524      -- Emit_Class --
525      ----------------
526
527      procedure Emit_Class (Bitmap : Character_Class) is
528         subtype Program31 is Program_Data (0 .. 31);
529
530         function Convert is new Unchecked_Conversion
531           (Character_Class, Program31);
532
533      begin
534         if Emit_Code then
535            Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap);
536         end if;
537
538         Emit_Ptr := Emit_Ptr + 32;
539      end Emit_Class;
540
541      ------------------
542      -- Emit_Natural --
543      ------------------
544
545      procedure Emit_Natural (IP : Pointer; N : Natural) is
546      begin
547         if Emit_Code then
548            Program (IP + 1) := Character'Val (N / 256);
549            Program (IP) := Character'Val (N mod 256);
550         end if;
551      end Emit_Natural;
552
553      ---------------
554      -- Emit_Node --
555      ---------------
556
557      function Emit_Node (Op : Opcode) return Pointer is
558         Result : constant Pointer := Emit_Ptr;
559
560      begin
561         if Emit_Code then
562            Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op));
563            Program (Emit_Ptr + 1) := ASCII.NUL;
564            Program (Emit_Ptr + 2) := ASCII.NUL;
565         end if;
566
567         Emit_Ptr := Emit_Ptr + 3;
568         return Result;
569      end Emit_Node;
570
571      ----------
572      -- Fail --
573      ----------
574
575      procedure Fail (M : in String) is
576      begin
577         raise Expression_Error;
578      end Fail;
579
580      -------------------------
581      -- Get_Curly_Arguments --
582      -------------------------
583
584      procedure Get_Curly_Arguments
585        (IP     : Natural;
586         Min    : out Natural;
587         Max    : out Natural;
588         Greedy : out Boolean)
589      is
590         pragma Unreferenced (IP);
591
592         Save_Pos : Natural := Parse_Pos + 1;
593
594      begin
595         Min := 0;
596         Max := Max_Curly_Repeat;
597
598         while Expression (Parse_Pos) /= '}'
599           and then Expression (Parse_Pos) /= ','
600         loop
601            Parse_Pos := Parse_Pos + 1;
602         end loop;
603
604         Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
605
606         if Expression (Parse_Pos) = ',' then
607            Save_Pos := Parse_Pos + 1;
608            while Expression (Parse_Pos) /= '}' loop
609               Parse_Pos := Parse_Pos + 1;
610            end loop;
611
612            if Save_Pos /= Parse_Pos then
613               Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1));
614            end if;
615
616         else
617            Max := Min;
618         end if;
619
620         if Parse_Pos < Expression'Last
621           and then Expression (Parse_Pos + 1) = '?'
622         then
623            Greedy := False;
624            Parse_Pos := Parse_Pos + 1;
625
626         else
627            Greedy := True;
628         end if;
629      end Get_Curly_Arguments;
630
631      ---------------------------
632      -- Insert_Curly_Operator --
633      ---------------------------
634
635      procedure Insert_Curly_Operator
636        (Op      : Opcode;
637         Min     : Natural;
638         Max     : Natural;
639         Operand : Pointer;
640         Greedy  : Boolean := True)
641      is
642         Dest   : constant Pointer := Emit_Ptr;
643         Old    : Pointer;
644         Size   : Pointer := 7;
645
646      begin
647         --  If the operand is not greedy, insert an extra operand before it
648
649         if not Greedy then
650            Size := Size + 3;
651         end if;
652
653         --  Move the operand in the byte-compilation, so that we can insert
654         --  the operator before it.
655
656         if Emit_Code then
657            Program (Operand + Size .. Emit_Ptr + Size) :=
658              Program (Operand .. Emit_Ptr);
659         end if;
660
661         --  Insert the operator at the position previously occupied by the
662         --  operand.
663
664         Emit_Ptr := Operand;
665
666         if not Greedy then
667            Old := Emit_Node (MINMOD);
668            Link_Tail (Old, Old + 3);
669         end if;
670
671         Old := Emit_Node (Op);
672         Emit_Natural (Old + 3, Min);
673         Emit_Natural (Old + 5, Max);
674
675         Emit_Ptr := Dest + Size;
676      end Insert_Curly_Operator;
677
678      ---------------------
679      -- Insert_Operator --
680      ---------------------
681
682      procedure Insert_Operator
683        (Op      : Opcode;
684         Operand : Pointer;
685         Greedy  : Boolean := True)
686      is
687         Dest   : constant Pointer := Emit_Ptr;
688         Old    : Pointer;
689         Size   : Pointer := 3;
690
691      begin
692         --  If not greedy, we have to emit another opcode first
693
694         if not Greedy then
695            Size := Size + 3;
696         end if;
697
698         --  Move the operand in the byte-compilation, so that we can insert
699         --  the operator before it.
700
701         if Emit_Code then
702            Program (Operand + Size .. Emit_Ptr + Size) :=
703              Program (Operand .. Emit_Ptr);
704         end if;
705
706         --  Insert the operator at the position previously occupied by the
707         --  operand.
708
709         Emit_Ptr := Operand;
710
711         if not Greedy then
712            Old := Emit_Node (MINMOD);
713            Link_Tail (Old, Old + 3);
714         end if;
715
716         Old := Emit_Node (Op);
717         Emit_Ptr := Dest + Size;
718      end Insert_Operator;
719
720      -----------------------
721      -- Is_Curly_Operator --
722      -----------------------
723
724      function Is_Curly_Operator (IP : Natural) return Boolean is
725         Scan : Natural := IP;
726
727      begin
728         if Expression (Scan) /= '{'
729           or else Scan + 2 > Expression'Last
730           or else not Is_Digit (Expression (Scan + 1))
731         then
732            return False;
733         end if;
734
735         Scan := Scan + 1;
736
737         --  The first digit
738
739         loop
740            Scan := Scan + 1;
741
742            if Scan > Expression'Last then
743               return False;
744            end if;
745
746            exit when not Is_Digit (Expression (Scan));
747         end loop;
748
749         if Expression (Scan) = ',' then
750            loop
751               Scan := Scan + 1;
752
753               if Scan > Expression'Last then
754                  return False;
755               end if;
756
757               exit when not Is_Digit (Expression (Scan));
758            end loop;
759         end if;
760
761         return Expression (Scan) = '}';
762      end Is_Curly_Operator;
763
764      -------------
765      -- Is_Mult --
766      -------------
767
768      function Is_Mult (IP : Natural) return Boolean is
769         C : constant Character := Expression (IP);
770
771      begin
772         return     C = '*'
773           or else  C = '+'
774           or else  C = '?'
775           or else (C = '{' and then Is_Curly_Operator (IP));
776      end Is_Mult;
777
778      -----------------------
779      -- Link_Operand_Tail --
780      -----------------------
781
782      procedure Link_Operand_Tail (P, Val : Pointer) is
783      begin
784         if Emit_Code and then Program (P) = BRANCH then
785            Link_Tail (Operand (P), Val);
786         end if;
787      end Link_Operand_Tail;
788
789      ---------------
790      -- Link_Tail --
791      ---------------
792
793      procedure Link_Tail (P, Val : Pointer) is
794         Scan   : Pointer;
795         Temp   : Pointer;
796         Offset : Pointer;
797
798      begin
799         if not Emit_Code then
800            return;
801         end if;
802
803         --  Find last node
804
805         Scan := P;
806         loop
807            Temp := Next_Instruction (Scan);
808            exit when Temp = 0;
809            Scan := Temp;
810         end loop;
811
812         Offset := Val - Scan;
813
814         Emit_Natural (Scan + 1, Natural (Offset));
815      end Link_Tail;
816
817      ----------------------
818      -- Next_Instruction --
819      ----------------------
820
821      function Next_Instruction (P : Pointer) return Pointer is
822         Offset : Pointer;
823
824      begin
825         if not Emit_Code then
826            return 0;
827         end if;
828
829         Offset := Get_Next_Offset (Program, P);
830
831         if Offset = 0 then
832            return 0;
833         end if;
834
835         return P + Offset;
836      end Next_Instruction;
837
838      -----------
839      -- Parse --
840      -----------
841
842      --  Combining parenthesis handling with the base level
843      --  of regular expression is a trifle forced, but the
844      --  need to tie the tails of the branches to what follows
845      --  makes it hard to avoid.
846
847      procedure Parse
848        (Parenthesized  : in Boolean;
849         Flags          : out Expression_Flags;
850         IP             : out Pointer)
851      is
852         E              : String renames Expression;
853         Br             : Pointer;
854         Ender          : Pointer;
855         Par_No         : Natural;
856         New_Flags      : Expression_Flags;
857         Have_Branch    : Boolean := False;
858
859      begin
860         Flags := (Has_Width => True, others => False);  -- Tentatively
861
862         --  Make an OPEN node, if parenthesized
863
864         if Parenthesized then
865            if Matcher.Paren_Count > Max_Paren_Count then
866               Fail ("too many ()");
867            end if;
868
869            Par_No := Matcher.Paren_Count + 1;
870            Matcher.Paren_Count := Matcher.Paren_Count + 1;
871            IP := Emit_Node (OPEN);
872            Emit (Character'Val (Par_No));
873
874         else
875            IP := 0;
876            Par_No := 0;
877         end if;
878
879         --  Pick up the branches, linking them together
880
881         Parse_Branch (New_Flags, True, Br);
882
883         if Br = 0 then
884            IP := 0;
885            return;
886         end if;
887
888         if Parse_Pos <= Parse_End
889           and then E (Parse_Pos) = '|'
890         then
891            Insert_Operator (BRANCH, Br);
892            Have_Branch := True;
893         end if;
894
895         if IP /= 0 then
896            Link_Tail (IP, Br);   -- OPEN -> first
897         else
898            IP := Br;
899         end if;
900
901         if not New_Flags.Has_Width then
902            Flags.Has_Width := False;
903         end if;
904
905         Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
906
907         while Parse_Pos <= Parse_End
908           and then (E (Parse_Pos) = '|')
909         loop
910            Parse_Pos := Parse_Pos + 1;
911            Parse_Branch (New_Flags, False, Br);
912
913            if Br = 0 then
914               IP := 0;
915               return;
916            end if;
917
918            Link_Tail (IP, Br);   -- BRANCH -> BRANCH
919
920            if not New_Flags.Has_Width then
921               Flags.Has_Width := False;
922            end if;
923
924            Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
925         end loop;
926
927         --  Make a closing node, and hook it on the end
928
929         if Parenthesized then
930            Ender := Emit_Node (CLOSE);
931            Emit (Character'Val (Par_No));
932         else
933            Ender := Emit_Node (EOP);
934         end if;
935
936         Link_Tail (IP, Ender);
937
938         if Have_Branch then
939
940            --  Hook the tails of the branches to the closing node
941
942            Br := IP;
943            loop
944               exit when Br = 0;
945               Link_Operand_Tail (Br, Ender);
946               Br := Next_Instruction (Br);
947            end loop;
948         end if;
949
950         --  Check for proper termination
951
952         if Parenthesized then
953            if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then
954               Fail ("unmatched ()");
955            end if;
956
957            Parse_Pos := Parse_Pos + 1;
958
959         elsif Parse_Pos <= Parse_End then
960            if E (Parse_Pos) = ')'  then
961               Fail ("unmatched ()");
962            else
963               Fail ("junk on end");         -- "Can't happen"
964            end if;
965         end if;
966      end Parse;
967
968      ----------------
969      -- Parse_Atom --
970      ----------------
971
972      procedure Parse_Atom
973        (Expr_Flags : out Expression_Flags;
974         IP         : out Pointer)
975      is
976         C : Character;
977
978      begin
979         --  Tentatively set worst expression case
980
981         Expr_Flags := Worst_Expression;
982
983         C := Expression (Parse_Pos);
984         Parse_Pos := Parse_Pos + 1;
985
986         case (C) is
987            when '^' =>
988               if (Flags and Multiple_Lines) /= 0  then
989                  IP := Emit_Node (MBOL);
990               elsif (Flags and Single_Line) /= 0 then
991                  IP := Emit_Node (SBOL);
992               else
993                  IP := Emit_Node (BOL);
994               end if;
995
996            when '$' =>
997               if (Flags and Multiple_Lines) /= 0  then
998                  IP := Emit_Node (MEOL);
999               elsif (Flags and Single_Line) /= 0 then
1000                  IP := Emit_Node (SEOL);
1001               else
1002                  IP := Emit_Node (EOL);
1003               end if;
1004
1005            when '.' =>
1006               if (Flags and Single_Line) /= 0 then
1007                  IP := Emit_Node (SANY);
1008               else
1009                  IP := Emit_Node (ANY);
1010               end if;
1011
1012               Expr_Flags.Has_Width := True;
1013               Expr_Flags.Simple := True;
1014
1015            when '[' =>
1016               Parse_Character_Class (IP);
1017               Expr_Flags.Has_Width := True;
1018               Expr_Flags.Simple := True;
1019
1020            when '(' =>
1021               declare
1022                  New_Flags : Expression_Flags;
1023
1024               begin
1025                  Parse (True, New_Flags, IP);
1026
1027                  if IP = 0 then
1028                     return;
1029                  end if;
1030
1031                  Expr_Flags.Has_Width :=
1032                    Expr_Flags.Has_Width or New_Flags.Has_Width;
1033                  Expr_Flags.SP_Start :=
1034                    Expr_Flags.SP_Start or New_Flags.SP_Start;
1035               end;
1036
1037            when '|' | ASCII.LF | ')' =>
1038               Fail ("internal urp");  --  Supposed to be caught earlier
1039
1040            when '?' | '+' | '*' =>
1041               Fail (C & " follows nothing");
1042
1043            when '{' =>
1044               if Is_Curly_Operator (Parse_Pos - 1) then
1045                  Fail (C & " follows nothing");
1046               else
1047                  Parse_Literal (Expr_Flags, IP);
1048               end if;
1049
1050            when '\' =>
1051               if Parse_Pos > Parse_End then
1052                  Fail ("trailing \");
1053               end if;
1054
1055               Parse_Pos := Parse_Pos + 1;
1056
1057               case Expression (Parse_Pos - 1) is
1058                  when 'b'        =>
1059                     IP := Emit_Node (BOUND);
1060
1061                  when 'B'        =>
1062                     IP := Emit_Node (NBOUND);
1063
1064                  when 's'        =>
1065                     IP := Emit_Node (SPACE);
1066                     Expr_Flags.Simple := True;
1067                     Expr_Flags.Has_Width := True;
1068
1069                  when 'S'        =>
1070                     IP := Emit_Node (NSPACE);
1071                     Expr_Flags.Simple := True;
1072                     Expr_Flags.Has_Width := True;
1073
1074                  when 'd'        =>
1075                     IP := Emit_Node (DIGIT);
1076                     Expr_Flags.Simple := True;
1077                     Expr_Flags.Has_Width := True;
1078
1079                  when 'D'        =>
1080                     IP := Emit_Node (NDIGIT);
1081                     Expr_Flags.Simple := True;
1082                     Expr_Flags.Has_Width := True;
1083
1084                  when 'w'        =>
1085                     IP := Emit_Node (ALNUM);
1086                     Expr_Flags.Simple := True;
1087                     Expr_Flags.Has_Width := True;
1088
1089                  when 'W'        =>
1090                     IP := Emit_Node (NALNUM);
1091                     Expr_Flags.Simple := True;
1092                     Expr_Flags.Has_Width := True;
1093
1094                  when 'A'        =>
1095                     IP := Emit_Node (SBOL);
1096
1097                  when 'G'        =>
1098                     IP := Emit_Node (SEOL);
1099
1100                  when '0' .. '9' =>
1101                     IP := Emit_Node (REFF);
1102
1103                     declare
1104                        Save : constant Natural := Parse_Pos - 1;
1105
1106                     begin
1107                        while Parse_Pos <= Expression'Last
1108                          and then Is_Digit (Expression (Parse_Pos))
1109                        loop
1110                           Parse_Pos := Parse_Pos + 1;
1111                        end loop;
1112
1113                        Emit (Character'Val (Natural'Value
1114                               (Expression (Save .. Parse_Pos - 1))));
1115                     end;
1116
1117                  when others =>
1118                     Parse_Pos := Parse_Pos - 1;
1119                     Parse_Literal (Expr_Flags, IP);
1120               end case;
1121
1122            when others =>
1123               Parse_Literal (Expr_Flags, IP);
1124         end case;
1125      end Parse_Atom;
1126
1127      ------------------
1128      -- Parse_Branch --
1129      ------------------
1130
1131      procedure Parse_Branch
1132        (Flags : out Expression_Flags;
1133         First : Boolean;
1134         IP    : out Pointer)
1135      is
1136         E         : String renames Expression;
1137         Chain     : Pointer;
1138         Last      : Pointer;
1139         New_Flags : Expression_Flags;
1140
1141         Discard : Pointer;
1142         pragma Warnings (Off, Discard);
1143
1144      begin
1145         Flags := Worst_Expression;    -- Tentatively
1146
1147         if First then
1148            IP := Emit_Ptr;
1149         else
1150            IP := Emit_Node (BRANCH);
1151         end if;
1152
1153         Chain := 0;
1154
1155         while Parse_Pos <= Parse_End
1156           and then E (Parse_Pos) /= ')'
1157           and then E (Parse_Pos) /= ASCII.LF
1158           and then E (Parse_Pos) /= '|'
1159         loop
1160            Parse_Piece (New_Flags, Last);
1161
1162            if Last = 0 then
1163               IP := 0;
1164               return;
1165            end if;
1166
1167            Flags.Has_Width := Flags.Has_Width or New_Flags.Has_Width;
1168
1169            if Chain = 0 then            -- First piece
1170               Flags.SP_Start := Flags.SP_Start or New_Flags.SP_Start;
1171            else
1172               Link_Tail (Chain, Last);
1173            end if;
1174
1175            Chain := Last;
1176         end loop;
1177
1178         --  Case where loop ran zero CURLY
1179
1180         if Chain = 0 then
1181            Discard := Emit_Node (NOTHING);
1182         end if;
1183      end Parse_Branch;
1184
1185      ---------------------------
1186      -- Parse_Character_Class --
1187      ---------------------------
1188
1189      procedure Parse_Character_Class (IP : out Pointer) is
1190         Bitmap      : Character_Class;
1191         Invert      : Boolean := False;
1192         In_Range    : Boolean := False;
1193         Named_Class : Std_Class := ANYOF_NONE;
1194         Value       : Character;
1195         Last_Value  : Character := ASCII.Nul;
1196
1197      begin
1198         Reset_Class (Bitmap);
1199
1200         --  Do we have an invert character class ?
1201
1202         if Parse_Pos <= Parse_End
1203           and then Expression (Parse_Pos) = '^'
1204         then
1205            Invert := True;
1206            Parse_Pos := Parse_Pos + 1;
1207         end if;
1208
1209         --  First character can be ] or -, without closing the class.
1210
1211         if Parse_Pos <= Parse_End
1212           and then (Expression (Parse_Pos) = ']'
1213                      or else Expression (Parse_Pos) = '-')
1214         then
1215            Set_In_Class (Bitmap, Expression (Parse_Pos));
1216            Parse_Pos := Parse_Pos + 1;
1217         end if;
1218
1219         --  While we don't have the end of the class
1220
1221         while Parse_Pos <= Parse_End
1222           and then Expression (Parse_Pos) /= ']'
1223         loop
1224            Named_Class := ANYOF_NONE;
1225            Value := Expression (Parse_Pos);
1226            Parse_Pos := Parse_Pos + 1;
1227
1228            --  Do we have a Posix character class
1229            if Value = '[' then
1230               Named_Class := Parse_Posix_Character_Class;
1231
1232            elsif Value = '\' then
1233               if Parse_Pos = Parse_End then
1234                  Fail ("Trailing \");
1235               end if;
1236               Value := Expression (Parse_Pos);
1237               Parse_Pos := Parse_Pos + 1;
1238
1239               case Value is
1240                  when 'w' => Named_Class := ANYOF_ALNUM;
1241                  when 'W' => Named_Class := ANYOF_NALNUM;
1242                  when 's' => Named_Class := ANYOF_SPACE;
1243                  when 'S' => Named_Class := ANYOF_NSPACE;
1244                  when 'd' => Named_Class := ANYOF_DIGIT;
1245                  when 'D' => Named_Class := ANYOF_NDIGIT;
1246                  when 'n' => Value := ASCII.LF;
1247                  when 'r' => Value := ASCII.CR;
1248                  when 't' => Value := ASCII.HT;
1249                  when 'f' => Value := ASCII.FF;
1250                  when 'e' => Value := ASCII.ESC;
1251                  when 'a' => Value := ASCII.BEL;
1252
1253                  --  when 'x'  => ??? hexadecimal value
1254                  --  when 'c'  => ??? control character
1255                  --  when '0'..'9' => ??? octal character
1256
1257                  when others => null;
1258               end case;
1259            end if;
1260
1261            --  Do we have a character class?
1262
1263            if Named_Class /= ANYOF_NONE then
1264
1265               --  A range like 'a-\d' or 'a-[:digit:] is not a range
1266
1267               if In_Range then
1268                  Set_In_Class (Bitmap, Last_Value);
1269                  Set_In_Class (Bitmap, '-');
1270                  In_Range := False;
1271               end if;
1272
1273               --  Expand the range
1274
1275               case Named_Class is
1276                  when ANYOF_NONE => null;
1277
1278                  when ANYOF_ALNUM | ANYOF_ALNUMC =>
1279                     for Value in Class_Byte'Range loop
1280                        if Is_Alnum (Character'Val (Value)) then
1281                           Set_In_Class (Bitmap, Character'Val (Value));
1282                        end if;
1283                     end loop;
1284
1285                  when ANYOF_NALNUM | ANYOF_NALNUMC =>
1286                     for Value in Class_Byte'Range loop
1287                        if not Is_Alnum (Character'Val (Value)) then
1288                           Set_In_Class (Bitmap, Character'Val (Value));
1289                        end if;
1290                     end loop;
1291
1292                  when ANYOF_SPACE =>
1293                     for Value in Class_Byte'Range loop
1294                        if Is_White_Space (Character'Val (Value)) then
1295                           Set_In_Class (Bitmap, Character'Val (Value));
1296                        end if;
1297                     end loop;
1298
1299                  when ANYOF_NSPACE =>
1300                     for Value in Class_Byte'Range loop
1301                        if not Is_White_Space (Character'Val (Value)) then
1302                           Set_In_Class (Bitmap, Character'Val (Value));
1303                        end if;
1304                     end loop;
1305
1306                  when ANYOF_DIGIT =>
1307                     for Value in Class_Byte'Range loop
1308                        if Is_Digit (Character'Val (Value)) then
1309                           Set_In_Class (Bitmap, Character'Val (Value));
1310                        end if;
1311                     end loop;
1312
1313                  when ANYOF_NDIGIT =>
1314                     for Value in Class_Byte'Range loop
1315                        if not Is_Digit (Character'Val (Value)) then
1316                           Set_In_Class (Bitmap, Character'Val (Value));
1317                        end if;
1318                     end loop;
1319
1320                  when ANYOF_ALPHA =>
1321                     for Value in Class_Byte'Range loop
1322                        if Is_Letter (Character'Val (Value)) then
1323                           Set_In_Class (Bitmap, Character'Val (Value));
1324                        end if;
1325                     end loop;
1326
1327                  when ANYOF_NALPHA =>
1328                     for Value in Class_Byte'Range loop
1329                        if not Is_Letter (Character'Val (Value)) then
1330                           Set_In_Class (Bitmap, Character'Val (Value));
1331                        end if;
1332                     end loop;
1333
1334                  when ANYOF_ASCII =>
1335                     for Value in 0 .. 127 loop
1336                        Set_In_Class (Bitmap, Character'Val (Value));
1337                     end loop;
1338
1339                  when ANYOF_NASCII =>
1340                     for Value in 128 .. 255 loop
1341                        Set_In_Class (Bitmap, Character'Val (Value));
1342                     end loop;
1343
1344                  when ANYOF_CNTRL =>
1345                     for Value in Class_Byte'Range loop
1346                        if Is_Control (Character'Val (Value)) then
1347                           Set_In_Class (Bitmap, Character'Val (Value));
1348                        end if;
1349                     end loop;
1350
1351                  when ANYOF_NCNTRL =>
1352                     for Value in Class_Byte'Range loop
1353                        if not Is_Control (Character'Val (Value)) then
1354                           Set_In_Class (Bitmap, Character'Val (Value));
1355                        end if;
1356                     end loop;
1357
1358                  when ANYOF_GRAPH =>
1359                     for Value in Class_Byte'Range loop
1360                        if Is_Graphic (Character'Val (Value)) then
1361                           Set_In_Class (Bitmap, Character'Val (Value));
1362                        end if;
1363                     end loop;
1364
1365                  when ANYOF_NGRAPH =>
1366                     for Value in Class_Byte'Range loop
1367                        if not Is_Graphic (Character'Val (Value)) then
1368                           Set_In_Class (Bitmap, Character'Val (Value));
1369                        end if;
1370                     end loop;
1371
1372                  when ANYOF_LOWER =>
1373                     for Value in Class_Byte'Range loop
1374                        if Is_Lower (Character'Val (Value)) then
1375                           Set_In_Class (Bitmap, Character'Val (Value));
1376                        end if;
1377                     end loop;
1378
1379                  when ANYOF_NLOWER =>
1380                     for Value in Class_Byte'Range loop
1381                        if not Is_Lower (Character'Val (Value)) then
1382                           Set_In_Class (Bitmap, Character'Val (Value));
1383                        end if;
1384                     end loop;
1385
1386                  when ANYOF_PRINT =>
1387                     for Value in Class_Byte'Range loop
1388                        if Is_Printable (Character'Val (Value)) then
1389                           Set_In_Class (Bitmap, Character'Val (Value));
1390                        end if;
1391                     end loop;
1392
1393                  when ANYOF_NPRINT =>
1394                     for Value in Class_Byte'Range loop
1395                        if not Is_Printable (Character'Val (Value)) then
1396                           Set_In_Class (Bitmap, Character'Val (Value));
1397                        end if;
1398                     end loop;
1399
1400                  when ANYOF_PUNCT =>
1401                     for Value in Class_Byte'Range loop
1402                        if Is_Printable (Character'Val (Value))
1403                          and then not Is_White_Space (Character'Val (Value))
1404                          and then not Is_Alnum (Character'Val (Value))
1405                        then
1406                           Set_In_Class (Bitmap, Character'Val (Value));
1407                        end if;
1408                     end loop;
1409
1410                  when ANYOF_NPUNCT =>
1411                     for Value in Class_Byte'Range loop
1412                        if not Is_Printable (Character'Val (Value))
1413                          or else Is_White_Space (Character'Val (Value))
1414                          or else Is_Alnum (Character'Val (Value))
1415                        then
1416                           Set_In_Class (Bitmap, Character'Val (Value));
1417                        end if;
1418                     end loop;
1419
1420                  when ANYOF_UPPER =>
1421                     for Value in Class_Byte'Range loop
1422                        if Is_Upper (Character'Val (Value)) then
1423                           Set_In_Class (Bitmap, Character'Val (Value));
1424                        end if;
1425                     end loop;
1426
1427                  when ANYOF_NUPPER =>
1428                     for Value in Class_Byte'Range loop
1429                        if not Is_Upper (Character'Val (Value)) then
1430                           Set_In_Class (Bitmap, Character'Val (Value));
1431                        end if;
1432                     end loop;
1433
1434                  when ANYOF_XDIGIT =>
1435                     for Value in Class_Byte'Range loop
1436                        if Is_Hexadecimal_Digit (Character'Val (Value)) then
1437                           Set_In_Class (Bitmap, Character'Val (Value));
1438                        end if;
1439                     end loop;
1440
1441                  when ANYOF_NXDIGIT =>
1442                     for Value in Class_Byte'Range loop
1443                        if not Is_Hexadecimal_Digit
1444                          (Character'Val (Value))
1445                        then
1446                           Set_In_Class (Bitmap, Character'Val (Value));
1447                        end if;
1448                     end loop;
1449
1450               end case;
1451
1452            --  Not a character range
1453
1454            elsif not In_Range then
1455               Last_Value := Value;
1456
1457               if Expression (Parse_Pos) = '-'
1458                 and then Parse_Pos < Parse_End
1459                 and then Expression (Parse_Pos + 1) /= ']'
1460               then
1461                  Parse_Pos := Parse_Pos + 1;
1462
1463                  --  Do we have a range like '\d-a' and '[:space:]-a'
1464                  --  which is not a real range
1465
1466                  if Named_Class /= ANYOF_NONE then
1467                     Set_In_Class (Bitmap, '-');
1468                  else
1469                     In_Range := True;
1470                  end if;
1471
1472               else
1473                  Set_In_Class (Bitmap, Value);
1474
1475               end if;
1476
1477            --  Else in a character range
1478
1479            else
1480               if Last_Value > Value then
1481                  Fail ("Invalid Range [" & Last_Value'Img
1482                        & "-" & Value'Img & "]");
1483               end if;
1484
1485               while Last_Value <= Value loop
1486                  Set_In_Class (Bitmap, Last_Value);
1487                  Last_Value := Character'Succ (Last_Value);
1488               end loop;
1489
1490               In_Range := False;
1491
1492            end if;
1493
1494         end loop;
1495
1496         --  Optimize case-insensitive ranges (put the upper case or lower
1497         --  case character into the bitmap)
1498
1499         if (Flags and Case_Insensitive) /= 0 then
1500            for C in Character'Range loop
1501               if Get_From_Class (Bitmap, C) then
1502                  Set_In_Class (Bitmap, To_Lower (C));
1503                  Set_In_Class (Bitmap, To_Upper (C));
1504               end if;
1505            end loop;
1506         end if;
1507
1508         --  Optimize inverted classes
1509
1510         if Invert then
1511            for J in Bitmap'Range loop
1512               Bitmap (J) := not Bitmap (J);
1513            end loop;
1514         end if;
1515
1516         Parse_Pos := Parse_Pos + 1;
1517
1518         --  Emit the class
1519
1520         IP := Emit_Node (ANYOF);
1521         Emit_Class (Bitmap);
1522      end Parse_Character_Class;
1523
1524      -------------------
1525      -- Parse_Literal --
1526      -------------------
1527
1528      --  This is a bit tricky due to quoted chars and due to
1529      --  the multiplier characters '*', '+', and '?' that
1530      --  take the SINGLE char previous as their operand.
1531
1532      --  On entry, the character at Parse_Pos - 1 is going to go
1533      --  into the string, no matter what it is. It could be
1534      --  following a \ if Parse_Atom was entered from the '\' case.
1535
1536      --  Basic idea is to pick up a good char in C and examine
1537      --  the next char. If Is_Mult (C) then twiddle, if it's a \
1538      --  then frozzle and if it's another magic char then push C and
1539      --  terminate the string. If none of the above, push C on the
1540      --  string and go around again.
1541
1542      --  Start_Pos is used to remember where "the current character"
1543      --  starts in the string, if due to an Is_Mult we need to back
1544      --  up and put the current char in a separate 1-character string.
1545      --  When Start_Pos is 0, C is the only char in the string;
1546      --  this is used in Is_Mult handling, and in setting the SIMPLE
1547      --  flag at the end.
1548
1549      procedure Parse_Literal
1550        (Expr_Flags : out Expression_Flags;
1551         IP         : out Pointer)
1552      is
1553         Start_Pos  : Natural := 0;
1554         C          : Character;
1555         Length_Ptr : Pointer;
1556
1557         Has_Special_Operator : Boolean := False;
1558
1559      begin
1560         Parse_Pos := Parse_Pos - 1;      --  Look at current character
1561
1562         if (Flags and Case_Insensitive) /= 0 then
1563            IP := Emit_Node (EXACTF);
1564         else
1565            IP := Emit_Node (EXACT);
1566         end if;
1567
1568         Length_Ptr := Emit_Ptr;
1569         Emit_Ptr := String_Operand (IP);
1570
1571         Parse_Loop :
1572         loop
1573            C := Expression (Parse_Pos); --  Get current character
1574
1575            case C is
1576               when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' =>
1577
1578                  if Start_Pos = 0 then
1579                     Start_Pos := Parse_Pos;
1580                     Emit (C);         --  First character is always emitted
1581                  else
1582                     exit Parse_Loop;  --  Else we are done
1583                  end if;
1584
1585               when '?' | '+' | '*' | '{' =>
1586
1587                  if Start_Pos = 0 then
1588                     Start_Pos := Parse_Pos;
1589                     Emit (C);         --  First character is always emitted
1590
1591                  --  Are we looking at an operator, or is this
1592                  --  simply a normal character ?
1593
1594                  elsif not Is_Mult (Parse_Pos) then
1595                     Start_Pos := Parse_Pos;
1596                     Case_Emit (C);
1597
1598                  else
1599                     --  We've got something like "abc?d".  Mark this as a
1600                     --  special case. What we want to emit is a first
1601                     --  constant string for "ab", then one for "c" that will
1602                     --  ultimately be transformed with a CURLY operator, A
1603                     --  special case has to be handled for "a?", since there
1604                     --  is no initial string to emit.
1605
1606                     Has_Special_Operator := True;
1607                     exit Parse_Loop;
1608                  end if;
1609
1610               when '\' =>
1611                  Start_Pos := Parse_Pos;
1612
1613                  if Parse_Pos = Parse_End then
1614                     Fail ("Trailing \");
1615
1616                  else
1617                     case Expression (Parse_Pos + 1) is
1618                        when 'b' | 'B' | 's' | 'S' | 'd' | 'D'
1619                          | 'w' | 'W' | '0' .. '9' | 'G' | 'A'
1620                          => exit Parse_Loop;
1621                        when 'n'         => Emit (ASCII.LF);
1622                        when 't'         => Emit (ASCII.HT);
1623                        when 'r'         => Emit (ASCII.CR);
1624                        when 'f'         => Emit (ASCII.FF);
1625                        when 'e'         => Emit (ASCII.ESC);
1626                        when 'a'         => Emit (ASCII.BEL);
1627                        when others      => Emit (Expression (Parse_Pos + 1));
1628                     end case;
1629
1630                     Parse_Pos := Parse_Pos + 1;
1631                  end if;
1632
1633               when others =>
1634                  Start_Pos := Parse_Pos;
1635                  Case_Emit (C);
1636            end case;
1637
1638            exit Parse_Loop when Emit_Ptr - Length_Ptr = 254;
1639
1640            Parse_Pos := Parse_Pos + 1;
1641
1642            exit Parse_Loop when Parse_Pos > Parse_End;
1643         end loop Parse_Loop;
1644
1645         --  Is the string followed by a '*+?{' operator ? If yes, and if there
1646         --  is an initial string to emit, do it now.
1647
1648         if Has_Special_Operator
1649           and then Emit_Ptr >= Length_Ptr + 3
1650         then
1651            Emit_Ptr := Emit_Ptr - 1;
1652            Parse_Pos := Start_Pos;
1653         end if;
1654
1655         if Emit_Code then
1656            Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2);
1657         end if;
1658
1659         Expr_Flags.Has_Width := True;
1660
1661         --  Slight optimization when there is a single character
1662
1663         if Emit_Ptr = Length_Ptr + 2 then
1664            Expr_Flags.Simple := True;
1665         end if;
1666      end Parse_Literal;
1667
1668      -----------------
1669      -- Parse_Piece --
1670      -----------------
1671
1672      --  Note that the branching code sequences used for '?' and the
1673      --  general cases of '*' and + are somewhat optimized: they use
1674      --  the same NOTHING node as both the endmarker for their branch
1675      --  list and the body of the last branch. It might seem that
1676      --  this node could be dispensed with entirely, but the endmarker
1677      --  role is not redundant.
1678
1679      procedure Parse_Piece
1680        (Expr_Flags : out Expression_Flags;
1681         IP         : out Pointer)
1682      is
1683         Op        : Character;
1684         New_Flags : Expression_Flags;
1685         Greedy    : Boolean := True;
1686
1687      begin
1688         Parse_Atom (New_Flags, IP);
1689
1690         if IP = 0 then
1691            return;
1692         end if;
1693
1694         if Parse_Pos > Parse_End
1695           or else not Is_Mult (Parse_Pos)
1696         then
1697            Expr_Flags := New_Flags;
1698            return;
1699         end if;
1700
1701         Op := Expression (Parse_Pos);
1702
1703         if Op /= '+' then
1704            Expr_Flags := (SP_Start => True, others => False);
1705         else
1706            Expr_Flags := (Has_Width => True, others => False);
1707         end if;
1708
1709         --  Detect non greedy operators in the easy cases
1710
1711         if Op /= '{'
1712           and then Parse_Pos + 1 <= Parse_End
1713           and then Expression (Parse_Pos + 1) = '?'
1714         then
1715            Greedy := False;
1716            Parse_Pos := Parse_Pos + 1;
1717         end if;
1718
1719         --  Generate the byte code
1720
1721         case Op is
1722            when '*' =>
1723
1724               if New_Flags.Simple then
1725                  Insert_Operator (STAR, IP, Greedy);
1726               else
1727                  Link_Tail (IP, Emit_Node (WHILEM));
1728                  Insert_Curly_Operator
1729                    (CURLYX, 0, Max_Curly_Repeat, IP, Greedy);
1730                  Link_Tail (IP, Emit_Node (NOTHING));
1731               end if;
1732
1733            when '+' =>
1734
1735               if New_Flags.Simple then
1736                  Insert_Operator (PLUS, IP, Greedy);
1737               else
1738                  Link_Tail (IP, Emit_Node (WHILEM));
1739                  Insert_Curly_Operator
1740                    (CURLYX, 1, Max_Curly_Repeat, IP, Greedy);
1741                  Link_Tail (IP, Emit_Node (NOTHING));
1742               end if;
1743
1744            when '?' =>
1745               if New_Flags.Simple then
1746                  Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy);
1747               else
1748                  Link_Tail (IP, Emit_Node (WHILEM));
1749                  Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy);
1750                  Link_Tail (IP, Emit_Node (NOTHING));
1751               end if;
1752
1753            when '{' =>
1754               declare
1755                  Min, Max : Natural;
1756
1757               begin
1758                  Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy);
1759
1760                  if New_Flags.Simple then
1761                     Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy);
1762                  else
1763                     Link_Tail (IP, Emit_Node (WHILEM));
1764                     Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy);
1765                     Link_Tail (IP, Emit_Node (NOTHING));
1766                  end if;
1767               end;
1768
1769            when others =>
1770               null;
1771         end case;
1772
1773         Parse_Pos := Parse_Pos + 1;
1774
1775         if Parse_Pos <= Parse_End
1776           and then Is_Mult (Parse_Pos)
1777         then
1778            Fail ("nested *+{");
1779         end if;
1780      end Parse_Piece;
1781
1782      ---------------------------------
1783      -- Parse_Posix_Character_Class --
1784      ---------------------------------
1785
1786      function Parse_Posix_Character_Class return Std_Class is
1787         Invert : Boolean := False;
1788         Class  : Std_Class := ANYOF_NONE;
1789         E      : String renames Expression;
1790
1791         --  Class names. Note that code assumes that the length of all
1792         --  classes starting with the same letter have the same length.
1793
1794         Alnum   : constant String := "alnum:]";
1795         Alpha   : constant String := "alpha:]";
1796         Ascii_C : constant String := "ascii:]";
1797         Cntrl   : constant String := "cntrl:]";
1798         Digit   : constant String := "digit:]";
1799         Graph   : constant String := "graph:]";
1800         Lower   : constant String := "lower:]";
1801         Print   : constant String := "print:]";
1802         Punct   : constant String := "punct:]";
1803         Space   : constant String := "space:]";
1804         Upper   : constant String := "upper:]";
1805         Word    : constant String := "word:]";
1806         Xdigit  : constant String := "xdigit:]";
1807
1808      begin
1809         --  Case of character class specified
1810
1811         if Parse_Pos <= Parse_End
1812           and then Expression (Parse_Pos) = ':'
1813         then
1814            Parse_Pos := Parse_Pos + 1;
1815
1816            --  Do we have something like:  [[:^alpha:]]
1817
1818            if Parse_Pos <= Parse_End
1819              and then Expression (Parse_Pos) = '^'
1820            then
1821               Invert := True;
1822               Parse_Pos := Parse_Pos + 1;
1823            end if;
1824
1825            --  Check for class names based on first letter
1826
1827            case Expression (Parse_Pos) is
1828
1829               when 'a' =>
1830
1831                  --  All 'a' classes have the same length (Alnum'Length)
1832
1833                  if Parse_Pos + Alnum'Length - 1 <= Parse_End then
1834
1835                     if E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) =
1836                                                                      Alnum
1837                     then
1838                        if Invert then
1839                           Class := ANYOF_NALNUMC;
1840                        else
1841                           Class := ANYOF_ALNUMC;
1842                        end if;
1843
1844                        Parse_Pos := Parse_Pos + Alnum'Length;
1845
1846                     elsif E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) =
1847                                                                      Alpha
1848                     then
1849                        if Invert then
1850                           Class := ANYOF_NALPHA;
1851                        else
1852                           Class := ANYOF_ALPHA;
1853                        end if;
1854
1855                        Parse_Pos := Parse_Pos + Alpha'Length;
1856
1857                     elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) =
1858                                                                      Ascii_C
1859                     then
1860                        if Invert then
1861                           Class := ANYOF_NASCII;
1862                        else
1863                           Class := ANYOF_ASCII;
1864                        end if;
1865
1866                        Parse_Pos := Parse_Pos + Ascii_C'Length;
1867                     end if;
1868                  end if;
1869
1870               when 'c' =>
1871                  if Parse_Pos + Cntrl'Length - 1 <= Parse_End
1872                    and then E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) =
1873                                                                      Cntrl
1874                  then
1875                     if Invert then
1876                        Class := ANYOF_NCNTRL;
1877                     else
1878                        Class := ANYOF_CNTRL;
1879                     end if;
1880
1881                     Parse_Pos := Parse_Pos + Cntrl'Length;
1882                  end if;
1883
1884               when 'd' =>
1885                  if Parse_Pos + Digit'Length - 1 <= Parse_End
1886                    and then E (Parse_Pos .. Parse_Pos + Digit'Length - 1) =
1887                                                                      Digit
1888                  then
1889                     if Invert then
1890                        Class := ANYOF_NDIGIT;
1891                     else
1892                        Class := ANYOF_DIGIT;
1893                     end if;
1894
1895                     Parse_Pos := Parse_Pos + Digit'Length;
1896                  end if;
1897
1898               when 'g' =>
1899                  if Parse_Pos + Graph'Length - 1 <= Parse_End
1900                    and then E (Parse_Pos .. Parse_Pos + Graph'Length - 1) =
1901                                                                      Graph
1902                  then
1903                     if Invert then
1904                        Class := ANYOF_NGRAPH;
1905                     else
1906                        Class := ANYOF_GRAPH;
1907                     end if;
1908                     Parse_Pos := Parse_Pos + Graph'Length;
1909                  end if;
1910
1911               when 'l' =>
1912                  if Parse_Pos + Lower'Length - 1 <= Parse_End
1913                    and then E (Parse_Pos .. Parse_Pos + Lower'Length - 1) =
1914                                                                      Lower
1915                  then
1916                     if Invert then
1917                        Class := ANYOF_NLOWER;
1918                     else
1919                        Class := ANYOF_LOWER;
1920                     end if;
1921                     Parse_Pos := Parse_Pos + Lower'Length;
1922                  end if;
1923
1924               when 'p' =>
1925
1926                  --  All 'p' classes have the same length
1927
1928                  if Parse_Pos + Print'Length - 1 <= Parse_End then
1929                     if E (Parse_Pos .. Parse_Pos + Print'Length - 1) =
1930                                                                      Print
1931                     then
1932                        if Invert then
1933                           Class := ANYOF_NPRINT;
1934                        else
1935                           Class := ANYOF_PRINT;
1936                        end if;
1937
1938                        Parse_Pos := Parse_Pos + Print'Length;
1939
1940                     elsif E (Parse_Pos .. Parse_Pos + Punct'Length - 1) =
1941                                                                      Punct
1942                     then
1943                        if Invert then
1944                           Class := ANYOF_NPUNCT;
1945                        else
1946                           Class := ANYOF_PUNCT;
1947                        end if;
1948
1949                        Parse_Pos := Parse_Pos + Punct'Length;
1950                     end if;
1951                  end if;
1952
1953               when 's' =>
1954                  if Parse_Pos + Space'Length - 1 <= Parse_End
1955                    and then E (Parse_Pos .. Parse_Pos + Space'Length - 1) =
1956                                                                      Space
1957                  then
1958                     if Invert then
1959                        Class := ANYOF_NSPACE;
1960                     else
1961                        Class := ANYOF_SPACE;
1962                     end if;
1963
1964                     Parse_Pos := Parse_Pos + Space'Length;
1965                  end if;
1966
1967               when 'u' =>
1968
1969                  if Parse_Pos + Upper'Length - 1 <= Parse_End
1970                    and then E (Parse_Pos .. Parse_Pos + Upper'Length - 1) =
1971                    Upper
1972                  then
1973                     if Invert then
1974                        Class := ANYOF_NUPPER;
1975                     else
1976                        Class := ANYOF_UPPER;
1977                     end if;
1978                     Parse_Pos := Parse_Pos + Upper'Length;
1979                  end if;
1980
1981               when 'w' =>
1982
1983                  if Parse_Pos + Word'Length - 1 <= Parse_End
1984                    and then E (Parse_Pos .. Parse_Pos + Word'Length - 1) =
1985                    Word
1986                  then
1987                     if Invert then
1988                        Class := ANYOF_NALNUM;
1989                     else
1990                        Class := ANYOF_ALNUM;
1991                     end if;
1992                     Parse_Pos := Parse_Pos + Word'Length;
1993                  end if;
1994
1995               when 'x' =>
1996
1997                  if Parse_Pos + Xdigit'Length - 1 <= Parse_End
1998                    and then E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1)
1999                    = Digit
2000                  then
2001                     if Invert then
2002                        Class := ANYOF_NXDIGIT;
2003                     else
2004                        Class := ANYOF_XDIGIT;
2005                     end if;
2006
2007                     Parse_Pos := Parse_Pos + Xdigit'Length;
2008                  end if;
2009
2010               when others =>
2011                  Fail ("Invalid character class");
2012            end case;
2013
2014         --  Character class not specified
2015
2016         else
2017            return ANYOF_NONE;
2018         end if;
2019
2020         return Class;
2021      end Parse_Posix_Character_Class;
2022
2023      Expr_Flags : Expression_Flags;
2024      Result     : Pointer;
2025
2026   --  Start of processing for Compile
2027
2028   begin
2029      Emit (MAGIC);
2030      Parse (False, Expr_Flags, Result);
2031
2032      if Result = 0 then
2033         Fail ("Couldn't compile expression");
2034      end if;
2035
2036      Final_Code_Size := Emit_Ptr - 1;
2037
2038      --  Do we want to actually compile the expression, or simply get the
2039      --  code size ???
2040
2041      if Emit_Code then
2042         Optimize (PM);
2043      end if;
2044
2045      PM.Flags := Flags;
2046   end Compile;
2047
2048   function Compile
2049     (Expression : String;
2050      Flags      : Regexp_Flags := No_Flags) return Pattern_Matcher
2051   is
2052      Size  : Program_Size;
2053      Dummy : Pattern_Matcher (0);
2054
2055   begin
2056      Compile (Dummy, Expression, Size, Flags);
2057
2058      declare
2059         Result : Pattern_Matcher (Size);
2060      begin
2061         Compile (Result, Expression, Size, Flags);
2062         return Result;
2063      end;
2064   end Compile;
2065
2066   procedure Compile
2067     (Matcher    : out Pattern_Matcher;
2068      Expression : String;
2069      Flags      : Regexp_Flags := No_Flags)
2070   is
2071      Size : Program_Size;
2072
2073   begin
2074      Compile (Matcher, Expression, Size, Flags);
2075   end Compile;
2076
2077   ----------
2078   -- Dump --
2079   ----------
2080
2081   procedure Dump (Self : Pattern_Matcher) is
2082
2083      --  Index  : Pointer := Program_First + 1;
2084      --  What is the above line for ???
2085
2086      Op      : Opcode;
2087      Program : Program_Data renames Self.Program;
2088
2089      procedure Dump_Until
2090        (Start  : Pointer;
2091         Till   : Pointer;
2092         Indent : Natural := 0);
2093      --  Dump the program until the node Till (not included) is met.
2094      --  Every line is indented with Index spaces at the beginning
2095      --  Dumps till the end if Till is 0.
2096
2097      ----------------
2098      -- Dump_Until --
2099      ----------------
2100
2101      procedure Dump_Until
2102        (Start  : Pointer;
2103         Till   : Pointer;
2104         Indent : Natural := 0)
2105      is
2106         Next : Pointer;
2107         Index : Pointer := Start;
2108         Local_Indent : Natural := Indent;
2109         Length : Pointer;
2110
2111      begin
2112         while Index < Till loop
2113
2114            Op := Opcode'Val (Character'Pos ((Self.Program (Index))));
2115
2116            if Op = CLOSE then
2117               Local_Indent := Local_Indent - 3;
2118            end if;
2119
2120            declare
2121               Point : constant String := Pointer'Image (Index);
2122
2123            begin
2124               for J in 1 .. 6 - Point'Length loop
2125                  Put (' ');
2126               end loop;
2127
2128               Put (Point
2129                    & " : "
2130                    & (1 .. Local_Indent => ' ')
2131                    & Opcode'Image (Op));
2132            end;
2133
2134            --  Print the parenthesis number
2135
2136            if Op = OPEN or else Op = CLOSE or else Op = REFF then
2137               Put (Natural'Image (Character'Pos (Program (Index + 3))));
2138            end if;
2139
2140            Next := Index + Get_Next_Offset (Program, Index);
2141
2142            if Next = Index then
2143               Put ("  (next at 0)");
2144            else
2145               Put ("  (next at " & Pointer'Image (Next) & ")");
2146            end if;
2147
2148            case Op is
2149
2150               --  Character class operand
2151
2152               when ANYOF =>  null;
2153                  declare
2154                     Bitmap  : Character_Class;
2155                     Last    : Character := ASCII.Nul;
2156                     Current : Natural := 0;
2157
2158                     Current_Char : Character;
2159
2160                  begin
2161                     Bitmap_Operand (Program, Index, Bitmap);
2162                     Put ("   operand=");
2163
2164                     while Current <= 255 loop
2165                        Current_Char := Character'Val (Current);
2166
2167                        --  First item in a range
2168
2169                        if Get_From_Class (Bitmap, Current_Char) then
2170                           Last := Current_Char;
2171
2172                           --  Search for the last item in the range
2173
2174                           loop
2175                              Current := Current + 1;
2176                              exit when Current > 255;
2177                              Current_Char := Character'Val (Current);
2178                              exit when
2179                                not Get_From_Class (Bitmap, Current_Char);
2180
2181                           end loop;
2182
2183                           if Last <= ' ' then
2184                              Put (Last'Img);
2185                           else
2186                              Put (Last);
2187                           end if;
2188
2189                           if Character'Succ (Last) /= Current_Char then
2190                              Put ("-" & Character'Pred (Current_Char));
2191                           end if;
2192
2193                        else
2194                           Current := Current + 1;
2195                        end if;
2196                     end loop;
2197
2198                     New_Line;
2199                     Index := Index + 3 + Bitmap'Length;
2200                  end;
2201
2202               --  string operand
2203
2204               when EXACT | EXACTF =>
2205                  Length := String_Length (Program, Index);
2206                  Put ("   operand (length:" & Program_Size'Image (Length + 1)
2207                       & ") ="
2208                       & String (Program (String_Operand (Index)
2209                                          .. String_Operand (Index)
2210                                          + Length)));
2211                  Index := String_Operand (Index) + Length + 1;
2212                  New_Line;
2213
2214               --  Node operand
2215
2216               when BRANCH =>
2217                  New_Line;
2218                  Dump_Until (Index + 3, Next, Local_Indent + 3);
2219                  Index := Next;
2220
2221               when STAR | PLUS =>
2222                  New_Line;
2223
2224                  --  Only one instruction
2225
2226                  Dump_Until (Index + 3, Index + 4, Local_Indent + 3);
2227                  Index := Next;
2228
2229               when CURLY | CURLYX =>
2230                  Put ("  {"
2231                       & Natural'Image (Read_Natural (Program, Index + 3))
2232                       & ","
2233                       & Natural'Image (Read_Natural (Program, Index + 5))
2234                       & "}");
2235                  New_Line;
2236                  Dump_Until (Index + 7, Next, Local_Indent + 3);
2237                  Index := Next;
2238
2239               when OPEN =>
2240                  New_Line;
2241                  Index := Index + 4;
2242                  Local_Indent := Local_Indent + 3;
2243
2244               when CLOSE | REFF =>
2245                  New_Line;
2246                  Index := Index + 4;
2247
2248               when EOP =>
2249                  Index := Index + 3;
2250                  New_Line;
2251                  exit;
2252
2253               --  No operand
2254
2255               when others =>
2256                  Index := Index + 3;
2257                  New_Line;
2258            end case;
2259         end loop;
2260      end Dump_Until;
2261
2262   --  Start of processing for Dump
2263
2264   begin
2265      pragma Assert (Self.Program (Program_First) = MAGIC,
2266                     "Corrupted Pattern_Matcher");
2267
2268      Put_Line ("Must start with (Self.First) = "
2269                & Character'Image (Self.First));
2270
2271      if (Self.Flags and Case_Insensitive) /= 0 then
2272         Put_Line ("  Case_Insensitive mode");
2273      end if;
2274
2275      if (Self.Flags and Single_Line) /= 0 then
2276         Put_Line ("  Single_Line mode");
2277      end if;
2278
2279      if (Self.Flags and Multiple_Lines) /= 0 then
2280         Put_Line ("  Multiple_Lines mode");
2281      end if;
2282
2283      Put_Line ("     1 : MAGIC");
2284      Dump_Until (Program_First + 1, Self.Program'Last + 1);
2285   end Dump;
2286
2287   --------------------
2288   -- Get_From_Class --
2289   --------------------
2290
2291   function Get_From_Class
2292     (Bitmap : Character_Class;
2293      C      : Character) return Boolean
2294   is
2295      Value : constant Class_Byte := Character'Pos (C);
2296
2297   begin
2298      return
2299        (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0;
2300   end Get_From_Class;
2301
2302   --------------
2303   -- Get_Next --
2304   --------------
2305
2306   function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is
2307      Offset : constant Pointer := Get_Next_Offset (Program, IP);
2308
2309   begin
2310      if Offset = 0 then
2311         return 0;
2312      else
2313         return IP + Offset;
2314      end if;
2315   end Get_Next;
2316
2317   ---------------------
2318   -- Get_Next_Offset --
2319   ---------------------
2320
2321   function Get_Next_Offset
2322     (Program : Program_Data;
2323      IP      : Pointer) return Pointer
2324   is
2325   begin
2326      return Pointer (Read_Natural (Program, IP + 1));
2327   end Get_Next_Offset;
2328
2329   --------------
2330   -- Is_Alnum --
2331   --------------
2332
2333   function Is_Alnum (C : Character) return Boolean is
2334   begin
2335      return Is_Alphanumeric (C) or else C = '_';
2336   end Is_Alnum;
2337
2338   ------------------
2339   -- Is_Printable --
2340   ------------------
2341
2342   function Is_Printable (C : Character) return Boolean is
2343   begin
2344      --  Printable if space or graphic character or other whitespace
2345      --  Other white space includes (HT/LF/VT/FF/CR = codes 9-13)
2346
2347      return C in Character'Val (32) .. Character'Val (126)
2348        or else C in ASCII.HT .. ASCII.CR;
2349   end Is_Printable;
2350
2351   --------------------
2352   -- Is_White_Space --
2353   --------------------
2354
2355   function Is_White_Space (C : Character) return Boolean is
2356   begin
2357      --  Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13
2358
2359      return C = ' ' or else C in ASCII.HT .. ASCII.CR;
2360   end Is_White_Space;
2361
2362   -----------
2363   -- Match --
2364   -----------
2365
2366   procedure Match
2367     (Self    : Pattern_Matcher;
2368      Data    : String;
2369      Matches : out Match_Array;
2370      Data_First : Integer := -1;
2371      Data_Last  : Positive := Positive'Last)
2372   is
2373      Program   : Program_Data renames Self.Program; -- Shorter notation
2374
2375      First_In_Data : constant Integer := Integer'Max (Data_First, Data'First);
2376      Last_In_Data  : constant Integer := Integer'Min (Data_Last, Data'Last);
2377
2378      --  Global work variables
2379
2380      Input_Pos : Natural;          -- String-input pointer
2381      BOL_Pos   : Natural;          -- Beginning of input, for ^ check
2382      Matched   : Boolean := False;  -- Until proven True
2383
2384      Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count,
2385                                                    Matches'Last));
2386      --  Stores the value of all the parenthesis pairs.
2387      --  We do not use directly Matches, so that we can also use back
2388      --  references (REFF) even if Matches is too small.
2389
2390      type Natural_Array is array (Match_Count range <>) of Natural;
2391      Matches_Tmp : Natural_Array (Matches_Full'Range);
2392      --  Save the opening position of parenthesis.
2393
2394      Last_Paren  : Natural := 0;
2395      --  Last parenthesis seen
2396
2397      Greedy : Boolean := True;
2398      --  True if the next operator should be greedy
2399
2400      type Current_Curly_Record;
2401      type Current_Curly_Access is access all Current_Curly_Record;
2402      type Current_Curly_Record is record
2403         Paren_Floor : Natural;  --  How far back to strip parenthesis data
2404         Cur         : Integer;  --  How many instances of scan we've matched
2405         Min         : Natural;  --  Minimal number of scans to match
2406         Max         : Natural;  --  Maximal number of scans to match
2407         Greedy      : Boolean;  --  Whether to work our way up or down
2408         Scan        : Pointer;  --  The thing to match
2409         Next        : Pointer;  --  What has to match after it
2410         Lastloc     : Natural;  --  Where we started matching this scan
2411         Old_Cc      : Current_Curly_Access; --  Before we started this one
2412      end record;
2413      --  Data used to handle the curly operator and the plus and star
2414      --  operators for complex expressions.
2415
2416      Current_Curly : Current_Curly_Access := null;
2417      --  The curly currently being processed.
2418
2419      -----------------------
2420      -- Local Subprograms --
2421      -----------------------
2422
2423      function Index (Start : Positive; C : Character) return Natural;
2424      --  Find character C in Data starting at Start and return position
2425
2426      function Repeat
2427        (IP  : Pointer;
2428         Max : Natural := Natural'Last) return Natural;
2429      --  Repeatedly match something simple, report how many
2430      --  It only matches on things of length 1.
2431      --  Starting from Input_Pos, it matches at most Max CURLY.
2432
2433      function Try (Pos : in Positive) return Boolean;
2434      --  Try to match at specific point
2435
2436      function Match (IP : Pointer) return Boolean;
2437      --  This is the main matching routine. Conceptually the strategy
2438      --  is simple:  check to see whether the current node matches,
2439      --  call self recursively to see whether the rest matches,
2440      --  and then act accordingly.
2441      --
2442      --  In practice Match makes some effort to avoid recursion, in
2443      --  particular by going through "ordinary" nodes (that don't
2444      --  need to know whether the rest of the match failed) by
2445      --  using a loop instead of recursion.
2446      --  Why is the above comment part of the spec rather than body ???
2447
2448      function Match_Whilem (IP : Pointer) return Boolean;
2449      --  Return True if a WHILEM matches
2450      --  How come IP is unreferenced in the body ???
2451
2452      function Recurse_Match (IP : Pointer; From : Natural) return Boolean;
2453      pragma Inline (Recurse_Match);
2454      --  Calls Match recursively. It saves and restores the parenthesis
2455      --  status and location in the input stream correctly, so that
2456      --  backtracking is possible
2457
2458      function Match_Simple_Operator
2459        (Op     : Opcode;
2460         Scan   : Pointer;
2461         Next   : Pointer;
2462         Greedy : Boolean) return Boolean;
2463      --  Return True it the simple operator (possibly non-greedy) matches
2464
2465      pragma Inline (Index);
2466      pragma Inline (Repeat);
2467
2468      --  These are two complex functions, but used only once.
2469
2470      pragma Inline (Match_Whilem);
2471      pragma Inline (Match_Simple_Operator);
2472
2473      -----------
2474      -- Index --
2475      -----------
2476
2477      function Index (Start : Positive; C : Character) return Natural is
2478      begin
2479         for J in Start .. Last_In_Data loop
2480            if Data (J) = C then
2481               return J;
2482            end if;
2483         end loop;
2484
2485         return 0;
2486      end Index;
2487
2488      -------------------
2489      -- Recurse_Match --
2490      -------------------
2491
2492      function Recurse_Match (IP : Pointer; From : Natural) return Boolean is
2493         L : constant Natural := Last_Paren;
2494
2495         Tmp_F : constant Match_Array :=
2496                   Matches_Full (From + 1 .. Matches_Full'Last);
2497
2498         Start : constant Natural_Array :=
2499                   Matches_Tmp (From + 1 .. Matches_Tmp'Last);
2500         Input : constant Natural := Input_Pos;
2501
2502      begin
2503         if Match (IP) then
2504            return True;
2505         end if;
2506
2507         Last_Paren := L;
2508         Matches_Full (Tmp_F'Range) := Tmp_F;
2509         Matches_Tmp (Start'Range) := Start;
2510         Input_Pos := Input;
2511         return False;
2512      end Recurse_Match;
2513
2514      -----------
2515      -- Match --
2516      -----------
2517
2518      function Match (IP : Pointer) return Boolean is
2519         Scan   : Pointer := IP;
2520         Next   : Pointer;
2521         Op     : Opcode;
2522
2523      begin
2524         State_Machine :
2525         loop
2526            pragma Assert (Scan /= 0);
2527
2528            --  Determine current opcode and count its usage in debug mode
2529
2530            Op := Opcode'Val (Character'Pos (Program (Scan)));
2531
2532            --  Calculate offset of next instruction.
2533            --  Second character is most significant in Program_Data.
2534
2535            Next := Get_Next (Program, Scan);
2536
2537            case Op is
2538               when EOP =>
2539                  return True;  --  Success !
2540
2541               when BRANCH =>
2542                  if Program (Next) /= BRANCH then
2543                     Next := Operand (Scan); -- No choice, avoid recursion
2544
2545                  else
2546                     loop
2547                        if Recurse_Match (Operand (Scan), 0) then
2548                           return True;
2549                        end if;
2550
2551                        Scan := Get_Next (Program, Scan);
2552                        exit when Scan = 0 or Program (Scan) /= BRANCH;
2553                     end loop;
2554
2555                     exit State_Machine;
2556                  end if;
2557
2558               when NOTHING =>
2559                  null;
2560
2561               when BOL =>
2562                  exit State_Machine when Input_Pos /= BOL_Pos
2563                    and then ((Self.Flags and Multiple_Lines) = 0
2564                              or else Data (Input_Pos - 1) /= ASCII.LF);
2565
2566               when MBOL =>
2567                  exit State_Machine when Input_Pos /= BOL_Pos
2568                    and then Data (Input_Pos - 1) /= ASCII.LF;
2569
2570               when SBOL =>
2571                  exit State_Machine when Input_Pos /= BOL_Pos;
2572
2573               when EOL =>
2574                  exit State_Machine when Input_Pos <= Data'Last
2575                    and then ((Self.Flags and Multiple_Lines) = 0
2576                              or else Data (Input_Pos) /= ASCII.LF);
2577
2578               when MEOL =>
2579                  exit State_Machine when Input_Pos <= Data'Last
2580                    and then Data (Input_Pos) /= ASCII.LF;
2581
2582               when SEOL =>
2583                  exit State_Machine when Input_Pos <= Data'Last;
2584
2585               when BOUND | NBOUND =>
2586
2587                  --  Was last char in word ?
2588
2589                  declare
2590                     N  : Boolean := False;
2591                     Ln : Boolean := False;
2592
2593                  begin
2594                     if Input_Pos /= First_In_Data then
2595                        N := Is_Alnum (Data (Input_Pos - 1));
2596                     end if;
2597
2598                     if Input_Pos > Last_In_Data then
2599                        Ln := False;
2600                     else
2601                        Ln := Is_Alnum (Data (Input_Pos));
2602                     end if;
2603
2604                     if Op = BOUND then
2605                        if N = Ln then
2606                           exit State_Machine;
2607                        end if;
2608                     else
2609                        if N /= Ln then
2610                           exit State_Machine;
2611                        end if;
2612                     end if;
2613                  end;
2614
2615               when SPACE =>
2616                  exit State_Machine when Input_Pos > Last_In_Data
2617                    or else not Is_White_Space (Data (Input_Pos));
2618                  Input_Pos := Input_Pos + 1;
2619
2620               when NSPACE =>
2621                  exit State_Machine when Input_Pos > Last_In_Data
2622                    or else Is_White_Space (Data (Input_Pos));
2623                  Input_Pos := Input_Pos + 1;
2624
2625               when DIGIT =>
2626                  exit State_Machine when Input_Pos > Last_In_Data
2627                    or else not Is_Digit (Data (Input_Pos));
2628                  Input_Pos := Input_Pos + 1;
2629
2630               when NDIGIT =>
2631                  exit State_Machine when Input_Pos > Last_In_Data
2632                    or else Is_Digit (Data (Input_Pos));
2633                  Input_Pos := Input_Pos + 1;
2634
2635               when ALNUM =>
2636                  exit State_Machine when Input_Pos > Last_In_Data
2637                    or else not Is_Alnum (Data (Input_Pos));
2638                  Input_Pos := Input_Pos + 1;
2639
2640               when NALNUM =>
2641                  exit State_Machine when Input_Pos > Last_In_Data
2642                    or else Is_Alnum (Data (Input_Pos));
2643                  Input_Pos := Input_Pos + 1;
2644
2645               when ANY =>
2646                  exit State_Machine when Input_Pos > Last_In_Data
2647                    or else Data (Input_Pos) = ASCII.LF;
2648                  Input_Pos := Input_Pos + 1;
2649
2650               when SANY =>
2651                  exit State_Machine when Input_Pos > Last_In_Data;
2652                  Input_Pos := Input_Pos + 1;
2653
2654               when EXACT =>
2655                  declare
2656                     Opnd    : Pointer  := String_Operand (Scan);
2657                     Current : Positive := Input_Pos;
2658
2659                     Last    : constant Pointer :=
2660                                 Opnd + String_Length (Program, Scan);
2661
2662                  begin
2663                     while Opnd <= Last loop
2664                        exit State_Machine when Current > Last_In_Data
2665                          or else Program (Opnd) /= Data (Current);
2666                        Current := Current + 1;
2667                        Opnd := Opnd + 1;
2668                     end loop;
2669
2670                     Input_Pos := Current;
2671                  end;
2672
2673               when EXACTF =>
2674                  declare
2675                     Opnd    : Pointer  := String_Operand (Scan);
2676                     Current : Positive := Input_Pos;
2677
2678                     Last    : constant Pointer :=
2679                                 Opnd + String_Length (Program, Scan);
2680
2681                  begin
2682                     while Opnd <= Last loop
2683                        exit State_Machine when Current > Last_In_Data
2684                          or else Program (Opnd) /= To_Lower (Data (Current));
2685                        Current := Current + 1;
2686                        Opnd := Opnd + 1;
2687                     end loop;
2688
2689                     Input_Pos := Current;
2690                  end;
2691
2692               when ANYOF =>
2693                  declare
2694                     Bitmap : Character_Class;
2695
2696                  begin
2697                     Bitmap_Operand (Program, Scan, Bitmap);
2698                     exit State_Machine when Input_Pos > Last_In_Data
2699                       or else not Get_From_Class (Bitmap, Data (Input_Pos));
2700                     Input_Pos := Input_Pos + 1;
2701                  end;
2702
2703               when OPEN =>
2704                  declare
2705                     No : constant Natural :=
2706                            Character'Pos (Program (Operand (Scan)));
2707
2708                  begin
2709                     Matches_Tmp (No) := Input_Pos;
2710                  end;
2711
2712               when CLOSE =>
2713                  declare
2714                     No : constant Natural :=
2715                            Character'Pos (Program (Operand (Scan)));
2716
2717                  begin
2718                     Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1);
2719
2720                     if Last_Paren < No then
2721                        Last_Paren := No;
2722                     end if;
2723                  end;
2724
2725               when REFF =>
2726                  declare
2727                     No : constant Natural :=
2728                            Character'Pos (Program (Operand (Scan)));
2729
2730                     Data_Pos : Natural;
2731
2732                  begin
2733                     --  If we haven't seen that parenthesis yet
2734
2735                     if Last_Paren < No then
2736                        return False;
2737                     end if;
2738
2739                     Data_Pos := Matches_Full (No).First;
2740
2741                     while Data_Pos <= Matches_Full (No).Last loop
2742                        if Input_Pos > Last_In_Data
2743                          or else Data (Input_Pos) /= Data (Data_Pos)
2744                        then
2745                           return False;
2746                        end if;
2747
2748                        Input_Pos := Input_Pos + 1;
2749                        Data_Pos := Data_Pos + 1;
2750                     end loop;
2751                  end;
2752
2753               when MINMOD =>
2754                  Greedy := False;
2755
2756               when STAR | PLUS | CURLY =>
2757                  declare
2758                     Greed : constant Boolean := Greedy;
2759
2760                  begin
2761                     Greedy := True;
2762                     return Match_Simple_Operator (Op, Scan, Next, Greed);
2763                  end;
2764
2765               when CURLYX =>
2766
2767                  --  Looking at something like:
2768
2769                  --    1: CURLYX {n,m}  (->4)
2770                  --    2:   code for complex thing  (->3)
2771                  --    3:   WHILEM (->0)
2772                  --    4: NOTHING
2773
2774                  declare
2775                     Min : constant Natural :=
2776                             Read_Natural (Program, Scan + 3);
2777                     Max : constant Natural :=
2778                             Read_Natural (Program, Scan + 5);
2779                     Cc  : aliased Current_Curly_Record;
2780
2781                     Has_Match : Boolean;
2782
2783                  begin
2784                     Cc := (Paren_Floor => Last_Paren,
2785                            Cur         => -1,
2786                            Min         => Min,
2787                            Max         => Max,
2788                            Greedy      => Greedy,
2789                            Scan        => Scan + 7,
2790                            Next        => Next,
2791                            Lastloc     => 0,
2792                            Old_Cc      => Current_Curly);
2793                     Current_Curly := Cc'Unchecked_Access;
2794
2795                     Has_Match := Match (Next - 3);
2796
2797                     --  Start on the WHILEM
2798
2799                     Current_Curly := Cc.Old_Cc;
2800                     return Has_Match;
2801                  end;
2802
2803               when WHILEM =>
2804                  return Match_Whilem (IP);
2805            end case;
2806
2807            Scan := Next;
2808         end loop State_Machine;
2809
2810         --  If we get here, there is no match.
2811         --  For successful matches when EOP is the terminating point.
2812
2813         return False;
2814      end Match;
2815
2816      ---------------------------
2817      -- Match_Simple_Operator --
2818      ---------------------------
2819
2820      function Match_Simple_Operator
2821        (Op     : Opcode;
2822         Scan   : Pointer;
2823         Next   : Pointer;
2824         Greedy : Boolean) return Boolean
2825      is
2826         Next_Char       : Character := ASCII.Nul;
2827         Next_Char_Known : Boolean := False;
2828         No              : Integer;  --  Can be negative
2829         Min             : Natural;
2830         Max             : Natural := Natural'Last;
2831         Operand_Code    : Pointer;
2832         Old             : Natural;
2833         Last_Pos        : Natural;
2834         Save            : constant Natural := Input_Pos;
2835
2836      begin
2837         --  Lookahead to avoid useless match attempts
2838         --  when we know what character comes next.
2839
2840         if Program (Next) = EXACT then
2841            Next_Char := Program (String_Operand (Next));
2842            Next_Char_Known := True;
2843         end if;
2844
2845         --  Find the minimal and maximal values for the operator
2846
2847         case Op is
2848            when STAR =>
2849               Min := 0;
2850               Operand_Code := Operand (Scan);
2851
2852            when PLUS =>
2853               Min := 1;
2854               Operand_Code := Operand (Scan);
2855
2856            when others =>
2857               Min := Read_Natural (Program, Scan + 3);
2858               Max := Read_Natural (Program, Scan + 5);
2859               Operand_Code := Scan + 7;
2860         end case;
2861
2862         --  Non greedy operators
2863
2864         if not Greedy then
2865
2866            --  Test the minimal repetitions
2867
2868            if Min /= 0
2869              and then Repeat (Operand_Code, Min) < Min
2870            then
2871               return False;
2872            end if;
2873
2874            Old := Input_Pos;
2875
2876            --  Find the place where 'next' could work
2877
2878            if Next_Char_Known then
2879               --  Last position to check
2880
2881               Last_Pos := Input_Pos + Max;
2882
2883               if Last_Pos > Last_In_Data
2884                 or else Max = Natural'Last
2885               then
2886                  Last_Pos := Last_In_Data;
2887               end if;
2888
2889               --  Look for the first possible opportunity
2890
2891               loop
2892                  --  Find the next possible position
2893
2894                  while Input_Pos <= Last_Pos
2895                    and then Data (Input_Pos) /= Next_Char
2896                  loop
2897                     Input_Pos := Input_Pos + 1;
2898                  end loop;
2899
2900                  if Input_Pos > Last_Pos then
2901                     return False;
2902                  end if;
2903
2904                  --  Check that we still match if we stop
2905                  --  at the position we just found.
2906
2907                  declare
2908                     Num : constant Natural := Input_Pos - Old;
2909
2910                  begin
2911                     Input_Pos := Old;
2912
2913                     if Repeat (Operand_Code, Num) < Num then
2914                        return False;
2915                     end if;
2916                  end;
2917
2918                  --  Input_Pos now points to the new position
2919
2920                  if Match (Get_Next (Program, Scan)) then
2921                     return True;
2922                  end if;
2923
2924                  Old := Input_Pos;
2925                  Input_Pos := Input_Pos + 1;
2926               end loop;
2927
2928            --  We know what the next character is
2929
2930            else
2931               while Max >= Min loop
2932
2933                  --  If the next character matches
2934
2935                  if Match (Next) then
2936                     return True;
2937                  end if;
2938
2939                  Input_Pos := Save + Min;
2940
2941                  --  Could not or did not match -- move forward
2942
2943                  if Repeat (Operand_Code, 1) /= 0 then
2944                     Min := Min + 1;
2945                  else
2946                     return False;
2947                  end if;
2948               end loop;
2949            end if;
2950
2951            return False;
2952
2953         --  Greedy operators
2954
2955         else
2956            No := Repeat (Operand_Code, Max);
2957
2958            --  ??? Perl has some special code here in case the
2959            --  next instruction is of type EOL, since $ and \Z
2960            --  can match before *and* after newline at the end.
2961
2962            --  ??? Perl has some special code here in case (paren)
2963            --  is True.
2964
2965            --  Else, if we don't have any parenthesis
2966
2967            while No >= Min loop
2968               if not Next_Char_Known
2969                 or else (Input_Pos <= Last_In_Data
2970                           and then Data (Input_Pos) = Next_Char)
2971               then
2972                  if Match (Next) then
2973                     return True;
2974                  end if;
2975               end if;
2976
2977               --  Could not or did not work, we back up
2978
2979               No := No - 1;
2980               Input_Pos := Save + No;
2981            end loop;
2982
2983            return False;
2984         end if;
2985      end Match_Simple_Operator;
2986
2987      ------------------
2988      -- Match_Whilem --
2989      ------------------
2990
2991      --  This is really hard to understand, because after we match what we
2992      --  are trying to match, we must make sure the rest of the REx is going
2993      --  to match for sure, and to do that we have to go back UP the parse
2994      --  tree by recursing ever deeper.  And if it fails, we have to reset
2995      --  our parent's current state that we can try again after backing off.
2996
2997      function Match_Whilem (IP : Pointer) return Boolean is
2998         pragma Unreferenced (IP);
2999
3000         Cc : constant Current_Curly_Access := Current_Curly;
3001         N  : constant Natural              := Cc.Cur + 1;
3002         Ln : Natural                       := 0;
3003
3004         Lastloc : constant Natural := Cc.Lastloc;
3005         --  Detection of 0-len.
3006
3007      begin
3008         --  If degenerate scan matches "", assume scan done.
3009
3010         if Input_Pos = Cc.Lastloc
3011           and then N >= Cc.Min
3012         then
3013            --  Temporarily restore the old context, and check that we
3014            --  match was comes after CURLYX.
3015
3016            Current_Curly := Cc.Old_Cc;
3017
3018            if Current_Curly /= null then
3019               Ln := Current_Curly.Cur;
3020            end if;
3021
3022            if Match (Cc.Next) then
3023               return True;
3024            end if;
3025
3026            if Current_Curly /= null then
3027               Current_Curly.Cur := Ln;
3028            end if;
3029
3030            Current_Curly := Cc;
3031            return False;
3032         end if;
3033
3034         --  First, just match a string of min scans.
3035
3036         if N < Cc.Min then
3037            Cc.Cur := N;
3038            Cc.Lastloc := Input_Pos;
3039
3040            if Match (Cc.Scan) then
3041               return True;
3042            end if;
3043
3044            Cc.Cur := N - 1;
3045            Cc.Lastloc := Lastloc;
3046            return False;
3047         end if;
3048
3049         --  Prefer next over scan for minimal matching.
3050
3051         if not Cc.Greedy then
3052            Current_Curly := Cc.Old_Cc;
3053
3054            if Current_Curly /= null then
3055               Ln := Current_Curly.Cur;
3056            end if;
3057
3058            if Recurse_Match (Cc.Next, Cc.Paren_Floor) then
3059               return True;
3060            end if;
3061
3062            if Current_Curly /= null then
3063               Current_Curly.Cur := Ln;
3064            end if;
3065
3066            Current_Curly := Cc;
3067
3068            --  Maximum greed exceeded ?
3069
3070            if N >= Cc.Max then
3071               return False;
3072            end if;
3073
3074            --  Try scanning more and see if it helps
3075            Cc.Cur := N;
3076            Cc.Lastloc := Input_Pos;
3077
3078            if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
3079               return True;
3080            end if;
3081
3082            Cc.Cur := N - 1;
3083            Cc.Lastloc := Lastloc;
3084            return False;
3085         end if;
3086
3087         --  Prefer scan over next for maximal matching
3088
3089         if N < Cc.Max then   --  more greed allowed ?
3090            Cc.Cur := N;
3091            Cc.Lastloc := Input_Pos;
3092
3093            if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then
3094               return True;
3095            end if;
3096         end if;
3097
3098         --  Failed deeper matches of scan, so see if this one works
3099
3100         Current_Curly := Cc.Old_Cc;
3101
3102         if Current_Curly /= null then
3103            Ln := Current_Curly.Cur;
3104         end if;
3105
3106         if Match (Cc.Next) then
3107            return True;
3108         end if;
3109
3110         if Current_Curly /= null then
3111            Current_Curly.Cur := Ln;
3112         end if;
3113
3114         Current_Curly := Cc;
3115         Cc.Cur := N - 1;
3116         Cc.Lastloc := Lastloc;
3117         return False;
3118      end Match_Whilem;
3119
3120      ------------
3121      -- Repeat --
3122      ------------
3123
3124      function Repeat
3125        (IP  : Pointer;
3126         Max : Natural := Natural'Last) return Natural
3127      is
3128         Scan  : Natural := Input_Pos;
3129         Last  : Natural;
3130         Op    : constant Opcode := Opcode'Val (Character'Pos (Program (IP)));
3131         Count : Natural;
3132         C     : Character;
3133         Is_First : Boolean := True;
3134         Bitmap   : Character_Class;
3135
3136      begin
3137         if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then
3138            Last := Last_In_Data;
3139         else
3140            Last := Scan + Max - 1;
3141         end if;
3142
3143         case Op is
3144            when ANY =>
3145               while Scan <= Last
3146                 and then Data (Scan) /= ASCII.LF
3147               loop
3148                  Scan := Scan + 1;
3149               end loop;
3150
3151            when SANY =>
3152               Scan := Last + 1;
3153
3154            when EXACT =>
3155
3156               --  The string has only one character if Repeat was called
3157
3158               C := Program (String_Operand (IP));
3159               while Scan <= Last
3160                 and then C = Data (Scan)
3161               loop
3162                  Scan := Scan + 1;
3163               end loop;
3164
3165            when EXACTF =>
3166
3167               --  The string has only one character if Repeat was called
3168
3169               C := Program (String_Operand (IP));
3170               while Scan <= Last
3171                 and then To_Lower (C) = Data (Scan)
3172               loop
3173                  Scan := Scan + 1;
3174               end loop;
3175
3176            when ANYOF =>
3177               if Is_First then
3178                  Bitmap_Operand (Program, IP, Bitmap);
3179                  Is_First := False;
3180               end if;
3181
3182               while Scan <= Last
3183                 and then Get_From_Class (Bitmap, Data (Scan))
3184               loop
3185                  Scan := Scan + 1;
3186               end loop;
3187
3188            when ALNUM =>
3189               while Scan <= Last
3190                 and then Is_Alnum (Data (Scan))
3191               loop
3192                  Scan := Scan + 1;
3193               end loop;
3194
3195            when NALNUM =>
3196               while Scan <= Last
3197                 and then not Is_Alnum (Data (Scan))
3198               loop
3199                  Scan := Scan + 1;
3200               end loop;
3201
3202            when SPACE =>
3203               while Scan <= Last
3204                 and then Is_White_Space (Data (Scan))
3205               loop
3206                  Scan := Scan + 1;
3207               end loop;
3208
3209            when NSPACE =>
3210               while Scan <= Last
3211                 and then not Is_White_Space (Data (Scan))
3212               loop
3213                  Scan := Scan + 1;
3214               end loop;
3215
3216            when DIGIT  =>
3217               while Scan <= Last
3218                 and then Is_Digit (Data (Scan))
3219               loop
3220                  Scan := Scan + 1;
3221               end loop;
3222
3223            when NDIGIT  =>
3224               while Scan <= Last
3225                 and then not Is_Digit (Data (Scan))
3226               loop
3227                  Scan := Scan + 1;
3228               end loop;
3229
3230            when others =>
3231               raise Program_Error;
3232         end case;
3233
3234         Count := Scan - Input_Pos;
3235         Input_Pos := Scan;
3236         return Count;
3237      end Repeat;
3238
3239      ---------
3240      -- Try --
3241      ---------
3242
3243      function Try (Pos : in Positive) return Boolean is
3244      begin
3245         Input_Pos  := Pos;
3246         Last_Paren := 0;
3247         Matches_Full := (others => No_Match);
3248
3249         if Match (Program_First + 1) then
3250            Matches_Full (0) := (Pos, Input_Pos - 1);
3251            return True;
3252         end if;
3253
3254         return False;
3255      end Try;
3256
3257   --  Start of processing for Match
3258
3259   begin
3260      --  Do we have the regexp Never_Match?
3261
3262      if Self.Size = 0 then
3263         Matches (0) := No_Match;
3264         return;
3265      end if;
3266
3267      --  Check validity of program
3268
3269      pragma Assert
3270        (Program (Program_First) = MAGIC,
3271         "Corrupted Pattern_Matcher");
3272
3273      --  If there is a "must appear" string, look for it
3274
3275      if Self.Must_Have_Length > 0 then
3276         declare
3277            First      : constant Character := Program (Self.Must_Have);
3278            Must_First : constant Pointer := Self.Must_Have;
3279            Must_Last  : constant Pointer :=
3280                           Must_First + Pointer (Self.Must_Have_Length - 1);
3281            Next_Try   : Natural := Index (First_In_Data, First);
3282
3283         begin
3284            while Next_Try /= 0
3285              and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1)
3286                          = String (Program (Must_First .. Must_Last))
3287            loop
3288               Next_Try := Index (Next_Try + 1, First);
3289            end loop;
3290
3291            if Next_Try = 0 then
3292               Matches_Full := (others => No_Match);
3293               return;                  -- Not present
3294            end if;
3295         end;
3296      end if;
3297
3298      --  Mark beginning of line for ^
3299
3300      BOL_Pos := Data'First;
3301
3302      --  Simplest case first: an anchored match need be tried only once
3303
3304      if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then
3305         Matched := Try (First_In_Data);
3306
3307      elsif Self.Anchored then
3308         declare
3309            Next_Try : Natural := First_In_Data;
3310         begin
3311            --  Test the first position in the buffer
3312            Matched := Try (Next_Try);
3313
3314            --  Else only test after newlines
3315
3316            if not Matched then
3317               while Next_Try <= Last_In_Data loop
3318                  while Next_Try <= Last_In_Data
3319                    and then Data (Next_Try) /= ASCII.LF
3320                  loop
3321                     Next_Try := Next_Try + 1;
3322                  end loop;
3323
3324                  Next_Try := Next_Try + 1;
3325
3326                  if Next_Try <= Last_In_Data then
3327                     Matched := Try (Next_Try);
3328                     exit when Matched;
3329                  end if;
3330               end loop;
3331            end if;
3332         end;
3333
3334      elsif Self.First /= ASCII.NUL then
3335         --  We know what char it must start with
3336
3337         declare
3338            Next_Try : Natural := Index (First_In_Data, Self.First);
3339
3340         begin
3341            while Next_Try /= 0 loop
3342               Matched := Try (Next_Try);
3343               exit when Matched;
3344               Next_Try := Index (Next_Try + 1, Self.First);
3345            end loop;
3346         end;
3347
3348      else
3349         --  Messy cases: try all locations (including for the empty string)
3350
3351         Matched := Try (First_In_Data);
3352
3353         if not Matched then
3354            for S in First_In_Data + 1 .. Last_In_Data loop
3355               Matched := Try (S);
3356               exit when Matched;
3357            end loop;
3358         end if;
3359      end if;
3360
3361      --  Matched has its value
3362
3363      for J in Last_Paren + 1 .. Matches'Last loop
3364         Matches_Full (J) := No_Match;
3365      end loop;
3366
3367      Matches := Matches_Full (Matches'Range);
3368      return;
3369   end Match;
3370
3371   -----------
3372   -- Match --
3373   -----------
3374
3375   function Match
3376     (Self       : Pattern_Matcher;
3377      Data       : String;
3378      Data_First : Integer := -1;
3379      Data_Last  : Positive := Positive'Last) return Natural
3380   is
3381      Matches : Match_Array (0 .. 0);
3382
3383   begin
3384      Match (Self, Data, Matches, Data_First, Data_Last);
3385      if Matches (0) = No_Match then
3386         return Data'First - 1;
3387      else
3388         return Matches (0).First;
3389      end if;
3390   end Match;
3391
3392   function Match
3393     (Self       : Pattern_Matcher;
3394      Data       : String;
3395      Data_First : Integer  := -1;
3396      Data_Last  : Positive := Positive'Last) return Boolean
3397   is
3398      Matches : Match_Array (0 .. 0);
3399
3400   begin
3401      Match (Self, Data, Matches, Data_First, Data_Last);
3402      return Matches (0).First >= Data'First;
3403   end Match;
3404
3405   procedure Match
3406     (Expression : String;
3407      Data       : String;
3408      Matches    : out Match_Array;
3409      Size       : Program_Size := 0;
3410      Data_First : Integer := -1;
3411      Data_Last  : Positive := Positive'Last)
3412   is
3413      PM            : Pattern_Matcher (Size);
3414      Finalize_Size : Program_Size;
3415
3416   begin
3417      if Size = 0 then
3418         Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
3419      else
3420         Compile (PM, Expression, Finalize_Size);
3421         Match (PM, Data, Matches, Data_First, Data_Last);
3422      end if;
3423   end Match;
3424
3425   -----------
3426   -- Match --
3427   -----------
3428
3429   function  Match
3430     (Expression : String;
3431      Data       : String;
3432      Size       : Program_Size := 0;
3433      Data_First : Integer := -1;
3434      Data_Last  : Positive := Positive'Last) return Natural
3435   is
3436      PM         : Pattern_Matcher (Size);
3437      Final_Size : Program_Size; -- unused
3438
3439   begin
3440      if Size = 0 then
3441         return Match (Compile (Expression), Data, Data_First, Data_Last);
3442      else
3443         Compile (PM, Expression, Final_Size);
3444         return Match (PM, Data, Data_First, Data_Last);
3445      end if;
3446   end Match;
3447
3448   -----------
3449   -- Match --
3450   -----------
3451
3452   function  Match
3453     (Expression : String;
3454      Data       : String;
3455      Size       : Program_Size := 0;
3456      Data_First : Integer := -1;
3457      Data_Last  : Positive := Positive'Last) return Boolean
3458   is
3459      Matches    : Match_Array (0 .. 0);
3460      PM         : Pattern_Matcher (Size);
3461      Final_Size : Program_Size; -- unused
3462
3463   begin
3464      if Size = 0 then
3465         Match (Compile (Expression), Data, Matches, Data_First, Data_Last);
3466      else
3467         Compile (PM, Expression, Final_Size);
3468         Match (PM, Data, Matches, Data_First, Data_Last);
3469      end if;
3470
3471      return Matches (0).First >= Data'First;
3472   end Match;
3473
3474   -------------
3475   -- Operand --
3476   -------------
3477
3478   function Operand (P : Pointer) return Pointer is
3479   begin
3480      return P + 3;
3481   end Operand;
3482
3483   --------------
3484   -- Optimize --
3485   --------------
3486
3487   procedure Optimize (Self : in out Pattern_Matcher) is
3488      Max_Length  : Program_Size;
3489      This_Length : Program_Size;
3490      Longest     : Pointer;
3491      Scan        : Pointer;
3492      Program     : Program_Data renames Self.Program;
3493
3494   begin
3495      --  Start with safe defaults (no optimization):
3496      --    *  No known first character of match
3497      --    *  Does not necessarily start at beginning of line
3498      --    *  No string known that has to appear in data
3499
3500      Self.First := ASCII.NUL;
3501      Self.Anchored := False;
3502      Self.Must_Have := Program'Last + 1;
3503      Self.Must_Have_Length := 0;
3504
3505      Scan := Program_First + 1;  --  First instruction (can be anything)
3506
3507      if Program (Scan) = EXACT then
3508         Self.First := Program (String_Operand (Scan));
3509
3510      elsif Program (Scan) = BOL
3511        or else Program (Scan) = SBOL
3512        or else Program (Scan) = MBOL
3513      then
3514         Self.Anchored := True;
3515      end if;
3516
3517      --  If there's something expensive in the regexp, find the
3518      --  longest literal string that must appear and make it the
3519      --  regmust. Resolve ties in favor of later strings, since
3520      --  the regstart check works with the beginning of the regexp.
3521      --  and avoiding duplication strengthens checking. Not a
3522      --  strong reason, but sufficient in the absence of others.
3523
3524      if False then -- if Flags.SP_Start then ???
3525         Longest := 0;
3526         Max_Length := 0;
3527         while Scan /= 0 loop
3528            if Program (Scan) = EXACT or else Program (Scan) = EXACTF then
3529               This_Length := String_Length (Program, Scan);
3530
3531               if This_Length >= Max_Length then
3532                  Longest := String_Operand (Scan);
3533                  Max_Length := This_Length;
3534               end if;
3535            end if;
3536
3537            Scan := Get_Next (Program, Scan);
3538         end loop;
3539
3540         Self.Must_Have        := Longest;
3541         Self.Must_Have_Length := Natural (Max_Length) + 1;
3542      end if;
3543   end Optimize;
3544
3545   -----------------
3546   -- Paren_Count --
3547   -----------------
3548
3549   function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is
3550   begin
3551      return Regexp.Paren_Count;
3552   end Paren_Count;
3553
3554   -----------
3555   -- Quote --
3556   -----------
3557
3558   function Quote (Str : String) return String is
3559      S    : String (1 .. Str'Length * 2);
3560      Last : Natural := 0;
3561
3562   begin
3563      for J in Str'Range loop
3564         case Str (J) is
3565            when '^' | '$' | '|' | '*' | '+' | '?' | '{' |
3566                 '}' | '[' | ']' | '(' | ')' | '\' =>
3567
3568               S (Last + 1) := '\';
3569               S (Last + 2) := Str (J);
3570               Last := Last + 2;
3571
3572            when others =>
3573               S (Last + 1) := Str (J);
3574               Last := Last + 1;
3575         end case;
3576      end loop;
3577
3578      return S (1 .. Last);
3579   end Quote;
3580
3581   ------------------
3582   -- Read_Natural --
3583   ------------------
3584
3585   function Read_Natural
3586     (Program : Program_Data;
3587      IP      : Pointer) return Natural
3588   is
3589   begin
3590      return Character'Pos (Program (IP)) +
3591               256 * Character'Pos (Program (IP + 1));
3592   end Read_Natural;
3593
3594   -----------------
3595   -- Reset_Class --
3596   -----------------
3597
3598   procedure Reset_Class (Bitmap : out Character_Class) is
3599   begin
3600      Bitmap := (others => 0);
3601   end Reset_Class;
3602
3603   ------------------
3604   -- Set_In_Class --
3605   ------------------
3606
3607   procedure Set_In_Class
3608     (Bitmap : in out Character_Class;
3609      C      : Character)
3610   is
3611      Value : constant Class_Byte := Character'Pos (C);
3612   begin
3613      Bitmap (Value / 8) := Bitmap (Value / 8)
3614        or Bit_Conversion (Value mod 8);
3615   end Set_In_Class;
3616
3617   -------------------
3618   -- String_Length --
3619   -------------------
3620
3621   function String_Length
3622     (Program : Program_Data;
3623      P       : Pointer) return Program_Size
3624   is
3625   begin
3626      pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF);
3627      return Character'Pos (Program (P + 3));
3628   end String_Length;
3629
3630   --------------------
3631   -- String_Operand --
3632   --------------------
3633
3634   function String_Operand (P : Pointer) return Pointer is
3635   begin
3636      return P + 4;
3637   end String_Operand;
3638
3639end GNAT.Regpat;
3640