1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT LIBRARY COMPONENTS                          --
4--                                                                          --
5--                G N A T . S P I T B O L . P A T T E R N S                 --
6--                                                                          --
7--                                 S p e c                                  --
8--                                                                          --
9--                     Copyright (C) 1997-2013, AdaCore                     --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32--  SPITBOL-like pattern construction and matching
33
34--  This child package of GNAT.SPITBOL provides a complete implementation
35--  of the SPITBOL-like pattern construction and matching operations. This
36--  package is based on Macro-SPITBOL created by Robert Dewar.
37
38------------------------------------------------------------
39-- Summary of Pattern Matching Packages in GNAT Hierarchy --
40------------------------------------------------------------
41
42--  There are three related packages that perform pattern matching functions.
43--  the following is an outline of these packages, to help you determine
44--  which is best for your needs.
45
46--     GNAT.Regexp (files g-regexp.ads/g-regexp.adb)
47--       This is a simple package providing Unix-style regular expression
48--       matching with the restriction that it matches entire strings. It
49--       is particularly useful for file name matching, and in particular
50--       it provides "globbing patterns" that are useful in implementing
51--       unix or DOS style wild card matching for file names.
52
53--     GNAT.Regpat (files g-regpat.ads/g-regpat.adb)
54--       This is a more complete implementation of Unix-style regular
55--       expressions, copied from the original V7 style regular expression
56--       library written in C by Henry Spencer. It is functionally the
57--       same as this library, and uses the same internal data structures
58--       stored in a binary compatible manner.
59
60--     GNAT.Spitbol.Patterns (files g-spipat.ads/g-spipat.adb)
61--       This is a completely general patterm matching package based on the
62--       pattern language of SNOBOL4, as implemented in SPITBOL. The pattern
63--       language is modeled on context free grammars, with context sensitive
64--       extensions that provide full (type 0) computational capabilities.
65
66with Ada.Strings.Maps; use Ada.Strings.Maps;
67with Ada.Text_IO;      use Ada.Text_IO;
68
69package GNAT.Spitbol.Patterns is
70   pragma Elaborate_Body;
71
72   -------------------------------
73   -- Pattern Matching Tutorial --
74   -------------------------------
75
76   --  A pattern matching operation (a call to one of the Match subprograms)
77   --  takes a subject string and a pattern, and optionally a replacement
78   --  string. The replacement string option is only allowed if the subject
79   --  is a variable.
80
81   --  The pattern is matched against the subject string, and either the
82   --  match fails, or it succeeds matching a contiguous substring. If a
83   --  replacement string is specified, then the subject string is modified
84   --  by replacing the matched substring with the given replacement.
85
86   --  Concatenation and Alternation
87   --  =============================
88
89   --    A pattern consists of a series of pattern elements. The pattern is
90   --    built up using either the concatenation operator:
91
92   --       A & B
93
94   --    which means match A followed immediately by matching B, or the
95   --    alternation operator:
96
97   --       A or B
98
99   --    which means first attempt to match A, and then if that does not
100   --    succeed, match B.
101
102   --    There is full backtracking, which means that if a given pattern
103   --    element fails to match, then previous alternatives are matched.
104   --    For example if we have the pattern:
105
106   --      (A or B) & (C or D) & (E or F)
107
108   --    First we attempt to match A, if that succeeds, then we go on to try
109   --    to match C, and if that succeeds, we go on to try to match E. If E
110   --    fails, then we try F. If F fails, then we go back and try matching
111   --    D instead of C. Let's make this explicit using a specific example,
112   --    and introducing the simplest kind of pattern element, which is a
113   --    literal string. The meaning of this pattern element is simply to
114   --    match the characters that correspond to the string characters. Now
115   --    let's rewrite the above pattern form with specific string literals
116   --    as the pattern elements:
117
118   --      ("ABC" or "AB") & ("DEF" or "CDE") & ("GH" or "IJ")
119
120   --    The following strings will be attempted in sequence:
121
122   --       ABC . DEF . GH
123   --       ABC . DEF . IJ
124   --       ABC . CDE . GH
125   --       ABC . CDE . IJ
126   --       AB . DEF . GH
127   --       AB . DEF . IJ
128   --       AB . CDE . GH
129   --       AB . CDE . IJ
130
131   --    Here we use the dot simply to separate the pieces of the string
132   --    matched by the three separate elements.
133
134   --  Moving the Start Point
135   --  ======================
136
137   --    A pattern is not required to match starting at the first character
138   --    of the string, and is not required to match to the end of the string.
139   --    The first attempt does indeed attempt to match starting at the first
140   --    character of the string, trying all the possible alternatives. But
141   --    if all alternatives fail, then the starting point of the match is
142   --    moved one character, and all possible alternatives are attempted at
143   --    the new anchor point.
144
145   --    The entire match fails only when every possible starting point has
146   --    been attempted. As an example, suppose that we had the subject
147   --    string
148
149   --      "ABABCDEIJKL"
150
151   --    matched using the pattern in the previous example:
152
153   --      ("ABC" or "AB") & ("DEF" or "CDE") & ("GH" or "IJ")
154
155   --    would succeed, after two anchor point moves:
156
157   --      "ABABCDEIJKL"
158   --         ^^^^^^^
159   --         matched
160   --         section
161
162   --    This mode of pattern matching is called the unanchored mode. It is
163   --    also possible to put the pattern matcher into anchored mode by
164   --    setting the global variable Anchored_Mode to True. This will cause
165   --    all subsequent matches to be performed in anchored mode, where the
166   --    match is required to start at the first character.
167
168   --    We will also see later how the effect of an anchored match can be
169   --    obtained for a single specified anchor point if this is desired.
170
171   --  Other Pattern Elements
172   --  ======================
173
174   --    In addition to strings (or single characters), there are many special
175   --    pattern elements that correspond to special predefined alternations:
176
177   --      Arb       Matches any string. First it matches the null string, and
178   --                then on a subsequent failure, matches one character, and
179   --                then two characters, and so on. It only fails if the
180   --                entire remaining string is matched.
181
182   --      Bal       Matches a non-empty string that is parentheses balanced
183   --                with respect to ordinary () characters. Examples of
184   --                balanced strings are "ABC", "A((B)C)", and "A(B)C(D)E".
185   --                Bal matches the shortest possible balanced string on the
186   --                first attempt, and if there is a subsequent failure,
187   --                attempts to extend the string.
188
189   --      Cancel    Immediately aborts the entire pattern match, signalling
190   --                failure. This is a specialized pattern element, which is
191   --                useful in conjunction with some of the special pattern
192   --                elements that have side effects.
193
194   --      Fail      The null alternation. Matches no possible strings, so it
195   --                always signals failure. This is a specialized pattern
196   --                element, which is useful in conjunction with some of the
197   --                special pattern elements that have side effects.
198
199   --      Fence     Matches the null string at first, and then if a failure
200   --                causes alternatives to be sought, aborts the match (like
201   --                a Cancel). Note that using Fence at the start of a pattern
202   --                has the same effect as matching in anchored mode.
203
204   --      Rest      Matches from the current point to the last character in
205   --                the string. This is a specialized pattern element, which
206   --                is useful in conjunction with some of the special pattern
207   --                elements that have side effects.
208
209   --      Succeed   Repeatedly matches the null string (it is equivalent to
210   --                the alternation ("" or "" or "" ....). This is a special
211   --                pattern element, which is useful in conjunction with some
212   --                of the special pattern elements that have side effects.
213
214   --  Pattern Construction Functions
215   --  ==============================
216
217   --    The following functions construct additional pattern elements
218
219   --      Any(S)    Where S is a string, matches a single character that is
220   --                any one of the characters in S. Fails if the current
221   --                character is not one of the given set of characters.
222
223   --      Arbno(P)  Where P is any pattern, matches any number of instances
224   --                of the pattern, starting with zero occurrences. It is
225   --                thus equivalent to ("" or (P & ("" or (P & ("" ....)))).
226   --                The pattern P may contain any number of pattern elements
227   --                including the use of alternation and concatenation.
228
229   --      Break(S)  Where S is a string, matches a string of zero or more
230   --                characters up to but not including a break character
231   --                that is one of the characters given in the string S.
232   --                Can match the null string, but cannot match the last
233   --                character in the string, since a break character is
234   --                required to be present.
235
236   --      BreakX(S) Where S is a string, behaves exactly like Break(S) when
237   --                it first matches, but if a string is successfully matched,
238   --                then a subsequent failure causes an attempt to extend the
239   --                matched string.
240
241   --      Fence(P)  Where P is a pattern, attempts to match the pattern P
242   --                including trying all possible alternatives of P. If none
243   --                of these alternatives succeeds, then the Fence pattern
244   --                fails. If one alternative succeeds, then the pattern
245   --                match proceeds, but on a subsequent failure, no attempt
246   --                is made to search for alternative matches of P. The
247   --                pattern P may contain any number of pattern elements
248   --                including the use of alternation and concatenation.
249
250   --      Len(N)    Where N is a natural number, matches the given number of
251   --                characters. For example, Len(10) matches any string that
252   --                is exactly ten characters long.
253
254   --      NotAny(S) Where S is a string, matches a single character that is
255   --                not one of the characters of S. Fails if the current
256   --                character is one of the given set of characters.
257
258   --      NSpan(S)  Where S is a string, matches a string of zero or more
259   --                characters that is among the characters given in the
260   --                string. Always matches the longest possible such string.
261   --                Always succeeds, since it can match the null string.
262
263   --      Pos(N)    Where N is a natural number, matches the null string
264   --                if exactly N characters have been matched so far, and
265   --                otherwise fails.
266
267   --      Rpos(N)   Where N is a natural number, matches the null string
268   --                if exactly N characters remain to be matched, and
269   --                otherwise fails.
270
271   --      Rtab(N)   Where N is a natural number, matches characters from
272   --                the current position until exactly N characters remain
273   --                to be matched in the string. Fails if fewer than N
274   --                unmatched characters remain in the string.
275
276   --      Tab(N)    Where N is a natural number, matches characters from
277   --                the current position until exactly N characters have
278   --                been matched in all. Fails if more than N characters
279   --                have already been matched.
280
281   --      Span(S)   Where S is a string, matches a string of one or more
282   --                characters that is among the characters given in the
283   --                string. Always matches the longest possible such string.
284   --                Fails if the current character is not one of the given
285   --                set of characters.
286
287   --  Recursive Pattern Matching
288   --  ==========================
289
290   --    The plus operator (+P) where P is a pattern variable, creates
291   --    a recursive pattern that will, at pattern matching time, follow
292   --    the pointer to obtain the referenced pattern, and then match this
293   --    pattern. This may be used to construct recursive patterns. Consider
294   --    for example:
295
296   --       P := ("A" or ("B" & (+P)))
297
298   --    On the first attempt, this pattern attempts to match the string "A".
299   --    If this fails, then the alternative matches a "B", followed by an
300   --    attempt to match P again. This second attempt first attempts to
301   --    match "A", and so on. The result is a pattern that will match a
302   --    string of B's followed by a single A.
303
304   --    This particular example could simply be written as NSpan('B') & 'A',
305   --    but the use of recursive patterns in the general case can construct
306   --    complex patterns which could not otherwise be built.
307
308   --  Pattern Assignment Operations
309   --  =============================
310
311   --    In addition to the overall result of a pattern match, which indicates
312   --    success or failure, it is often useful to be able to keep track of
313   --    the pieces of the subject string that are matched by individual
314   --    pattern elements, or subsections of the pattern.
315
316   --    The pattern assignment operators allow this capability. The first
317   --    form is the immediate assignment:
318
319   --       P * S
320
321   --    Here P is an arbitrary pattern, and S is a variable of type VString
322   --    that will be set to the substring matched by P. This assignment
323   --    happens during pattern matching, so if P matches more than once,
324   --    then the assignment happens more than once.
325
326   --    The deferred assignment operation:
327
328   --      P ** S
329
330   --    avoids these multiple assignments by deferring the assignment to the
331   --    end of the match. If the entire match is successful, and if the
332   --    pattern P was part of the successful match, then at the end of the
333   --    matching operation the assignment to S of the string matching P is
334   --    performed.
335
336   --    The cursor assignment operation:
337
338   --      Setcur(N'Access)
339
340   --    assigns the current cursor position to the natural variable N. The
341   --    cursor position is defined as the count of characters that have been
342   --    matched so far (including any start point moves).
343
344   --    Finally the operations * and ** may be used with values of type
345   --    Text_IO.File_Access. The effect is to do a Put_Line operation of
346   --    the matched substring. These are particularly useful in debugging
347   --    pattern matches.
348
349   --  Deferred Matching
350   --  =================
351
352   --    The pattern construction functions (such as Len and Any) all permit
353   --    the use of pointers to natural or string values, or functions that
354   --    return natural or string values. These forms cause the actual value
355   --    to be obtained at pattern matching time. This allows interesting
356   --    possibilities for constructing dynamic patterns as illustrated in
357   --    the examples section.
358
359   --    In addition the (+S) operator may be used where S is a pointer to
360   --    string or function returning string, with a similar deferred effect.
361
362   --    A special use of deferred matching is the construction of predicate
363   --    functions. The element (+P) where P is an access to a function that
364   --    returns a Boolean value, causes the function to be called at the
365   --    time the element is matched. If the function returns True, then the
366   --    null string is matched, if the function returns False, then failure
367   --    is signalled and previous alternatives are sought.
368
369   --  Deferred Replacement
370   --  ====================
371
372   --    The simple model given for pattern replacement (where the matched
373   --    substring is replaced by the string given as the third argument to
374   --    Match) works fine in simple cases, but this approach does not work
375   --    in the case where the expression used as the replacement string is
376   --    dependent on values set by the match.
377
378   --    For example, suppose we want to find an instance of a parenthesized
379   --    character, and replace the parentheses with square brackets. At first
380   --    glance it would seem that:
381
382   --      Match (Subject, '(' & Len (1) * Char & ')', '[' & Char & ']');
383
384   --    would do the trick, but that does not work, because the third
385   --    argument to Match gets evaluated too early, before the call to
386   --    Match, and before the pattern match has had a chance to set Char.
387
388   --    To solve this problem we provide the deferred replacement capability.
389   --    With this approach, which of course is only needed if the pattern
390   --    involved has side effects, is to do the match in two stages. The
391   --    call to Match sets a pattern result in a variable of the private
392   --    type Match_Result, and then a subsequent Replace operation uses
393   --    this Match_Result object to perform the required replacement.
394
395   --    Using this approach, we can now write the above operation properly
396   --    in a manner that will work:
397
398   --      M : Match_Result;
399   --      ...
400   --      Match (Subject, '(' & Len (1) * Char & ')', M);
401   --      Replace (M, '[' & Char & ']');
402
403   --    As with other Match cases, there is a function and procedure form
404   --    of this match call. A call to Replace after a failed match has no
405   --    effect. Note that Subject should not be modified between the calls.
406
407   --  Examples of Pattern Matching
408   --  ============================
409
410   --    First a simple example of the use of pattern replacement to remove
411   --    a line number from the start of a string. We assume that the line
412   --    number has the form of a string of decimal digits followed by a
413   --    period, followed by one or more spaces.
414
415   --       Digs : constant Pattern := Span("0123456789");
416
417   --       Lnum : constant Pattern := Pos(0) & Digs & '.' & Span(' ');
418
419   --    Now to use this pattern we simply do a match with a replacement:
420
421   --       Match (Line, Lnum, "");
422
423   --    which replaces the line number by the null string. Note that it is
424   --    also possible to use an Ada.Strings.Maps.Character_Set value as an
425   --    argument to Span and similar functions, and in particular all the
426   --    useful constants 'in Ada.Strings.Maps.Constants are available. This
427   --    means that we could define Digs as:
428
429   --       Digs : constant Pattern := Span(Decimal_Digit_Set);
430
431   --    The style we use here, of defining constant patterns and then using
432   --    them is typical. It is possible to build up patterns dynamically,
433   --    but it is usually more efficient to build them in pieces in advance
434   --    using constant declarations. Note in particular that although it is
435   --    possible to construct a pattern directly as an argument for the
436   --    Match routine, it is much more efficient to preconstruct the pattern
437   --    as we did in this example.
438
439   --    Now let's look at the use of pattern assignment to break a
440   --    string into sections. Suppose that the input string has two
441   --    unsigned decimal integers, separated by spaces or a comma,
442   --    with spaces allowed anywhere. Then we can isolate the two
443   --    numbers with the following pattern:
444
445   --       Num1, Num2 : aliased VString;
446
447   --       B : constant Pattern := NSpan(' ');
448
449   --       N : constant Pattern := Span("0123456789");
450
451   --       T : constant Pattern :=
452   --             NSpan(' ') & N * Num1 & Span(" ,") & N * Num2;
453
454   --    The match operation Match (" 124, 257  ", T) would assign the
455   --    string 124 to Num1 and the string 257 to Num2.
456
457   --    Now let's see how more complex elements can be built from the
458   --    set of primitive elements. The following pattern matches strings
459   --    that have the syntax of Ada 95 based literals:
460
461   --       Digs  : constant Pattern := Span(Decimal_Digit_Set);
462   --       UDigs : constant Pattern := Digs & Arbno('_' & Digs);
463
464   --       Edig  : constant Pattern := Span(Hexadecimal_Digit_Set);
465   --       UEdig : constant Pattern := Edig & Arbno('_' & Edig);
466
467   --       Bnum  : constant Pattern := Udigs & '#' & UEdig & '#';
468
469   --    A match against Bnum will now match the desired strings, e.g.
470   --    it will match 16#123_abc#, but not a#b#. However, this pattern
471   --    is not quite complete, since it does not allow colons to replace
472   --    the pound signs. The following is more complete:
473
474   --       Bchar : constant Pattern := Any("#:");
475   --       Bnum  : constant Pattern := Udigs & Bchar & UEdig & Bchar;
476
477   --    but that is still not quite right, since it allows # and : to be
478   --    mixed, and they are supposed to be used consistently. We solve
479   --    this by using a deferred match.
480
481   --       Temp  : aliased VString;
482
483   --       Bnum  : constant Pattern :=
484   --                 Udigs & Bchar * Temp & UEdig & (+Temp)
485
486   --    Here the first instance of the base character is stored in Temp, and
487   --    then later in the pattern we rematch the value that was assigned.
488
489   --    For an example of a recursive pattern, let's define a pattern
490   --    that is like the built in Bal, but the string matched is balanced
491   --    with respect to square brackets or curly brackets.
492
493   --    The language for such strings might be defined in extended BNF as
494
495   --      ELEMENT ::= <any character other than [] or {}>
496   --                  | '[' BALANCED_STRING ']'
497   --                  | '{' BALANCED_STRING '}'
498
499   --      BALANCED_STRING ::= ELEMENT {ELEMENT}
500
501   --    Here we use {} to indicate zero or more occurrences of a term, as
502   --    is common practice in extended BNF. Now we can translate the above
503   --    BNF into recursive patterns as follows:
504
505   --      Element, Balanced_String : aliased Pattern;
506   --      .
507   --      .
508   --      .
509   --      Element := NotAny ("[]{}")
510   --                   or
511   --                 ('[' & (+Balanced_String) & ']')
512   --                   or
513   --                 ('{' & (+Balanced_String) & '}');
514
515   --      Balanced_String := Element & Arbno (Element);
516
517   --    Note the important use of + here to refer to a pattern not yet
518   --    defined. Note also that we use assignments precisely because we
519   --    cannot refer to as yet undeclared variables in initializations.
520
521   --    Now that this pattern is constructed, we can use it as though it
522   --    were a new primitive pattern element, and for example, the match:
523
524   --      Match ("xy[ab{cd}]", Balanced_String * Current_Output & Fail);
525
526   --    will generate the output:
527
528   --       x
529   --       xy
530   --       xy[ab{cd}]
531   --       y
532   --       y[ab{cd}]
533   --       [ab{cd}]
534   --       a
535   --       ab
536   --       ab{cd}
537   --       b
538   --       b{cd}
539   --       {cd}
540   --       c
541   --       cd
542   --       d
543
544   --    Note that the function of the fail here is simply to force the
545   --    pattern Balanced_String to match all possible alternatives. Studying
546   --    the operation of this pattern in detail is highly instructive.
547
548   --    Finally we give a rather elaborate example of the use of deferred
549   --    matching. The following declarations build up a pattern which will
550   --    find the longest string of decimal digits in the subject string.
551
552   --       Max, Cur : VString;
553   --       Loc      : Natural;
554
555   --       function GtS return Boolean is
556   --       begin
557   --          return Length (Cur) > Length (Max);
558   --       end GtS;
559
560   --       Digit : constant Character_Set := Decimal_Digit_Set;
561
562   --       Digs  : constant Pattern := Span(Digit);
563
564   --       Find : constant Pattern :=
565   --         "" * Max & Fence            & -- initialize Max to null
566   --         BreakX (Digit)              & -- scan looking for digits
567   --         ((Span(Digit) * Cur         & -- assign next string to Cur
568   --          (+GtS'Unrestricted_Access) & -- check size(Cur) > Size(Max)
569   --          Setcur(Loc'Access))          -- if so, save location
570   --                   * Max)            & -- and assign to Max
571   --         Fail;                         -- seek all alternatives
572
573   --    As we see from the comments here, complex patterns like this take
574   --    on aspects of sequential programs. In fact they are sequential
575   --    programs with general backtracking. In this pattern, we first use
576   --    a pattern assignment that matches null and assigns it to Max, so
577   --    that it is initialized for the new match. Now BreakX scans to the
578   --    next digit. Arb would do here, but BreakX will be more efficient.
579   --    Once we have found a digit, we scan out the longest string of
580   --    digits with Span, and assign it to Cur. The deferred call to GtS
581   --    tests if the string we assigned to Cur is the longest so far. If
582   --    not, then failure is signalled, and we seek alternatives (this
583   --    means that BreakX will extend and look for the next digit string).
584   --    If the call to GtS succeeds then the matched string is assigned
585   --    as the largest string so far into Max and its location is saved
586   --    in Loc. Finally Fail forces the match to fail and seek alternatives,
587   --    so that the entire string is searched.
588
589   --    If the pattern Find is matched against a string, the variable Max
590   --    at the end of the pattern will have the longest string of digits,
591   --    and Loc will be the starting character location of the string. For
592   --    example, Match("ab123cd4657ef23", Find) will assign "4657" to Max
593   --    and 11 to Loc (indicating that the string ends with the eleventh
594   --    character of the string).
595
596   --    Note: the use of Unrestricted_Access to reference GtS will not
597   --    be needed if GtS is defined at the outer level, but definitely
598   --    will be necessary if GtS is a nested function (in which case of
599   --    course the scope of the pattern Find will be restricted to this
600   --    nested scope, and this cannot be checked, i.e. use of the pattern
601   --    outside this scope is erroneous). Generally it is a good idea to
602   --    define patterns and the functions they call at the outer level
603   --    where possible, to avoid such problems.
604
605   --  Correspondence with Pattern Matching in SPITBOL
606   --  ===============================================
607
608   --    Generally the Ada syntax and names correspond closely to SPITBOL
609   --    syntax for pattern matching construction.
610
611   --      The basic pattern construction operators are renamed as follows:
612
613   --          Spitbol     Ada
614
615   --          (space)      &
616   --             |         or
617   --             $         *
618   --             .         **
619
620   --      The Ada operators were chosen so that the relative precedences of
621   --      these operators corresponds to that of the Spitbol operators, but
622   --      as always, the use of parentheses is advisable to clarify.
623
624   --    The pattern construction operators all have similar names except for
625
626   --          Spitbol      Ada
627
628   --          Abort        Cancel
629   --          Rem          Rest
630
631   --    where we have clashes with Ada reserved names
632
633   --    Ada requires the use of 'Access to refer to functions used in the
634   --    pattern match, and often the use of 'Unrestricted_Access may be
635   --    necessary to get around the scope restrictions if the functions
636   --    are not declared at the outer level.
637
638   --    The actual pattern matching syntax is modified in Ada as follows:
639
640   --          Spitbol      Ada
641
642   --          X Y          Match (X, Y);
643   --          X Y = Z      Match (X, Y, Z);
644
645   --    and pattern failure is indicated by returning a Boolean result from
646   --    the Match function (True for success, False for failure).
647
648   -----------------------
649   -- Type Declarations --
650   -----------------------
651
652   type Pattern is private;
653   --  Type representing a pattern. This package provides a complete set of
654   --  operations for constructing patterns that can be used in the pattern
655   --  matching operations provided.
656
657   type Boolean_Func is access function return Boolean;
658   --  General Boolean function type. When this type is used as a formal
659   --  parameter type in this package, it indicates a deferred predicate
660   --  pattern. The function will be called when the pattern element is
661   --  matched and failure signalled if False is returned.
662
663   type Natural_Func is access function return Natural;
664   --  General Natural function type. When this type is used as a formal
665   --  parameter type in this package, it indicates a deferred pattern.
666   --  The function will be called when the pattern element is matched
667   --  to obtain the currently referenced Natural value.
668
669   type VString_Func is access function return VString;
670   --  General VString function type. When this type is used as a formal
671   --  parameter type in this package, it indicates a deferred pattern.
672   --  The function will be called when the pattern element is matched
673   --  to obtain the currently referenced string value.
674
675   subtype PString is String;
676   --  This subtype is used in the remainder of the package to indicate a
677   --  formal parameter that is converted to its corresponding pattern,
678   --  i.e. a pattern that matches the characters of the string.
679
680   subtype PChar is Character;
681   --  Similarly, this subtype is used in the remainder of the package to
682   --  indicate a formal parameter that is converted to its corresponding
683   --  pattern, i.e. a pattern that matches this one character.
684
685   subtype VString_Var is VString;
686   subtype Pattern_Var is Pattern;
687   --  These synonyms are used as formal parameter types to a function where,
688   --  if the language allowed, we would use in out parameters, but we are
689   --  not allowed to have in out parameters for functions. Instead we pass
690   --  actuals which must be variables, and with a bit of trickery in the
691   --  body, manage to interpret them properly as though they were indeed
692   --  in out parameters.
693
694   pragma Warnings (Off, VString_Var);
695   pragma Warnings (Off, Pattern_Var);
696   --  We turn off warnings for these two types so that when variables are used
697   --  as arguments in this context, warnings about them not being assigned in
698   --  the source program will be suppressed.
699
700   --------------------------------
701   -- Basic Pattern Construction --
702   --------------------------------
703
704   function "&"  (L : Pattern; R : Pattern) return Pattern;
705   function "&"  (L : PString; R : Pattern) return Pattern;
706   function "&"  (L : Pattern; R : PString) return Pattern;
707   function "&"  (L : PChar;   R : Pattern) return Pattern;
708   function "&"  (L : Pattern; R : PChar)   return Pattern;
709
710   --  Pattern concatenation. Matches L followed by R
711
712   function "or" (L : Pattern; R : Pattern) return Pattern;
713   function "or" (L : PString; R : Pattern) return Pattern;
714   function "or" (L : Pattern; R : PString) return Pattern;
715   function "or" (L : PString; R : PString) return Pattern;
716   function "or" (L : PChar;   R : Pattern) return Pattern;
717   function "or" (L : Pattern; R : PChar)   return Pattern;
718   function "or" (L : PChar;   R : PChar)   return Pattern;
719   function "or" (L : PString; R : PChar)   return Pattern;
720   function "or" (L : PChar;   R : PString) return Pattern;
721   --  Pattern alternation. Creates a pattern that will first try to match
722   --  L and then on a subsequent failure, attempts to match R instead.
723
724   ----------------------------------
725   -- Pattern Assignment Functions --
726   ----------------------------------
727
728   function "*" (P : Pattern; Var : VString_Var)  return Pattern;
729   function "*" (P : PString; Var : VString_Var)  return Pattern;
730   function "*" (P : PChar;   Var : VString_Var)  return Pattern;
731   --  Matches P, and if the match succeeds, assigns the matched substring
732   --  to the given VString variable Var. This assignment happens as soon as
733   --  the substring is matched, and if the pattern P1 is matched more than
734   --  once during the course of the match, then the assignment will occur
735   --  more than once.
736
737   function "**" (P : Pattern; Var : VString_Var) return Pattern;
738   function "**" (P : PString; Var : VString_Var) return Pattern;
739   function "**" (P : PChar;   Var : VString_Var) return Pattern;
740   --  Like "*" above, except that the assignment happens at most once
741   --  after the entire match is completed successfully. If the match
742   --  fails, then no assignment takes place.
743
744   ----------------------------------
745   -- Deferred Matching Operations --
746   ----------------------------------
747
748   function "+" (Str : VString_Var)  return Pattern;
749   --  Here Str must be a VString variable. This function constructs a
750   --  pattern which at pattern matching time will access the current
751   --  value of this variable, and match against these characters.
752
753   function "+" (Str : VString_Func) return Pattern;
754   --  Constructs a pattern which at pattern matching time calls the given
755   --  function, and then matches against the string or character value
756   --  that is returned by the call.
757
758   function "+" (P : Pattern_Var)    return Pattern;
759   --  Here P must be a Pattern variable. This function constructs a
760   --  pattern which at pattern matching time will access the current
761   --  value of this variable, and match against the pattern value.
762
763   function "+" (P : Boolean_Func)   return Pattern;
764   --  Constructs a predicate pattern function that at pattern matching time
765   --  calls the given function. If True is returned, then the pattern matches.
766   --  If False is returned, then failure is signalled.
767
768   --------------------------------
769   -- Pattern Building Functions --
770   --------------------------------
771
772   function Arb                                             return Pattern;
773   --  Constructs a pattern that will match any string. On the first attempt,
774   --  the pattern matches a null string, then on each successive failure, it
775   --  matches one more character, and only fails if matching the entire rest
776   --  of the string.
777
778   function Arbno  (P : Pattern)                            return Pattern;
779   function Arbno  (P : PString)                            return Pattern;
780   function Arbno  (P : PChar)                              return Pattern;
781   --  Pattern repetition. First matches null, then on a subsequent failure
782   --  attempts to match an additional instance of the given pattern.
783   --  Equivalent to (but more efficient than) P & ("" or (P & ("" or ...
784
785   function Any    (Str : String)                           return Pattern;
786   function Any    (Str : VString)                          return Pattern;
787   function Any    (Str : Character)                        return Pattern;
788   function Any    (Str : Character_Set)                    return Pattern;
789   function Any    (Str : not null access VString)          return Pattern;
790   function Any    (Str : VString_Func)                     return Pattern;
791   --  Constructs a pattern that matches a single character that is one of
792   --  the characters in the given argument. The pattern fails if the current
793   --  character is not in Str.
794
795   function Bal                                             return Pattern;
796   --  Constructs a pattern that will match any non-empty string that is
797   --  parentheses balanced with respect to the normal parentheses characters.
798   --  Attempts to extend the string if a subsequent failure occurs.
799
800   function Break  (Str : String)                           return Pattern;
801   function Break  (Str : VString)                          return Pattern;
802   function Break  (Str : Character)                        return Pattern;
803   function Break  (Str : Character_Set)                    return Pattern;
804   function Break  (Str : not null access VString)          return Pattern;
805   function Break  (Str : VString_Func)                     return Pattern;
806   --  Constructs a pattern that matches a (possibly null) string which
807   --  is immediately followed by a character in the given argument. This
808   --  character is not part of the matched string. The pattern fails if
809   --  the remaining characters to be matched do not include any of the
810   --  characters in Str.
811
812   function BreakX (Str : String)                           return Pattern;
813   function BreakX (Str : VString)                          return Pattern;
814   function BreakX (Str : Character)                        return Pattern;
815   function BreakX (Str : Character_Set)                    return Pattern;
816   function BreakX (Str : not null access VString)          return Pattern;
817   function BreakX (Str : VString_Func)                     return Pattern;
818   --  Like Break, but the pattern attempts to extend on a failure to find
819   --  the next occurrence of a character in Str, and only fails when the
820   --  last such instance causes a failure.
821
822   function Cancel                                          return Pattern;
823   --  Constructs a pattern that immediately aborts the entire match
824
825   function Fail                                            return Pattern;
826   --  Constructs a pattern that always fails
827
828   function Fence                                           return Pattern;
829   --  Constructs a pattern that matches null on the first attempt, and then
830   --  causes the entire match to be aborted if a subsequent failure occurs.
831
832   function Fence  (P : Pattern)                            return Pattern;
833   --  Constructs a pattern that first matches P. If P fails, then the
834   --  constructed pattern fails. If P succeeds, then the match proceeds,
835   --  but if subsequent failure occurs, alternatives in P are not sought.
836   --  The idea of Fence is that each time the pattern is matched, just
837   --  one attempt is made to match P, without trying alternatives.
838
839   function Len    (Count : Natural)                        return Pattern;
840   function Len    (Count : not null access Natural)        return Pattern;
841   function Len    (Count : Natural_Func)                   return Pattern;
842   --  Constructs a pattern that matches exactly the given number of
843   --  characters. The pattern fails if fewer than this number of characters
844   --  remain to be matched in the string.
845
846   function NotAny (Str : String)                           return Pattern;
847   function NotAny (Str : VString)                          return Pattern;
848   function NotAny (Str : Character)                        return Pattern;
849   function NotAny (Str : Character_Set)                    return Pattern;
850   function NotAny (Str : not null access VString)          return Pattern;
851   function NotAny (Str : VString_Func)                     return Pattern;
852   --  Constructs a pattern that matches a single character that is not
853   --  one of the characters in the given argument. The pattern Fails if
854   --  the current character is in Str.
855
856   function NSpan  (Str : String)                           return Pattern;
857   function NSpan  (Str : VString)                          return Pattern;
858   function NSpan  (Str : Character)                        return Pattern;
859   function NSpan  (Str : Character_Set)                    return Pattern;
860   function NSpan  (Str : not null access VString)          return Pattern;
861   function NSpan  (Str : VString_Func)                     return Pattern;
862   --  Constructs a pattern that matches the longest possible string
863   --  consisting entirely of characters from the given argument. The
864   --  string may be empty, so this pattern always succeeds.
865
866   function Pos    (Count : Natural)                        return Pattern;
867   function Pos    (Count : not null access Natural)        return Pattern;
868   function Pos    (Count : Natural_Func)                   return Pattern;
869   --  Constructs a pattern that matches the null string if exactly Count
870   --  characters have already been matched, and otherwise fails.
871
872   function Rest                                            return Pattern;
873   --  Constructs a pattern that always succeeds, matching the remaining
874   --  unmatched characters in the pattern.
875
876   function Rpos   (Count : Natural)                        return Pattern;
877   function Rpos   (Count : not null access Natural)        return Pattern;
878   function Rpos   (Count : Natural_Func)                   return Pattern;
879   --  Constructs a pattern that matches the null string if exactly Count
880   --  characters remain to be matched in the string, and otherwise fails.
881
882   function Rtab   (Count : Natural)                        return Pattern;
883   function Rtab   (Count : not null access Natural)        return Pattern;
884   function Rtab   (Count : Natural_Func)                   return Pattern;
885   --  Constructs a pattern that matches from the current location until
886   --  exactly Count characters remain to be matched in the string. The
887   --  pattern fails if fewer than Count characters remain to be matched.
888
889   function Setcur (Var : not null access Natural)          return Pattern;
890   --  Constructs a pattern that matches the null string, and assigns the
891   --  current cursor position in the string. This value is the number of
892   --  characters matched so far. So it is zero at the start of the match.
893
894   function Span   (Str : String)                           return Pattern;
895   function Span   (Str : VString)                          return Pattern;
896   function Span   (Str : Character)                        return Pattern;
897   function Span   (Str : Character_Set)                    return Pattern;
898   function Span   (Str : not null access VString)          return Pattern;
899   function Span   (Str : VString_Func)                     return Pattern;
900   --  Constructs a pattern that matches the longest possible string
901   --  consisting entirely of characters from the given argument. The
902   --  string cannot be empty , so the pattern fails if the current
903   --  character is not one of the characters in Str.
904
905   function Succeed                                         return Pattern;
906   --  Constructs a pattern that succeeds matching null, both on the first
907   --  attempt, and on any rematch attempt, i.e. it is equivalent to an
908   --  infinite alternation of null strings.
909
910   function Tab    (Count : Natural)                        return Pattern;
911   function Tab    (Count : not null access Natural)        return Pattern;
912   function Tab    (Count : Natural_Func)                   return Pattern;
913   --  Constructs a pattern that from the current location until Count
914   --  characters have been matched. The pattern fails if more than Count
915   --  characters have already been matched.
916
917   ---------------------------------
918   -- Pattern Matching Operations --
919   ---------------------------------
920
921   --  The Match function performs an actual pattern matching operation.
922   --  The versions with three parameters perform a match without modifying
923   --  the subject string and return a Boolean result indicating if the
924   --  match is successful or not. The Anchor parameter is set to True to
925   --  obtain an anchored match in which the pattern is required to match
926   --  the first character of the string. In an unanchored match, which is
927
928   --  the default, successive attempts are made to match the given pattern
929   --  at each character of the subject string until a match succeeds, or
930   --  until all possibilities have failed.
931
932   --  Note that pattern assignment functions in the pattern may generate
933   --  side effects, so these functions are not necessarily pure.
934
935   Anchored_Mode : Boolean := False;
936   --  This global variable can be set True to cause all subsequent pattern
937   --  matches to operate in anchored mode. In anchored mode, no attempt is
938   --  made to move the anchor point, so that if the match succeeds it must
939   --  succeed starting at the first character. Note that the effect of
940   --  anchored mode may be achieved in individual pattern matches by using
941   --  Fence or Pos(0) at the start of the pattern.
942
943   Pattern_Stack_Overflow : exception;
944   --  Exception raised if internal pattern matching stack overflows. This
945   --  is typically the result of runaway pattern recursion. If there is a
946   --  genuine case of stack overflow, then either the match must be broken
947   --  down into simpler steps, or the stack limit must be reset.
948
949   Stack_Size : constant Positive := 2000;
950   --  Size used for internal pattern matching stack. Increase this size if
951   --  complex patterns cause Pattern_Stack_Overflow to be raised.
952
953   --  Simple match functions. The subject is matched against the pattern.
954   --  Any immediate or deferred assignments or writes are executed, and
955   --  the returned value indicates whether or not the match succeeded.
956
957   function Match
958     (Subject : VString;
959      Pat     : Pattern) return Boolean;
960
961   function Match
962     (Subject : VString;
963      Pat     : PString) return Boolean;
964
965   function Match
966     (Subject : String;
967      Pat     : Pattern) return Boolean;
968
969   function Match
970     (Subject : String;
971      Pat     : PString) return Boolean;
972
973   --  Replacement functions. The subject is matched against the pattern.
974   --  Any immediate or deferred assignments or writes are executed, and
975   --  the returned value indicates whether or not the match succeeded.
976   --  If the match succeeds, then the matched part of the subject string
977   --  is replaced by the given Replace string.
978
979   function Match
980     (Subject : VString_Var;
981      Pat     : Pattern;
982      Replace : VString) return Boolean;
983
984   function Match
985     (Subject : VString_Var;
986      Pat     : PString;
987      Replace : VString) return Boolean;
988
989   function Match
990     (Subject : VString_Var;
991      Pat     : Pattern;
992      Replace : String) return Boolean;
993
994   function Match
995     (Subject : VString_Var;
996      Pat     : PString;
997      Replace : String) return Boolean;
998
999   --  Simple match procedures. The subject is matched against the pattern.
1000   --  Any immediate or deferred assignments or writes are executed. No
1001   --  indication of success or failure is returned.
1002
1003   procedure Match
1004     (Subject : VString;
1005      Pat     : Pattern);
1006
1007   procedure Match
1008     (Subject : VString;
1009      Pat     : PString);
1010
1011   procedure Match
1012     (Subject : String;
1013      Pat     : Pattern);
1014
1015   procedure Match
1016     (Subject : String;
1017      Pat     : PString);
1018
1019   --  Replacement procedures. The subject is matched against the pattern.
1020   --  Any immediate or deferred assignments or writes are executed. No
1021   --  indication of success or failure is returned. If the match succeeds,
1022   --  then the matched part of the subject string is replaced by the given
1023   --  Replace string.
1024
1025   procedure Match
1026     (Subject : in out VString;
1027      Pat     : Pattern;
1028      Replace : VString);
1029
1030   procedure Match
1031     (Subject : in out VString;
1032      Pat     : PString;
1033      Replace : VString);
1034
1035   procedure Match
1036     (Subject : in out VString;
1037      Pat     : Pattern;
1038      Replace : String);
1039
1040   procedure Match
1041     (Subject : in out VString;
1042      Pat     : PString;
1043      Replace : String);
1044
1045   --  Deferred Replacement
1046
1047   type Match_Result is private;
1048   --  Type used to record result of pattern match
1049
1050   subtype Match_Result_Var is Match_Result;
1051   --  This synonyms is used as a formal parameter type to a function where,
1052   --  if the language allowed, we would use an in out parameter, but we are
1053   --  not allowed to have in out parameters for functions. Instead we pass
1054   --  actuals which must be variables, and with a bit of trickery in the
1055   --  body, manage to interpret them properly as though they were indeed
1056   --  in out parameters.
1057
1058   function Match
1059     (Subject : VString_Var;
1060      Pat     : Pattern;
1061      Result  : Match_Result_Var) return Boolean;
1062
1063   procedure Match
1064     (Subject : in out VString;
1065      Pat     : Pattern;
1066      Result  : out Match_Result);
1067
1068   procedure Replace
1069     (Result  : in out Match_Result;
1070      Replace : VString);
1071   --  Given a previous call to Match which set Result, performs a pattern
1072   --  replacement if the match was successful. Has no effect if the match
1073   --  failed. This call should immediately follow the Match call.
1074
1075   ------------------------
1076   -- Debugging Routines --
1077   ------------------------
1078
1079   --  Debugging pattern matching operations can often be quite complex,
1080   --  since there is no obvious way to trace the progress of the match.
1081   --  The declarations in this section provide some debugging assistance.
1082
1083   Debug_Mode : Boolean := False;
1084   --  This global variable can be set True to generate debugging on all
1085   --  subsequent calls to Match. The debugging output is a full trace of
1086   --  the actions of the pattern matcher, written to Standard_Output. The
1087   --  level of this information is intended to be comprehensible at the
1088   --  abstract level of this package declaration. However, note that the
1089   --  use of this switch often generates large amounts of output.
1090
1091   function "*"  (P : Pattern; Fil : File_Access)           return Pattern;
1092   function "*"  (P : PString; Fil : File_Access)           return Pattern;
1093   function "*"  (P : PChar;   Fil : File_Access)           return Pattern;
1094   function "**" (P : Pattern; Fil : File_Access)           return Pattern;
1095   function "**" (P : PString; Fil : File_Access)           return Pattern;
1096   function "**" (P : PChar;   Fil : File_Access)           return Pattern;
1097   --  These are similar to the corresponding pattern assignment operations
1098   --  except that instead of setting the value of a variable, the matched
1099   --  substring is written to the appropriate file. This can be useful in
1100   --  following the progress of a match without generating the full amount
1101   --  of information obtained by setting Debug_Mode to True.
1102
1103   Terminal : constant File_Access := Standard_Error;
1104   Output   : constant File_Access := Standard_Output;
1105   --  Two handy synonyms for use with the above pattern write operations
1106
1107   --  Finally we have some routines that are useful for determining what
1108   --  patterns are in use, particularly if they are constructed dynamically.
1109
1110   function Image (P : Pattern) return String;
1111   function Image (P : Pattern) return VString;
1112   --  This procedures yield strings that corresponds to the syntax needed
1113   --  to create the given pattern using the functions in this package. The
1114   --  form of this string is such that it could actually be compiled and
1115   --  evaluated to yield the required pattern except for references to
1116   --  variables and functions, which are output using one of the following
1117   --  forms:
1118   --
1119   --     access Natural     NP(16#...#)
1120   --     access Pattern     PP(16#...#)
1121   --     access VString     VP(16#...#)
1122   --
1123   --     Natural_Func       NF(16#...#)
1124   --     VString_Func       VF(16#...#)
1125   --
1126   --  where 16#...# is the hex representation of the integer address that
1127   --  corresponds to the given access value
1128
1129   procedure Dump (P : Pattern);
1130   --  This procedure writes information about the pattern to Standard_Out.
1131   --  The format of this information is keyed to the internal data structures
1132   --  used to implement patterns. The information provided by Dump is thus
1133   --  more precise than that yielded by Image, but is also a bit more obscure
1134   --  (i.e. it cannot be interpreted solely in terms of this spec, you have
1135   --  to know something about the data structures).
1136
1137   ------------------
1138   -- Private Part --
1139   ------------------
1140
1141private
1142   type PE;
1143   --  Pattern element, a pattern is a complex structure of PE's. This type
1144   --  is defined and described in the body of this package.
1145
1146   type PE_Ptr is access all PE;
1147   --  Pattern reference. PE's use PE_Ptr values to reference other PE's
1148
1149   type Pattern is new Controlled with record
1150      Stk : Natural := 0;
1151      --  Maximum number of stack entries required for matching this
1152      --  pattern. See description of pattern history stack in body.
1153
1154      P : PE_Ptr := null;
1155      --  Pointer to initial pattern element for pattern
1156   end record;
1157
1158   pragma Finalize_Storage_Only (Pattern);
1159
1160   procedure Adjust (Object : in out Pattern);
1161   --  Adjust routine used to copy pattern objects
1162
1163   procedure Finalize (Object : in out Pattern);
1164   --  Finalization routine used to release storage allocated for a pattern
1165
1166   type VString_Ptr is access all VString;
1167
1168   type Match_Result is record
1169      Var : VString_Ptr;
1170      --  Pointer to subject string. Set to null if match failed
1171
1172      Start : Natural := 1;
1173      --  Starting index position (1's origin) of matched section of
1174      --  subject string. Only valid if Var is non-null.
1175
1176      Stop : Natural := 0;
1177      --  Ending index position (1's origin) of matched section of
1178      --  subject string. Only valid if Var is non-null.
1179
1180   end record;
1181
1182   pragma Volatile (Match_Result);
1183   --  This ensures that the Result parameter is passed by reference, so
1184   --  that we can play our games with the bogus Match_Result_Var parameter
1185   --  in the function case to treat it as though it were an in out parameter.
1186
1187end GNAT.Spitbol.Patterns;
1188