1------------------------------------------------------------------------------
2--                                                                          --
3--                            Matreshka Project                             --
4--                                                                          --
5--         Localization, Internationalization, Globalization for Ada        --
6--                                                                          --
7--                        Runtime Library Component                         --
8--                                                                          --
9------------------------------------------------------------------------------
10--                                                                          --
11-- Copyright © 2010, Vadim Godunko <vgodunko@gmail.com>                     --
12-- All rights reserved.                                                     --
13--                                                                          --
14-- Redistribution and use in source and binary forms, with or without       --
15-- modification, are permitted provided that the following conditions       --
16-- are met:                                                                 --
17--                                                                          --
18--  * Redistributions of source code must retain the above copyright        --
19--    notice, this list of conditions and the following disclaimer.         --
20--                                                                          --
21--  * Redistributions in binary form must reproduce the above copyright     --
22--    notice, this list of conditions and the following disclaimer in the   --
23--    documentation and/or other materials provided with the distribution.  --
24--                                                                          --
25--  * Neither the name of the Vadim Godunko, IE nor the names of its        --
26--    contributors may be used to endorse or promote products derived from  --
27--    this software without specific prior written permission.              --
28--                                                                          --
29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      --
30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        --
31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR    --
32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT     --
33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,   --
34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED --
35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR   --
36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   --
37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     --
38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       --
39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             --
40--                                                                          --
41------------------------------------------------------------------------------
42--  $Revision: 527 $ $Date: 2010-06-16 14:22:28 +0400 (Wed, 16 Jun 2010) $
43------------------------------------------------------------------------------
44
45package Matreshka.Internals.Regexps.Compiler is
46
47   pragma Preelaborate;
48
49   type YY_Errors is
50    (No_Error,
51     Unexpected_End_Of_Literal,
52     Unexpected_End_Of_Character_Class,
53     Unexpected_Character_in_Multiplicity_Specifier,
54     Unexpected_End_Of_Multiplicity_Specifier,
55     Unexpected_End_Of_Property_Specification,
56     Unrecognized_Character_In_Property_Specification,
57     Unescaped_Pattern_Syntax_Character,
58     Expression_Syntax_Error);
59
60   type YY_Error_Information is record
61      Error : YY_Errors;
62      Index : Natural;
63   end record;
64
65   type Property_Specification_Keyword is
66    (ASCII_Hex_Digit,                 --  Names of binary properties
67     Alphabetic,
68     Bidi_Control,
69--     Bidi_Mirrored,
70     Cased,
71     Case_Ignorable,
72     Changes_When_Casefolded,
73     Changes_When_Casemapped,
74     Changes_When_NFKC_Casefolded,
75     Changes_When_Lowercased,
76     Changes_When_Titlecased,
77     Changes_When_Uppercased,
78     Composition_Exclusion,
79     Full_Composition_Exclusion,
80     Dash,
81     Deprecated,
82     Default_Ignorable_Code_Point,
83     Diacritic,
84     Extender,
85     Grapheme_Base,
86     Grapheme_Extend,
87     Grapheme_Link,
88     Hex_Digit,
89     Hyphen,
90     ID_Continue,
91     Ideographic,
92     ID_Start,
93     IDS_Binary_Operator,
94     IDS_Trinary_Operator,
95     Join_Control,
96     Logical_Order_Exception,
97     Lowercase,
98     Math,
99     Noncharacter_Code_Point,
100     Other_Alphabetic,
101     Other_Default_Ignorable_Code_Point,
102     Other_Grapheme_Extend,
103     Other_ID_Continue,
104     Other_ID_Start,
105     Other_Lowercase,
106     Other_Math,
107     Other_Uppercase,
108     Pattern_Syntax,
109     Pattern_White_Space,
110     Quotation_Mark,
111     Radical,
112     Soft_Dotted,
113     STerm,
114     Terminal_Punctuation,
115     Unified_Ideograph,
116     Uppercase,
117     Variation_Selector,
118     White_Space,
119     XID_Continue,
120     XID_Start,
121     Expands_On_NFC,
122     Expands_On_NFD,
123     Expands_On_NFKC,
124     Expands_On_NFKD,
125
126     Other,                --  Values of the General Category
127     Control,
128     Format,
129     Unassigned,
130     Private_Use,
131     Surrogate,
132     Letter,
133     Cased_Letter,
134     Lowercase_Letter,
135     Modifier_Letter,
136     Other_Letter,
137     Titlecase_Letter,
138     Uppercase_Letter,
139     Mark,
140     Spacing_Mark,
141     Enclosing_Mark,
142     Nonspacing_Mark,
143     Number,
144     Decimal_Number,
145     Letter_Number,
146     Other_Number,
147     Punctuation,
148     Connector_Punctuation,
149     Dash_Punctuation,
150     Close_Punctuation,
151     Final_Punctuation,
152     Initial_Punctuation,
153     Other_Punctuation,
154     Open_Punctuation,
155     Symbol,
156     Currency_Symbol,
157     Modifier_Symbol,
158     Math_Symbol,
159     Other_Symbol,
160     Separator,
161     Line_Separator,
162     Paragraph_Separator,
163     Space_Separator);
164
165   type Kinds is
166    (None,
167     Match_Code_Point,
168     Number,
169     Property_Keyword,
170     AST_Node);
171
172   type YYSType (Kind : Kinds := None) is record
173      case Kind is
174         when None =>
175            null;
176
177         when Match_Code_Point =>
178            Code : Wide_Wide_Character;
179
180         when Number =>
181            Value : Natural;
182
183         when Property_Keyword =>
184            Keyword : Property_Specification_Keyword;
185
186         when AST_Node =>
187            Node : Positive;
188      end case;
189   end record;
190
191   type Token is
192    (End_Of_Input,
193     Error,
194     Token_Code_Point,
195     Token_Any_Code_Point,
196     Token_Alternation,
197     Token_Optional_Greedy,
198     Token_Optional_Lazy,
199     Token_Zero_Or_More_Greedy,
200     Token_Zero_Or_More_Lazy,
201     Token_One_Or_More_Greedy,
202     Token_One_Or_More_Lazy,
203     Token_Character_Class_Begin,
204     Token_Character_Class_End,
205     Token_Negate_Character_Class,
206     Token_Character_Class_Range,
207     Token_Multiplicity_Begin,
208     Token_Multiplicity_End_Greedy,
209     Token_Multiplicity_End_Lazy,
210     Token_Multiplicity_Comma,
211     Token_Multiplicity_Number,
212     Token_Subexpression_Capture_Begin,
213     Token_Subexpression_Begin,
214     Token_Subexpression_End,
215     Token_Property_Begin_Positive,
216     Token_Property_Begin_Negative,
217     Token_Property_End,
218     Token_Property_Keyword,
219     Token_Start_Of_Line,
220     Token_End_Of_Line);
221
222   --  Here is global state of the compiler. At the first stage of
223   --  refactoring all global state variables must be moved to here.
224   --  Later, they will be wrapped by record type to allow to have
225   --  several compiler in the different threads at the same time.
226
227   type Compiler_State is record
228      Data                 : Matreshka.Internals.Strings.Shared_String_Access;
229      YY_Start_State       : Integer := 1;
230      YY_Current_Position  : Matreshka.Internals.Utf16.Utf16_String_Index := 0;
231      YY_Current_Index     : Positive := 1;
232      YY_Error             : YY_Error_Information := (No_Error, 0);
233      YYLVal               : YYSType;
234      YYVal                : YYSType;
235      Character_Class_Mode : Boolean := False;
236      --  Recognition of the Unicode property specification is done in the
237      --  separate scanner's mode; this variable is used to switch back to
238      --  original mode.
239   end record;
240
241   procedure YYError
242    (Self  : not null access Compiler_State;
243     Error : YY_Errors;
244     Index : Natural);
245   --  Report error.
246
247   procedure Attach
248    (Pattern : in out Shared_Pattern; Head : Positive; Node : Positive);
249   --  Attach Node to the list of nodes, started by Head.
250
251   function Compile
252    (Expression : not null Matreshka.Internals.Strings.Shared_String_Access)
253       return not null Shared_Pattern_Access;
254
255   function Create_Alternative
256    (Pattern     : not null Shared_Pattern_Access;
257     Prefered    : Positive;
258     Alternative : Positive) return Positive;
259   pragma Inline (Create_Alternative);
260
261   function Create_Anchor_End_Of_Line
262    (Pattern : not null Shared_Pattern_Access) return Positive;
263   pragma Inline (Create_Anchor_End_Of_Line);
264
265   function Create_Anchor_Start_Of_Line
266    (Pattern : not null Shared_Pattern_Access) return Positive;
267   pragma Inline (Create_Anchor_Start_Of_Line);
268
269   function Create_Character_Class
270    (Pattern : not null Shared_Pattern_Access) return Positive;
271   pragma Inline (Create_Character_Class);
272
273   function Create_Match_Any
274    (Pattern : not null Shared_Pattern_Access) return Positive;
275   pragma Inline (Create_Match_Any);
276
277   function Create_Match_Character
278    (Pattern   : not null Shared_Pattern_Access;
279     Character : Matreshka.Internals.Unicode.Code_Point) return Positive;
280   pragma Inline (Create_Match_Character);
281
282   function Create_Match_Property
283    (Pattern  : not null Shared_Pattern_Access;
284     Value    : Matreshka.Internals.Unicode.Ucd.Boolean_Properties;
285     Negative : Boolean) return Positive;
286   pragma Inline (Create_Match_Property);
287
288   function Create_Match_Property
289    (Pattern  : not null Shared_Pattern_Access;
290     Value    : General_Category_Flags;
291     Negative : Boolean) return Positive;
292   pragma Inline (Create_Match_Property);
293
294   procedure Create_Member_Character
295    (Pattern   : not null Shared_Pattern_Access;
296     Class     : Positive;
297     Character : Matreshka.Internals.Unicode.Code_Point);
298   pragma Inline (Create_Member_Character);
299
300   procedure Create_Member_Property
301    (Pattern  : not null Shared_Pattern_Access;
302     Class    : Positive;
303     Value    : Matreshka.Internals.Unicode.Ucd.Boolean_Properties;
304     Negative : Boolean);
305   pragma Inline (Create_Member_Property);
306
307   procedure  Create_Member_Property
308    (Pattern  : not null Shared_Pattern_Access;
309     Class    : Positive;
310     Value    : General_Category_Flags;
311     Negative : Boolean);
312   pragma Inline (Create_Member_Property);
313
314   procedure Create_Member_Range
315    (Pattern  : not null Shared_Pattern_Access;
316     Class    : Positive;
317     Low      : Matreshka.Internals.Unicode.Code_Point;
318     High     : Matreshka.Internals.Unicode.Code_Point);
319   pragma Inline (Create_Member_Range);
320
321   function Create_Repetition
322     (Pattern    : not null Shared_Pattern_Access;
323      Expression : Positive;
324      Lower      : Natural;
325      Upper      : Natural;
326      Greedy     : Boolean) return Positive;
327   pragma Inline (Create_Repetition);
328
329   function Create_Subexpression
330     (Pattern    : not null Shared_Pattern_Access;
331      Expression : Positive;
332      Capture    : Boolean) return Positive;
333   pragma Inline (Create_Subexpression);
334
335   function Get_Preferred
336     (Pattern : not null Shared_Pattern_Access;
337      Node    : Positive) return Natural;
338   pragma Inline (Get_Preferred);
339
340   function Get_Fallback
341     (Pattern : not null Shared_Pattern_Access;
342      Node    : Positive) return Natural;
343   pragma Inline (Get_Fallback);
344
345   function Get_Members
346     (Pattern : not null Shared_Pattern_Access;
347      Node    : Positive) return Natural;
348   pragma Inline (Get_Members);
349
350   function Get_Expression
351     (Pattern : not null Shared_Pattern_Access;
352      Node    : Positive) return Natural;
353   pragma Inline (Get_Expression);
354
355   function Get_Lower_Bound
356     (Pattern : not null Shared_Pattern_Access;
357      Node    : Positive) return Natural;
358   pragma Inline (Get_Lower_Bound);
359
360   function Get_Upper_Bound
361     (Pattern : not null Shared_Pattern_Access;
362      Node    : Positive) return Natural;
363   pragma Inline (Get_Upper_Bound);
364
365   function Get_Previous_Sibling
366     (Pattern : not null Shared_Pattern_Access;
367      Node    : Positive) return Natural;
368   pragma Inline (Get_Previous_Sibling);
369
370   function Get_Next_Sibling
371     (Pattern : not null Shared_Pattern_Access;
372      Node    : Positive) return Natural;
373   pragma Inline (Get_Next_Sibling);
374
375end Matreshka.Internals.Regexps.Compiler;
376