1------------------------------------------------------------------------------
2--                                                                          --
3--                            Matreshka Project                             --
4--                                                                          --
5--                               XML Processor                              --
6--                                                                          --
7--                        Runtime Library Component                         --
8--                                                                          --
9------------------------------------------------------------------------------
10--                                                                          --
11-- Copyright © 2010-2015, Vadim Godunko <vgodunko@gmail.com>                --
12-- All rights reserved.                                                     --
13--                                                                          --
14-- Redistribution and use in source and binary forms, with or without       --
15-- modification, are permitted provided that the following conditions       --
16-- are met:                                                                 --
17--                                                                          --
18--  * Redistributions of source code must retain the above copyright        --
19--    notice, this list of conditions and the following disclaimer.         --
20--                                                                          --
21--  * Redistributions in binary form must reproduce the above copyright     --
22--    notice, this list of conditions and the following disclaimer in the   --
23--    documentation and/or other materials provided with the distribution.  --
24--                                                                          --
25--  * Neither the name of the Vadim Godunko, IE nor the names of its        --
26--    contributors may be used to endorse or promote products derived from  --
27--    this software without specific prior written permission.              --
28--                                                                          --
29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      --
30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        --
31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR    --
32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT     --
33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,   --
34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED --
35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR   --
36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF   --
37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING     --
38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS       --
39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.             --
40--                                                                          --
41------------------------------------------------------------------------------
42--  $Revision: 5215 $ $Date: 2015-03-17 20:29:25 +0300 (Tue, 17 Mar 2015) $
43------------------------------------------------------------------------------
44--  This package provides implementation of simple SAX reader. It can be used
45--  for both stream and incremental processing of data. It supports XML
46--  Namespaces and includes partial support for validation against DTD.
47--
48--  Supported features:
49--   - http://xml.org/sax/features/namespaces
50--   - http://xml.org/sax/features/namespace-prefixes
51--   - http://xml.org/sax/features/validation
52--   - http://apache.org/xml/features/nonvalidating/load-external-dtd
53------------------------------------------------------------------------------
54private with Ada.Containers.Vectors;
55private with Ada.Exceptions;
56private with Ada.Finalization;
57private with Interfaces;
58
59with League.IRIs;
60with League.Strings;
61private with Matreshka.Internals.SAX_Locators;
62private with Matreshka.Internals.Strings;
63private with Matreshka.Internals.Unicode;
64private with Matreshka.Internals.Utf16;
65private with Matreshka.Internals.XML.Attribute_Tables;
66private with Matreshka.Internals.XML.Attributes;
67private with Matreshka.Internals.XML.Base_Scopes;
68private with Matreshka.Internals.XML.Element_Tables;
69private with Matreshka.Internals.XML.Entity_Tables;
70private with Matreshka.Internals.XML.Namespace_Scopes;
71private with Matreshka.Internals.XML.Notation_Tables;
72private with Matreshka.Internals.XML.Symbol_Tables;
73private with XML.SAX.Attributes;
74private with XML.SAX.Default_Handlers;
75with XML.SAX.Input_Sources;
76with XML.SAX.Readers;
77
78package XML.SAX.Simple_Readers is
79
80   type Simple_Reader is
81     limited new XML.SAX.Readers.SAX_Reader with private;
82
83   not overriding procedure Parse
84    (Self   : in out Simple_Reader;
85     Source : not null access XML.SAX.Input_Sources.SAX_Input_Source'Class);
86   --  Reads data from the specified input source till end of data is reached
87   --  and parse it. Reader can be used to read data several times, each time
88   --  it process separate XML document.
89
90   not overriding procedure Set_Input_Source
91    (Self   : in out Simple_Reader;
92     Source : not null access XML.SAX.Input_Sources.SAX_Input_Source'Class);
93   --  Sets input source to read data from it in incremental mode. It must be
94   --  called once to set input source, and procedure Parse without input
95   --  source parameter must be used to process chunks of data.
96
97   not overriding procedure Parse (Self : in out Simple_Reader);
98   --  Reads next chunk of data from the input source and parse it. Input
99   --  source must be setted by call to procedure Set_Input_Source.
100
101private
102
103   type Token is
104    (End_Of_Input,
105     Error,
106     End_Of_Chunk,
107     Token_Xml_Decl_Open,
108     Token_Pi_Open,
109     Token_Pi_Close,
110     Token_Pe_Reference,
111     Token_Doctype_Decl_Open,
112     Token_Entity_Decl_Open,
113     Token_Element_Decl_Open,
114     Token_Notation_Decl_Open,
115     Token_Close,
116     Token_Name,
117     Token_System,
118     Token_Public,
119     Token_System_Literal,
120     Token_Public_Literal,
121     Token_Internal_Subset_Open,
122     Token_Internal_Subset_Close,
123     Token_Percent,
124     Token_Value_Open,
125     Token_Value_Close,
126     Token_String_Segment,
127     Token_Ndata,
128     Token_Comment,
129     Token_Element_Open,
130     Token_Equal,
131     Token_End_Open,
132     Token_Empty_Close,
133     Token_Version,
134     Token_Encoding,
135     Token_Standalone,
136     Token_Empty,
137     Token_Any,
138     Token_Open_Parenthesis,
139     Token_Close_Parenthesis,
140     Token_Vertical_Bar,
141     Token_Comma,
142     Token_Question,
143     Token_Asterisk,
144     Token_Plus,
145     Token_Pcdata,
146     Token_Attlist_Decl_Open,
147     Token_Cdata,
148     Token_Id,
149     Token_Idref,
150     Token_Idrefs,
151     Token_Entity,
152     Token_Entities,
153     Token_Nmtoken,
154     Token_Nmtokens,
155     Token_Notation,
156     Token_Required,
157     Token_Implied,
158     Token_Fixed,
159     Token_Entity_Start,
160     Token_Entity_End,
161     Token_Conditional_Open,
162     Token_Conditional_Close,
163     Token_Cdata_Open,
164     Token_Cdata_Close);
165
166   type YYSType is limited record
167      String        : Matreshka.Internals.Strings.Shared_String_Access;
168      Symbol        : Matreshka.Internals.XML.Symbol_Identifier;
169      Is_Whitespace : Boolean;
170   end record;
171
172   procedure Set_String
173    (Item          : in out YYSType;
174     String        : League.Strings.Universal_String;
175     Is_Whitespace : Boolean);
176   pragma Inline (Set_String);
177
178   procedure Set_String_Internal
179    (Item          : in out YYSType;
180     String        : Matreshka.Internals.Strings.Shared_String_Access;
181     Is_Whitespace : Boolean);
182   pragma Inline (Set_String_Internal);
183
184   procedure Set_Symbol
185    (Item    : in out YYSType;
186      Symbol : Matreshka.Internals.XML.Symbol_Identifier);
187   pragma Inline (Set_Symbol);
188
189   procedure Move
190    (To   : in out YYSType;
191     From : in out YYSType);
192   pragma Inline (Move);
193
194   procedure Clear (Item : in out YYSType);
195   pragma Inline (Clear);
196
197   -------------------
198   -- Scanner state --
199   -------------------
200
201   type XML_Version is (XML_1_0, XML_1_1, XML_1_X);
202
203   subtype Supported_XML_Version is XML_Version range XML_1_0 .. XML_1_1;
204
205   package Unsigned_32_Vectors is
206     new Ada.Containers.Vectors
207          (Positive, Interfaces.Unsigned_32, Interfaces."=");
208
209   type Scanner_State_Information is record
210      Source               : XML.SAX.Input_Sources.SAX_Input_Source_Access;
211      Data                 : Matreshka.Internals.Strings.Shared_String_Access
212        := Matreshka.Internals.Strings.Shared_Empty'Access;
213      YY_Base_Position     : Matreshka.Internals.Utf16.Utf16_String_Index := 0;
214      YY_Base_Index        : Positive := 1;
215      YY_Base_Line         : Natural := 1;
216      YY_Base_Column       : Natural := 1;
217      YY_Base_Skip_LF      : Boolean := False;
218      YY_Current_Position  : Matreshka.Internals.Utf16.Utf16_String_Index := 0;
219      YY_Current_Index     : Positive := 1;
220      YY_Current_Line      : Natural := 1;
221      YY_Current_Column    : Natural := 1;
222      YY_Current_Skip_LF   : Boolean := False;
223      YY_Start_State       : Interfaces.Unsigned_32 := 1;
224      Incremental          : Boolean := False;
225      Is_External_Subset   : Boolean := False;
226      Entity               : Matreshka.Internals.XML.Entity_Identifier
227        := Matreshka.Internals.XML.No_Entity;
228      Start_Condition_Stack : Unsigned_32_Vectors.Vector;
229      Delimiter            : Matreshka.Internals.Unicode.Code_Point;
230      --  Delimiter of the entity value.
231      In_Literal           : Boolean  := False;
232      --  Include in literal mode, apostrophe and quotation characters are
233      --  ignored.
234      --  XXX The same behavior can be achived by resetting Delimiter to
235      --  any symbol.
236      Start_Issued         : Boolean := False;
237      --  Sets to True when Token_Entity_Start was issued before the start of
238      --  entity's replacement text scanning, thus Token_Entity_End must be
239      --  issued after completing of scanning of replacement text.
240   end record;
241
242   package Scanner_State_Vectors is
243     new Ada.Containers.Vectors (Positive, Scanner_State_Information);
244
245   package Symbol_Identifier_Vectors is
246     new Ada.Containers.Vectors
247          (Positive,
248           Matreshka.Internals.XML.Symbol_Identifier,
249           Matreshka.Internals.XML."=");
250
251   ------------------
252   -- Parser state --
253   ------------------
254
255   YY_Stack_Size : constant := 300;
256   --  The size of the value and state stacks.
257
258   type Value_Stack_Array is array (0 .. YY_Stack_Size) of YYSType;
259   type State_Stack_Array is array (0 .. YY_Stack_Size) of Natural;
260
261   type Parser_State_Information is record
262      --  Stack data used by the parser.
263
264      TOS          : Natural := 0;
265      Value_Stack  : Value_Stack_Array;
266      State_Stack  : State_Stack_Array;
267
268      Input_Symbol : Token;
269      --  Current input symbol.
270
271      Look_Ahead   : Boolean := True;
272      --  Obtain next token from the scanner.
273
274      Error        : Boolean := False;
275      --  Error recovery flag.
276--       error_flag : natural := 0;
277--          -- indicates  3 - (number of valid shifts after an error occurs)
278   end record;
279
280   Default_Handler : aliased XML.SAX.Default_Handlers.SAX_Default_Handler;
281   --  Default handler for use when user defined handler is not specified.
282
283   type Attribute_Record is record
284      Namespace_URI  : League.Strings.Universal_String;
285      Local_Name     : Matreshka.Internals.XML.Symbol_Identifier;
286      Qualified_Name : Matreshka.Internals.XML.Symbol_Identifier;
287      Value          : League.Strings.Universal_String;
288   end record;
289
290   type Namespaces_Options is record
291      Enabled  : Boolean := True;
292      Prefixes : Boolean := False;
293   end record;
294
295   type Validation_Options is record
296      Enabled  : Boolean := False;
297      Has_DTD  : Boolean := False;
298      Load_DTD : Boolean := True;
299   end record;
300
301   type Configuration_Information is record
302      Reset             : Boolean := True;
303      Source            : XML.SAX.Input_Sources.SAX_Input_Source_Access;
304      Incremental       : Boolean := False;
305      Enable_Namespaces : Boolean := True;
306      Enable_Prefixes   : Boolean := False;
307      Enable_Validation : Boolean := False;
308      Load_External_DTD : Boolean := True;
309   end record;
310
311   type Simple_Shared_Locator is tagged;
312   type Simple_Shared_Locator_Access is access all Simple_Shared_Locator'Class;
313
314   type Simple_Reader is limited new Ada.Finalization.Limited_Controlled
315     and XML.SAX.Readers.SAX_Reader with
316   record
317      --  Handlers
318
319      Content_Handler        : XML.SAX.Readers.SAX_Content_Handler_Access
320        := Default_Handler'Access;
321      Declaration_Handler    : XML.SAX.Readers.SAX_Declaration_Handler_Access
322        := Default_Handler'Access;
323      DTD_Handler            : XML.SAX.Readers.SAX_DTD_Handler_Access
324        := Default_Handler'Access;
325      Error_Handler          : XML.SAX.Readers.SAX_Error_Handler_Access
326        := Default_Handler'Access;
327      Lexical_Handler        : XML.SAX.Readers.SAX_Lexical_Handler_Access
328        := Default_Handler'Access;
329      Entity_Resolver        : XML.SAX.Readers.SAX_Entity_Resolver_Access;
330
331      --  Configuration, it is used to initialize state of the reader.
332
333      Configuration          : Configuration_Information;
334
335      --  Scanner state
336
337      Scanner_State          : Scanner_State_Information;
338      Scanner_Stack          : Scanner_State_Vectors.Vector;
339      Symbols                :
340        Matreshka.Internals.XML.Symbol_Tables.Symbol_Table;
341      Locator                : Simple_Shared_Locator_Access;
342      YYLVal                 : YYSType;
343      Character_Buffer       :
344        Matreshka.Internals.Strings.Shared_String_Access;
345      --  Preallocated buffer for character reference and attribute value
346      --  delimiter handling.
347      Character_Data         : Matreshka.Internals.Strings.Shared_String_Access
348        := Matreshka.Internals.Strings.Shared_Empty'Access;
349      --  Preallocated buffer for character data to avoid unnecessary
350      --  allocations, and to accumulate attribute's value.
351      Normalize_Value        : Boolean;
352      --  When True attribute value normalization is applied to the
353      --  attribute's character data.
354      Space_Before           : Boolean;
355      --  When normalize attribute value of non-CDATA type, it indicates that
356      --  previous processed character was space.
357      Version                : Supported_XML_Version := XML_1_0;
358      --  XML version of document entity.
359      Conditional_Directive  : Boolean;
360      --  True when conditional directive is parsed or not needed to be checked
361      --  (for nested conditional sections).
362      Conditional_Depth      : Natural := 0;
363      --  Depth of conditional sections.
364      Ignore_Depth           : Natural := 0;
365      --  Depth of ignore conditional sections.
366      Notation_Attribute     : Boolean;
367      --  Sets when processing attribute declaration of type of NOTATION.
368
369      --  Parser state
370
371      Parser_State           : Parser_State_Information;
372      Public_Id              : League.Strings.Universal_String;
373      System_Id              : League.Strings.Universal_String;
374      Error_Reported         : Boolean := False;
375
376      --  Analyzer state
377
378      Whitespace_Matched     : Boolean;
379      --  Used to check whether whitespace is used to separate tokens. For
380      --  example, '%' must be separated by whitespace from '<!ENTITY' and
381      --  following name.
382      In_Document_Content    : Boolean := False;
383      Is_Standalone          : Boolean := False;
384
385      Entities               :
386        Matreshka.Internals.XML.Entity_Tables.Entity_Table;
387      Elements               :
388        Matreshka.Internals.XML.Element_Tables.Element_Table;
389      Attributes             :
390        Matreshka.Internals.XML.Attribute_Tables.Attribute_Table;
391      Notations              :
392        Matreshka.Internals.XML.Notation_Tables.Notation_Table;
393      External_Subset_Entity : Matreshka.Internals.XML.Entity_Identifier
394        := Matreshka.Internals.XML.No_Entity;
395      --  Entity of the external subset if any.
396      External_Subset_Done   : Boolean := False;
397      Element_Names          : Symbol_Identifier_Vectors.Vector;
398      --  Stack of names of elements.
399      Continue               : Boolean := True;
400      --  Continue processing.
401      Error_Message          : League.Strings.Universal_String;
402      --  Error message.
403      User_Exception         : Ada.Exceptions.Exception_Occurrence;
404      --  Catched exception from the user defined handler.
405      Current_Element_Name   : Matreshka.Internals.XML.Symbol_Identifier;
406      --  Name of the currently processed element.
407      Current_Element        : Matreshka.Internals.XML.Element_Identifier;
408      --  Currently analyzed element, used to handle attribute list
409      --  declaration and element declaration.
410      Current_Attribute      : Matreshka.Internals.XML.Attribute_Identifier;
411      --  Currently analyzed attribute, used to handle default declaration
412      --  of the attribute definition.
413      Attribute_Redefined    : Boolean;
414      --  Mark redefinition of the currently processing attribute. Once
415      --  attribute is defined it can't be redefined.
416
417      --  When components can't be nested thier information is not hold in
418      --  YYSType and placed directly here to avoid copy overhead.
419
420      Attribute_Set          :
421        Matreshka.Internals.XML.Attributes.Attribute_Set;
422      SAX_Attributes         : XML.SAX.Attributes.SAX_Attributes;
423      --  Set of attributes of the element.
424
425      --  Namespaces handling state
426
427      Namespaces             : Namespaces_Options;
428      Namespace_Scope        :
429        Matreshka.Internals.XML.Namespace_Scopes.Namespace_Scope;
430
431      --  XML Base handling state
432
433      Bases                  : Matreshka.Internals.XML.Base_Scopes.Base_Scope;
434      --  Tracks current base URI.
435
436      --  Validator state
437
438      Validation             : Validation_Options;
439      --  Validation options
440      Root_Symbol            : Matreshka.Internals.XML.Symbol_Identifier
441        := Matreshka.Internals.XML.No_Symbol;
442      --  Expected name of the root element.
443   end record;
444
445   overriding procedure Initialize (Self : in out Simple_Reader);
446   --  Initialize internal state of the reader.
447
448   overriding procedure Finalize (Self : in out Simple_Reader);
449   --  Finalize internal state of the reader.
450
451   overriding function Content_Handler
452    (Self : Simple_Reader)
453       return XML.SAX.Readers.SAX_Content_Handler_Access;
454
455   overriding function Declaration_Handler
456    (Self : Simple_Reader)
457       return XML.SAX.Readers.SAX_Declaration_Handler_Access;
458
459   overriding function DTD_Handler
460    (Self : Simple_Reader)
461       return XML.SAX.Readers.SAX_DTD_Handler_Access;
462
463   overriding function Entity_Resolver
464    (Self : Simple_Reader)
465       return XML.SAX.Readers.SAX_Entity_Resolver_Access;
466
467   overriding function Error_Handler
468    (Self : Simple_Reader)
469       return XML.SAX.Readers.SAX_Error_Handler_Access;
470
471   overriding function Feature
472    (Self : Simple_Reader;
473     Name : League.Strings.Universal_String) return Boolean;
474
475   overriding function Has_Feature
476    (Self : Simple_Reader;
477     Name : League.Strings.Universal_String) return Boolean;
478
479   overriding function Lexical_Handler
480    (Self : Simple_Reader)
481       return XML.SAX.Readers.SAX_Lexical_Handler_Access;
482
483   overriding procedure Set_Content_Handler
484    (Self    : in out Simple_Reader;
485     Handler : XML.SAX.Readers.SAX_Content_Handler_Access);
486
487   overriding procedure Set_Declaration_Handler
488    (Self    : in out Simple_Reader;
489     Handler : XML.SAX.Readers.SAX_Declaration_Handler_Access);
490
491   overriding procedure Set_DTD_Handler
492    (Self    : in out Simple_Reader;
493     Handler : XML.SAX.Readers.SAX_DTD_Handler_Access);
494
495   overriding procedure Set_Entity_Resolver
496    (Self     : in out Simple_Reader;
497     Resolver : XML.SAX.Readers.SAX_Entity_Resolver_Access);
498
499   overriding procedure Set_Error_Handler
500    (Self    : in out Simple_Reader;
501     Handler : XML.SAX.Readers.SAX_Error_Handler_Access);
502
503   overriding procedure Set_Feature
504    (Self  : in out Simple_Reader;
505     Name  : League.Strings.Universal_String;
506     Value : Boolean);
507
508   overriding procedure Set_Lexical_Handler
509    (Self    : in out Simple_Reader;
510     Handler : XML.SAX.Readers.SAX_Lexical_Handler_Access);
511
512   -------------
513   -- Locator --
514   -------------
515
516   type Simple_Shared_Locator is
517     new Matreshka.Internals.SAX_Locators.Shared_Abstract_Locator with record
518      Reader : access Simple_Reader'Class;
519   end record;
520
521   overriding function Line
522    (Self : not null access constant Simple_Shared_Locator) return Natural;
523
524   overriding function Column
525    (Self : not null access constant Simple_Shared_Locator) return Natural;
526
527   overriding function Encoding
528    (Self : not null access constant Simple_Shared_Locator)
529       return League.Strings.Universal_String;
530
531   overriding function Version
532    (Self : not null access constant Simple_Shared_Locator)
533       return League.Strings.Universal_String;
534
535   overriding function Public_Id
536    (Self : not null access constant Simple_Shared_Locator)
537       return League.Strings.Universal_String;
538
539   overriding function System_Id
540    (Self : not null access constant Simple_Shared_Locator)
541       return League.Strings.Universal_String;
542
543   overriding function Base_URI
544    (Self : not null access constant Simple_Shared_Locator)
545       return League.IRIs.IRI;
546
547end XML.SAX.Simple_Readers;
548