1------------------------------------------------------------------------------ 2-- -- 3-- Matreshka Project -- 4-- -- 5-- XML Processor -- 6-- -- 7-- Runtime Library Component -- 8-- -- 9------------------------------------------------------------------------------ 10-- -- 11-- Copyright © 2010-2015, Vadim Godunko <vgodunko@gmail.com> -- 12-- All rights reserved. -- 13-- -- 14-- Redistribution and use in source and binary forms, with or without -- 15-- modification, are permitted provided that the following conditions -- 16-- are met: -- 17-- -- 18-- * Redistributions of source code must retain the above copyright -- 19-- notice, this list of conditions and the following disclaimer. -- 20-- -- 21-- * Redistributions in binary form must reproduce the above copyright -- 22-- notice, this list of conditions and the following disclaimer in the -- 23-- documentation and/or other materials provided with the distribution. -- 24-- -- 25-- * Neither the name of the Vadim Godunko, IE nor the names of its -- 26-- contributors may be used to endorse or promote products derived from -- 27-- this software without specific prior written permission. -- 28-- -- 29-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -- 30-- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -- 31-- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -- 32-- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -- 33-- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -- 34-- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -- 35-- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -- 36-- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -- 37-- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -- 38-- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -- 39-- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -- 40-- -- 41------------------------------------------------------------------------------ 42-- $Revision: 5215 $ $Date: 2015-03-17 20:29:25 +0300 (Tue, 17 Mar 2015) $ 43------------------------------------------------------------------------------ 44-- This package provides implementation of simple SAX reader. It can be used 45-- for both stream and incremental processing of data. It supports XML 46-- Namespaces and includes partial support for validation against DTD. 47-- 48-- Supported features: 49-- - http://xml.org/sax/features/namespaces 50-- - http://xml.org/sax/features/namespace-prefixes 51-- - http://xml.org/sax/features/validation 52-- - http://apache.org/xml/features/nonvalidating/load-external-dtd 53------------------------------------------------------------------------------ 54private with Ada.Containers.Vectors; 55private with Ada.Exceptions; 56private with Ada.Finalization; 57private with Interfaces; 58 59with League.IRIs; 60with League.Strings; 61private with Matreshka.Internals.SAX_Locators; 62private with Matreshka.Internals.Strings; 63private with Matreshka.Internals.Unicode; 64private with Matreshka.Internals.Utf16; 65private with Matreshka.Internals.XML.Attribute_Tables; 66private with Matreshka.Internals.XML.Attributes; 67private with Matreshka.Internals.XML.Base_Scopes; 68private with Matreshka.Internals.XML.Element_Tables; 69private with Matreshka.Internals.XML.Entity_Tables; 70private with Matreshka.Internals.XML.Namespace_Scopes; 71private with Matreshka.Internals.XML.Notation_Tables; 72private with Matreshka.Internals.XML.Symbol_Tables; 73private with XML.SAX.Attributes; 74private with XML.SAX.Default_Handlers; 75with XML.SAX.Input_Sources; 76with XML.SAX.Readers; 77 78package XML.SAX.Simple_Readers is 79 80 type Simple_Reader is 81 limited new XML.SAX.Readers.SAX_Reader with private; 82 83 not overriding procedure Parse 84 (Self : in out Simple_Reader; 85 Source : not null access XML.SAX.Input_Sources.SAX_Input_Source'Class); 86 -- Reads data from the specified input source till end of data is reached 87 -- and parse it. Reader can be used to read data several times, each time 88 -- it process separate XML document. 89 90 not overriding procedure Set_Input_Source 91 (Self : in out Simple_Reader; 92 Source : not null access XML.SAX.Input_Sources.SAX_Input_Source'Class); 93 -- Sets input source to read data from it in incremental mode. It must be 94 -- called once to set input source, and procedure Parse without input 95 -- source parameter must be used to process chunks of data. 96 97 not overriding procedure Parse (Self : in out Simple_Reader); 98 -- Reads next chunk of data from the input source and parse it. Input 99 -- source must be setted by call to procedure Set_Input_Source. 100 101private 102 103 type Token is 104 (End_Of_Input, 105 Error, 106 End_Of_Chunk, 107 Token_Xml_Decl_Open, 108 Token_Pi_Open, 109 Token_Pi_Close, 110 Token_Pe_Reference, 111 Token_Doctype_Decl_Open, 112 Token_Entity_Decl_Open, 113 Token_Element_Decl_Open, 114 Token_Notation_Decl_Open, 115 Token_Close, 116 Token_Name, 117 Token_System, 118 Token_Public, 119 Token_System_Literal, 120 Token_Public_Literal, 121 Token_Internal_Subset_Open, 122 Token_Internal_Subset_Close, 123 Token_Percent, 124 Token_Value_Open, 125 Token_Value_Close, 126 Token_String_Segment, 127 Token_Ndata, 128 Token_Comment, 129 Token_Element_Open, 130 Token_Equal, 131 Token_End_Open, 132 Token_Empty_Close, 133 Token_Version, 134 Token_Encoding, 135 Token_Standalone, 136 Token_Empty, 137 Token_Any, 138 Token_Open_Parenthesis, 139 Token_Close_Parenthesis, 140 Token_Vertical_Bar, 141 Token_Comma, 142 Token_Question, 143 Token_Asterisk, 144 Token_Plus, 145 Token_Pcdata, 146 Token_Attlist_Decl_Open, 147 Token_Cdata, 148 Token_Id, 149 Token_Idref, 150 Token_Idrefs, 151 Token_Entity, 152 Token_Entities, 153 Token_Nmtoken, 154 Token_Nmtokens, 155 Token_Notation, 156 Token_Required, 157 Token_Implied, 158 Token_Fixed, 159 Token_Entity_Start, 160 Token_Entity_End, 161 Token_Conditional_Open, 162 Token_Conditional_Close, 163 Token_Cdata_Open, 164 Token_Cdata_Close); 165 166 type YYSType is limited record 167 String : Matreshka.Internals.Strings.Shared_String_Access; 168 Symbol : Matreshka.Internals.XML.Symbol_Identifier; 169 Is_Whitespace : Boolean; 170 end record; 171 172 procedure Set_String 173 (Item : in out YYSType; 174 String : League.Strings.Universal_String; 175 Is_Whitespace : Boolean); 176 pragma Inline (Set_String); 177 178 procedure Set_String_Internal 179 (Item : in out YYSType; 180 String : Matreshka.Internals.Strings.Shared_String_Access; 181 Is_Whitespace : Boolean); 182 pragma Inline (Set_String_Internal); 183 184 procedure Set_Symbol 185 (Item : in out YYSType; 186 Symbol : Matreshka.Internals.XML.Symbol_Identifier); 187 pragma Inline (Set_Symbol); 188 189 procedure Move 190 (To : in out YYSType; 191 From : in out YYSType); 192 pragma Inline (Move); 193 194 procedure Clear (Item : in out YYSType); 195 pragma Inline (Clear); 196 197 ------------------- 198 -- Scanner state -- 199 ------------------- 200 201 type XML_Version is (XML_1_0, XML_1_1, XML_1_X); 202 203 subtype Supported_XML_Version is XML_Version range XML_1_0 .. XML_1_1; 204 205 package Unsigned_32_Vectors is 206 new Ada.Containers.Vectors 207 (Positive, Interfaces.Unsigned_32, Interfaces."="); 208 209 type Scanner_State_Information is record 210 Source : XML.SAX.Input_Sources.SAX_Input_Source_Access; 211 Data : Matreshka.Internals.Strings.Shared_String_Access 212 := Matreshka.Internals.Strings.Shared_Empty'Access; 213 YY_Base_Position : Matreshka.Internals.Utf16.Utf16_String_Index := 0; 214 YY_Base_Index : Positive := 1; 215 YY_Base_Line : Natural := 1; 216 YY_Base_Column : Natural := 1; 217 YY_Base_Skip_LF : Boolean := False; 218 YY_Current_Position : Matreshka.Internals.Utf16.Utf16_String_Index := 0; 219 YY_Current_Index : Positive := 1; 220 YY_Current_Line : Natural := 1; 221 YY_Current_Column : Natural := 1; 222 YY_Current_Skip_LF : Boolean := False; 223 YY_Start_State : Interfaces.Unsigned_32 := 1; 224 Incremental : Boolean := False; 225 Is_External_Subset : Boolean := False; 226 Entity : Matreshka.Internals.XML.Entity_Identifier 227 := Matreshka.Internals.XML.No_Entity; 228 Start_Condition_Stack : Unsigned_32_Vectors.Vector; 229 Delimiter : Matreshka.Internals.Unicode.Code_Point; 230 -- Delimiter of the entity value. 231 In_Literal : Boolean := False; 232 -- Include in literal mode, apostrophe and quotation characters are 233 -- ignored. 234 -- XXX The same behavior can be achived by resetting Delimiter to 235 -- any symbol. 236 Start_Issued : Boolean := False; 237 -- Sets to True when Token_Entity_Start was issued before the start of 238 -- entity's replacement text scanning, thus Token_Entity_End must be 239 -- issued after completing of scanning of replacement text. 240 end record; 241 242 package Scanner_State_Vectors is 243 new Ada.Containers.Vectors (Positive, Scanner_State_Information); 244 245 package Symbol_Identifier_Vectors is 246 new Ada.Containers.Vectors 247 (Positive, 248 Matreshka.Internals.XML.Symbol_Identifier, 249 Matreshka.Internals.XML."="); 250 251 ------------------ 252 -- Parser state -- 253 ------------------ 254 255 YY_Stack_Size : constant := 300; 256 -- The size of the value and state stacks. 257 258 type Value_Stack_Array is array (0 .. YY_Stack_Size) of YYSType; 259 type State_Stack_Array is array (0 .. YY_Stack_Size) of Natural; 260 261 type Parser_State_Information is record 262 -- Stack data used by the parser. 263 264 TOS : Natural := 0; 265 Value_Stack : Value_Stack_Array; 266 State_Stack : State_Stack_Array; 267 268 Input_Symbol : Token; 269 -- Current input symbol. 270 271 Look_Ahead : Boolean := True; 272 -- Obtain next token from the scanner. 273 274 Error : Boolean := False; 275 -- Error recovery flag. 276-- error_flag : natural := 0; 277-- -- indicates 3 - (number of valid shifts after an error occurs) 278 end record; 279 280 Default_Handler : aliased XML.SAX.Default_Handlers.SAX_Default_Handler; 281 -- Default handler for use when user defined handler is not specified. 282 283 type Attribute_Record is record 284 Namespace_URI : League.Strings.Universal_String; 285 Local_Name : Matreshka.Internals.XML.Symbol_Identifier; 286 Qualified_Name : Matreshka.Internals.XML.Symbol_Identifier; 287 Value : League.Strings.Universal_String; 288 end record; 289 290 type Namespaces_Options is record 291 Enabled : Boolean := True; 292 Prefixes : Boolean := False; 293 end record; 294 295 type Validation_Options is record 296 Enabled : Boolean := False; 297 Has_DTD : Boolean := False; 298 Load_DTD : Boolean := True; 299 end record; 300 301 type Configuration_Information is record 302 Reset : Boolean := True; 303 Source : XML.SAX.Input_Sources.SAX_Input_Source_Access; 304 Incremental : Boolean := False; 305 Enable_Namespaces : Boolean := True; 306 Enable_Prefixes : Boolean := False; 307 Enable_Validation : Boolean := False; 308 Load_External_DTD : Boolean := True; 309 end record; 310 311 type Simple_Shared_Locator is tagged; 312 type Simple_Shared_Locator_Access is access all Simple_Shared_Locator'Class; 313 314 type Simple_Reader is limited new Ada.Finalization.Limited_Controlled 315 and XML.SAX.Readers.SAX_Reader with 316 record 317 -- Handlers 318 319 Content_Handler : XML.SAX.Readers.SAX_Content_Handler_Access 320 := Default_Handler'Access; 321 Declaration_Handler : XML.SAX.Readers.SAX_Declaration_Handler_Access 322 := Default_Handler'Access; 323 DTD_Handler : XML.SAX.Readers.SAX_DTD_Handler_Access 324 := Default_Handler'Access; 325 Error_Handler : XML.SAX.Readers.SAX_Error_Handler_Access 326 := Default_Handler'Access; 327 Lexical_Handler : XML.SAX.Readers.SAX_Lexical_Handler_Access 328 := Default_Handler'Access; 329 Entity_Resolver : XML.SAX.Readers.SAX_Entity_Resolver_Access; 330 331 -- Configuration, it is used to initialize state of the reader. 332 333 Configuration : Configuration_Information; 334 335 -- Scanner state 336 337 Scanner_State : Scanner_State_Information; 338 Scanner_Stack : Scanner_State_Vectors.Vector; 339 Symbols : 340 Matreshka.Internals.XML.Symbol_Tables.Symbol_Table; 341 Locator : Simple_Shared_Locator_Access; 342 YYLVal : YYSType; 343 Character_Buffer : 344 Matreshka.Internals.Strings.Shared_String_Access; 345 -- Preallocated buffer for character reference and attribute value 346 -- delimiter handling. 347 Character_Data : Matreshka.Internals.Strings.Shared_String_Access 348 := Matreshka.Internals.Strings.Shared_Empty'Access; 349 -- Preallocated buffer for character data to avoid unnecessary 350 -- allocations, and to accumulate attribute's value. 351 Normalize_Value : Boolean; 352 -- When True attribute value normalization is applied to the 353 -- attribute's character data. 354 Space_Before : Boolean; 355 -- When normalize attribute value of non-CDATA type, it indicates that 356 -- previous processed character was space. 357 Version : Supported_XML_Version := XML_1_0; 358 -- XML version of document entity. 359 Conditional_Directive : Boolean; 360 -- True when conditional directive is parsed or not needed to be checked 361 -- (for nested conditional sections). 362 Conditional_Depth : Natural := 0; 363 -- Depth of conditional sections. 364 Ignore_Depth : Natural := 0; 365 -- Depth of ignore conditional sections. 366 Notation_Attribute : Boolean; 367 -- Sets when processing attribute declaration of type of NOTATION. 368 369 -- Parser state 370 371 Parser_State : Parser_State_Information; 372 Public_Id : League.Strings.Universal_String; 373 System_Id : League.Strings.Universal_String; 374 Error_Reported : Boolean := False; 375 376 -- Analyzer state 377 378 Whitespace_Matched : Boolean; 379 -- Used to check whether whitespace is used to separate tokens. For 380 -- example, '%' must be separated by whitespace from '<!ENTITY' and 381 -- following name. 382 In_Document_Content : Boolean := False; 383 Is_Standalone : Boolean := False; 384 385 Entities : 386 Matreshka.Internals.XML.Entity_Tables.Entity_Table; 387 Elements : 388 Matreshka.Internals.XML.Element_Tables.Element_Table; 389 Attributes : 390 Matreshka.Internals.XML.Attribute_Tables.Attribute_Table; 391 Notations : 392 Matreshka.Internals.XML.Notation_Tables.Notation_Table; 393 External_Subset_Entity : Matreshka.Internals.XML.Entity_Identifier 394 := Matreshka.Internals.XML.No_Entity; 395 -- Entity of the external subset if any. 396 External_Subset_Done : Boolean := False; 397 Element_Names : Symbol_Identifier_Vectors.Vector; 398 -- Stack of names of elements. 399 Continue : Boolean := True; 400 -- Continue processing. 401 Error_Message : League.Strings.Universal_String; 402 -- Error message. 403 User_Exception : Ada.Exceptions.Exception_Occurrence; 404 -- Catched exception from the user defined handler. 405 Current_Element_Name : Matreshka.Internals.XML.Symbol_Identifier; 406 -- Name of the currently processed element. 407 Current_Element : Matreshka.Internals.XML.Element_Identifier; 408 -- Currently analyzed element, used to handle attribute list 409 -- declaration and element declaration. 410 Current_Attribute : Matreshka.Internals.XML.Attribute_Identifier; 411 -- Currently analyzed attribute, used to handle default declaration 412 -- of the attribute definition. 413 Attribute_Redefined : Boolean; 414 -- Mark redefinition of the currently processing attribute. Once 415 -- attribute is defined it can't be redefined. 416 417 -- When components can't be nested thier information is not hold in 418 -- YYSType and placed directly here to avoid copy overhead. 419 420 Attribute_Set : 421 Matreshka.Internals.XML.Attributes.Attribute_Set; 422 SAX_Attributes : XML.SAX.Attributes.SAX_Attributes; 423 -- Set of attributes of the element. 424 425 -- Namespaces handling state 426 427 Namespaces : Namespaces_Options; 428 Namespace_Scope : 429 Matreshka.Internals.XML.Namespace_Scopes.Namespace_Scope; 430 431 -- XML Base handling state 432 433 Bases : Matreshka.Internals.XML.Base_Scopes.Base_Scope; 434 -- Tracks current base URI. 435 436 -- Validator state 437 438 Validation : Validation_Options; 439 -- Validation options 440 Root_Symbol : Matreshka.Internals.XML.Symbol_Identifier 441 := Matreshka.Internals.XML.No_Symbol; 442 -- Expected name of the root element. 443 end record; 444 445 overriding procedure Initialize (Self : in out Simple_Reader); 446 -- Initialize internal state of the reader. 447 448 overriding procedure Finalize (Self : in out Simple_Reader); 449 -- Finalize internal state of the reader. 450 451 overriding function Content_Handler 452 (Self : Simple_Reader) 453 return XML.SAX.Readers.SAX_Content_Handler_Access; 454 455 overriding function Declaration_Handler 456 (Self : Simple_Reader) 457 return XML.SAX.Readers.SAX_Declaration_Handler_Access; 458 459 overriding function DTD_Handler 460 (Self : Simple_Reader) 461 return XML.SAX.Readers.SAX_DTD_Handler_Access; 462 463 overriding function Entity_Resolver 464 (Self : Simple_Reader) 465 return XML.SAX.Readers.SAX_Entity_Resolver_Access; 466 467 overriding function Error_Handler 468 (Self : Simple_Reader) 469 return XML.SAX.Readers.SAX_Error_Handler_Access; 470 471 overriding function Feature 472 (Self : Simple_Reader; 473 Name : League.Strings.Universal_String) return Boolean; 474 475 overriding function Has_Feature 476 (Self : Simple_Reader; 477 Name : League.Strings.Universal_String) return Boolean; 478 479 overriding function Lexical_Handler 480 (Self : Simple_Reader) 481 return XML.SAX.Readers.SAX_Lexical_Handler_Access; 482 483 overriding procedure Set_Content_Handler 484 (Self : in out Simple_Reader; 485 Handler : XML.SAX.Readers.SAX_Content_Handler_Access); 486 487 overriding procedure Set_Declaration_Handler 488 (Self : in out Simple_Reader; 489 Handler : XML.SAX.Readers.SAX_Declaration_Handler_Access); 490 491 overriding procedure Set_DTD_Handler 492 (Self : in out Simple_Reader; 493 Handler : XML.SAX.Readers.SAX_DTD_Handler_Access); 494 495 overriding procedure Set_Entity_Resolver 496 (Self : in out Simple_Reader; 497 Resolver : XML.SAX.Readers.SAX_Entity_Resolver_Access); 498 499 overriding procedure Set_Error_Handler 500 (Self : in out Simple_Reader; 501 Handler : XML.SAX.Readers.SAX_Error_Handler_Access); 502 503 overriding procedure Set_Feature 504 (Self : in out Simple_Reader; 505 Name : League.Strings.Universal_String; 506 Value : Boolean); 507 508 overriding procedure Set_Lexical_Handler 509 (Self : in out Simple_Reader; 510 Handler : XML.SAX.Readers.SAX_Lexical_Handler_Access); 511 512 ------------- 513 -- Locator -- 514 ------------- 515 516 type Simple_Shared_Locator is 517 new Matreshka.Internals.SAX_Locators.Shared_Abstract_Locator with record 518 Reader : access Simple_Reader'Class; 519 end record; 520 521 overriding function Line 522 (Self : not null access constant Simple_Shared_Locator) return Natural; 523 524 overriding function Column 525 (Self : not null access constant Simple_Shared_Locator) return Natural; 526 527 overriding function Encoding 528 (Self : not null access constant Simple_Shared_Locator) 529 return League.Strings.Universal_String; 530 531 overriding function Version 532 (Self : not null access constant Simple_Shared_Locator) 533 return League.Strings.Universal_String; 534 535 overriding function Public_Id 536 (Self : not null access constant Simple_Shared_Locator) 537 return League.Strings.Universal_String; 538 539 overriding function System_Id 540 (Self : not null access constant Simple_Shared_Locator) 541 return League.Strings.Universal_String; 542 543 overriding function Base_URI 544 (Self : not null access constant Simple_Shared_Locator) 545 return League.IRIs.IRI; 546 547end XML.SAX.Simple_Readers; 548