1package HTML::HTML5::Parser::Tokenizer; # -*- Perl -*- 2## skip Test::Tabs 3use strict; 4use warnings; 5our $VERSION='0.992'; 6 7## This module implements the tokenization phase of both HTML5 and 8## XML5. Notes like this are usually based on the latest HTML 9## specification. Since XML is different from HTML, and since XML5 10## specification has not been maintained any more, there is a few 11## differences from HTML's tokenization. Such differences are marked 12## by prefix "XML5:". 13 14## Warnings that depend on the HTML/XML input stream, such as ones 15## related to surrogate code positions, are not useful. 16no warnings 'utf8'; 17 18## ------ Token types ------ 19 20BEGIN { 21 require Exporter; 22 push our @ISA, 'Exporter'; 23 24 our @EXPORT_OK = qw( 25 DOCTYPE_TOKEN 26 COMMENT_TOKEN 27 START_TAG_TOKEN 28 END_TAG_TOKEN 29 END_OF_FILE_TOKEN 30 CHARACTER_TOKEN 31 PI_TOKEN 32 ABORT_TOKEN 33 END_OF_DOCTYPE_TOKEN 34 ATTLIST_TOKEN 35 ELEMENT_TOKEN 36 GENERAL_ENTITY_TOKEN 37 PARAMETER_ENTITY_TOKEN 38 NOTATION_TOKEN 39 ); 40 41 our %EXPORT_TAGS = ( 42 token => [qw( 43 DOCTYPE_TOKEN 44 COMMENT_TOKEN 45 START_TAG_TOKEN 46 END_TAG_TOKEN 47 END_OF_FILE_TOKEN 48 CHARACTER_TOKEN 49 PI_TOKEN 50 ABORT_TOKEN 51 END_OF_DOCTYPE_TOKEN 52 ATTLIST_TOKEN 53 ELEMENT_TOKEN 54 GENERAL_ENTITY_TOKEN 55 PARAMETER_ENTITY_TOKEN 56 NOTATION_TOKEN 57 )], 58 ); 59} 60 61sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token. 62sub COMMENT_TOKEN () { 2 } 63sub START_TAG_TOKEN () { 3 } 64sub END_TAG_TOKEN () { 4 } 65sub END_OF_FILE_TOKEN () { 5 } 66sub CHARACTER_TOKEN () { 6 } 67sub PI_TOKEN () { 7 } ## NOTE: XML only. 68sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing. 69sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only. 70sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only. 71sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only. 72sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only. 73sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only. 74sub NOTATION_TOKEN () { 14 } ## NOTE: XML only. 75 76## XML5: XML5 has "empty tag token". In this implementation, it is 77## represented as a start tag token with $self->{self_closing} flag 78## set to true. 79 80## XML5: XML5 has "short end tag token". In this implementation, it 81## is represented as an end tag token with $token->{tag_name} flag set 82## to an empty string. 83 84package HTML::HTML5::Parser::TagSoupParser; 85 86BEGIN { HTML::HTML5::Parser::Tokenizer->import (':token') } 87 88use HTML::HTML5::Entities qw[%entity2char]; 89 90## ------ Tokenizer states ------ 91 92sub DATA_STATE () { 0 } 93sub RCDATA_STATE () { 107 } 94sub RAWTEXT_STATE () { 108 } 95sub SCRIPT_DATA_STATE () { 109 } 96sub PLAINTEXT_STATE () { 110 } 97sub TAG_OPEN_STATE () { 2 } 98sub RCDATA_LT_STATE () { 111 } 99sub RAWTEXT_LT_STATE () { 112 } 100sub SCRIPT_DATA_LT_STATE () { 113 } 101sub CLOSE_TAG_OPEN_STATE () { 3 } 102sub RCDATA_END_TAG_OPEN_STATE () { 114 } 103sub RAWTEXT_END_TAG_OPEN_STATE () { 115 } 104sub SCRIPT_DATA_END_TAG_OPEN_STATE () { 116 } 105sub SCRIPT_DATA_ESCAPE_START_STATE () { 1 } 106sub SCRIPT_DATA_ESCAPE_START_DASH_STATE () { 12 } 107sub SCRIPT_DATA_ESCAPED_STATE () { 117 } 108sub SCRIPT_DATA_ESCAPED_DASH_STATE () { 118 } 109sub SCRIPT_DATA_ESCAPED_DASH_DASH_STATE () { 119 } 110sub SCRIPT_DATA_ESCAPED_LT_STATE () { 120 } 111sub SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE () { 121 } 112sub SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE () { 122 } 113sub SCRIPT_DATA_DOUBLE_ESCAPED_STATE () { 123 } 114sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE () { 124 } 115sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE () { 125 } 116sub SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE () { 126 } 117sub SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE () { 127 } 118sub TAG_NAME_STATE () { 4 } 119sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 } 120sub ATTRIBUTE_NAME_STATE () { 6 } 121sub AFTER_ATTRIBUTE_NAME_STATE () { 7 } 122sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 } 123sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } 124sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } 125sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } 126sub MARKUP_DECLARATION_OPEN_STATE () { 13 } 127sub COMMENT_START_STATE () { 14 } 128sub COMMENT_START_DASH_STATE () { 15 } 129sub COMMENT_STATE () { 16 } 130sub COMMENT_END_STATE () { 17 } 131sub COMMENT_END_BANG_STATE () { 102 } 132#sub COMMENT_END_SPACE_STATE () { 103 } ## REMOVED 133sub COMMENT_END_DASH_STATE () { 18 } 134sub BOGUS_COMMENT_STATE () { 19 } 135sub DOCTYPE_STATE () { 20 } 136sub BEFORE_DOCTYPE_NAME_STATE () { 21 } 137sub DOCTYPE_NAME_STATE () { 22 } 138sub AFTER_DOCTYPE_NAME_STATE () { 23 } 139sub AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE () { 104 } 140sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 } 141sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 } 142sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 } 143sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 } 144sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 } 145sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 } 146sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } 147sub BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE () { 105 } 148sub AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE () { 106 } 149sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } 150sub BOGUS_DOCTYPE_STATE () { 32 } 151sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } 152sub SELF_CLOSING_START_TAG_STATE () { 34 } 153sub CDATA_SECTION_STATE () { 35 } 154sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec 155sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec 156sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec 157#sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec 158sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec 159sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec 160sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec 161sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec 162## 163## NOTE: "Entity data state", "entity in attribute value state", and 164## the "consume a character reference" algorithm, are jointly 165## implemented as the following six states: 166sub ENTITY_STATE () { 44 } 167sub ENTITY_HASH_STATE () { 45 } 168sub NCR_NUM_STATE () { 46 } 169sub HEXREF_X_STATE () { 47 } 170sub HEXREF_HEX_STATE () { 48 } 171sub ENTITY_NAME_STATE () { 49 } 172## 173## XML-only states 174sub DATA_MSE1_STATE () { 50 } 175sub DATA_MSE2_STATE () { 128 } # last 176sub PI_STATE () { 51 } 177sub PI_TARGET_STATE () { 52 } 178sub PI_TARGET_AFTER_STATE () { 53 } 179sub PI_DATA_STATE () { 54 } 180sub PI_AFTER_STATE () { 55 } 181sub PI_DATA_AFTER_STATE () { 56 } 182sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 } 183sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 } 184sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 } 185sub DOCTYPE_TAG_STATE () { 60 } 186sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 } 187sub MD_ATTLIST_STATE () { 62 } 188sub MD_E_STATE () { 63 } 189sub MD_ELEMENT_STATE () { 64 } 190sub MD_ENTITY_STATE () { 65 } 191sub MD_NOTATION_STATE () { 66 } 192sub DOCTYPE_MD_STATE () { 67 } 193sub BEFORE_MD_NAME_STATE () { 68 } 194sub MD_NAME_STATE () { 69 } 195sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 } 196sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 } 197sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 } 198sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 } 199sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 } 200sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 } 201sub BEFORE_ALLOWED_TOKEN_STATE () { 76 } 202sub ALLOWED_TOKEN_STATE () { 77 } 203sub AFTER_ALLOWED_TOKEN_STATE () { 78 } 204sub AFTER_ALLOWED_TOKENS_STATE () { 79 } 205sub BEFORE_ATTR_DEFAULT_STATE () { 80 } 206sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 } 207sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 } 208sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 } 209sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 } 210sub BEFORE_NDATA_STATE () { 85 } 211sub NDATA_STATE () { 86 } 212sub AFTER_NDATA_STATE () { 87 } 213sub BEFORE_NOTATION_NAME_STATE () { 88 } 214sub NOTATION_NAME_STATE () { 89 } 215sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 } 216sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 } 217sub ENTITY_VALUE_ENTITY_STATE () { 92 } 218sub AFTER_ELEMENT_NAME_STATE () { 93 } 219sub BEFORE_ELEMENT_CONTENT_STATE () { 94 } 220sub CONTENT_KEYWORD_STATE () { 95 } 221sub AFTER_CM_GROUP_OPEN_STATE () { 96 } 222sub CM_ELEMENT_NAME_STATE () { 97 } 223sub AFTER_CM_ELEMENT_NAME_STATE () { 98 } 224sub AFTER_CM_GROUP_CLOSE_STATE () { 99 } 225sub AFTER_MD_DEF_STATE () { 100 } 226sub BOGUS_MD_STATE () { 101 } 227 228## ------ Tree constructor state constants ------ 229 230## Whether the parsed string is in the foreign island or not affect 231## how tokenization is done, unfortunately. These are a copy of some 232## of tokenization state constants. See Whatpm::HTML for the full 233## list and the descriptions for constants. 234 235sub FOREIGN_EL () { 0b1_00000000000 } 236 237## ------ Character reference mappings ------ 238 239my $charref_map = { 240 0x00 => 0xFFFD, # REPLACEMENT CHARACTER 241 0x0D => 0x000D, # CARRIAGE RETURN 242 0x80 => 0x20AC, 243 0x81 => 0x0081, 244 0x82 => 0x201A, 245 0x83 => 0x0192, 246 0x84 => 0x201E, 247 0x85 => 0x2026, 248 0x86 => 0x2020, 249 0x87 => 0x2021, 250 0x88 => 0x02C6, 251 0x89 => 0x2030, 252 0x8A => 0x0160, 253 0x8B => 0x2039, 254 0x8C => 0x0152, 255 0x8D => 0x008D, 256 0x8E => 0x017D, 257 0x8F => 0x008F, 258 0x90 => 0x0090, 259 0x91 => 0x2018, 260 0x92 => 0x2019, 261 0x93 => 0x201C, 262 0x94 => 0x201D, 263 0x95 => 0x2022, 264 0x96 => 0x2013, 265 0x97 => 0x2014, 266 0x98 => 0x02DC, 267 0x99 => 0x2122, 268 0x9A => 0x0161, 269 0x9B => 0x203A, 270 0x9C => 0x0153, 271 0x9D => 0x009D, 272 0x9E => 0x017E, 273 0x9F => 0x0178, 274}; # $charref_map 275$charref_map->{$_} = 0xFFFD # REPLACEMENT CHARACTER 276 for 0xD800..0xDFFF; 277$charref_map->{$_} = $_ 278 for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F, 279 0xFDD0..0xFDEF, 280 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 281 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 282 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 283 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 284 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF; 285 286## ------ Special character-like constants ------ 287 288## The "EOF" pseudo-character in the HTML parsing algorithm. 289sub EOF_CHAR () { -1 } 290 291## A pseudo-character code that can never appear in the input stream. 292sub NEVER_CHAR () { -2 } 293 294## ------ The tokenizer ------ 295 296## Implementations MUST act as if state machine in the spec 297 298sub _initialize_tokenizer ($) { 299 my $self = shift; 300 301 ## NOTE: Fields set by |new| constructor: 302 #$self->{level} 303 #$self->{set_nc} 304 #$self->{parse_error} 305 #$self->{is_xml} (if XML) 306 307 $self->{state} = DATA_STATE; # MUST 308 #$self->{kwd} = ''; # State-dependent keyword; initialized when used 309 #$self->{entity__value}; # initialized when used 310 #$self->{entity__match}; # initialized when used 311 undef $self->{ct}; # current token 312 undef $self->{ca}; # current attribute 313 undef $self->{last_stag_name}; # last emitted start tag name 314 #$self->{prev_state}; # initialized when used 315 delete $self->{self_closing}; 316 $self->{char_buffer} = ''; 317 $self->{char_buffer_pos} = 0; 318 $self->{nc} = -1; # next input character 319 #$self->{next_nc} 320 321 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 322 $self->{line_prev} = $self->{line}; 323 $self->{column_prev} = $self->{column}; 324 $self->{column}++; 325 $self->{nc} 326 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 327 } else { 328 $self->{set_nc}->($self); 329 } 330 331 $self->{token} = []; 332 # $self->{escape} 333} # _initialize_tokenizer 334 335## A token has: 336## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, 337## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN 338## ->{name} (DOCTYPE_TOKEN) 339## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) 340## ->{target} (PI_TOKEN) 341## ->{pubid} (DOCTYPE_TOKEN) 342## ->{sysid} (DOCTYPE_TOKEN) 343## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag 344## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) 345## ->{name} 346## ->{value} 347## ->{has_reference} == 1 or 0 348## ->{index}: Index of the attribute in a tag. 349## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) 350## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) 351## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. 352## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN) 353 354## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. 355## |->{self_closing}| is used to save the value of |$self->{self_closing}| 356## while the token is pushed back to the stack. 357 358## Emitted token MUST immediately be handled by the tree construction state. 359 360## Before each step, UA MAY check to see if either one of the scripts in 361## "list of scripts that will execute as soon as possible" or the first 362## script in the "list of scripts that will execute asynchronously", 363## has completed loading. If one has, then it MUST be executed 364## and removed from the list. 365 366## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) 367## (This requirement was dropped from HTML5 spec, unfortunately.) 368 369my $is_space = { 370 0x0009 => 1, # CHARACTER TABULATION (HT) 371 0x000A => 1, # LINE FEED (LF) 372 #0x000B => 0, # LINE TABULATION (VT) 373 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character. 374 0x000D => 1, # CARRIAGE RETURN (CR) 375 0x0020 => 1, # SPACE (SP) 376}; 377 378sub KEY_ELSE_CHAR () { 255 } 379sub KEY_ULATIN_CHAR () { 254 } 380sub KEY_LLATIN_CHAR () { 253 } 381sub KEY_EOF_CHAR () { 252 } 382sub KEY_SPACE_CHAR () { 251 } 383 384my $Action; 385my $XMLAction; 386$Action->[DATA_STATE]->[0x0026] = { 387 name => 'data &', 388 state => ENTITY_STATE, # "entity data state" + "consume a character reference" 389 state_set => {entity_add => -1, prev_state => DATA_STATE}, 390}; 391$Action->[DATA_STATE]->[0x003C] = { 392 name => 'data <', 393 state => TAG_OPEN_STATE, 394}; 395$Action->[DATA_STATE]->[KEY_EOF_CHAR] = { 396 name => 'data eof', 397 emit => END_OF_FILE_TOKEN, 398 reconsume => 1, 399}; 400$Action->[DATA_STATE]->[0x0000] = { 401 name => 'data null', 402 emit => CHARACTER_TOKEN, 403 error => 'NULL', 404}; 405$Action->[DATA_STATE]->[KEY_ELSE_CHAR] = { 406 name => 'data else', 407 emit => CHARACTER_TOKEN, 408 emit_data_read_until => qq{\x00<&}, 409}; 410 $XMLAction->[DATA_STATE]->[0x005D] = { # ] 411 name => 'data ]', 412 state => DATA_MSE1_STATE, 413 emit => CHARACTER_TOKEN, 414 }; 415 $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = { 416 name => 'data else xml', 417 emit => CHARACTER_TOKEN, 418 emit_data_read_until => qq{\x00<&\]}, 419 }; 420$Action->[RCDATA_STATE]->[0x0026] = { 421 name => 'rcdata &', 422 state => ENTITY_STATE, # "entity data state" + "consume a character reference" 423 state_set => {entity_add => -1, prev_state => RCDATA_STATE}, 424}; 425$Action->[RCDATA_STATE]->[0x003C] = { 426 name => 'rcdata <', 427 state => RCDATA_LT_STATE, 428}; 429$Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; 430$Action->[RCDATA_STATE]->[0x0000] = { 431 name => 'rcdata null', 432 emit => CHARACTER_TOKEN, 433 emit_data => "\x{FFFD}", 434 error => 'NULL', 435}; 436$Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = { 437 name => 'rcdata else', 438 emit => CHARACTER_TOKEN, 439 emit_data_read_until => qq{\x00<&}, 440}; 441$Action->[RAWTEXT_STATE]->[0x003C] = { 442 name => 'rawtext <', 443 state => RAWTEXT_LT_STATE, 444}; 445$Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; 446$Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000]; 447$Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = { 448 name => 'rawtext else', 449 emit => CHARACTER_TOKEN, 450 emit_data_read_until => qq{\x00<}, 451}; 452$Action->[SCRIPT_DATA_STATE]->[0x003C] = { 453 name => 'script data <', 454 state => SCRIPT_DATA_LT_STATE, 455}; 456$Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; 457$Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000]; 458$Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR]; 459$Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; 460$Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000]; 461$Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = { 462 name => 'plaintext else', 463 emit => CHARACTER_TOKEN, 464 emit_data_read_until => qq{\x00}, 465}; 466# "Tag open state" is known as "tag state" in XML5. 467$Action->[TAG_OPEN_STATE]->[0x0021] = { 468 name => 'tag open !', 469 state => MARKUP_DECLARATION_OPEN_STATE, 470}; 471$Action->[TAG_OPEN_STATE]->[0x002F] = { 472 name => 'tag open /', 473 state => CLOSE_TAG_OPEN_STATE, 474}; 475$Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { 476 name => 'tag open uc', 477 ct => { 478 type => START_TAG_TOKEN, 479 delta => 1, 480 append_tag_name => 0x0020, # UC -> lc 481 }, 482 state => TAG_NAME_STATE, 483}; 484 $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { 485 name => 'tag open uc xml', 486 ct => { 487 type => START_TAG_TOKEN, 488 delta => 1, 489 append_tag_name => 0x0000, 490 }, 491 state => TAG_NAME_STATE, 492 }; 493$Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = { 494 name => 'tag open lc', 495 ct => { 496 type => START_TAG_TOKEN, 497 delta => 1, 498 append_tag_name => 0x0000, 499 }, 500 state => TAG_NAME_STATE, 501}; 502$Action->[TAG_OPEN_STATE]->[0x003F] = { 503 name => 'tag open ?', 504 state => BOGUS_COMMENT_STATE, 505 error => 'pio', 506 error_delta => 1, 507 ct => { 508 type => COMMENT_TOKEN, 509 }, 510 reconsume => 1, ## $self->{nc} is intentionally left as is 511}; 512 $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ? 513 name => 'tag open ? xml', 514 state => PI_STATE, 515 }; 516$Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] = 517$Action->[TAG_OPEN_STATE]->[0x003E] = { # > 518 name => 'tag open else', 519 error => 'bare stago', 520 error_delta => 1, 521 state => DATA_STATE, 522 reconsume => 1, 523 emit => CHARACTER_TOKEN, 524 emit_data => '<', 525 emit_delta => 1, 526}; 527$Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E]; 528 $XMLAction->[TAG_OPEN_STATE]->[0x0000] = { 529 name => 'tag open null xml', 530 ct => { 531 type => START_TAG_TOKEN, 532 delta => 1, 533 append_tag_name => 0xFFFD, 534 }, 535 error => 'NULL', 536 state => TAG_NAME_STATE, 537 }; 538 ## XML5: "<:" has a parse error. 539 $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { 540 name => 'tag open else xml', 541 ct => { 542 type => START_TAG_TOKEN, 543 delta => 1, 544 append_tag_name => 0x0000, 545 }, 546 state => TAG_NAME_STATE, 547 }; 548$Action->[RCDATA_LT_STATE]->[0x002F] = { 549 name => 'rcdata lt /', 550 state => RCDATA_END_TAG_OPEN_STATE, 551 buffer => {clear => 1}, 552}; 553$Action->[RAWTEXT_LT_STATE]->[0x002F] = { 554 name => 'rawtext lt /', 555 state => RAWTEXT_END_TAG_OPEN_STATE, 556 buffer => {clear => 1}, 557}; 558$Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = { 559 name => 'script data lt /', 560 state => SCRIPT_DATA_END_TAG_OPEN_STATE, 561 buffer => {clear => 1}, 562}; 563$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = { 564 name => 'script data escaped lt /', 565 state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE, 566 buffer => {clear => 1}, 567}; 568$Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = { 569 name => 'script data lt !', 570 state => SCRIPT_DATA_ESCAPE_START_STATE, 571 emit => CHARACTER_TOKEN, 572 emit_data => '<!', 573}; 574$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = { 575 name => 'script data escaped lt uc', 576 emit => CHARACTER_TOKEN, 577 emit_data => '<', 578 emit_data_append => 1, 579 buffer => {clear => 1, append => 0x0020}, # UC -> lc 580 state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE, 581}; 582$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = { 583 name => 'script data escaped lt lc', 584 emit => CHARACTER_TOKEN, 585 emit_data => '<', 586 emit_data_append => 1, 587 buffer => {clear => 1, append => 0x0000}, 588 state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE, 589}; 590$Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = { 591 name => 'rcdata lt else', 592 state => RCDATA_STATE, 593 reconsume => 1, 594 emit => CHARACTER_TOKEN, 595 emit_data => '<', 596}; 597$Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = { 598 name => 'rawtext lt else', 599 state => RAWTEXT_STATE, 600 reconsume => 1, 601 emit => CHARACTER_TOKEN, 602 emit_data => '<', 603}; 604$Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = { 605 name => 'script data lt else', 606 state => SCRIPT_DATA_STATE, 607 reconsume => 1, 608 emit => CHARACTER_TOKEN, 609 emit_data => '<', 610}; 611$Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = { 612 name => 'script data escaped lt else', 613 state => SCRIPT_DATA_ESCAPED_STATE, 614 reconsume => 1, 615 emit => CHARACTER_TOKEN, 616 emit_data => '<', 617}; 618## XXX "End tag token" in latest HTML5 and in XML5. 619$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { 620 name => 'end tag open uc', 621 ct => { 622 type => END_TAG_TOKEN, 623 delta => 2, 624 append_tag_name => 0x0020, # UC -> lc 625 }, 626 state => TAG_NAME_STATE, 627}; 628 $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { 629 name => 'end tag open uc xml', 630 ct => { 631 type => END_TAG_TOKEN, 632 delta => 2, 633 append_tag_name => 0x0000, 634 }, 635 state => TAG_NAME_STATE, 636 }; 637$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = { 638 name => 'end tag open lc', 639 ct => { 640 type => END_TAG_TOKEN, 641 delta => 2, 642 append_tag_name => 0x0000, 643 }, 644 state => TAG_NAME_STATE, 645}; 646$Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = { 647 name => 'end tag open >', 648 error => 'empty end tag', 649 error_delta => 2, # "<" in "</>" 650 state => DATA_STATE, 651}; 652 ## XML5: No parse error. 653 654 ## NOTE: This parser raises a parse error, since it supports XML1, 655 ## not XML5. 656 657 ## NOTE: A short end tag token. 658 659 $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x003E] = { 660 name => 'end tag open > xml', 661 error => 'empty end tag', 662 error_delta => 2, # "<" in "</>" 663 state => DATA_STATE, 664 ct => { 665 type => END_TAG_TOKEN, 666 delta => 2, 667 }, 668 emit => '', 669 }; 670$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_EOF_CHAR] = { 671 name => 'end tag open eof', 672 error => 'bare etago', 673 state => DATA_STATE, 674 reconsume => 1, 675 emit => CHARACTER_TOKEN, 676 emit_data => '</', 677 emit_delta => 2, 678}; 679$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_SPACE_CHAR] = 680$Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { 681 name => 'end tag open else', 682 error => 'bogus end tag', 683 error_delta => 2, # "<" of "</" 684 state => BOGUS_COMMENT_STATE, 685 ct => { 686 type => COMMENT_TOKEN, 687 delta => 2, # "<" of "</" 688 }, 689 reconsume => 1, 690 ## NOTE: $self->{nc} is intentionally left as is. Although the 691 ## "anything else" case of the spec not explicitly states that the 692 ## next input character is to be reconsumed, it will be included to 693 ## the |data| of the comment token generated from the bogus end tag, 694 ## as defined in the "bogus comment state" entry. 695}; 696 $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x0000] = { 697 name => 'end tag open null xml', 698 ct => { 699 type => END_TAG_TOKEN, 700 delta => 2, 701 append_tag_name => 0xFFFD, 702 }, 703 error => 'NULL', 704 state => TAG_NAME_STATE, ## XML5: "end tag name state". 705 }; 706 ## XML5: "</:" is a parse error. 707 $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { 708 name => 'end tag open else xml', 709 ct => { 710 type => END_TAG_TOKEN, 711 delta => 2, 712 append_tag_name => 0x0000, 713 }, 714 state => TAG_NAME_STATE, ## XML5: "end tag name state". 715 }; 716 ## This switch-case implements "tag name state", "RCDATA end tag 717 ## name state", "RAWTEXT end tag name state", and "script data 718 ## end tag name state" jointly with the implementation of 719 ## "RCDATA end tag open state" and so on. 720$Action->[TAG_NAME_STATE]->[KEY_SPACE_CHAR] = { 721 name => 'tag name sp', 722 state => BEFORE_ATTRIBUTE_NAME_STATE, 723}; 724$Action->[TAG_NAME_STATE]->[0x003E] = { 725 name => 'tag name >', 726 state => DATA_STATE, 727 emit => '', 728}; 729$Action->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = { 730 name => 'tag name uc', 731 ct => { 732 append_tag_name => 0x0020, # UC -> lc 733 }, 734}; 735$XMLAction->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = { 736 name => 'tag name uc xml', 737 ct => { 738 append_tag_name => 0x0000, 739 }, 740}; 741$Action->[TAG_NAME_STATE]->[KEY_EOF_CHAR] = { 742 name => 'tag name eof', 743 error => 'unclosed tag', 744 state => DATA_STATE, 745 reconsume => 1, 746}; 747$Action->[TAG_NAME_STATE]->[0x002F] = { 748 name => 'tag name /', 749 state => SELF_CLOSING_START_TAG_STATE, 750}; 751$Action->[TAG_NAME_STATE]->[0x0000] = { 752 name => 'tag name null', 753 ct => { 754 append_tag_name => 0xFFFD, 755 }, 756 error => 'NULL', 757}; 758$Action->[TAG_NAME_STATE]->[KEY_ELSE_CHAR] = { 759 name => 'tag name else', 760 ct => { 761 append_tag_name => 0x0000, 762 }, 763}; 764$Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[0x002D] = { 765 name => 'script data escape start -', 766 state => SCRIPT_DATA_ESCAPE_START_DASH_STATE, 767 emit => CHARACTER_TOKEN, 768 emit_data => '-', 769}; 770$Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[0x002D] = { 771 name => 'script data escape start dash -', 772 state => SCRIPT_DATA_ESCAPED_STATE, 773 emit => CHARACTER_TOKEN, 774 emit_data => '-', 775}; 776$Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = { 777 name => 'script data escape start else', 778 state => SCRIPT_DATA_STATE, 779 reconsume => 1, 780}; 781$Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[KEY_ELSE_CHAR] = $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR]; 782$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x002D] = { 783 name => 'script data escaped -', 784 state => SCRIPT_DATA_ESCAPED_DASH_STATE, 785 emit => CHARACTER_TOKEN, 786 emit_data => '-', 787}; 788$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x002D] = { 789 name => 'script data escaped dash -', 790 state => SCRIPT_DATA_ESCAPED_DASH_DASH_STATE, 791 emit => CHARACTER_TOKEN, 792 emit_data => '-', 793}; 794$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x002D] = { 795 name => 'script data escaped dash dash -', 796 emit => CHARACTER_TOKEN, 797 emit_data => '-', 798}; 799$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x002D] = { 800 name => 'script data double escaped -', 801 state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE, 802 emit => CHARACTER_TOKEN, 803 emit_data => '-', 804}; 805$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x002D] = { 806 name => 'script data double escaped -', 807 state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE, 808 emit => CHARACTER_TOKEN, 809 emit_data => '-', 810}; 811$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x002D] = { 812 name => 'script data double escaped dash dash -', 813 emit => CHARACTER_TOKEN, 814 emit_data => '-', 815}; 816$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x003C] = { 817 name => 'script data escaped <', 818 state => SCRIPT_DATA_ESCAPED_LT_STATE, 819}; 820$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x003C] = { 821 name => 'script data escaped dash <', 822 state => SCRIPT_DATA_ESCAPED_LT_STATE, 823}; 824$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003C] = { 825 name => 'script data escaped dash dash <', 826 state => SCRIPT_DATA_ESCAPED_LT_STATE, 827}; 828$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x003C] = { 829 name => 'script data double escaped <', 830 state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, 831 emit => CHARACTER_TOKEN, 832 emit_data => '<', 833}; 834$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x003C] = { 835 name => 'script data double escaped dash <', 836 state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, 837 emit => CHARACTER_TOKEN, 838 emit_data => '<', 839}; 840$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003C] = { 841 name => 'script data double escaped dash dash <', 842 state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, 843 emit => CHARACTER_TOKEN, 844 emit_data => '<', 845}; 846$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E] = { 847 name => 'script data escaped dash dash >', 848 state => SCRIPT_DATA_STATE, 849 emit => CHARACTER_TOKEN, 850 emit_data => '>', 851}; 852$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003E] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E]; 853$Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_EOF_CHAR] = 854$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] = 855$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = 856$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_EOF_CHAR] = 857$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] = 858$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = { 859 name => 'script data escaped eof', 860 error => 'eof in escaped script data', # XXXdocumentation 861 state => DATA_STATE, 862 reconsume => 1, 863}; 864$Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x0000] = 865$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x0000] = 866$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x0000] = 867$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x0000] = 868$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x0000] = 869$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x0000] = { 870 name => 'script data escaped null', 871 emit => CHARACTER_TOKEN, 872 emit_data => "\x{FFFD}", 873 error => 'NULL', 874 state => SCRIPT_DATA_ESCAPED_STATE, 875}; 876$Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_ELSE_CHAR] = { 877 name => 'script data escaped else', 878 emit => CHARACTER_TOKEN, 879 state => SCRIPT_DATA_ESCAPED_STATE, 880}; 881$Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = { 882 name => 'script data escaped dash else', 883 emit => CHARACTER_TOKEN, 884 state => SCRIPT_DATA_ESCAPED_STATE, 885}; 886$Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = { 887 name => 'script data escaped dash dash else', 888 emit => CHARACTER_TOKEN, 889 state => SCRIPT_DATA_ESCAPED_STATE, 890}; 891$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_ELSE_CHAR] = { 892 name => 'script data double escaped else', 893 emit => CHARACTER_TOKEN, 894 state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, 895}; 896$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = { 897 name => 'script data double escaped dash else', 898 emit => CHARACTER_TOKEN, 899 state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, 900}; 901$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = { 902 name => 'script data double escaped dash dash else', 903 emit => CHARACTER_TOKEN, 904 state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, 905}; 906$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_SPACE_CHAR] = 907$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_SPACE_CHAR] = 908$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x003E] = 909$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x003E] = 910$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x002F] = 911$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x002F] = { 912 name => 'script data double escape start sp>/', 913 skip => 1, 914}; 915$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ULATIN_CHAR] = 916$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ULATIN_CHAR] = { 917 name => 'script data double escape start uc', 918 emit => CHARACTER_TOKEN, 919 buffer => {append => 0x0020}, # UC -> lc 920}; 921$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_LLATIN_CHAR] = 922$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_LLATIN_CHAR] = { 923 name => 'script data double escape start lc', 924 emit => CHARACTER_TOKEN, 925 buffer => {append => 0x0000}, 926}; 927$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = { 928 name => 'script data double escape start else', 929 state => SCRIPT_DATA_ESCAPED_STATE, 930 reconsume => 1, 931}; 932$Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ELSE_CHAR] = { 933 name => 'script data double escape end else', 934 state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, 935 reconsume => 1, 936}; 937$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[0x002F] = { 938 name => 'script data double escaped lt /', 939 buffer => {clear => 1}, 940 state => SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE, 941 emit => CHARACTER_TOKEN, 942 emit_data => '/', 943}; 944$Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = { 945 name => 'script data double escaped lt else', 946 state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, 947 reconsume => 1, 948}; 949 ## XML5: Part of the "data state". 950$Action->[DATA_MSE1_STATE]->[0x005D] = { 951 name => 'data mse1 ]', 952 state => DATA_MSE2_STATE, 953 emit => CHARACTER_TOKEN, 954 emit_data => ']', 955}; 956$Action->[DATA_MSE1_STATE]->[KEY_ELSE_CHAR] = { 957 name => 'data mse1 else', 958 state => DATA_STATE, 959 reconsume => 1, 960}; 961$Action->[DATA_MSE2_STATE]->[0x003E] = { 962 name => 'data mse2 >', 963 error => 'unmatched mse', # XML5: Not a parse error. # XXXdocumentation 964 error_delta => 2, 965 state => DATA_STATE, 966 emit => CHARACTER_TOKEN, 967 emit_data => '>', 968}; 969$Action->[DATA_MSE2_STATE]->[0x005D] = { 970 name => 'data mse2 ]', 971 emit => CHARACTER_TOKEN, 972 emit_data => ']', 973}; 974$Action->[DATA_MSE2_STATE]->[KEY_ELSE_CHAR] = { 975 name => 'data mse2 else', 976 state => DATA_STATE, 977 reconsume => 1, 978}; 979 ## XML5: "Tag attribute name before state". 980$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { 981 name => 'before attr name sp', 982}; 983$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003E] = { 984 name => 'before attr name >', 985 emit => '', 986 state => DATA_STATE, 987}; 988$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 989 name => 'before attr name uc', 990 ca => { 991 set_name => 0x0020, # UC -> lc 992 }, 993 state => ATTRIBUTE_NAME_STATE, 994}; 995$XMLAction->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 996 name => 'before attr name uc xml', 997 ca => { 998 set_name => 0x0000, 999 }, 1000 state => ATTRIBUTE_NAME_STATE, 1001}; 1002$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x002F] = { 1003 name => 'before attr name /', 1004 state => SELF_CLOSING_START_TAG_STATE, 1005}; 1006$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { 1007 name => 'before attr name eof', 1008 error => 'unclosed tag', 1009 state => DATA_STATE, 1010}; 1011$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0022] = 1012$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0027] = 1013$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003C] = 1014$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003D] = { 1015 name => q[before attr name "'<=], 1016 error => 'bad attribute name', ## XML5: Not a parse error. 1017 ca => {set_name => 0x0000}, 1018 state => ATTRIBUTE_NAME_STATE, 1019}; 1020$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0000] = { 1021 name => 'before attr name null', 1022 ca => {set_name => 0xFFFD}, 1023 error => 'NULL', 1024 state => ATTRIBUTE_NAME_STATE, 1025}; 1026 ## XML5: ":" raises a parse error and is ignored. 1027$Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { 1028 name => 'before attr name else', 1029 ca => {set_name => 0x0000}, 1030 state => ATTRIBUTE_NAME_STATE, 1031}; 1032 1033 ## XML5: "Tag attribute name state". 1034$Action->[ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { 1035 name => 'attr name sp', 1036 ca => {leave => 1}, 1037 state => AFTER_ATTRIBUTE_NAME_STATE, 1038}; 1039$Action->[ATTRIBUTE_NAME_STATE]->[0x003D] = { 1040 name => 'attr name =', 1041 ca => {leave => 1}, 1042 state => BEFORE_ATTRIBUTE_VALUE_STATE, 1043}; 1044$Action->[ATTRIBUTE_NAME_STATE]->[0x003E] = { 1045 name => 'attr name >', 1046 ca => {leave => 1}, 1047 emit => '', 1048 state => DATA_STATE, 1049}; 1050$XMLAction->[ATTRIBUTE_NAME_STATE]->[0x003E] = { 1051 name => 'attr name > xml', 1052 error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation 1053 ca => {leave => 1}, 1054 emit => '', 1055 state => DATA_STATE, 1056}; 1057$Action->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 1058 name => 'attr name uc', 1059 ca => {name => 0x0020}, # UC -> lc 1060}; 1061$XMLAction->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 1062 name => 'attr name uc', 1063 ca => {name => 0x0000}, 1064}; 1065$Action->[ATTRIBUTE_NAME_STATE]->[0x002F] = { 1066 name => 'attr name /', 1067 ca => {leave => 1}, 1068 state => SELF_CLOSING_START_TAG_STATE, 1069}; 1070$XMLAction->[ATTRIBUTE_NAME_STATE]->[0x002F] = { 1071 name => 'attr name / xml', 1072 error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation 1073 ca => {leave => 1}, 1074 state => SELF_CLOSING_START_TAG_STATE, 1075}; 1076$Action->[ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { 1077 name => 'attr name eof', 1078 error => 'unclosed tag', 1079 ca => {leave => 1}, 1080 state => DATA_STATE, 1081 reconsume => 1, 1082}; 1083$Action->[ATTRIBUTE_NAME_STATE]->[0x0022] = 1084$Action->[ATTRIBUTE_NAME_STATE]->[0x0027] = 1085$Action->[ATTRIBUTE_NAME_STATE]->[0x003C] = { 1086 name => q[attr name "'<], 1087 error => 'bad attribute name', ## XML5: Not a parse error. 1088 ca => {name => 0x0000}, 1089}; 1090$Action->[ATTRIBUTE_NAME_STATE]->[0x0000] = { 1091 name => 'attr name null', 1092 ca => {name => 0xFFFD}, 1093 error => 'NULL', 1094}; 1095$Action->[ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { 1096 name => 'attr name else', 1097 ca => {name => 0x0000}, 1098}; 1099 ## XML5: "Tag attribute name after state". 1100$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { 1101 name => 'after attr name sp', 1102}; 1103$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003D] = { 1104 name => 'after attr name =', 1105 state => BEFORE_ATTRIBUTE_VALUE_STATE, 1106}; 1107$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = { 1108 name => 'after attr name >', 1109 emit => '', 1110 state => DATA_STATE, 1111}; 1112$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = { 1113 name => 'after attr name > xml', 1114 error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation 1115 emit => '', 1116 state => DATA_STATE, 1117}; 1118$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 1119 name => 'after attr name uc', 1120 ca => {set_name => 0x0020}, # UC -> lc 1121 state => ATTRIBUTE_NAME_STATE, 1122}; 1123$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { 1124 name => 'after attr name uc xml', 1125 ca => {set_name => 0x0000}, 1126 state => ATTRIBUTE_NAME_STATE, 1127}; 1128$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = { 1129 name => 'after attr name /', 1130 state => SELF_CLOSING_START_TAG_STATE, 1131}; 1132$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = { 1133 name => 'after attr name / xml', 1134 error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation 1135 state => SELF_CLOSING_START_TAG_STATE, 1136}; 1137$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { 1138 name => 'after attr name eof', 1139 error => 'unclosed tag', 1140 state => DATA_STATE, 1141 reconsume => 1, 1142}; 1143$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0022] = 1144$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0027] = 1145$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003C] = { 1146 name => q[after attr name "'<], 1147 error => 'bad attribute name', ## XML5: Not a parse error. 1148 #error2(xml) => 'no attr value', ## XML5: Not a parse error. 1149 ca => {set_name => 0x0000}, 1150 state => ATTRIBUTE_NAME_STATE, 1151}; 1152$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0000] = { 1153 name => q[after attr name else], 1154 ca => {set_name => 0xFFFD}, 1155 error => 'NULL', 1156 #error2(xml) => 'no attr value', ## XML5: Not a parse error. 1157 state => ATTRIBUTE_NAME_STATE, 1158}; 1159$Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { 1160 name => q[after attr name else], 1161 ca => {set_name => 0x0000}, 1162 state => ATTRIBUTE_NAME_STATE, 1163}; 1164$XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { 1165 name => q[after attr name else], 1166 error => 'no attr value', ## XML5: Not a parse error. 1167 ca => {set_name => 0x0000}, 1168 state => ATTRIBUTE_NAME_STATE, 1169}; 1170 ## XML5: "Tag attribute value before state". 1171$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_SPACE_CHAR] = { 1172 name => 'before attr value sp', 1173}; 1174$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0022] = { 1175 name => 'before attr value "', 1176 state => ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE, 1177}; 1178$XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = { 1179 name => 'before attr value &', 1180 error => 'unquoted attr value', ## XML5: Not a parse error. 1181 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1182 reconsume => 1, 1183}; 1184$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = { 1185 name => 'before attr value &', 1186 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1187 reconsume => 1, 1188}; 1189$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0027] = { 1190 name => "before attr value '", 1191 state => ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE, 1192}; 1193$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003E] = { 1194 name => 'before attr value >', 1195 error => 'empty unquoted attribute value', 1196 emit => '', 1197 state => DATA_STATE, 1198}; 1199$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_EOF_CHAR] = { 1200 name => 'before attr value eof', 1201 error => 'unclosed tag', 1202 state => DATA_STATE, 1203}; 1204$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003C] = 1205$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003D] = 1206$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0060] = { 1207 name => 'before attr value <=`', 1208 error => 'bad attribute value', ## XML5: Not a parse error. 1209 #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error. 1210 ca => {value => 1}, 1211 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1212}; 1213$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0000] = { 1214 name => 'before attr value null', 1215 ca => {value => "\x{FFFD}"}, 1216 error => 'NULL', 1217 #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error. 1218 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1219}; 1220$XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = { 1221 name => 'before attr value else xml', 1222 error => 'unquoted attr value', ## XML5: Not a parse error. # XXXdocumentation 1223 ca => {value => 1}, 1224 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1225}; 1226$Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = { 1227 name => 'before attr value else', 1228 ca => {value => 1}, 1229 state => ATTRIBUTE_VALUE_UNQUOTED_STATE, 1230}; 1231 1232$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_SPACE_CHAR] = { 1233 name => 'after attr value quoted sp', 1234 state => BEFORE_ATTRIBUTE_NAME_STATE, 1235}; 1236$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x003E] = { 1237 name => 'after attr value quoted >', 1238 emit => '', 1239 state => DATA_STATE, 1240}; 1241$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x002F] = { 1242 name => 'after attr value quoted /', 1243 state => SELF_CLOSING_START_TAG_STATE, 1244}; 1245$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_EOF_CHAR] = { 1246 name => 'after attr value quoted eof', 1247 error => 'unclosed tag', 1248 state => DATA_STATE, 1249 reconsume => 1, 1250}; 1251$Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_ELSE_CHAR] = { 1252 name => 'after attr value quoted else', 1253 error => 'no space between attributes', 1254 state => BEFORE_ATTRIBUTE_NAME_STATE, 1255 reconsume => 1, 1256}; 1257$Action->[SELF_CLOSING_START_TAG_STATE]->[0x003E] = { 1258 name => 'self closing start tag >', 1259 skip => 1, 1260}; 1261$Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_EOF_CHAR] = { 1262 name => 'self closing start tag eof', 1263 error => 'unclosed tag', 1264 state => DATA_STATE, ## XML5: "Tag attribute name before state". 1265 reconsume => 1, 1266}; 1267$Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_ELSE_CHAR] = { 1268 name => 'self closing start tag else', 1269 error => 'nestc', # XXX This error type is wrong. 1270 state => BEFORE_ATTRIBUTE_NAME_STATE, 1271 reconsume => 1, 1272}; 1273$Action->[MD_HYPHEN_STATE]->[0x002D] = { 1274 name => 'md hyphen -', 1275 ct => {type => COMMENT_TOKEN, data => '', delta => 3}, 1276 state => COMMENT_START_STATE, ## XML5: "comment state". 1277}; 1278$Action->[MD_HYPHEN_STATE]->[KEY_ELSE_CHAR] = { 1279 name => 'md hyphen else', 1280 error => 'bogus comment', 1281 error_delta => 3, 1282 state => BOGUS_COMMENT_STATE, 1283 reconsume => 1, 1284 ct => {type => COMMENT_TOKEN, data => '-', delta => 3}, 1285}; 1286 1287my $c_to_key = []; 1288$c_to_key->[255] = KEY_EOF_CHAR; # EOF_CHAR 1289$c_to_key->[$_] = $_ for 0x0000..0x007F; 1290$c_to_key->[$_] = KEY_SPACE_CHAR for keys %$is_space; 1291$c_to_key->[$_] = KEY_ULATIN_CHAR for 0x0041..0x005A; 1292$c_to_key->[$_] = KEY_LLATIN_CHAR for 0x0061..0x007A; 1293 1294sub _get_next_token ($) { 1295 my $self = shift; 1296 1297 if ($self->{self_closing}) { 1298 ## NOTE: The |$self->{self_closing}| flag can never be set to 1299 ## tokens except for start tag tokens. A start tag token is 1300 ## always set to |$self->{ct}| before it is emitted. 1301 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); 1302 delete $self->{self_closing}; 1303 } 1304 1305 if (@{$self->{token}}) { 1306 $self->{self_closing} = $self->{token}->[0]->{self_closing}; 1307 return shift @{$self->{token}}; 1308 } 1309 1310 A: { 1311 my $nc = $self->{nc}; 1312 my $state = $self->{state}; 1313 1314 1315 1316 my $c = $nc > 0x007F ? KEY_ELSE_CHAR : $c_to_key->[$nc]; 1317 my $action = $Action->[$state]->[$c] || $Action->[$state]->[KEY_ELSE_CHAR]; 1318 if ($self->{is_xml}) { 1319 $action = $XMLAction->[$state]->[$c] 1320 || $Action->[$state]->[$c] 1321 || $XMLAction->[$state]->[KEY_ELSE_CHAR] 1322 || $Action->[$state]->[KEY_ELSE_CHAR]; 1323 } 1324 1325 if ($action and not $action->{skip}) { 1326 1327 1328 if (defined $action->{error}) { 1329 if ($action->{error_delta}) { 1330 $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error}, 1331 line => $self->{line_prev}, 1332 column => $self->{column_prev} - $action->{error_delta} + 1); 1333 } else { 1334 $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error}); 1335 } 1336 } 1337 1338 if (defined $action->{state}) { 1339 $self->{state} = $action->{state}; 1340 1341 if ($action->{state_set}) { 1342 for (keys %{$action->{state_set}}) { 1343 $self->{$_} = $action->{state_set}->{$_}; 1344 } 1345 } 1346 } 1347 1348 if (my $act = $action->{ct}) { 1349 if (defined $act->{type}) { 1350 $self->{ct} = {type => $act->{type}, 1351 tag_name => '', data => $act->{data}}; 1352 if ($act->{delta}) { 1353 $self->{ct}->{line} = $self->{line_prev}; 1354 $self->{ct}->{column} = $self->{column_prev} - $act->{delta} + 1; 1355 } else { 1356 $self->{ct}->{line} = $self->{line}; 1357 $self->{ct}->{column} = $self->{column}; 1358 } 1359 } 1360 1361 if (defined $act->{append_tag_name}) { 1362 $self->{ct}->{tag_name} .= chr ($nc + $act->{append_tag_name}); 1363 } 1364 } 1365 1366 if (my $aca = $action->{ca}) { 1367 if ($aca->{value}) { 1368 $self->{ca}->{value} .= $aca->{value} ne '1' ? $aca->{value} : chr $nc; 1369 } elsif (defined $aca->{name}) { 1370 $self->{ca}->{name} .= chr ($nc + $aca->{name}); 1371 } elsif (defined $aca->{set_name}) { 1372 $self->{ca} = { 1373 name => chr ($nc + $aca->{set_name}), 1374 value => '', 1375 line => $self->{line}, column => $self->{column}, 1376 }; 1377 } elsif ($aca->{leave}) { 1378 if (exists $self->{ct}->{attributes}->{$self->{ca}->{name}}) { 1379 1380 $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column}); 1381 ## Discard $self->{ca}. 1382 } else { 1383 1384 $self->{ct}->{attributes}->{$self->{ca}->{name}} = $self->{ca}; 1385 $self->{ca}->{index} = ++$self->{ct}->{last_index}; 1386 } 1387 } 1388 } 1389 1390 if (defined $action->{buffer}) { 1391 $self->{kwd} = '' if $action->{buffer}->{clear}; 1392 $self->{kwd} .= chr ($nc + $action->{buffer}->{append}) 1393 if defined $action->{buffer}->{append}; 1394 1395 1396 } 1397 1398 if (defined $action->{emit}) { 1399 if ($action->{emit} eq '') { 1400 if ($self->{ct}->{type} == START_TAG_TOKEN) { 1401 1402 $self->{last_stag_name} = $self->{ct}->{tag_name}; 1403 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { 1404 if ($self->{ct}->{attributes}) { 1405 1406 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 1407 } else { 1408 1409 } 1410 } else { 1411 die "$0: $self->{ct}->{type}: Unknown token type"; 1412 } 1413 1414 if ($action->{reconsume}) { 1415 # 1416 } else { 1417 1418 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1419 $self->{line_prev} = $self->{line}; 1420 $self->{column_prev} = $self->{column}; 1421 $self->{column}++; 1422 $self->{nc} 1423 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1424 } else { 1425 $self->{set_nc}->($self); 1426 } 1427 1428 } 1429 return ($self->{ct}); 1430 } else { 1431 my $token = {type => $action->{emit}}; 1432 if (defined $action->{emit_data}) { 1433 $token->{data} = $action->{emit_data}; 1434 if ($action->{emit_data_append}) { 1435 $token->{data} .= chr $nc; 1436 } 1437 } elsif ($action->{emit} == CHARACTER_TOKEN) { 1438 $token->{data} .= chr $nc; 1439 } 1440 if ($action->{emit_delta}) { 1441 $token->{line} = $self->{line_prev}; 1442 $token->{column} = $self->{column_prev} - $action->{emit_delta} + 1; 1443 } else { 1444 $token->{line} = $self->{line}; 1445 $token->{column} = $self->{column}; 1446 } 1447 if (defined $action->{emit_data_read_until}) { 1448 $self->{read_until}->($token->{data}, 1449 $action->{emit_data_read_until}, 1450 length $token->{data}); 1451 } 1452 1453 if ($action->{reconsume}) { 1454 # 1455 } else { 1456 1457 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1458 $self->{line_prev} = $self->{line}; 1459 $self->{column_prev} = $self->{column}; 1460 $self->{column}++; 1461 $self->{nc} 1462 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1463 } else { 1464 $self->{set_nc}->($self); 1465 } 1466 1467 } 1468 return ($token); 1469 } 1470 } else { 1471 if ($action->{reconsume}) { 1472 # 1473 } else { 1474 1475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1476 $self->{line_prev} = $self->{line}; 1477 $self->{column_prev} = $self->{column}; 1478 $self->{column}++; 1479 $self->{nc} 1480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1481 } else { 1482 $self->{set_nc}->($self); 1483 } 1484 1485 } 1486 } 1487 1488 redo A; 1489 } 1490 1491 if ({ 1492 (RCDATA_END_TAG_OPEN_STATE) => 1, 1493 (RAWTEXT_END_TAG_OPEN_STATE) => 1, 1494 (SCRIPT_DATA_END_TAG_OPEN_STATE) => 1, 1495 (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => 1, 1496 }->{$state}) { 1497 ## This switch-case implements "RCDATA end tag open state", 1498 ## "RAWTEXT end tag open state", "script data end tag open 1499 ## state", "RCDATA end tag name state", "RAWTEXT end tag name 1500 ## state", and "script end tag name state" jointly with the 1501 ## implementation of the "tag name" state. 1502 1503 my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" 1504 1505 if (defined $self->{last_stag_name}) { 1506 # 1507 } else { 1508 ## No start tag token has ever been emitted 1509 ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. 1510 1511 $self->{state} = { 1512 (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, 1513 (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, 1514 (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, 1515 (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) 1516 => SCRIPT_DATA_ESCAPED_STATE, 1517 }->{$state} or die "${state}'s next state not found"; 1518 ## Reconsume. 1519 return ({type => CHARACTER_TOKEN, data => '</', 1520 line => $l, column => $c}); 1521 redo A; 1522 } 1523 1524 my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1; 1525 if (length $ch) { 1526 my $CH = $ch; 1527 $ch =~ tr/a-z/A-Z/; 1528 my $nch = chr $nc; 1529 if ($nch eq $ch or $nch eq $CH) { 1530 1531 ## Stay in the state. 1532 $self->{kwd} .= $nch; 1533 1534 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1535 $self->{line_prev} = $self->{line}; 1536 $self->{column_prev} = $self->{column}; 1537 $self->{column}++; 1538 $self->{nc} 1539 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1540 } else { 1541 $self->{set_nc}->($self); 1542 } 1543 1544 redo A; 1545 } else { 1546 1547 $self->{state} = { 1548 (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, 1549 (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, 1550 (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, 1551 (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) 1552 => SCRIPT_DATA_ESCAPED_STATE, 1553 }->{$state} or die "${state}'s next state not found"; 1554 ## Reconsume. 1555 return ({type => CHARACTER_TOKEN, 1556 data => '</' . $self->{kwd}, 1557 line => $self->{line_prev}, 1558 column => $self->{column_prev} - 1 - length $self->{kwd}, 1559 }); 1560 redo A; 1561 } 1562 } else { # after "</{tag-name}" 1563 unless ($is_space->{$nc} or 1564 { 1565 0x003E => 1, # > 1566 0x002F => 1, # / 1567 }->{$nc}) { 1568 1569 ## Reconsume. 1570 $self->{state} = { 1571 (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, 1572 (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, 1573 (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, 1574 (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) 1575 => SCRIPT_DATA_ESCAPED_STATE, 1576 }->{$self->{state}} or die "${state}'s next state not found"; 1577 return ({type => CHARACTER_TOKEN, 1578 data => '</' . $self->{kwd}, 1579 line => $self->{line_prev}, 1580 column => $self->{column_prev} - 1 - length $self->{kwd}, 1581 }); 1582 redo A; 1583 } else { 1584 1585 $self->{ct} 1586 = {type => END_TAG_TOKEN, 1587 tag_name => $self->{last_stag_name}, 1588 line => $self->{line_prev}, 1589 column => $self->{column_prev} - 1 - length $self->{kwd}}; 1590 $self->{state} = TAG_NAME_STATE; 1591 ## Reconsume. 1592 redo A; 1593 } 1594 } 1595 } elsif ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE or 1596 $state == SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) { 1597 if ($is_space->{$nc} or 1598 $nc == 0x002F or # / 1599 $nc == 0x003E) { # > 1600 my $token = {type => CHARACTER_TOKEN, 1601 data => chr $nc, 1602 line => $self->{line}, column => $self->{column}}; 1603 if ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) { 1604 $self->{state} = $self->{kwd} eq 'script' # "temporary buffer" 1605 ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE 1606 : SCRIPT_DATA_ESCAPED_STATE; 1607 } else { 1608 $self->{state} = $self->{kwd} eq 'script' # "temporary buffer" 1609 ? SCRIPT_DATA_ESCAPED_STATE 1610 : SCRIPT_DATA_DOUBLE_ESCAPED_STATE; 1611 } 1612 1613 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1614 $self->{line_prev} = $self->{line}; 1615 $self->{column_prev} = $self->{column}; 1616 $self->{column}++; 1617 $self->{nc} 1618 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1619 } else { 1620 $self->{set_nc}->($self); 1621 } 1622 1623 return ($token); 1624 redo A; 1625 } else { 1626 die "$state/$nc is implemented"; 1627 } 1628 } elsif ($state == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { 1629 ## XML5: "Tag attribute value double quoted state" and "DOCTYPE 1630 ## ATTLIST attribute value double quoted state". 1631 1632 if ($nc == 0x0022) { # " 1633 if ($self->{ct}->{type} == ATTLIST_TOKEN) { 1634 1635 ## XML5: "DOCTYPE ATTLIST name after state". 1636 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 1637 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE; 1638 } else { 1639 1640 ## XML5: "Tag attribute name before state". 1641 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1642 } 1643 1644 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1645 $self->{line_prev} = $self->{line}; 1646 $self->{column_prev} = $self->{column}; 1647 $self->{column}++; 1648 $self->{nc} 1649 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1650 } else { 1651 $self->{set_nc}->($self); 1652 } 1653 1654 redo A; 1655 } elsif ($nc == 0x0026) { # & 1656 1657 ## XML5: Not defined yet. 1658 1659 ## NOTE: In the spec, the tokenizer is switched to the 1660 ## "entity in attribute value state". In this implementation, the 1661 ## tokenizer is switched to the |ENTITY_STATE|, which is an 1662 ## implementation of the "consume a character reference" algorithm. 1663 $self->{prev_state} = $state; 1664 $self->{entity_add} = 0x0022; # " 1665 $self->{state} = ENTITY_STATE; 1666 1667 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1668 $self->{line_prev} = $self->{line}; 1669 $self->{column_prev} = $self->{column}; 1670 $self->{column}++; 1671 $self->{nc} 1672 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1673 } else { 1674 $self->{set_nc}->($self); 1675 } 1676 1677 redo A; 1678 } elsif ($self->{is_xml} and 1679 $is_space->{$nc}) { 1680 1681 $self->{ca}->{value} .= ' '; 1682 ## Stay in the state. 1683 1684 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1685 $self->{line_prev} = $self->{line}; 1686 $self->{column_prev} = $self->{column}; 1687 $self->{column}++; 1688 $self->{nc} 1689 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1690 } else { 1691 $self->{set_nc}->($self); 1692 } 1693 1694 redo A; 1695 } elsif ($nc == -1) { 1696 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); 1697 if ($self->{ct}->{type} == START_TAG_TOKEN) { 1698 1699 $self->{last_stag_name} = $self->{ct}->{tag_name}; 1700 1701 $self->{state} = DATA_STATE; 1702 ## reconsume 1703 return ($self->{ct}); # start tag 1704 redo A; 1705 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { 1706 if ($self->{ct}->{attributes}) { 1707 1708 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 1709 } else { 1710 ## NOTE: This state should never be reached. 1711 1712 } 1713 1714 $self->{state} = DATA_STATE; 1715 ## reconsume 1716 1717 ## Discard the token. 1718 #return ($self->{ct}); # end tag 1719 1720 redo A; 1721 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { 1722 ## XML5: No parse error above; not defined yet. 1723 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 1724 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 1725 ## Reconsume. 1726 1727 ## Discard the token. 1728 #return ($self->{ct}); # ATTLIST 1729 1730 redo A; 1731 } else { 1732 die "$0: $self->{ct}->{type}: Unknown token type"; 1733 } 1734 } elsif ($nc == 0x0000) { 1735 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 1736 $self->{ca}->{value} .= "\x{FFFD}"; 1737 ## Stay in the state 1738 1739 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1740 $self->{line_prev} = $self->{line}; 1741 $self->{column_prev} = $self->{column}; 1742 $self->{column}++; 1743 $self->{nc} 1744 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1745 } else { 1746 $self->{set_nc}->($self); 1747 } 1748 1749 redo A; 1750 } else { 1751 ## XML5 [ATTLIST]: Not defined yet. 1752 if ($self->{is_xml} and $nc == 0x003C) { # < 1753 1754 ## XML5: Not a parse error. 1755 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type 1756 } else { 1757 1758 } 1759 $self->{ca}->{value} .= chr ($nc); 1760 $self->{read_until}->($self->{ca}->{value}, 1761 qq[\x00"&<\x09\x0C\x20], 1762 length $self->{ca}->{value}); 1763 1764 ## Stay in the state 1765 1766 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1767 $self->{line_prev} = $self->{line}; 1768 $self->{column_prev} = $self->{column}; 1769 $self->{column}++; 1770 $self->{nc} 1771 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1772 } else { 1773 $self->{set_nc}->($self); 1774 } 1775 1776 redo A; 1777 } 1778 } elsif ($state == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { 1779 ## XML5: "Tag attribute value single quoted state" and "DOCTYPE 1780 ## ATTLIST attribute value single quoted state". 1781 1782 if ($nc == 0x0027) { # ' 1783 if ($self->{ct}->{type} == ATTLIST_TOKEN) { 1784 1785 ## XML5: "DOCTYPE ATTLIST name after state". 1786 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 1787 $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE; 1788 } else { 1789 1790 ## XML5: "Before attribute name state" (sic). 1791 $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; 1792 } 1793 1794 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1795 $self->{line_prev} = $self->{line}; 1796 $self->{column_prev} = $self->{column}; 1797 $self->{column}++; 1798 $self->{nc} 1799 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1800 } else { 1801 $self->{set_nc}->($self); 1802 } 1803 1804 redo A; 1805 } elsif ($nc == 0x0026) { # & 1806 1807 ## XML5: Not defined yet. 1808 1809 ## NOTE: In the spec, the tokenizer is switched to the 1810 ## "entity in attribute value state". In this implementation, the 1811 ## tokenizer is switched to the |ENTITY_STATE|, which is an 1812 ## implementation of the "consume a character reference" algorithm. 1813 $self->{entity_add} = 0x0027; # ' 1814 $self->{prev_state} = $state; 1815 $self->{state} = ENTITY_STATE; 1816 1817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1818 $self->{line_prev} = $self->{line}; 1819 $self->{column_prev} = $self->{column}; 1820 $self->{column}++; 1821 $self->{nc} 1822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1823 } else { 1824 $self->{set_nc}->($self); 1825 } 1826 1827 redo A; 1828 } elsif ($self->{is_xml} and 1829 $is_space->{$nc}) { 1830 1831 $self->{ca}->{value} .= ' '; 1832 ## Stay in the state. 1833 1834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1835 $self->{line_prev} = $self->{line}; 1836 $self->{column_prev} = $self->{column}; 1837 $self->{column}++; 1838 $self->{nc} 1839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1840 } else { 1841 $self->{set_nc}->($self); 1842 } 1843 1844 redo A; 1845 } elsif ($nc == -1) { 1846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); 1847 if ($self->{ct}->{type} == START_TAG_TOKEN) { 1848 1849 $self->{last_stag_name} = $self->{ct}->{tag_name}; 1850 1851 $self->{state} = DATA_STATE; 1852 ## reconsume 1853 1854 ## Discard the token. 1855 #return ($self->{ct}); # start tag 1856 1857 redo A; 1858 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { 1859 if ($self->{ct}->{attributes}) { 1860 1861 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 1862 } else { 1863 ## NOTE: This state should never be reached. 1864 1865 } 1866 1867 $self->{state} = DATA_STATE; 1868 ## reconsume 1869 1870 ## Discard the token. 1871 #return ($self->{ct}); # end tag 1872 1873 redo A; 1874 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { 1875 ## XML5: No parse error above; not defined yet. 1876 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 1877 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 1878 ## Reconsume. 1879 1880 ## Discard the token. 1881 #return ($self->{ct}); # ATTLIST 1882 1883 redo A; 1884 } else { 1885 die "$0: $self->{ct}->{type}: Unknown token type"; 1886 } 1887 } elsif ($nc == 0x0000) { 1888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 1889 $self->{ca}->{value} .= "\x{FFFD}"; 1890 ## Stay in the state 1891 1892 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1893 $self->{line_prev} = $self->{line}; 1894 $self->{column_prev} = $self->{column}; 1895 $self->{column}++; 1896 $self->{nc} 1897 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1898 } else { 1899 $self->{set_nc}->($self); 1900 } 1901 1902 redo A; 1903 } else { 1904 ## XML5 [ATTLIST]: Not defined yet. 1905 if ($self->{is_xml} and $nc == 0x003C) { # < 1906 1907 ## XML5: Not a parse error. 1908 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type 1909 } else { 1910 1911 } 1912 $self->{ca}->{value} .= chr ($nc); 1913 $self->{read_until}->($self->{ca}->{value}, 1914 qq[\x00'&<\x09\x0C\x20], 1915 length $self->{ca}->{value}); 1916 1917 ## Stay in the state 1918 1919 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1920 $self->{line_prev} = $self->{line}; 1921 $self->{column_prev} = $self->{column}; 1922 $self->{column}++; 1923 $self->{nc} 1924 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1925 } else { 1926 $self->{set_nc}->($self); 1927 } 1928 1929 redo A; 1930 } 1931 } elsif ($state == ATTRIBUTE_VALUE_UNQUOTED_STATE) { 1932 ## XML5: "Tag attribute value unquoted state". 1933 1934 if ($is_space->{$nc}) { 1935 if ($self->{ct}->{type} == ATTLIST_TOKEN) { 1936 1937 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 1938 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; 1939 } else { 1940 1941 ## XML5: "Tag attribute name before state". 1942 $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; 1943 } 1944 1945 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1946 $self->{line_prev} = $self->{line}; 1947 $self->{column_prev} = $self->{column}; 1948 $self->{column}++; 1949 $self->{nc} 1950 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1951 } else { 1952 $self->{set_nc}->($self); 1953 } 1954 1955 redo A; 1956 } elsif ($nc == 0x0026) { # & 1957 1958 1959 ## XML5: Not defined yet. 1960 1961 ## NOTE: In the spec, the tokenizer is switched to the 1962 ## "character reference in attribute value state". In this 1963 ## implementation, the tokenizer is switched to the 1964 ## |ENTITY_STATE|, which is an implementation of the "consume 1965 ## a character reference" algorithm. 1966 $self->{entity_add} = 0x003E; # > 1967 $self->{prev_state} = $state; 1968 $self->{state} = ENTITY_STATE; 1969 1970 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1971 $self->{line_prev} = $self->{line}; 1972 $self->{column_prev} = $self->{column}; 1973 $self->{column}++; 1974 $self->{nc} 1975 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1976 } else { 1977 $self->{set_nc}->($self); 1978 } 1979 1980 redo A; 1981 } elsif ($nc == 0x003E) { # > 1982 if ($self->{ct}->{type} == START_TAG_TOKEN) { 1983 1984 $self->{last_stag_name} = $self->{ct}->{tag_name}; 1985 1986 $self->{state} = DATA_STATE; 1987 1988 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 1989 $self->{line_prev} = $self->{line}; 1990 $self->{column_prev} = $self->{column}; 1991 $self->{column}++; 1992 $self->{nc} 1993 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 1994 } else { 1995 $self->{set_nc}->($self); 1996 } 1997 1998 return ($self->{ct}); # start tag 1999 redo A; 2000 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { 2001 if ($self->{ct}->{attributes}) { 2002 2003 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 2004 } else { 2005 ## NOTE: This state should never be reached. 2006 2007 } 2008 2009 $self->{state} = DATA_STATE; 2010 2011 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2012 $self->{line_prev} = $self->{line}; 2013 $self->{column_prev} = $self->{column}; 2014 $self->{column}++; 2015 $self->{nc} 2016 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2017 } else { 2018 $self->{set_nc}->($self); 2019 } 2020 2021 return ($self->{ct}); # end tag 2022 redo A; 2023 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { 2024 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 2025 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2026 2027 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2028 $self->{line_prev} = $self->{line}; 2029 $self->{column_prev} = $self->{column}; 2030 $self->{column}++; 2031 $self->{nc} 2032 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2033 } else { 2034 $self->{set_nc}->($self); 2035 } 2036 2037 return ($self->{ct}); # ATTLIST 2038 redo A; 2039 } else { 2040 die "$0: $self->{ct}->{type}: Unknown token type"; 2041 } 2042 } elsif ($nc == -1) { 2043 if ($self->{ct}->{type} == START_TAG_TOKEN) { 2044 2045 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); 2046 $self->{last_stag_name} = $self->{ct}->{tag_name}; 2047 2048 $self->{state} = DATA_STATE; 2049 ## reconsume 2050 2051 ## Discard the token. 2052 #return ($self->{ct}); # start tag 2053 2054 redo A; 2055 } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { 2056 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); 2057 if ($self->{ct}->{attributes}) { 2058 2059 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 2060 } else { 2061 ## NOTE: This state should never be reached. 2062 2063 } 2064 2065 $self->{state} = DATA_STATE; 2066 ## reconsume 2067 2068 ## Discard the token. 2069 #return ($self->{ct}); # end tag 2070 2071 redo A; 2072 } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { 2073 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 2074 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 2075 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2076 ## Reconsume. 2077 2078 ## Discard the token. 2079 #return ($self->{ct}); # ATTLIST 2080 2081 redo A; 2082 } else { 2083 die "$0: $self->{ct}->{type}: Unknown token type"; 2084 } 2085 } elsif ($nc == 0x0000) { 2086 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2087 $self->{ca}->{value} .= "\x{FFFD}"; 2088 ## Stay in the state 2089 2090 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2091 $self->{line_prev} = $self->{line}; 2092 $self->{column_prev} = $self->{column}; 2093 $self->{column}++; 2094 $self->{nc} 2095 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2096 } else { 2097 $self->{set_nc}->($self); 2098 } 2099 2100 redo A; 2101 } else { 2102 if ({ 2103 0x0022 => 1, # " 2104 0x0027 => 1, # ' 2105 0x003D => 1, # = 2106 0x003C => 1, # < 2107 0x0060 => 1, # ` 2108 }->{$nc}) { 2109 2110 ## XML5: Not a parse error. 2111 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); 2112 } else { 2113 2114 } 2115 $self->{ca}->{value} .= chr ($nc); 2116 $self->{read_until}->($self->{ca}->{value}, 2117 qq[\x00"'=&` \x09\x0C<>], 2118 length $self->{ca}->{value}); 2119 2120 ## Stay in the state 2121 2122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2123 $self->{line_prev} = $self->{line}; 2124 $self->{column_prev} = $self->{column}; 2125 $self->{column}++; 2126 $self->{nc} 2127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2128 } else { 2129 $self->{set_nc}->($self); 2130 } 2131 2132 redo A; 2133 } 2134 } elsif ($state == SELF_CLOSING_START_TAG_STATE) { 2135 ## XML5: "Empty tag state". 2136 2137 if ($nc == 0x003E) { # > 2138 if ($self->{ct}->{type} == END_TAG_TOKEN) { 2139 2140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); 2141 ## XXX: Different type than slash in start tag 2142 if ($self->{ct}->{attributes}) { 2143 2144 $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); 2145 } else { 2146 2147 } 2148 ## XXX: Test |<title></title/>| 2149 } else { 2150 2151 $self->{self_closing} = 1; 2152 } 2153 2154 $self->{state} = DATA_STATE; 2155 2156 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2157 $self->{line_prev} = $self->{line}; 2158 $self->{column_prev} = $self->{column}; 2159 $self->{column}++; 2160 $self->{nc} 2161 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2162 } else { 2163 $self->{set_nc}->($self); 2164 } 2165 2166 2167 return ($self->{ct}); # start tag or end tag 2168 2169 redo A; 2170 } else { 2171 die "$state/$nc is implemented"; 2172 } 2173 } elsif ($state == BOGUS_COMMENT_STATE) { 2174 ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state". 2175 2176 ## NOTE: Unlike spec's "bogus comment state", this implementation 2177 ## consumes characters one-by-one basis. 2178 2179 if ($nc == 0x003E) { # > 2180 if ($self->{in_subset}) { 2181 2182 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2183 } else { 2184 2185 $self->{state} = DATA_STATE; 2186 } 2187 2188 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2189 $self->{line_prev} = $self->{line}; 2190 $self->{column_prev} = $self->{column}; 2191 $self->{column}++; 2192 $self->{nc} 2193 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2194 } else { 2195 $self->{set_nc}->($self); 2196 } 2197 2198 2199 return ($self->{ct}); # comment 2200 redo A; 2201 } elsif ($nc == -1) { 2202 if ($self->{in_subset}) { 2203 2204 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2205 } else { 2206 2207 $self->{state} = DATA_STATE; 2208 } 2209 ## reconsume 2210 2211 return ($self->{ct}); # comment 2212 redo A; 2213 } elsif ($nc == 0x0000) { 2214 $self->{ct}->{data} .= "\x{FFFD}"; # comment 2215 ## Stay in the state. 2216 2217 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2218 $self->{line_prev} = $self->{line}; 2219 $self->{column_prev} = $self->{column}; 2220 $self->{column}++; 2221 $self->{nc} 2222 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2223 } else { 2224 $self->{set_nc}->($self); 2225 } 2226 2227 redo A; 2228 } else { 2229 2230 $self->{ct}->{data} .= chr ($nc); # comment 2231 $self->{read_until}->($self->{ct}->{data}, 2232 qq[\x00>], 2233 length $self->{ct}->{data}); 2234 2235 ## Stay in the state. 2236 2237 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2238 $self->{line_prev} = $self->{line}; 2239 $self->{column_prev} = $self->{column}; 2240 $self->{column}++; 2241 $self->{nc} 2242 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2243 } else { 2244 $self->{set_nc}->($self); 2245 } 2246 2247 redo A; 2248 } 2249 } elsif ($state == MARKUP_DECLARATION_OPEN_STATE) { 2250 ## XML5: "Markup declaration state". 2251 2252 if ($nc == 0x002D) { # - 2253 2254 $self->{state} = MD_HYPHEN_STATE; 2255 2256 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2257 $self->{line_prev} = $self->{line}; 2258 $self->{column_prev} = $self->{column}; 2259 $self->{column}++; 2260 $self->{nc} 2261 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2262 } else { 2263 $self->{set_nc}->($self); 2264 } 2265 2266 redo A; 2267 } elsif ($nc == 0x0044 or # D 2268 $nc == 0x0064) { # d 2269 ## ASCII case-insensitive. 2270 2271 $self->{state} = MD_DOCTYPE_STATE; 2272 $self->{kwd} = chr $nc; 2273 2274 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2275 $self->{line_prev} = $self->{line}; 2276 $self->{column_prev} = $self->{column}; 2277 $self->{column}++; 2278 $self->{nc} 2279 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2280 } else { 2281 $self->{set_nc}->($self); 2282 } 2283 2284 redo A; 2285# $nc == 0x005B) { # [ 2286 2287 $self->{state} = MD_CDATA_STATE; 2288 $self->{kwd} = '['; 2289 2290 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2291 $self->{line_prev} = $self->{line}; 2292 $self->{column_prev} = $self->{column}; 2293 $self->{column}++; 2294 $self->{nc} 2295 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2296 } else { 2297 $self->{set_nc}->($self); 2298 } 2299 2300 redo A; 2301 } else { 2302 2303 } 2304 2305 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 2306 line => $self->{line_prev}, 2307 column => $self->{column_prev} - 1); 2308 ## Reconsume. 2309 $self->{state} = BOGUS_COMMENT_STATE; 2310 $self->{ct} = {type => COMMENT_TOKEN, data => '', 2311 line => $self->{line_prev}, 2312 column => $self->{column_prev} - 1, 2313 }; 2314 redo A; 2315 } elsif ($state == MD_DOCTYPE_STATE) { 2316 ## ASCII case-insensitive. 2317 if ($nc == [ 2318 undef, 2319 0x004F, # O 2320 0x0043, # C 2321 0x0054, # T 2322 0x0059, # Y 2323 0x0050, # P 2324 NEVER_CHAR, # (E) 2325 ]->[length $self->{kwd}] or 2326 $nc == [ 2327 undef, 2328 0x006F, # o 2329 0x0063, # c 2330 0x0074, # t 2331 0x0079, # y 2332 0x0070, # p 2333 NEVER_CHAR, # (e) 2334 ]->[length $self->{kwd}]) { 2335 2336 ## Stay in the state. 2337 $self->{kwd} .= chr $nc; 2338 2339 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2340 $self->{line_prev} = $self->{line}; 2341 $self->{column_prev} = $self->{column}; 2342 $self->{column}++; 2343 $self->{nc} 2344 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2345 } else { 2346 $self->{set_nc}->($self); 2347 } 2348 2349 redo A; 2350 } elsif ((length $self->{kwd}) == 6 and 2351 ($nc == 0x0045 or # E 2352 $nc == 0x0065)) { # e 2353 if ($self->{is_xml} and 2354 ($self->{kwd} ne 'DOCTYP' or $nc == 0x0065)) { 2355 2356 ## XML5: case-sensitive. 2357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO 2358 text => 'DOCTYPE', 2359 line => $self->{line_prev}, 2360 column => $self->{column_prev} - 5); 2361 } else { 2362 2363 } 2364 $self->{state} = DOCTYPE_STATE; 2365 $self->{ct} = {type => DOCTYPE_TOKEN, 2366 quirks => 1, 2367 line => $self->{line_prev}, 2368 column => $self->{column_prev} - 7, 2369 }; 2370 2371 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2372 $self->{line_prev} = $self->{line}; 2373 $self->{column_prev} = $self->{column}; 2374 $self->{column}++; 2375 $self->{nc} 2376 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2377 } else { 2378 $self->{set_nc}->($self); 2379 } 2380 2381 redo A; 2382 } else { 2383 2384 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 2385 line => $self->{line_prev}, 2386 column => $self->{column_prev} - 1 - length $self->{kwd}); 2387 $self->{state} = BOGUS_COMMENT_STATE; 2388 ## Reconsume. 2389 $self->{ct} = {type => COMMENT_TOKEN, 2390 data => $self->{kwd}, 2391 line => $self->{line_prev}, 2392 column => $self->{column_prev} - 1 - length $self->{kwd}, 2393 }; 2394 redo A; 2395 } 2396 } elsif ($state == MD_CDATA_STATE) { 2397 if ($nc == { 2398 '[' => 0x0043, # C 2399 '[C' => 0x0044, # D 2400 '[CD' => 0x0041, # A 2401 '[CDA' => 0x0054, # T 2402 '[CDAT' => 0x0041, # A 2403 '[CDATA' => NEVER_CHAR, # ([) 2404 }->{$self->{kwd}}) { 2405 2406 ## Stay in the state. 2407 $self->{kwd} .= chr $nc; 2408 2409 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2410 $self->{line_prev} = $self->{line}; 2411 $self->{column_prev} = $self->{column}; 2412 $self->{column}++; 2413 $self->{nc} 2414 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2415 } else { 2416 $self->{set_nc}->($self); 2417 } 2418 2419 redo A; 2420 } elsif ($self->{kwd} eq '[CDATA' and 2421 $nc == 0x005B) { # [ 2422 if ($self->{is_xml} and 2423 not $self->{tainted} and 2424 @{$self->{open_elements} or []} == 0) { 2425 2426 $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element', 2427 line => $self->{line_prev}, 2428 column => $self->{column_prev} - 7); 2429 $self->{tainted} = 1; 2430 } else { 2431 2432 } 2433 2434 $self->{ct} = {type => CHARACTER_TOKEN, 2435 data => '', 2436 line => $self->{line_prev}, 2437 column => $self->{column_prev} - 7}; 2438 $self->{state} = CDATA_SECTION_STATE; 2439 2440 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2441 $self->{line_prev} = $self->{line}; 2442 $self->{column_prev} = $self->{column}; 2443 $self->{column}++; 2444 $self->{nc} 2445 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2446 } else { 2447 $self->{set_nc}->($self); 2448 } 2449 2450 redo A; 2451 } else { 2452 2453 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 2454 line => $self->{line_prev}, 2455 column => $self->{column_prev} - 1 - length $self->{kwd}); 2456 $self->{state} = BOGUS_COMMENT_STATE; 2457 ## Reconsume. 2458 $self->{ct} = {type => COMMENT_TOKEN, 2459 data => $self->{kwd}, 2460 line => $self->{line_prev}, 2461 column => $self->{column_prev} - 1 - length $self->{kwd}, 2462 }; 2463 redo A; 2464 } 2465 } elsif ($state == COMMENT_START_STATE) { 2466 if ($nc == 0x002D) { # - 2467 2468 $self->{state} = COMMENT_START_DASH_STATE; 2469 2470 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2471 $self->{line_prev} = $self->{line}; 2472 $self->{column_prev} = $self->{column}; 2473 $self->{column}++; 2474 $self->{nc} 2475 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2476 } else { 2477 $self->{set_nc}->($self); 2478 } 2479 2480 redo A; 2481 } elsif ($nc == 0x003E) { # > 2482 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); 2483 if ($self->{in_subset}) { 2484 2485 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2486 } else { 2487 2488 $self->{state} = DATA_STATE; 2489 } 2490 2491 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2492 $self->{line_prev} = $self->{line}; 2493 $self->{column_prev} = $self->{column}; 2494 $self->{column}++; 2495 $self->{nc} 2496 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2497 } else { 2498 $self->{set_nc}->($self); 2499 } 2500 2501 2502 return ($self->{ct}); # comment 2503 2504 redo A; 2505 } elsif ($nc == -1) { 2506 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); 2507 if ($self->{in_subset}) { 2508 2509 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2510 } else { 2511 2512 $self->{state} = DATA_STATE; 2513 } 2514 ## reconsume 2515 2516 return ($self->{ct}); # comment 2517 2518 redo A; 2519 } elsif ($nc == 0x0000) { 2520 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2521 $self->{ct}->{data} .= "\x{FFFD}"; # comment 2522 $self->{state} = COMMENT_STATE; 2523 2524 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2525 $self->{line_prev} = $self->{line}; 2526 $self->{column_prev} = $self->{column}; 2527 $self->{column}++; 2528 $self->{nc} 2529 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2530 } else { 2531 $self->{set_nc}->($self); 2532 } 2533 2534 redo A; 2535 } else { 2536 2537 $self->{ct}->{data} # comment 2538 .= chr ($nc); 2539 $self->{state} = COMMENT_STATE; 2540 2541 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2542 $self->{line_prev} = $self->{line}; 2543 $self->{column_prev} = $self->{column}; 2544 $self->{column}++; 2545 $self->{nc} 2546 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2547 } else { 2548 $self->{set_nc}->($self); 2549 } 2550 2551 redo A; 2552 } 2553 } elsif ($state == COMMENT_START_DASH_STATE) { 2554 if ($nc == 0x002D) { # - 2555 2556 $self->{state} = COMMENT_END_STATE; 2557 2558 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2559 $self->{line_prev} = $self->{line}; 2560 $self->{column_prev} = $self->{column}; 2561 $self->{column}++; 2562 $self->{nc} 2563 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2564 } else { 2565 $self->{set_nc}->($self); 2566 } 2567 2568 redo A; 2569 } elsif ($nc == 0x003E) { # > 2570 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); 2571 if ($self->{in_subset}) { 2572 2573 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2574 } else { 2575 2576 $self->{state} = DATA_STATE; 2577 } 2578 2579 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2580 $self->{line_prev} = $self->{line}; 2581 $self->{column_prev} = $self->{column}; 2582 $self->{column}++; 2583 $self->{nc} 2584 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2585 } else { 2586 $self->{set_nc}->($self); 2587 } 2588 2589 2590 return ($self->{ct}); # comment 2591 2592 redo A; 2593 } elsif ($nc == -1) { 2594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); 2595 if ($self->{in_subset}) { 2596 2597 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2598 } else { 2599 2600 $self->{state} = DATA_STATE; 2601 } 2602 ## reconsume 2603 2604 return ($self->{ct}); # comment 2605 2606 redo A; 2607 } elsif ($nc == 0x0000) { 2608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2609 $self->{ct}->{data} .= "-\x{FFFD}"; # comment 2610 $self->{state} = COMMENT_STATE; 2611 2612 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2613 $self->{line_prev} = $self->{line}; 2614 $self->{column_prev} = $self->{column}; 2615 $self->{column}++; 2616 $self->{nc} 2617 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2618 } else { 2619 $self->{set_nc}->($self); 2620 } 2621 2622 redo A; 2623 } else { 2624 2625 $self->{ct}->{data} # comment 2626 .= '-' . chr ($nc); 2627 $self->{state} = COMMENT_STATE; 2628 2629 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2630 $self->{line_prev} = $self->{line}; 2631 $self->{column_prev} = $self->{column}; 2632 $self->{column}++; 2633 $self->{nc} 2634 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2635 } else { 2636 $self->{set_nc}->($self); 2637 } 2638 2639 redo A; 2640 } 2641 } elsif ($state == COMMENT_STATE) { 2642 ## XML5: "Comment state" and "DOCTYPE comment state". 2643 2644 if ($nc == 0x002D) { # - 2645 2646 $self->{state} = COMMENT_END_DASH_STATE; 2647 2648 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2649 $self->{line_prev} = $self->{line}; 2650 $self->{column_prev} = $self->{column}; 2651 $self->{column}++; 2652 $self->{nc} 2653 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2654 } else { 2655 $self->{set_nc}->($self); 2656 } 2657 2658 redo A; 2659 } elsif ($nc == -1) { 2660 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); 2661 if ($self->{in_subset}) { 2662 2663 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2664 } else { 2665 2666 $self->{state} = DATA_STATE; 2667 } 2668 ## reconsume 2669 2670 return ($self->{ct}); # comment 2671 2672 redo A; 2673 } elsif ($nc == 0x0000) { 2674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2675 $self->{ct}->{data} .= "\x{FFFD}"; # comment 2676 2677 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2678 $self->{line_prev} = $self->{line}; 2679 $self->{column_prev} = $self->{column}; 2680 $self->{column}++; 2681 $self->{nc} 2682 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2683 } else { 2684 $self->{set_nc}->($self); 2685 } 2686 2687 redo A; 2688 } else { 2689 2690 $self->{ct}->{data} .= chr ($nc); # comment 2691 $self->{read_until}->($self->{ct}->{data}, 2692 qq[-\x00], 2693 length $self->{ct}->{data}); 2694 2695 ## Stay in the state 2696 2697 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2698 $self->{line_prev} = $self->{line}; 2699 $self->{column_prev} = $self->{column}; 2700 $self->{column}++; 2701 $self->{nc} 2702 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2703 } else { 2704 $self->{set_nc}->($self); 2705 } 2706 2707 redo A; 2708 } 2709 } elsif ($state == COMMENT_END_DASH_STATE) { 2710 ## XML5: "Comment dash state" and "DOCTYPE comment dash state". 2711 2712 if ($nc == 0x002D) { # - 2713 2714 $self->{state} = COMMENT_END_STATE; 2715 2716 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2717 $self->{line_prev} = $self->{line}; 2718 $self->{column_prev} = $self->{column}; 2719 $self->{column}++; 2720 $self->{nc} 2721 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2722 } else { 2723 $self->{set_nc}->($self); 2724 } 2725 2726 redo A; 2727 } elsif ($nc == -1) { 2728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); 2729 if ($self->{in_subset}) { 2730 2731 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2732 } else { 2733 2734 $self->{state} = DATA_STATE; 2735 } 2736 ## reconsume 2737 2738 return ($self->{ct}); # comment 2739 2740 redo A; 2741 } elsif ($nc == 0x0000) { 2742 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2743 $self->{ct}->{data} .= "-\x{FFFD}"; # comment 2744 $self->{state} = COMMENT_STATE; 2745 2746 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2747 $self->{line_prev} = $self->{line}; 2748 $self->{column_prev} = $self->{column}; 2749 $self->{column}++; 2750 $self->{nc} 2751 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2752 } else { 2753 $self->{set_nc}->($self); 2754 } 2755 2756 redo A; 2757 } else { 2758 2759 $self->{ct}->{data} .= '-' . chr ($nc); # comment 2760 $self->{state} = COMMENT_STATE; 2761 2762 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2763 $self->{line_prev} = $self->{line}; 2764 $self->{column_prev} = $self->{column}; 2765 $self->{column}++; 2766 $self->{nc} 2767 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2768 } else { 2769 $self->{set_nc}->($self); 2770 } 2771 2772 redo A; 2773 } 2774 } elsif ($state == COMMENT_END_STATE or 2775 $state == COMMENT_END_BANG_STATE) { 2776 ## XML5: "Comment end state" and "DOCTYPE comment end state". 2777 ## (No comment end bang state.) 2778 2779 if ($nc == 0x003E) { # > 2780 if ($self->{in_subset}) { 2781 2782 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2783 } else { 2784 2785 $self->{state} = DATA_STATE; 2786 } 2787 2788 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2789 $self->{line_prev} = $self->{line}; 2790 $self->{column_prev} = $self->{column}; 2791 $self->{column}++; 2792 $self->{nc} 2793 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2794 } else { 2795 $self->{set_nc}->($self); 2796 } 2797 2798 2799 return ($self->{ct}); # comment 2800 2801 redo A; 2802 } elsif ($nc == 0x002D) { # - 2803 if ($state == COMMENT_END_BANG_STATE) { 2804 2805 $self->{ct}->{data} .= '--!'; # comment 2806 $self->{state} = COMMENT_END_DASH_STATE; 2807 } else { 2808 2809 ## XML5: Not a parse error. 2810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', 2811 line => $self->{line_prev}, 2812 column => $self->{column_prev}); 2813 $self->{ct}->{data} .= '-'; # comment 2814 ## Stay in the state 2815 } 2816 2817 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2818 $self->{line_prev} = $self->{line}; 2819 $self->{column_prev} = $self->{column}; 2820 $self->{column}++; 2821 $self->{nc} 2822 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2823 } else { 2824 $self->{set_nc}->($self); 2825 } 2826 2827 redo A; 2828 } elsif ($state != COMMENT_END_BANG_STATE and 2829 $nc == 0x0021) { # ! 2830 2831 $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type 2832 $self->{state} = COMMENT_END_BANG_STATE; 2833 2834 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2835 $self->{line_prev} = $self->{line}; 2836 $self->{column_prev} = $self->{column}; 2837 $self->{column}++; 2838 $self->{nc} 2839 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2840 } else { 2841 $self->{set_nc}->($self); 2842 } 2843 2844 redo A; 2845 } elsif ($nc == -1) { 2846 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); 2847 if ($self->{in_subset}) { 2848 2849 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 2850 } else { 2851 2852 $self->{state} = DATA_STATE; 2853 } 2854 ## Reconsume. 2855 2856 return ($self->{ct}); # comment 2857 2858 redo A; 2859 } elsif ($nc == 0x0000) { 2860 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 2861 if ($state == COMMENT_END_BANG_STATE) { 2862 $self->{ct}->{data} .= "--!\x{FFFD}"; # comment 2863 } else { 2864 $self->{ct}->{data} .= "--\x{FFFD}"; # comment 2865 } 2866 $self->{state} = COMMENT_STATE; 2867 2868 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2869 $self->{line_prev} = $self->{line}; 2870 $self->{column_prev} = $self->{column}; 2871 $self->{column}++; 2872 $self->{nc} 2873 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2874 } else { 2875 $self->{set_nc}->($self); 2876 } 2877 2878 redo A; 2879 } else { 2880 2881 if ($state == COMMENT_END_BANG_STATE) { 2882 $self->{ct}->{data} .= '--!' . chr ($nc); # comment 2883 } else { 2884 $self->{ct}->{data} .= '--' . chr ($nc); # comment 2885 } 2886 $self->{state} = COMMENT_STATE; 2887 2888 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2889 $self->{line_prev} = $self->{line}; 2890 $self->{column_prev} = $self->{column}; 2891 $self->{column}++; 2892 $self->{nc} 2893 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2894 } else { 2895 $self->{set_nc}->($self); 2896 } 2897 2898 redo A; 2899 } 2900 } elsif ($state == DOCTYPE_STATE) { 2901 if ($is_space->{$nc}) { 2902 2903 $self->{state} = BEFORE_DOCTYPE_NAME_STATE; 2904 2905 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2906 $self->{line_prev} = $self->{line}; 2907 $self->{column_prev} = $self->{column}; 2908 $self->{column}++; 2909 $self->{nc} 2910 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2911 } else { 2912 $self->{set_nc}->($self); 2913 } 2914 2915 redo A; 2916 } elsif ($nc == -1) { 2917 2918 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 2919 $self->{ct}->{quirks} = 1; 2920 2921 $self->{state} = DATA_STATE; 2922 ## Reconsume. 2923 return ($self->{ct}); # DOCTYPE (quirks) 2924 2925 redo A; 2926 } else { 2927 2928 ## XML5: Swith to the bogus comment state. 2929 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); 2930 $self->{state} = BEFORE_DOCTYPE_NAME_STATE; 2931 ## reconsume 2932 redo A; 2933 } 2934 } elsif ($state == BEFORE_DOCTYPE_NAME_STATE) { 2935 ## XML5: "DOCTYPE root name before state". 2936 2937 if ($is_space->{$nc}) { 2938 2939 ## Stay in the state 2940 2941 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2942 $self->{line_prev} = $self->{line}; 2943 $self->{column_prev} = $self->{column}; 2944 $self->{column}++; 2945 $self->{nc} 2946 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2947 } else { 2948 $self->{set_nc}->($self); 2949 } 2950 2951 redo A; 2952 } elsif ($nc == 0x003E) { # > 2953 2954 ## XML5: No parse error. 2955 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); 2956 $self->{state} = DATA_STATE; 2957 2958 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2959 $self->{line_prev} = $self->{line}; 2960 $self->{column_prev} = $self->{column}; 2961 $self->{column}++; 2962 $self->{nc} 2963 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2964 } else { 2965 $self->{set_nc}->($self); 2966 } 2967 2968 2969 return ($self->{ct}); # DOCTYPE (quirks) 2970 2971 redo A; 2972 } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z 2973 2974 $self->{ct}->{name} # DOCTYPE 2975 = chr ($nc + ($self->{is_xml} ? 0 : 0x0020)); 2976 delete $self->{ct}->{quirks}; 2977 $self->{state} = DOCTYPE_NAME_STATE; 2978 2979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 2980 $self->{line_prev} = $self->{line}; 2981 $self->{column_prev} = $self->{column}; 2982 $self->{column}++; 2983 $self->{nc} 2984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 2985 } else { 2986 $self->{set_nc}->($self); 2987 } 2988 2989 redo A; 2990 } elsif ($nc == -1) { 2991 2992 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); 2993 $self->{state} = DATA_STATE; 2994 ## reconsume 2995 2996 return ($self->{ct}); # DOCTYPE (quirks) 2997 2998 redo A; 2999 } elsif ($self->{is_xml} and $nc == 0x005B) { # [ 3000 3001 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); 3002 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3003 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 3004 $self->{in_subset} = 1; 3005 3006 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3007 $self->{line_prev} = $self->{line}; 3008 $self->{column_prev} = $self->{column}; 3009 $self->{column}++; 3010 $self->{nc} 3011 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3012 } else { 3013 $self->{set_nc}->($self); 3014 } 3015 3016 return ($self->{ct}); # DOCTYPE 3017 redo A; 3018 } elsif ($nc == 0x0000) { 3019 $self->{ct}->{name} = "\x{FFFD}"; 3020 delete $self->{ct}->{quirks}; 3021 $self->{state} = DOCTYPE_NAME_STATE; 3022 3023 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3024 $self->{line_prev} = $self->{line}; 3025 $self->{column_prev} = $self->{column}; 3026 $self->{column}++; 3027 $self->{nc} 3028 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3029 } else { 3030 $self->{set_nc}->($self); 3031 } 3032 3033 redo A; 3034 } else { 3035 3036 $self->{ct}->{name} = chr $nc; 3037 delete $self->{ct}->{quirks}; 3038 $self->{state} = DOCTYPE_NAME_STATE; 3039 3040 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3041 $self->{line_prev} = $self->{line}; 3042 $self->{column_prev} = $self->{column}; 3043 $self->{column}++; 3044 $self->{nc} 3045 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3046 } else { 3047 $self->{set_nc}->($self); 3048 } 3049 3050 redo A; 3051 } 3052 } elsif ($state == DOCTYPE_NAME_STATE) { 3053 ## XML5: "DOCTYPE root name state". 3054 3055 if ($is_space->{$nc}) { 3056 3057 $self->{state} = AFTER_DOCTYPE_NAME_STATE; 3058 3059 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3060 $self->{line_prev} = $self->{line}; 3061 $self->{column_prev} = $self->{column}; 3062 $self->{column}++; 3063 $self->{nc} 3064 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3065 } else { 3066 $self->{set_nc}->($self); 3067 } 3068 3069 redo A; 3070 } elsif ($nc == 0x003E) { # > 3071 3072 $self->{state} = DATA_STATE; 3073 3074 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3075 $self->{line_prev} = $self->{line}; 3076 $self->{column_prev} = $self->{column}; 3077 $self->{column}++; 3078 $self->{nc} 3079 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3080 } else { 3081 $self->{set_nc}->($self); 3082 } 3083 3084 3085 return ($self->{ct}); # DOCTYPE 3086 3087 redo A; 3088 } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z 3089 3090 $self->{ct}->{name} # DOCTYPE 3091 .= chr ($nc + ($self->{is_xml} ? 0 : 0x0020)); 3092 delete $self->{ct}->{quirks}; 3093 ## Stay in the state. 3094 3095 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3096 $self->{line_prev} = $self->{line}; 3097 $self->{column_prev} = $self->{column}; 3098 $self->{column}++; 3099 $self->{nc} 3100 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3101 } else { 3102 $self->{set_nc}->($self); 3103 } 3104 3105 redo A; 3106 } elsif ($nc == -1) { 3107 3108 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 3109 $self->{state} = DATA_STATE; 3110 ## reconsume 3111 3112 $self->{ct}->{quirks} = 1; 3113 return ($self->{ct}); # DOCTYPE 3114 3115 redo A; 3116 } elsif ($self->{is_xml} and $nc == 0x005B) { # [ 3117 3118 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3119 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 3120 $self->{in_subset} = 1; 3121 3122 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3123 $self->{line_prev} = $self->{line}; 3124 $self->{column_prev} = $self->{column}; 3125 $self->{column}++; 3126 $self->{nc} 3127 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3128 } else { 3129 $self->{set_nc}->($self); 3130 } 3131 3132 return ($self->{ct}); # DOCTYPE 3133 redo A; 3134 } elsif ($nc == 0x0000) { 3135 $self->{ct}->{name} .= "\x{FFFD}"; # DOCTYPE 3136 ## Stay in the state. 3137 3138 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3139 $self->{line_prev} = $self->{line}; 3140 $self->{column_prev} = $self->{column}; 3141 $self->{column}++; 3142 $self->{nc} 3143 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3144 } else { 3145 $self->{set_nc}->($self); 3146 } 3147 3148 redo A; 3149 } else { 3150 3151 $self->{ct}->{name} .= chr ($nc); # DOCTYPE 3152 ## Stay in the state. 3153 3154 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3155 $self->{line_prev} = $self->{line}; 3156 $self->{column_prev} = $self->{column}; 3157 $self->{column}++; 3158 $self->{nc} 3159 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3160 } else { 3161 $self->{set_nc}->($self); 3162 } 3163 3164 redo A; 3165 } 3166 } elsif ($state == AFTER_DOCTYPE_NAME_STATE) { 3167 ## XML5: Corresponding to XML5's "DOCTYPE root name after 3168 ## state", but implemented differently. 3169 3170 if ($is_space->{$nc}) { 3171 3172 ## Stay in the state 3173 3174 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3175 $self->{line_prev} = $self->{line}; 3176 $self->{column_prev} = $self->{column}; 3177 $self->{column}++; 3178 $self->{nc} 3179 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3180 } else { 3181 $self->{set_nc}->($self); 3182 } 3183 3184 redo A; 3185 } elsif ($nc == 0x003E) { # > 3186 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3187 3188 $self->{state} = DATA_STATE; 3189 } else { 3190 3191 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type 3192 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3193 } 3194 3195 3196 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3197 $self->{line_prev} = $self->{line}; 3198 $self->{column_prev} = $self->{column}; 3199 $self->{column}++; 3200 $self->{nc} 3201 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3202 } else { 3203 $self->{set_nc}->($self); 3204 } 3205 3206 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3207 redo A; 3208 } elsif ($nc == -1) { 3209 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3210 3211 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 3212 $self->{state} = DATA_STATE; 3213 $self->{ct}->{quirks} = 1; 3214 } else { 3215 3216 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 3217 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3218 } 3219 3220 ## Reconsume. 3221 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3222 redo A; 3223 } elsif ($nc == 0x0050 or # P 3224 $nc == 0x0070) { # p 3225 3226 $self->{state} = PUBLIC_STATE; 3227 $self->{kwd} = chr $nc; 3228 3229 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3230 $self->{line_prev} = $self->{line}; 3231 $self->{column_prev} = $self->{column}; 3232 $self->{column}++; 3233 $self->{nc} 3234 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3235 } else { 3236 $self->{set_nc}->($self); 3237 } 3238 3239 redo A; 3240 } elsif ($nc == 0x0053 or # S 3241 $nc == 0x0073) { # s 3242 3243 $self->{state} = SYSTEM_STATE; 3244 $self->{kwd} = chr $nc; 3245 3246 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3247 $self->{line_prev} = $self->{line}; 3248 $self->{column_prev} = $self->{column}; 3249 $self->{column}++; 3250 $self->{nc} 3251 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3252 } else { 3253 $self->{set_nc}->($self); 3254 } 3255 3256 redo A; 3257 } elsif ($nc == 0x0022 and # " 3258 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or 3259 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) { 3260 3261 $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE; 3262 $self->{ct}->{value} = ''; # ENTITY 3263 3264 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3265 $self->{line_prev} = $self->{line}; 3266 $self->{column_prev} = $self->{column}; 3267 $self->{column}++; 3268 $self->{nc} 3269 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3270 } else { 3271 $self->{set_nc}->($self); 3272 } 3273 3274 redo A; 3275 } elsif ($nc == 0x0027 and # ' 3276 ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or 3277 $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) { 3278 3279 $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE; 3280 $self->{ct}->{value} = ''; # ENTITY 3281 3282 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3283 $self->{line_prev} = $self->{line}; 3284 $self->{column_prev} = $self->{column}; 3285 $self->{column}++; 3286 $self->{nc} 3287 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3288 } else { 3289 $self->{set_nc}->($self); 3290 } 3291 3292 redo A; 3293 } elsif ($self->{is_xml} and 3294 $self->{ct}->{type} == DOCTYPE_TOKEN and 3295 $nc == 0x005B) { # [ 3296 3297 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3298 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 3299 $self->{in_subset} = 1; 3300 3301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3302 $self->{line_prev} = $self->{line}; 3303 $self->{column_prev} = $self->{column}; 3304 $self->{column}++; 3305 $self->{nc} 3306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3307 } else { 3308 $self->{set_nc}->($self); 3309 } 3310 3311 return ($self->{ct}); # DOCTYPE 3312 redo A; 3313 } else { 3314 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type 3315 3316 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3317 3318 $self->{ct}->{quirks} = 1; 3319 $self->{state} = BOGUS_DOCTYPE_STATE; 3320 } else { 3321 3322 $self->{state} = BOGUS_MD_STATE; 3323 } 3324 3325 3326 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3327 $self->{line_prev} = $self->{line}; 3328 $self->{column_prev} = $self->{column}; 3329 $self->{column}++; 3330 $self->{nc} 3331 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3332 } else { 3333 $self->{set_nc}->($self); 3334 } 3335 3336 redo A; 3337 } 3338 } elsif ($state == PUBLIC_STATE) { 3339 ## ASCII case-insensitive 3340 if ($nc == [ 3341 undef, 3342 0x0055, # U 3343 0x0042, # B 3344 0x004C, # L 3345 0x0049, # I 3346 NEVER_CHAR, # (C) 3347 ]->[length $self->{kwd}] or 3348 $nc == [ 3349 undef, 3350 0x0075, # u 3351 0x0062, # b 3352 0x006C, # l 3353 0x0069, # i 3354 NEVER_CHAR, # (c) 3355 ]->[length $self->{kwd}]) { 3356 3357 ## Stay in the state. 3358 $self->{kwd} .= chr $nc; 3359 3360 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3361 $self->{line_prev} = $self->{line}; 3362 $self->{column_prev} = $self->{column}; 3363 $self->{column}++; 3364 $self->{nc} 3365 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3366 } else { 3367 $self->{set_nc}->($self); 3368 } 3369 3370 redo A; 3371 } elsif ((length $self->{kwd}) == 5 and 3372 ($nc == 0x0043 or # C 3373 $nc == 0x0063)) { # c 3374 if ($self->{is_xml} and 3375 ($self->{kwd} ne 'PUBLI' or $nc == 0x0063)) { # c 3376 3377 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 3378 text => 'PUBLIC', 3379 line => $self->{line_prev}, 3380 column => $self->{column_prev} - 4); 3381 } else { 3382 3383 } 3384 $self->{state} = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; 3385 3386 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3387 $self->{line_prev} = $self->{line}; 3388 $self->{column_prev} = $self->{column}; 3389 $self->{column}++; 3390 $self->{nc} 3391 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3392 } else { 3393 $self->{set_nc}->($self); 3394 } 3395 3396 redo A; 3397 } else { 3398 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type 3399 line => $self->{line_prev}, 3400 column => $self->{column_prev} + 1 - length $self->{kwd}); 3401 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3402 3403 $self->{ct}->{quirks} = 1; 3404 $self->{state} = BOGUS_DOCTYPE_STATE; 3405 } else { 3406 3407 $self->{state} = BOGUS_MD_STATE; 3408 } 3409 ## Reconsume. 3410 redo A; 3411 } 3412 } elsif ($state == SYSTEM_STATE) { 3413 ## ASCII case-insensitive 3414 if ($nc == [ 3415 undef, 3416 0x0059, # Y 3417 0x0053, # S 3418 0x0054, # T 3419 0x0045, # E 3420 NEVER_CHAR, # (M) 3421 ]->[length $self->{kwd}] or 3422 $nc == [ 3423 undef, 3424 0x0079, # y 3425 0x0073, # s 3426 0x0074, # t 3427 0x0065, # e 3428 NEVER_CHAR, # (m) 3429 ]->[length $self->{kwd}]) { 3430 3431 ## Stay in the state. 3432 $self->{kwd} .= chr $nc; 3433 3434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3435 $self->{line_prev} = $self->{line}; 3436 $self->{column_prev} = $self->{column}; 3437 $self->{column}++; 3438 $self->{nc} 3439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3440 } else { 3441 $self->{set_nc}->($self); 3442 } 3443 3444 redo A; 3445 } elsif ((length $self->{kwd}) == 5 and 3446 ($nc == 0x004D or # M 3447 $nc == 0x006D)) { # m 3448 if ($self->{is_xml} and 3449 ($self->{kwd} ne 'SYSTE' or $nc == 0x006D)) { # m 3450 3451 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 3452 text => 'SYSTEM', 3453 line => $self->{line_prev}, 3454 column => $self->{column_prev} - 4); 3455 } else { 3456 3457 } 3458 $self->{state} = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; 3459 3460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3461 $self->{line_prev} = $self->{line}; 3462 $self->{column_prev} = $self->{column}; 3463 $self->{column}++; 3464 $self->{nc} 3465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3466 } else { 3467 $self->{set_nc}->($self); 3468 } 3469 3470 redo A; 3471 } else { 3472 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type 3473 line => $self->{line_prev}, 3474 column => $self->{column_prev} + 1 - length $self->{kwd}); 3475 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3476 3477 $self->{ct}->{quirks} = 1; 3478 $self->{state} = BOGUS_DOCTYPE_STATE; 3479 } else { 3480 3481 $self->{state} = BOGUS_MD_STATE; 3482 } 3483 ## Reconsume. 3484 redo A; 3485 } 3486 } elsif ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE or 3487 $state == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { 3488 if ($is_space->{$nc}) { 3489 3490 ## Stay in or switch to the state. 3491 $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 3492 3493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3494 $self->{line_prev} = $self->{line}; 3495 $self->{column_prev} = $self->{column}; 3496 $self->{column}++; 3497 $self->{nc} 3498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3499 } else { 3500 $self->{set_nc}->($self); 3501 } 3502 3503 redo A; 3504 } elsif ($nc == 0x0022) { # " 3505 if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) { 3506 3507 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation 3508 } else { 3509 3510 } 3511 $self->{ct}->{pubid} = ''; # DOCTYPE 3512 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; 3513 3514 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3515 $self->{line_prev} = $self->{line}; 3516 $self->{column_prev} = $self->{column}; 3517 $self->{column}++; 3518 $self->{nc} 3519 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3520 } else { 3521 $self->{set_nc}->($self); 3522 } 3523 3524 redo A; 3525 } elsif ($nc == 0x0027) { # ' 3526 if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) { 3527 3528 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation 3529 } else { 3530 3531 } 3532 $self->{ct}->{pubid} = ''; # DOCTYPE 3533 $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; 3534 3535 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3536 $self->{line_prev} = $self->{line}; 3537 $self->{column_prev} = $self->{column}; 3538 $self->{column}++; 3539 $self->{nc} 3540 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3541 } else { 3542 $self->{set_nc}->($self); 3543 } 3544 3545 redo A; 3546 } elsif ($nc == 0x003E) { # > 3547 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); 3548 3549 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3550 3551 $self->{state} = DATA_STATE; 3552 $self->{ct}->{quirks} = 1; 3553 } else { 3554 3555 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3556 } 3557 3558 3559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3560 $self->{line_prev} = $self->{line}; 3561 $self->{column_prev} = $self->{column}; 3562 $self->{column}++; 3563 $self->{nc} 3564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3565 } else { 3566 $self->{set_nc}->($self); 3567 } 3568 3569 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3570 redo A; 3571 } elsif ($nc == EOF_CHAR) { 3572 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3573 3574 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 3575 $self->{state} = DATA_STATE; 3576 $self->{ct}->{quirks} = 1; 3577 } else { 3578 3579 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 3580 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3581 } 3582 3583 ## Reconsume. 3584 return ($self->{ct}); # DOCTYPE 3585 redo A; 3586 } elsif ($self->{is_xml} and 3587 $self->{ct}->{type} == DOCTYPE_TOKEN and 3588 $nc == 0x005B) { # [ 3589 3590 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); 3591 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3592 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 3593 $self->{in_subset} = 1; 3594 3595 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3596 $self->{line_prev} = $self->{line}; 3597 $self->{column_prev} = $self->{column}; 3598 $self->{column}++; 3599 $self->{nc} 3600 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3601 } else { 3602 $self->{set_nc}->($self); 3603 } 3604 3605 return ($self->{ct}); # DOCTYPE 3606 redo A; 3607 } else { 3608 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); 3609 3610 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3611 3612 $self->{ct}->{quirks} = 1; 3613 $self->{state} = BOGUS_DOCTYPE_STATE; 3614 } else { 3615 3616 $self->{state} = BOGUS_MD_STATE; 3617 } 3618 3619 3620 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3621 $self->{line_prev} = $self->{line}; 3622 $self->{column_prev} = $self->{column}; 3623 $self->{column}++; 3624 $self->{nc} 3625 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3626 } else { 3627 $self->{set_nc}->($self); 3628 } 3629 3630 redo A; 3631 } 3632 } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) { 3633 if ($nc == 0x0022) { # " 3634 3635 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 3636 3637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3638 $self->{line_prev} = $self->{line}; 3639 $self->{column_prev} = $self->{column}; 3640 $self->{column}++; 3641 $self->{nc} 3642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3643 } else { 3644 $self->{set_nc}->($self); 3645 } 3646 3647 redo A; 3648 } elsif ($nc == 0x003E) { # > 3649 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); 3650 3651 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3652 3653 $self->{state} = DATA_STATE; 3654 $self->{ct}->{quirks} = 1; 3655 } else { 3656 3657 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3658 } 3659 3660 3661 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3662 $self->{line_prev} = $self->{line}; 3663 $self->{column_prev} = $self->{column}; 3664 $self->{column}++; 3665 $self->{nc} 3666 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3667 } else { 3668 $self->{set_nc}->($self); 3669 } 3670 3671 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3672 redo A; 3673 } elsif ($nc == -1) { 3674 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); 3675 3676 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3677 3678 $self->{state} = DATA_STATE; 3679 $self->{ct}->{quirks} = 1; 3680 } else { 3681 3682 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3683 } 3684 3685 ## Reconsume. 3686 return ($self->{ct}); # DOCTYPE 3687 redo A; 3688 } elsif ($nc == 0x0000) { 3689 $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION 3690 ## Stay in the state. 3691 3692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3693 $self->{line_prev} = $self->{line}; 3694 $self->{column_prev} = $self->{column}; 3695 $self->{column}++; 3696 $self->{nc} 3697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3698 } else { 3699 $self->{set_nc}->($self); 3700 } 3701 3702 redo A; 3703 } else { 3704 3705 $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION 3706 $self->{read_until}->($self->{ct}->{pubid}, qq[\x00">], 3707 length $self->{ct}->{pubid}); 3708 3709 ## Stay in the state. 3710 3711 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3712 $self->{line_prev} = $self->{line}; 3713 $self->{column_prev} = $self->{column}; 3714 $self->{column}++; 3715 $self->{nc} 3716 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3717 } else { 3718 $self->{set_nc}->($self); 3719 } 3720 3721 redo A; 3722 } 3723 } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) { 3724 if ($nc == 0x0027) { # ' 3725 3726 $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; 3727 3728 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3729 $self->{line_prev} = $self->{line}; 3730 $self->{column_prev} = $self->{column}; 3731 $self->{column}++; 3732 $self->{nc} 3733 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3734 } else { 3735 $self->{set_nc}->($self); 3736 } 3737 3738 redo A; 3739 } elsif ($nc == 0x003E) { # > 3740 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); 3741 3742 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3743 3744 $self->{state} = DATA_STATE; 3745 $self->{ct}->{quirks} = 1; 3746 } else { 3747 3748 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3749 } 3750 3751 3752 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3753 $self->{line_prev} = $self->{line}; 3754 $self->{column_prev} = $self->{column}; 3755 $self->{column}++; 3756 $self->{nc} 3757 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3758 } else { 3759 $self->{set_nc}->($self); 3760 } 3761 3762 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3763 redo A; 3764 } elsif ($nc == -1) { 3765 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); 3766 3767 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3768 3769 $self->{state} = DATA_STATE; 3770 $self->{ct}->{quirks} = 1; 3771 } else { 3772 3773 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3774 } 3775 3776 ## reconsume 3777 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3778 redo A; 3779 } elsif ($nc == 0x0000) { 3780 $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION 3781 ## Stay in the state. 3782 3783 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3784 $self->{line_prev} = $self->{line}; 3785 $self->{column_prev} = $self->{column}; 3786 $self->{column}++; 3787 $self->{nc} 3788 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3789 } else { 3790 $self->{set_nc}->($self); 3791 } 3792 3793 redo A; 3794 } else { 3795 3796 $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION 3797 $self->{read_until}->($self->{ct}->{pubid}, qq[\x00'>], 3798 length $self->{ct}->{pubid}); 3799 3800 ## Stay in the state 3801 3802 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3803 $self->{line_prev} = $self->{line}; 3804 $self->{column_prev} = $self->{column}; 3805 $self->{column}++; 3806 $self->{nc} 3807 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3808 } else { 3809 $self->{set_nc}->($self); 3810 } 3811 3812 redo A; 3813 } 3814 } elsif ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE or 3815 $state == BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE) { 3816 if ($is_space->{$nc}) { 3817 3818 ## Stay in or switch to the state. 3819 $self->{state} = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE; 3820 3821 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3822 $self->{line_prev} = $self->{line}; 3823 $self->{column_prev} = $self->{column}; 3824 $self->{column}++; 3825 $self->{nc} 3826 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3827 } else { 3828 $self->{set_nc}->($self); 3829 } 3830 3831 redo A; 3832 } elsif ($nc == 0x0022) { # " 3833 if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { 3834 3835 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation 3836 } else { 3837 3838 } 3839 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION 3840 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 3841 3842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3843 $self->{line_prev} = $self->{line}; 3844 $self->{column_prev} = $self->{column}; 3845 $self->{column}++; 3846 $self->{nc} 3847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3848 } else { 3849 $self->{set_nc}->($self); 3850 } 3851 3852 redo A; 3853 } elsif ($nc == 0x0027) { # ' 3854 if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { 3855 3856 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation 3857 } else { 3858 3859 } 3860 $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION 3861 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 3862 3863 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3864 $self->{line_prev} = $self->{line}; 3865 $self->{column_prev} = $self->{column}; 3866 $self->{column}++; 3867 $self->{nc} 3868 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3869 } else { 3870 $self->{set_nc}->($self); 3871 } 3872 3873 redo A; 3874 } elsif ($nc == 0x003E) { # > 3875 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3876 if ($self->{is_xml}) { 3877 3878 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); 3879 } else { 3880 3881 } 3882 $self->{state} = DATA_STATE; 3883 } else { 3884 if ($self->{ct}->{type} == NOTATION_TOKEN) { 3885 3886 } else { 3887 3888 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); 3889 } 3890 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3891 } 3892 3893 3894 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3895 $self->{line_prev} = $self->{line}; 3896 $self->{column_prev} = $self->{column}; 3897 $self->{column}++; 3898 $self->{nc} 3899 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3900 } else { 3901 $self->{set_nc}->($self); 3902 } 3903 3904 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3905 redo A; 3906 } elsif ($nc == EOF_CHAR) { 3907 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3908 3909 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 3910 3911 $self->{state} = DATA_STATE; 3912 $self->{ct}->{quirks} = 1; 3913 } else { 3914 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 3915 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3916 } 3917 3918 ## Reconsume. 3919 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 3920 redo A; 3921 } elsif ($self->{is_xml} and 3922 $self->{ct}->{type} == DOCTYPE_TOKEN and 3923 $nc == 0x005B) { # [ 3924 3925 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); 3926 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 3927 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 3928 $self->{in_subset} = 1; 3929 3930 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3931 $self->{line_prev} = $self->{line}; 3932 $self->{column_prev} = $self->{column}; 3933 $self->{column}++; 3934 $self->{nc} 3935 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3936 } else { 3937 $self->{set_nc}->($self); 3938 } 3939 3940 return ($self->{ct}); # DOCTYPE 3941 redo A; 3942 } else { 3943 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); 3944 3945 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 3946 3947 $self->{ct}->{quirks} = 1; 3948 $self->{state} = BOGUS_DOCTYPE_STATE; 3949 } else { 3950 3951 $self->{state} = BOGUS_MD_STATE; 3952 } 3953 3954 3955 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3956 $self->{line_prev} = $self->{line}; 3957 $self->{column_prev} = $self->{column}; 3958 $self->{column}++; 3959 $self->{nc} 3960 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3961 } else { 3962 $self->{set_nc}->($self); 3963 } 3964 3965 redo A; 3966 } 3967 } elsif ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE or 3968 $state == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { 3969 if ($is_space->{$nc}) { 3970 3971 ## Stay in or switch to the state. 3972 $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 3973 3974 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3975 $self->{line_prev} = $self->{line}; 3976 $self->{column_prev} = $self->{column}; 3977 $self->{column}++; 3978 $self->{nc} 3979 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 3980 } else { 3981 $self->{set_nc}->($self); 3982 } 3983 3984 redo A; 3985 } elsif ($nc == 0x0022) { # " 3986 if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) { 3987 3988 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation 3989 } else { 3990 3991 } 3992 $self->{ct}->{sysid} = ''; # DOCTYPE 3993 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; 3994 3995 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 3996 $self->{line_prev} = $self->{line}; 3997 $self->{column_prev} = $self->{column}; 3998 $self->{column}++; 3999 $self->{nc} 4000 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4001 } else { 4002 $self->{set_nc}->($self); 4003 } 4004 4005 redo A; 4006 } elsif ($nc == 0x0027) { # ' 4007 if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) { 4008 4009 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation 4010 } else { 4011 4012 } 4013 $self->{ct}->{sysid} = ''; # DOCTYPE 4014 $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; 4015 4016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4017 $self->{line_prev} = $self->{line}; 4018 $self->{column_prev} = $self->{column}; 4019 $self->{column}++; 4020 $self->{nc} 4021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4022 } else { 4023 $self->{set_nc}->($self); 4024 } 4025 4026 redo A; 4027 } elsif ($nc == 0x003E) { # > 4028 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); 4029 4030 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4031 $self->{line_prev} = $self->{line}; 4032 $self->{column_prev} = $self->{column}; 4033 $self->{column}++; 4034 $self->{nc} 4035 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4036 } else { 4037 $self->{set_nc}->($self); 4038 } 4039 4040 4041 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4042 4043 $self->{state} = DATA_STATE; 4044 $self->{ct}->{quirks} = 1; 4045 } else { 4046 4047 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4048 } 4049 4050 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4051 redo A; 4052 } elsif ($nc == EOF_CHAR) { 4053 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4054 4055 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 4056 $self->{state} = DATA_STATE; 4057 $self->{ct}->{quirks} = 1; 4058 } else { 4059 4060 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 4061 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4062 } 4063 4064 ## Reconsume. 4065 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4066 redo A; 4067 } elsif ($self->{is_xml} and 4068 $self->{ct}->{type} == DOCTYPE_TOKEN and 4069 $nc == 0x005B) { # [ 4070 4071 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); 4072 4073 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4074 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 4075 $self->{in_subset} = 1; 4076 4077 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4078 $self->{line_prev} = $self->{line}; 4079 $self->{column_prev} = $self->{column}; 4080 $self->{column}++; 4081 $self->{nc} 4082 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4083 } else { 4084 $self->{set_nc}->($self); 4085 } 4086 4087 return ($self->{ct}); # DOCTYPE 4088 redo A; 4089 } else { 4090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); 4091 4092 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4093 4094 $self->{ct}->{quirks} = 1; 4095 $self->{state} = BOGUS_DOCTYPE_STATE; 4096 } else { 4097 4098 $self->{state} = BOGUS_MD_STATE; 4099 } 4100 4101 4102 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4103 $self->{line_prev} = $self->{line}; 4104 $self->{column_prev} = $self->{column}; 4105 $self->{column}++; 4106 $self->{nc} 4107 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4108 } else { 4109 $self->{set_nc}->($self); 4110 } 4111 4112 redo A; 4113 } 4114 } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) { 4115 if ($nc == 0x0022) { # " 4116 4117 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 4118 4119 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4120 $self->{line_prev} = $self->{line}; 4121 $self->{column_prev} = $self->{column}; 4122 $self->{column}++; 4123 $self->{nc} 4124 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4125 } else { 4126 $self->{set_nc}->($self); 4127 } 4128 4129 redo A; 4130 } elsif (not $self->{is_xml} and $nc == 0x003E) { # > 4131 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); 4132 4133 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4134 4135 $self->{state} = DATA_STATE; 4136 $self->{ct}->{quirks} = 1; 4137 } else { 4138 4139 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4140 } 4141 4142 4143 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4144 $self->{line_prev} = $self->{line}; 4145 $self->{column_prev} = $self->{column}; 4146 $self->{column}++; 4147 $self->{nc} 4148 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4149 } else { 4150 $self->{set_nc}->($self); 4151 } 4152 4153 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4154 redo A; 4155 } elsif ($nc == -1) { 4156 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); 4157 4158 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4159 4160 $self->{state} = DATA_STATE; 4161 $self->{ct}->{quirks} = 1; 4162 } else { 4163 4164 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4165 } 4166 4167 ## reconsume 4168 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4169 redo A; 4170 } elsif ($nc == 0x0000) { 4171 $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION 4172 ## Stay in the state. 4173 4174 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4175 $self->{line_prev} = $self->{line}; 4176 $self->{column_prev} = $self->{column}; 4177 $self->{column}++; 4178 $self->{nc} 4179 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4180 } else { 4181 $self->{set_nc}->($self); 4182 } 4183 4184 redo A; 4185 } else { 4186 4187 $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION 4188 $self->{read_until}->($self->{ct}->{sysid}, qq[\x00">], 4189 length $self->{ct}->{sysid}); 4190 4191 ## Stay in the state 4192 4193 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4194 $self->{line_prev} = $self->{line}; 4195 $self->{column_prev} = $self->{column}; 4196 $self->{column}++; 4197 $self->{nc} 4198 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4199 } else { 4200 $self->{set_nc}->($self); 4201 } 4202 4203 redo A; 4204 } 4205 } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) { 4206 if ($nc == 0x0027) { # ' 4207 4208 $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; 4209 4210 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4211 $self->{line_prev} = $self->{line}; 4212 $self->{column_prev} = $self->{column}; 4213 $self->{column}++; 4214 $self->{nc} 4215 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4216 } else { 4217 $self->{set_nc}->($self); 4218 } 4219 4220 redo A; 4221 } elsif (not $self->{is_xml} and $nc == 0x003E) { # > 4222 4223 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); 4224 4225 $self->{state} = DATA_STATE; 4226 4227 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4228 $self->{line_prev} = $self->{line}; 4229 $self->{column_prev} = $self->{column}; 4230 $self->{column}++; 4231 $self->{nc} 4232 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4233 } else { 4234 $self->{set_nc}->($self); 4235 } 4236 4237 4238 $self->{ct}->{quirks} = 1; 4239 return ($self->{ct}); # DOCTYPE 4240 4241 redo A; 4242 } elsif ($nc == -1) { 4243 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); 4244 4245 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4246 4247 $self->{state} = DATA_STATE; 4248 $self->{ct}->{quirks} = 1; 4249 } else { 4250 4251 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4252 } 4253 4254 ## reconsume 4255 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4256 redo A; 4257 } elsif ($nc == 0x0000) { 4258 $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION 4259 ## Stay in the state. 4260 4261 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4262 $self->{line_prev} = $self->{line}; 4263 $self->{column_prev} = $self->{column}; 4264 $self->{column}++; 4265 $self->{nc} 4266 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4267 } else { 4268 $self->{set_nc}->($self); 4269 } 4270 4271 redo A; 4272 } else { 4273 4274 $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION 4275 $self->{read_until}->($self->{ct}->{sysid}, qq[\x00'>], 4276 length $self->{ct}->{sysid}); 4277 4278 ## Stay in the state 4279 4280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4281 $self->{line_prev} = $self->{line}; 4282 $self->{column_prev} = $self->{column}; 4283 $self->{column}++; 4284 $self->{nc} 4285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4286 } else { 4287 $self->{set_nc}->($self); 4288 } 4289 4290 redo A; 4291 } 4292 } elsif ($state == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { 4293 if ($is_space->{$nc}) { 4294 if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) { 4295 4296 $self->{state} = BEFORE_NDATA_STATE; 4297 } else { 4298 4299 ## Stay in the state 4300 } 4301 4302 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4303 $self->{line_prev} = $self->{line}; 4304 $self->{column_prev} = $self->{column}; 4305 $self->{column}++; 4306 $self->{nc} 4307 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4308 } else { 4309 $self->{set_nc}->($self); 4310 } 4311 4312 redo A; 4313 } elsif ($nc == 0x003E) { # > 4314 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4315 4316 $self->{state} = DATA_STATE; 4317 } else { 4318 4319 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4320 } 4321 4322 4323 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4324 $self->{line_prev} = $self->{line}; 4325 $self->{column_prev} = $self->{column}; 4326 $self->{column}++; 4327 $self->{nc} 4328 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4329 } else { 4330 $self->{set_nc}->($self); 4331 } 4332 4333 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4334 redo A; 4335 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and 4336 ($nc == 0x004E or # N 4337 $nc == 0x006E)) { # n 4338 4339 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type 4340 $self->{state} = NDATA_STATE; 4341 $self->{kwd} = chr $nc; 4342 4343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4344 $self->{line_prev} = $self->{line}; 4345 $self->{column_prev} = $self->{column}; 4346 $self->{column}++; 4347 $self->{nc} 4348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4349 } else { 4350 $self->{set_nc}->($self); 4351 } 4352 4353 redo A; 4354 } elsif ($nc == -1) { 4355 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4356 4357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 4358 $self->{state} = DATA_STATE; 4359 $self->{ct}->{quirks} = 1; 4360 } else { 4361 4362 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 4363 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4364 } 4365 4366 ## reconsume 4367 return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION 4368 redo A; 4369 } elsif ($self->{is_xml} and 4370 $self->{ct}->{type} == DOCTYPE_TOKEN and 4371 $nc == 0x005B) { # [ 4372 4373 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4374 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 4375 $self->{in_subset} = 1; 4376 4377 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4378 $self->{line_prev} = $self->{line}; 4379 $self->{column_prev} = $self->{column}; 4380 $self->{column}++; 4381 $self->{nc} 4382 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4383 } else { 4384 $self->{set_nc}->($self); 4385 } 4386 4387 return ($self->{ct}); # DOCTYPE 4388 redo A; 4389 } else { 4390 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); 4391 4392 if ($self->{ct}->{type} == DOCTYPE_TOKEN) { 4393 4394 #$self->{ct}->{quirks} = 1; 4395 $self->{state} = BOGUS_DOCTYPE_STATE; 4396 } else { 4397 4398 $self->{state} = BOGUS_MD_STATE; 4399 } 4400 4401 4402 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4403 $self->{line_prev} = $self->{line}; 4404 $self->{column_prev} = $self->{column}; 4405 $self->{column}++; 4406 $self->{nc} 4407 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4408 } else { 4409 $self->{set_nc}->($self); 4410 } 4411 4412 redo A; 4413 } 4414 } elsif ($state == BEFORE_NDATA_STATE) { 4415 if ($is_space->{$nc}) { 4416 4417 ## Stay in the state. 4418 4419 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4420 $self->{line_prev} = $self->{line}; 4421 $self->{column_prev} = $self->{column}; 4422 $self->{column}++; 4423 $self->{nc} 4424 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4425 } else { 4426 $self->{set_nc}->($self); 4427 } 4428 4429 redo A; 4430 } elsif ($nc == 0x003E) { # > 4431 4432 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4433 4434 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4435 $self->{line_prev} = $self->{line}; 4436 $self->{column_prev} = $self->{column}; 4437 $self->{column}++; 4438 $self->{nc} 4439 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4440 } else { 4441 $self->{set_nc}->($self); 4442 } 4443 4444 return ($self->{ct}); # ENTITY 4445 redo A; 4446 } elsif ($nc == 0x004E or # N 4447 $nc == 0x006E) { # n 4448 4449 $self->{state} = NDATA_STATE; 4450 $self->{kwd} = chr $nc; 4451 4452 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4453 $self->{line_prev} = $self->{line}; 4454 $self->{column_prev} = $self->{column}; 4455 $self->{column}++; 4456 $self->{nc} 4457 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4458 } else { 4459 $self->{set_nc}->($self); 4460 } 4461 4462 redo A; 4463 } elsif ($nc == -1) { 4464 4465 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 4466 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4467 ## reconsume 4468 return ($self->{ct}); # ENTITY 4469 redo A; 4470 } else { 4471 4472 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); 4473 $self->{state} = BOGUS_MD_STATE; 4474 4475 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4476 $self->{line_prev} = $self->{line}; 4477 $self->{column_prev} = $self->{column}; 4478 $self->{column}++; 4479 $self->{nc} 4480 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4481 } else { 4482 $self->{set_nc}->($self); 4483 } 4484 4485 redo A; 4486 } 4487 } elsif ($state == BOGUS_DOCTYPE_STATE) { 4488 if ($nc == 0x003E) { # > 4489 4490 $self->{state} = DATA_STATE; 4491 4492 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4493 $self->{line_prev} = $self->{line}; 4494 $self->{column_prev} = $self->{column}; 4495 $self->{column}++; 4496 $self->{nc} 4497 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4498 } else { 4499 $self->{set_nc}->($self); 4500 } 4501 4502 4503 return ($self->{ct}); # DOCTYPE 4504 4505 redo A; 4506 } elsif ($self->{is_xml} and $nc == 0x005B) { # [ 4507 4508 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 4509 $self->{ct}->{has_internal_subset} = 1; # DOCTYPE 4510 $self->{in_subset} = 1; 4511 4512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4513 $self->{line_prev} = $self->{line}; 4514 $self->{column_prev} = $self->{column}; 4515 $self->{column}++; 4516 $self->{nc} 4517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4518 } else { 4519 $self->{set_nc}->($self); 4520 } 4521 4522 return ($self->{ct}); # DOCTYPE 4523 redo A; 4524 } elsif ($nc == -1) { 4525 4526 $self->{state} = DATA_STATE; 4527 ## reconsume 4528 4529 return ($self->{ct}); # DOCTYPE 4530 4531 redo A; 4532 } else { 4533 4534 my $s = ''; 4535 $self->{read_until}->($s, q{>[}, 0); 4536 4537 ## Stay in the state 4538 4539 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4540 $self->{line_prev} = $self->{line}; 4541 $self->{column_prev} = $self->{column}; 4542 $self->{column}++; 4543 $self->{nc} 4544 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4545 } else { 4546 $self->{set_nc}->($self); 4547 } 4548 4549 redo A; 4550 } 4551 } elsif ($state == CDATA_SECTION_STATE) { 4552 ## NOTE: "CDATA section state" in the state is jointly implemented 4553 ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, 4554 ## and |CDATA_SECTION_MSE2_STATE|. 4555 4556 ## XML5: "CDATA state". 4557 4558 if ($nc == 0x005D) { # ] 4559 4560 $self->{state} = CDATA_SECTION_MSE1_STATE; 4561 4562 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4563 $self->{line_prev} = $self->{line}; 4564 $self->{column_prev} = $self->{column}; 4565 $self->{column}++; 4566 $self->{nc} 4567 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4568 } else { 4569 $self->{set_nc}->($self); 4570 } 4571 4572 redo A; 4573 } elsif ($nc == -1) { 4574 if ($self->{is_xml}) { 4575 4576 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type 4577 } else { 4578 4579 } 4580 4581 $self->{state} = DATA_STATE; 4582 ## Reconsume. 4583 if (length $self->{ct}->{data}) { # character 4584 4585 return ($self->{ct}); # character 4586 } else { 4587 4588 ## No token to emit. $self->{ct} is discarded. 4589 } 4590 redo A; 4591 } else { 4592 4593 $self->{ct}->{data} .= chr $nc; 4594 $self->{read_until}->($self->{ct}->{data}, 4595 qq<\x00]>, 4596 length $self->{ct}->{data}); 4597 ## NOTE: NULLs are left as is (see spec's comment). However, 4598 ## a token cannot contain more than one U+0000 NULL character 4599 ## for the ease of processing in the tree constructor. 4600 4601 ## Stay in the state. 4602 4603 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4604 $self->{line_prev} = $self->{line}; 4605 $self->{column_prev} = $self->{column}; 4606 $self->{column}++; 4607 $self->{nc} 4608 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4609 } else { 4610 $self->{set_nc}->($self); 4611 } 4612 4613 redo A; 4614 } 4615 4616 ## ISSUE: "text tokens" in spec. 4617 } elsif ($state == CDATA_SECTION_MSE1_STATE) { 4618 ## XML5: "CDATA bracket state". 4619 4620 if ($nc == 0x005D) { # ] 4621 4622 $self->{state} = CDATA_SECTION_MSE2_STATE; 4623 4624 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4625 $self->{line_prev} = $self->{line}; 4626 $self->{column_prev} = $self->{column}; 4627 $self->{column}++; 4628 $self->{nc} 4629 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4630 } else { 4631 $self->{set_nc}->($self); 4632 } 4633 4634 redo A; 4635 } else { 4636 4637 ## XML5: If EOF, "]" is not appended and changed to the data state. 4638 $self->{ct}->{data} .= ']'; 4639 $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. 4640 ## Reconsume. 4641 redo A; 4642 } 4643 } elsif ($state == CDATA_SECTION_MSE2_STATE) { 4644 ## XML5: "CDATA end state". 4645 4646 if ($nc == 0x003E) { # > 4647 $self->{state} = DATA_STATE; 4648 4649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4650 $self->{line_prev} = $self->{line}; 4651 $self->{column_prev} = $self->{column}; 4652 $self->{column}++; 4653 $self->{nc} 4654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4655 } else { 4656 $self->{set_nc}->($self); 4657 } 4658 4659 if (length $self->{ct}->{data}) { # character 4660 4661 return ($self->{ct}); # character 4662 } else { 4663 4664 ## No token to emit. $self->{ct} is discarded. 4665 } 4666 redo A; 4667 } elsif ($nc == 0x005D) { # ] 4668 # character 4669 $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]". 4670 ## Stay in the state. 4671 4672 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4673 $self->{line_prev} = $self->{line}; 4674 $self->{column_prev} = $self->{column}; 4675 $self->{column}++; 4676 $self->{nc} 4677 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4678 } else { 4679 $self->{set_nc}->($self); 4680 } 4681 4682 redo A; 4683 } else { 4684 4685 $self->{ct}->{data} .= ']]'; # character 4686 $self->{state} = CDATA_SECTION_STATE; 4687 ## Reconsume. ## XML5: Emit. 4688 redo A; 4689 } 4690 } elsif ($state == ENTITY_STATE) { 4691 if ($is_space->{$nc} or 4692 { 4693 0x003C => 1, 0x0026 => 1, -1 => 1, # <, & 4694 4695 ## Following characters are added here to detect parse 4696 ## error for "=" of "&=" in an unquoted attribute value. 4697 ## Though this disagree with the Web Applications 1.0 4698 ## spec, the result token sequences of both algorithms 4699 ## should be same, as these characters cannot form a part 4700 ## of character references. 4701 0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', ` 4702 0x003D => 1, # = 4703 4704 ## As a result of the addition above, the following clause 4705 ## has no effect in fact. 4706 $self->{entity_add} => 1, 4707 }->{$nc}) { 4708 if ($self->{is_xml}) { 4709 4710 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', 4711 line => $self->{line_prev}, 4712 column => $self->{column_prev} 4713 + ($nc == -1 ? 1 : 0)); 4714 } else { 4715 4716 ## No error 4717 } 4718 ## Don't consume 4719 ## Return nothing. 4720 # 4721 } elsif ($nc == 0x0023) { # # 4722 4723 $self->{state} = ENTITY_HASH_STATE; 4724 $self->{kwd} = '#'; 4725 4726 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4727 $self->{line_prev} = $self->{line}; 4728 $self->{column_prev} = $self->{column}; 4729 $self->{column}++; 4730 $self->{nc} 4731 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4732 } else { 4733 $self->{set_nc}->($self); 4734 } 4735 4736 redo A; 4737 } elsif ($self->{is_xml} or 4738 (0x0041 <= $nc and 4739 $nc <= 0x005A) or # A..Z 4740 (0x0061 <= $nc and 4741 $nc <= 0x007A)) { # a..z 4742 4743 #require HTML::HTML5::Parser::NamedEntityList; 4744 $self->{state} = ENTITY_NAME_STATE; 4745 $self->{kwd} = chr $nc; 4746 $self->{entity__value} = $self->{kwd}; 4747 $self->{entity__match} = 0; 4748 4749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4750 $self->{line_prev} = $self->{line}; 4751 $self->{column_prev} = $self->{column}; 4752 $self->{column}++; 4753 $self->{nc} 4754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4755 } else { 4756 $self->{set_nc}->($self); 4757 } 4758 4759 redo A; 4760 } else { 4761 4762 ## Return nothing. 4763 # 4764 } 4765 4766 ## We implement the "consume a character reference" in a 4767 ## slightly different way from the spec's algorithm, though the 4768 ## end result should be exactly same. 4769 4770 ## NOTE: No character is consumed by the "consume a character 4771 ## reference" algorithm. In other word, there is an "&" character 4772 ## that does not introduce a character reference, which would be 4773 ## appended to the parent element or the attribute value in later 4774 ## process of the tokenizer. 4775 4776 if ($self->{prev_state} == DATA_STATE or 4777 $self->{prev_state} == RCDATA_STATE) { 4778 4779 $self->{state} = $self->{prev_state}; 4780 ## Reconsume. 4781 return ({type => CHARACTER_TOKEN, data => '&', 4782 line => $self->{line_prev}, 4783 column => $self->{column_prev}, 4784 }); 4785 redo A; 4786 } else { 4787 4788 $self->{ca}->{value} .= '&'; 4789 $self->{state} = $self->{prev_state}; 4790 ## Reconsume. 4791 redo A; 4792 } 4793 } elsif ($state == ENTITY_HASH_STATE) { 4794 if ($nc == 0x0078) { # x 4795 4796 $self->{state} = HEXREF_X_STATE; 4797 $self->{kwd} .= chr $nc; 4798 4799 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4800 $self->{line_prev} = $self->{line}; 4801 $self->{column_prev} = $self->{column}; 4802 $self->{column}++; 4803 $self->{nc} 4804 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4805 } else { 4806 $self->{set_nc}->($self); 4807 } 4808 4809 redo A; 4810 } elsif ($nc == 0x0058) { # X 4811 4812 if ($self->{is_xml}) { 4813 $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type 4814 } 4815 $self->{state} = HEXREF_X_STATE; 4816 $self->{kwd} .= chr $nc; 4817 4818 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4819 $self->{line_prev} = $self->{line}; 4820 $self->{column_prev} = $self->{column}; 4821 $self->{column}++; 4822 $self->{nc} 4823 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4824 } else { 4825 $self->{set_nc}->($self); 4826 } 4827 4828 redo A; 4829 } elsif (0x0030 <= $nc and 4830 $nc <= 0x0039) { # 0..9 4831 4832 $self->{state} = NCR_NUM_STATE; 4833 $self->{kwd} = $nc - 0x0030; 4834 4835 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4836 $self->{line_prev} = $self->{line}; 4837 $self->{column_prev} = $self->{column}; 4838 $self->{column}++; 4839 $self->{nc} 4840 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4841 } else { 4842 $self->{set_nc}->($self); 4843 } 4844 4845 redo A; 4846 } else { 4847 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero', 4848 line => $self->{line_prev}, 4849 column => $self->{column_prev} - 1); 4850 4851 ## NOTE: According to the spec algorithm, nothing is returned, 4852 ## and then "&#" is appended to the parent element or the attribute 4853 ## value in the later processing. 4854 4855 if ($self->{prev_state} == DATA_STATE or 4856 $self->{prev_state} == RCDATA_STATE) { 4857 4858 $self->{state} = $self->{prev_state}; 4859 ## Reconsume. 4860 return ({type => CHARACTER_TOKEN, 4861 data => '&#', 4862 line => $self->{line_prev}, 4863 column => $self->{column_prev} - 1, 4864 }); 4865 redo A; 4866 } else { 4867 4868 $self->{ca}->{value} .= '&#'; 4869 $self->{state} = $self->{prev_state}; 4870 ## Reconsume. 4871 redo A; 4872 } 4873 } 4874 } elsif ($state == NCR_NUM_STATE) { 4875 if (0x0030 <= $nc and 4876 $nc <= 0x0039) { # 0..9 4877 4878 $self->{kwd} *= 10; 4879 $self->{kwd} += $nc - 0x0030; 4880 4881 ## Stay in the state. 4882 4883 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4884 $self->{line_prev} = $self->{line}; 4885 $self->{column_prev} = $self->{column}; 4886 $self->{column}++; 4887 $self->{nc} 4888 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4889 } else { 4890 $self->{set_nc}->($self); 4891 } 4892 4893 redo A; 4894 } elsif ($nc == 0x003B) { # ; 4895 4896 4897 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4898 $self->{line_prev} = $self->{line}; 4899 $self->{column_prev} = $self->{column}; 4900 $self->{column}++; 4901 $self->{nc} 4902 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 4903 } else { 4904 $self->{set_nc}->($self); 4905 } 4906 4907 # 4908 } else { 4909 4910 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc'); 4911 ## Reconsume. 4912 # 4913 } 4914 4915 my $code = $self->{kwd}; 4916 my $l = $self->{line_prev}; 4917 my $c = $self->{column_prev}; 4918 if ((not $self->{is_xml} and $charref_map->{$code}) or 4919 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or 4920 ($self->{is_xml} and $code == 0x0000)) { 4921 4922 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', 4923 text => (sprintf 'U+%04X', $code), 4924 line => $l, column => $c); 4925 $code = $charref_map->{$code}; 4926 } elsif ($code > 0x10FFFF) { 4927 4928 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', 4929 text => (sprintf 'U-%08X', $code), 4930 line => $l, column => $c); 4931 $code = 0xFFFD; 4932 } 4933 4934 if ($self->{prev_state} == DATA_STATE or 4935 $self->{prev_state} == RCDATA_STATE) { 4936 4937 $self->{state} = $self->{prev_state}; 4938 ## Reconsume. 4939 return ({type => CHARACTER_TOKEN, data => chr $code, 4940 has_reference => 1, 4941 line => $l, column => $c, 4942 }); 4943 redo A; 4944 } else { 4945 4946 $self->{ca}->{value} .= chr $code; 4947 $self->{ca}->{has_reference} = 1; 4948 $self->{state} = $self->{prev_state}; 4949 ## Reconsume. 4950 redo A; 4951 } 4952 } elsif ($state == HEXREF_X_STATE) { 4953 if ((0x0030 <= $nc and $nc <= 0x0039) or 4954 (0x0041 <= $nc and $nc <= 0x0046) or 4955 (0x0061 <= $nc and $nc <= 0x0066)) { 4956 # 0..9, A..F, a..f 4957 4958 $self->{state} = HEXREF_HEX_STATE; 4959 $self->{kwd} = 0; 4960 ## Reconsume. 4961 redo A; 4962 } else { 4963 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro', 4964 line => $self->{line_prev}, 4965 column => $self->{column_prev} - 2); 4966 4967 ## NOTE: According to the spec algorithm, nothing is returned, 4968 ## and then "&#" followed by "X" or "x" is appended to the parent 4969 ## element or the attribute value in the later processing. 4970 4971 if ($self->{prev_state} == DATA_STATE or 4972 $self->{prev_state} == RCDATA_STATE) { 4973 4974 $self->{state} = $self->{prev_state}; 4975 ## Reconsume. 4976 return ({type => CHARACTER_TOKEN, 4977 data => '&' . $self->{kwd}, 4978 line => $self->{line_prev}, 4979 column => $self->{column_prev} - length $self->{kwd}, 4980 }); 4981 redo A; 4982 } else { 4983 4984 $self->{ca}->{value} .= '&' . $self->{kwd}; 4985 $self->{state} = $self->{prev_state}; 4986 ## Reconsume. 4987 redo A; 4988 } 4989 } 4990 } elsif ($state == HEXREF_HEX_STATE) { 4991 if (0x0030 <= $nc and $nc <= 0x0039) { 4992 # 0..9 4993 4994 $self->{kwd} *= 0x10; 4995 $self->{kwd} += $nc - 0x0030; 4996 ## Stay in the state. 4997 4998 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 4999 $self->{line_prev} = $self->{line}; 5000 $self->{column_prev} = $self->{column}; 5001 $self->{column}++; 5002 $self->{nc} 5003 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5004 } else { 5005 $self->{set_nc}->($self); 5006 } 5007 5008 redo A; 5009 } elsif (0x0061 <= $nc and 5010 $nc <= 0x0066) { # a..f 5011 5012 $self->{kwd} *= 0x10; 5013 $self->{kwd} += $nc - 0x0060 + 9; 5014 ## Stay in the state. 5015 5016 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5017 $self->{line_prev} = $self->{line}; 5018 $self->{column_prev} = $self->{column}; 5019 $self->{column}++; 5020 $self->{nc} 5021 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5022 } else { 5023 $self->{set_nc}->($self); 5024 } 5025 5026 redo A; 5027 } elsif (0x0041 <= $nc and 5028 $nc <= 0x0046) { # A..F 5029 5030 $self->{kwd} *= 0x10; 5031 $self->{kwd} += $nc - 0x0040 + 9; 5032 ## Stay in the state. 5033 5034 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5035 $self->{line_prev} = $self->{line}; 5036 $self->{column_prev} = $self->{column}; 5037 $self->{column}++; 5038 $self->{nc} 5039 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5040 } else { 5041 $self->{set_nc}->($self); 5042 } 5043 5044 redo A; 5045 } elsif ($nc == 0x003B) { # ; 5046 5047 5048 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5049 $self->{line_prev} = $self->{line}; 5050 $self->{column_prev} = $self->{column}; 5051 $self->{column}++; 5052 $self->{nc} 5053 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5054 } else { 5055 $self->{set_nc}->($self); 5056 } 5057 5058 # 5059 } else { 5060 5061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc', 5062 line => $self->{line}, 5063 column => $self->{column}); 5064 ## Reconsume. 5065 # 5066 } 5067 5068 my $code = $self->{kwd}; 5069 my $l = $self->{line_prev}; 5070 my $c = $self->{column_prev}; 5071 if ((not $self->{is_xml} and $charref_map->{$code}) or 5072 ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or 5073 ($self->{is_xml} and $code == 0x0000)) { 5074 5075 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', 5076 text => (sprintf 'U+%04X', $code), 5077 line => $l, column => $c); 5078 $code = $charref_map->{$code}; 5079 } elsif ($code > 0x10FFFF) { 5080 5081 $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', 5082 text => (sprintf 'U-%08X', $code), 5083 line => $l, column => $c); 5084 $code = 0xFFFD; 5085 } 5086 5087 if ($self->{prev_state} == DATA_STATE or 5088 $self->{prev_state} == RCDATA_STATE) { 5089 5090 $self->{state} = $self->{prev_state}; 5091 ## Reconsume. 5092 return ({type => CHARACTER_TOKEN, data => chr $code, 5093 has_reference => 1, 5094 line => $l, column => $c, 5095 }); 5096 redo A; 5097 } else { 5098 5099 $self->{ca}->{value} .= chr $code; 5100 $self->{ca}->{has_reference} = 1; 5101 $self->{state} = $self->{prev_state}; 5102 ## Reconsume. 5103 redo A; 5104 } 5105 } elsif ($state == ENTITY_NAME_STATE) { 5106 if ((0x0041 <= $nc and # a 5107 $nc <= 0x005A) or # x 5108 (0x0061 <= $nc and # a 5109 $nc <= 0x007A) or # z 5110 (0x0030 <= $nc and # 0 5111 $nc <= 0x0039) or # 9 5112 $nc == 0x003B or # ; 5113 ($self->{is_xml} and 5114 not ($is_space->{$nc} or 5115 { 5116 0x003C => 1, 0x0026 => 1, -1 => 1, # <, & 5117 5118 ## See comment in the |ENTITY_STATE|'s |if| 5119 ## statement for the rationale of addition of these 5120 ## characters. 5121 0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', ` 5122 0x003D => 1, # = 5123 5124 ## This is redundant for the same reason. 5125 $self->{entity_add} => 1, 5126 }->{$nc}))) { 5127 #local %entity2char; 5128 $self->{kwd} .= chr $nc; ## Bare entity name. 5129 if (defined $entity2char{$self->{kwd}} or ## HTML charrefs. 5130 $self->{ge}->{$self->{kwd}}) { ## XML general entities. 5131 if ($nc == 0x003B) { # ; 5132 if (defined $self->{ge}->{$self->{kwd}}) { 5133 ## A declared XML entity. 5134 if ($self->{ge}->{$self->{kwd}}->{only_text}) { 5135 5136 $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value}; 5137 } else { 5138 if (defined $self->{ge}->{$self->{kwd}}->{notation}) { 5139 5140 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type 5141 value => $self->{kwd}); 5142 } else { 5143 5144 } 5145 $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand 5146 } 5147 } else { 5148 ## An HTML character reference. 5149 if ($self->{is_xml}) { 5150 ## Not a declared XML entity. 5151 5152 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type 5153 value => $self->{kwd}, 5154 level => { 5155 'amp;' => $self->{level}->{warn}, 5156 'quot;' => $self->{level}->{warn}, 5157 'lt;' => $self->{level}->{warn}, 5158 'gt;' => $self->{level}->{warn}, 5159 'apos;' => $self->{level}->{warn}, 5160 }->{$self->{kwd}} || 5161 $self->{level}->{must}, 5162 line => $self->{line_prev}, 5163 column => $self->{column} - length $self->{kwd}); 5164 } else { 5165 5166 } 5167 $self->{entity__value} = $entity2char{$self->{kwd}}; 5168 } 5169 $self->{entity__match} = 1; ## Matched exactly with ";" entity. 5170 5171 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5172 $self->{line_prev} = $self->{line}; 5173 $self->{column_prev} = $self->{column}; 5174 $self->{column}++; 5175 $self->{nc} 5176 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5177 } else { 5178 $self->{set_nc}->($self); 5179 } 5180 5181 # 5182 } else { 5183 5184 $self->{entity__value} = $entity2char{$self->{kwd}}; 5185 $self->{entity__match} = -1; ## Exactly matched to non-";" entity. 5186 ## Stay in the state. 5187 5188 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5189 $self->{line_prev} = $self->{line}; 5190 $self->{column_prev} = $self->{column}; 5191 $self->{column}++; 5192 $self->{nc} 5193 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5194 } else { 5195 $self->{set_nc}->($self); 5196 } 5197 5198 redo A; 5199 } 5200 } else { 5201 if ($nc == 0x003B) { # ; 5202 ## A reserved HTML character reference or an undeclared 5203 ## XML entity reference. 5204 5205 $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## XXXtype 5206 value => $self->{kwd}, 5207 level => $self->{level}->{must}, 5208 line => $self->{line_prev}, 5209 column => $self->{column} - length $self->{kwd}); 5210 $self->{entity__value} .= chr $nc; 5211 $self->{entity__match} *= 2; ## Matched (positive) or not (zero) 5212 5213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5214 $self->{line_prev} = $self->{line}; 5215 $self->{column_prev} = $self->{column}; 5216 $self->{column}++; 5217 $self->{nc} 5218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5219 } else { 5220 $self->{set_nc}->($self); 5221 } 5222 5223 # 5224 } else { 5225 5226 $self->{entity__value} .= chr $nc; 5227 $self->{entity__match} *= 2; ## Matched (positive) or not (zero) 5228 ## Stay in the state. 5229 5230 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5231 $self->{line_prev} = $self->{line}; 5232 $self->{column_prev} = $self->{column}; 5233 $self->{column}++; 5234 $self->{nc} 5235 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5236 } else { 5237 $self->{set_nc}->($self); 5238 } 5239 5240 redo A; 5241 } 5242 } 5243 } elsif ($nc == 0x003D) { # = 5244 if ($self->{entity__match} < 0 and 5245 $self->{prev_state} != DATA_STATE and # in attribute 5246 $self->{prev_state} != RCDATA_STATE) { 5247 $self->{entity__match} = 0; 5248 } 5249 } 5250 5251 my $data; 5252 my $has_ref; 5253 if ($self->{entity__match} > 0) { ## A ";" entity. 5254 5255 $data = $self->{entity__value}; 5256 ## Strictly speaking the $has_ref flag should not be set if 5257 ## there is no matched entity. However, this flag is used 5258 ## only in contexts where use of an 5259 ## unexpanded-entity-reference-like string is in no way 5260 ## allowed, so it should not make any difference in theory. 5261 $has_ref = 1; 5262 # 5263 } elsif ($self->{entity__match} < 0) { ## Matched to non-";" entity. 5264 if ($self->{prev_state} != DATA_STATE and # in attribute 5265 $self->{prev_state} != RCDATA_STATE and 5266 $self->{entity__match} < -1) { 5267 ## In attribute-value contexts, matched non-";" string is 5268 ## left as is if there is trailing alphabetical letters. 5269 5270 $data = '&' . $self->{kwd}; 5271 # 5272 } else { 5273 ## In attribute-value contexts, exactly matched non-";" 5274 ## string is replaced as a character reference. In any 5275 ## context, matched non-";" string with or without trailing 5276 ## alphabetical letters is replaced as a character reference 5277 ## (with trailing letters). Note that use of a no-";" 5278 ## character reference is always non-conforming. 5279 5280 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc'); 5281 $data = $self->{entity__value}; 5282 $has_ref = 1; 5283 # 5284 } 5285 } else { ## Unmatched string. 5286 if ($self->{is_xml} and not $self->{kwd} =~ /;$/) { 5287 5288 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', 5289 line => $self->{line_prev}, 5290 column => $self->{column_prev} - length $self->{kwd}); 5291 } else { 5292 5293 } 5294 $data = '&' . $self->{kwd}; 5295 # 5296 } 5297 5298 ## NOTE: In these cases, when a character reference is found, 5299 ## it is consumed and a character token is returned, or, otherwise, 5300 ## nothing is consumed and returned, according to the spec algorithm. 5301 ## In this implementation, anything that has been examined by the 5302 ## tokenizer is appended to the parent element or the attribute value 5303 ## as string, either literal string when no character reference or 5304 ## entity-replaced string otherwise, in this stage, since any characters 5305 ## that would not be consumed are appended in the data state or in an 5306 ## appropriate attribute value state anyway. 5307 5308 if ($self->{prev_state} == DATA_STATE or 5309 $self->{prev_state} == RCDATA_STATE) { 5310 5311 $self->{state} = $self->{prev_state}; 5312 ## Reconsume. 5313 return ({type => CHARACTER_TOKEN, 5314 data => $data, 5315 has_reference => $has_ref, 5316 line => $self->{line_prev}, 5317 column => $self->{column_prev} + 1 - length $self->{kwd}, 5318 }); 5319 redo A; 5320 } else { 5321 5322 $self->{ca}->{value} .= $data; 5323 $self->{ca}->{has_reference} = 1 if $has_ref; 5324 $self->{state} = $self->{prev_state}; 5325 ## Reconsume. 5326 redo A; 5327 } 5328 5329 ## ========== XML-only states ========== 5330 5331 } elsif ($state == PI_STATE) { 5332 ## XML5: "Pi state" and "DOCTYPE pi state". 5333 5334 if ($is_space->{$nc} or 5335 $nc == 0x003F or # ? 5336 $nc == -1) { 5337 ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE 5338 ## pi state": Switch to the "DOCTYPE pi after state". EOF: 5339 ## "DOCTYPE pi state": Parse error, switch to the "data 5340 ## state". 5341 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type 5342 line => $self->{line_prev}, 5343 column => $self->{column_prev} 5344 - 1 * ($nc != -1)); 5345 $self->{state} = BOGUS_COMMENT_STATE; 5346 ## Reconsume. 5347 $self->{ct} = {type => COMMENT_TOKEN, 5348 data => '?', 5349 line => $self->{line_prev}, 5350 column => $self->{column_prev} 5351 - 1 * ($nc != -1), 5352 }; 5353 redo A; 5354 } else { 5355 ## XML5: "DOCTYPE pi state": Stay in the state. 5356 if ($nc == 0x0000) { 5357 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 5358 } 5359 $self->{ct} = {type => PI_TOKEN, 5360 target => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, 5361 data => '', 5362 line => $self->{line_prev}, 5363 column => $self->{column_prev} - 1, 5364 }; 5365 $self->{state} = PI_TARGET_STATE; 5366 5367 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5368 $self->{line_prev} = $self->{line}; 5369 $self->{column_prev} = $self->{column}; 5370 $self->{column}++; 5371 $self->{nc} 5372 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5373 } else { 5374 $self->{set_nc}->($self); 5375 } 5376 5377 redo A; 5378 } 5379 } elsif ($state == PI_TARGET_STATE) { 5380 if ($is_space->{$nc}) { 5381 $self->{state} = PI_TARGET_AFTER_STATE; 5382 $self->{kwd} = chr $nc; # "temporary buffer" 5383 5384 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5385 $self->{line_prev} = $self->{line}; 5386 $self->{column_prev} = $self->{column}; 5387 $self->{column}++; 5388 $self->{nc} 5389 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5390 } else { 5391 $self->{set_nc}->($self); 5392 } 5393 5394 redo A; 5395 } elsif ($nc == EOF_CHAR) { 5396 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type 5397 if ($self->{in_subset}) { 5398 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 5399 } else { 5400 $self->{state} = DATA_STATE; 5401 } 5402 ## Reconsume. 5403 return ({type => COMMENT_TOKEN, 5404 data => '?' . $self->{ct}->{target}, 5405 line => $self->{ct}->{line}, 5406 column => $self->{ct}->{column}}); 5407 redo A; 5408 } elsif ($nc == 0x003F) { # ? 5409 $self->{state} = PI_AFTER_STATE; 5410 $self->{kwd} = ''; # "temporary buffer" 5411 5412 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5413 $self->{line_prev} = $self->{line}; 5414 $self->{column_prev} = $self->{column}; 5415 $self->{column}++; 5416 $self->{nc} 5417 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5418 } else { 5419 $self->{set_nc}->($self); 5420 } 5421 5422 redo A; 5423 } else { 5424 ## XML5: typo ("tag name" -> "target") 5425 if ($nc == 0x0000) { 5426 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 5427 } 5428 $self->{ct}->{target} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi 5429 5430 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5431 $self->{line_prev} = $self->{line}; 5432 $self->{column_prev} = $self->{column}; 5433 $self->{column}++; 5434 $self->{nc} 5435 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5436 } else { 5437 $self->{set_nc}->($self); 5438 } 5439 5440 redo A; 5441 } 5442 } elsif ($state == PI_TARGET_AFTER_STATE) { 5443 if ($is_space->{$nc}) { 5444 $self->{kwd} .= chr $nc; # "temporary buffer" 5445 ## Stay in the state. 5446 5447 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5448 $self->{line_prev} = $self->{line}; 5449 $self->{column_prev} = $self->{column}; 5450 $self->{column}++; 5451 $self->{nc} 5452 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5453 } else { 5454 $self->{set_nc}->($self); 5455 } 5456 5457 redo A; 5458 } else { 5459 $self->{state} = PI_DATA_STATE; 5460 ## Reprocess. 5461 redo A; 5462 } 5463 } elsif ($state == PI_DATA_STATE) { 5464 if ($nc == 0x003F) { # ? 5465 $self->{state} = PI_DATA_AFTER_STATE; 5466 5467 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5468 $self->{line_prev} = $self->{line}; 5469 $self->{column_prev} = $self->{column}; 5470 $self->{column}++; 5471 $self->{nc} 5472 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5473 } else { 5474 $self->{set_nc}->($self); 5475 } 5476 5477 redo A; 5478 } elsif ($nc == EOF_CHAR) { 5479 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type 5480 if ($self->{in_subset}) { 5481 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state" 5482 } else { 5483 $self->{state} = DATA_STATE; 5484 } 5485 ## Reprocess. 5486 return ({type => COMMENT_TOKEN, 5487 data => '?' . $self->{ct}->{target} . 5488 $self->{kwd} . # "temporary buffer" 5489 $self->{ct}->{data}, 5490 line => $self->{ct}->{line}, 5491 column => $self->{ct}->{column}}); 5492 redo A; 5493 } else { 5494 if ($nc == 0x0000) { 5495 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 5496 } 5497 $self->{ct}->{data} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi 5498 $self->{read_until}->($self->{ct}->{data}, qq[\x00?], 5499 length $self->{ct}->{data}); 5500 ## Stay in the state. 5501 5502 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5503 $self->{line_prev} = $self->{line}; 5504 $self->{column_prev} = $self->{column}; 5505 $self->{column}++; 5506 $self->{nc} 5507 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5508 } else { 5509 $self->{set_nc}->($self); 5510 } 5511 5512 ## Reprocess. 5513 redo A; 5514 } 5515 } elsif ($state == PI_AFTER_STATE) { 5516 ## XML5: Part of "Pi after state". 5517 5518 if ($nc == 0x003E) { # > 5519 if ($self->{in_subset}) { 5520 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 5521 } else { 5522 $self->{state} = DATA_STATE; 5523 } 5524 5525 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5526 $self->{line_prev} = $self->{line}; 5527 $self->{column_prev} = $self->{column}; 5528 $self->{column}++; 5529 $self->{nc} 5530 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5531 } else { 5532 $self->{set_nc}->($self); 5533 } 5534 5535 return ($self->{ct}); # pi 5536 redo A; 5537 } elsif ($nc == 0x003F) { # ? 5538 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type 5539 line => $self->{line_prev}, 5540 column => $self->{column_prev}); ## XML5: no error 5541 $self->{ct}->{data} .= '?'; 5542 $self->{state} = PI_DATA_AFTER_STATE; 5543 5544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5545 $self->{line_prev} = $self->{line}; 5546 $self->{column_prev} = $self->{column}; 5547 $self->{column}++; 5548 $self->{nc} 5549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5550 } else { 5551 $self->{set_nc}->($self); 5552 } 5553 5554 redo A; 5555 } else { 5556 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type 5557 line => $self->{line_prev}, 5558 column => $self->{column_prev} 5559 + 1 * ($nc == -1)); ## XML5: no error 5560 $self->{ct}->{data} .= '?'; ## XML5: not appended 5561 $self->{state} = PI_DATA_STATE; 5562 ## Reprocess. 5563 redo A; 5564 } 5565 } elsif ($state == PI_DATA_AFTER_STATE) { 5566 ## XML5: Same as "pi after state" and "DOCTYPE pi after state". 5567 5568 if ($nc == 0x003E) { # > 5569 if ($self->{in_subset}) { 5570 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 5571 } else { 5572 $self->{state} = DATA_STATE; 5573 } 5574 5575 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5576 $self->{line_prev} = $self->{line}; 5577 $self->{column_prev} = $self->{column}; 5578 $self->{column}++; 5579 $self->{nc} 5580 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5581 } else { 5582 $self->{set_nc}->($self); 5583 } 5584 5585 return ($self->{ct}); # pi 5586 redo A; 5587 } elsif ($nc == 0x003F) { # ? 5588 $self->{ct}->{data} .= '?'; 5589 ## Stay in the state. 5590 5591 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5592 $self->{line_prev} = $self->{line}; 5593 $self->{column_prev} = $self->{column}; 5594 $self->{column}++; 5595 $self->{nc} 5596 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5597 } else { 5598 $self->{set_nc}->($self); 5599 } 5600 5601 redo A; 5602 } else { 5603 $self->{ct}->{data} .= '?'; ## XML5: not appended 5604 $self->{state} = PI_DATA_STATE; 5605 ## Reprocess. 5606 redo A; 5607 } 5608 5609 } elsif ($state == DOCTYPE_INTERNAL_SUBSET_STATE) { 5610 if ($nc == 0x003C) { # < 5611 $self->{state} = DOCTYPE_TAG_STATE; 5612 5613 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5614 $self->{line_prev} = $self->{line}; 5615 $self->{column_prev} = $self->{column}; 5616 $self->{column}++; 5617 $self->{nc} 5618 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5619 } else { 5620 $self->{set_nc}->($self); 5621 } 5622 5623 redo A; 5624 } elsif ($nc == 0x0025) { # % 5625 ## XML5: Not defined yet. 5626 5627 ## TODO: parameter entity expansion 5628 5629 if (not $self->{stop_processing} and 5630 not $self->{document}->xml_standalone) { 5631 $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type 5632 level => $self->{level}->{info}); 5633 $self->{stop_processing} = 1; 5634 } 5635 5636 5637 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5638 $self->{line_prev} = $self->{line}; 5639 $self->{column_prev} = $self->{column}; 5640 $self->{column}++; 5641 $self->{nc} 5642 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5643 } else { 5644 $self->{set_nc}->($self); 5645 } 5646 5647 redo A; 5648 } elsif ($nc == 0x005D) { # ] 5649 delete $self->{in_subset}; 5650 $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; 5651 5652 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5653 $self->{line_prev} = $self->{line}; 5654 $self->{column_prev} = $self->{column}; 5655 $self->{column}++; 5656 $self->{nc} 5657 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5658 } else { 5659 $self->{set_nc}->($self); 5660 } 5661 5662 redo A; 5663 } elsif ($is_space->{$nc}) { 5664 ## Stay in the state. 5665 5666 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5667 $self->{line_prev} = $self->{line}; 5668 $self->{column_prev} = $self->{column}; 5669 $self->{column}++; 5670 $self->{nc} 5671 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5672 } else { 5673 $self->{set_nc}->($self); 5674 } 5675 5676 redo A; 5677 } elsif ($nc == EOF_CHAR) { 5678 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type 5679 delete $self->{in_subset}; 5680 $self->{state} = DATA_STATE; 5681 ## Reconsume. 5682 return ({type => END_OF_DOCTYPE_TOKEN}); 5683 redo A; 5684 } else { 5685 unless ($self->{internal_subset_tainted}) { 5686 ## XML5: No parse error. 5687 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset'); 5688 $self->{internal_subset_tainted} = 1; 5689 } 5690 ## Stay in the state. 5691 5692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5693 $self->{line_prev} = $self->{line}; 5694 $self->{column_prev} = $self->{column}; 5695 $self->{column}++; 5696 $self->{nc} 5697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5698 } else { 5699 $self->{set_nc}->($self); 5700 } 5701 5702 redo A; 5703 } 5704 } elsif ($state == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { 5705 if ($nc == 0x003E) { # > 5706 $self->{state} = DATA_STATE; 5707 5708 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5709 $self->{line_prev} = $self->{line}; 5710 $self->{column_prev} = $self->{column}; 5711 $self->{column}++; 5712 $self->{nc} 5713 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5714 } else { 5715 $self->{set_nc}->($self); 5716 } 5717 5718 return ({type => END_OF_DOCTYPE_TOKEN}); 5719 redo A; 5720 } elsif ($nc == EOF_CHAR) { 5721 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); 5722 $self->{state} = DATA_STATE; 5723 ## Reconsume. 5724 return ({type => END_OF_DOCTYPE_TOKEN}); 5725 redo A; 5726 } else { 5727 ## XML5: No parse error and stay in the state. 5728 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type 5729 5730 $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; 5731 5732 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5733 $self->{line_prev} = $self->{line}; 5734 $self->{column_prev} = $self->{column}; 5735 $self->{column}++; 5736 $self->{nc} 5737 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5738 } else { 5739 $self->{set_nc}->($self); 5740 } 5741 5742 redo A; 5743 } 5744 } elsif ($state == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { 5745 if ($nc == 0x003E) { # > 5746 $self->{state} = DATA_STATE; 5747 5748 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5749 $self->{line_prev} = $self->{line}; 5750 $self->{column_prev} = $self->{column}; 5751 $self->{column}++; 5752 $self->{nc} 5753 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5754 } else { 5755 $self->{set_nc}->($self); 5756 } 5757 5758 return ({type => END_OF_DOCTYPE_TOKEN}); 5759 redo A; 5760 } elsif ($nc == EOF_CHAR) { 5761 $self->{state} = DATA_STATE; 5762 ## Reconsume. 5763 return ({type => END_OF_DOCTYPE_TOKEN}); 5764 redo A; 5765 } else { 5766 ## Stay in the state. 5767 5768 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5769 $self->{line_prev} = $self->{line}; 5770 $self->{column_prev} = $self->{column}; 5771 $self->{column}++; 5772 $self->{nc} 5773 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5774 } else { 5775 $self->{set_nc}->($self); 5776 } 5777 5778 redo A; 5779 } 5780 } elsif ($state == DOCTYPE_TAG_STATE) { 5781 if ($nc == 0x0021) { # ! 5782 $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE; 5783 5784 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5785 $self->{line_prev} = $self->{line}; 5786 $self->{column_prev} = $self->{column}; 5787 $self->{column}++; 5788 $self->{nc} 5789 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5790 } else { 5791 $self->{set_nc}->($self); 5792 } 5793 5794 redo A; 5795 } elsif ($nc == 0x003F) { # ? 5796 $self->{state} = PI_STATE; 5797 5798 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5799 $self->{line_prev} = $self->{line}; 5800 $self->{column_prev} = $self->{column}; 5801 $self->{column}++; 5802 $self->{nc} 5803 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5804 } else { 5805 $self->{set_nc}->($self); 5806 } 5807 5808 redo A; 5809 } elsif ($nc == EOF_CHAR) { 5810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago'); 5811 $self->{state} = DATA_STATE; 5812 ## Reconsume. 5813 redo A; 5814 } else { 5815 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error. 5816 line => $self->{line_prev}, 5817 column => $self->{column_prev}); 5818 $self->{state} = BOGUS_COMMENT_STATE; 5819 $self->{ct} = {type => COMMENT_TOKEN, 5820 data => '', 5821 }; ## NOTE: Will be discarded. 5822 5823 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5824 $self->{line_prev} = $self->{line}; 5825 $self->{column_prev} = $self->{column}; 5826 $self->{column}++; 5827 $self->{nc} 5828 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5829 } else { 5830 $self->{set_nc}->($self); 5831 } 5832 5833 redo A; 5834 } 5835 } elsif ($state == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) { 5836 ## XML5: "DOCTYPE markup declaration state". 5837 5838 if ($nc == 0x002D) { # - 5839 $self->{state} = MD_HYPHEN_STATE; 5840 5841 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5842 $self->{line_prev} = $self->{line}; 5843 $self->{column_prev} = $self->{column}; 5844 $self->{column}++; 5845 $self->{nc} 5846 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5847 } else { 5848 $self->{set_nc}->($self); 5849 } 5850 5851 redo A; 5852 } elsif ($nc == 0x0045 or # E 5853 $nc == 0x0065) { # e 5854 $self->{state} = MD_E_STATE; 5855 $self->{kwd} = chr $nc; 5856 5857 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5858 $self->{line_prev} = $self->{line}; 5859 $self->{column_prev} = $self->{column}; 5860 $self->{column}++; 5861 $self->{nc} 5862 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5863 } else { 5864 $self->{set_nc}->($self); 5865 } 5866 5867 redo A; 5868 } elsif ($nc == 0x0041 or # A 5869 $nc == 0x0061) { # a 5870 $self->{state} = MD_ATTLIST_STATE; 5871 $self->{kwd} = chr $nc; 5872 5873 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5874 $self->{line_prev} = $self->{line}; 5875 $self->{column_prev} = $self->{column}; 5876 $self->{column}++; 5877 $self->{nc} 5878 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5879 } else { 5880 $self->{set_nc}->($self); 5881 } 5882 5883 redo A; 5884 } elsif ($nc == 0x004E or # N 5885 $nc == 0x006E) { # n 5886 $self->{state} = MD_NOTATION_STATE; 5887 $self->{kwd} = chr $nc; 5888 5889 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5890 $self->{line_prev} = $self->{line}; 5891 $self->{column_prev} = $self->{column}; 5892 $self->{column}++; 5893 $self->{nc} 5894 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5895 } else { 5896 $self->{set_nc}->($self); 5897 } 5898 5899 redo A; 5900 } else { 5901 # 5902 } 5903 5904 ## XML5: No parse error. 5905 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 5906 line => $self->{line_prev}, 5907 column => $self->{column_prev} - 1); 5908 ## Reconsume. 5909 $self->{state} = BOGUS_COMMENT_STATE; 5910 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded. 5911 redo A; 5912 } elsif ($state == MD_E_STATE) { 5913 if ($nc == 0x004E or # N 5914 $nc == 0x006E) { # n 5915 $self->{state} = MD_ENTITY_STATE; 5916 $self->{kwd} .= chr $nc; 5917 5918 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5919 $self->{line_prev} = $self->{line}; 5920 $self->{column_prev} = $self->{column}; 5921 $self->{column}++; 5922 $self->{nc} 5923 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5924 } else { 5925 $self->{set_nc}->($self); 5926 } 5927 5928 redo A; 5929 } elsif ($nc == 0x004C or # L 5930 $nc == 0x006C) { # l 5931 ## XML5: <!ELEMENT> not supported. 5932 $self->{state} = MD_ELEMENT_STATE; 5933 $self->{kwd} .= chr $nc; 5934 5935 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5936 $self->{line_prev} = $self->{line}; 5937 $self->{column_prev} = $self->{column}; 5938 $self->{column}++; 5939 $self->{nc} 5940 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5941 } else { 5942 $self->{set_nc}->($self); 5943 } 5944 5945 redo A; 5946 } else { 5947 ## XML5: No parse error. 5948 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 5949 line => $self->{line_prev}, 5950 column => $self->{column_prev} - 2 5951 + 1 * ($nc == EOF_CHAR)); 5952 ## Reconsume. 5953 $self->{state} = BOGUS_COMMENT_STATE; 5954 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 5955 redo A; 5956 } 5957 } elsif ($state == MD_ENTITY_STATE) { 5958 if ($nc == [ 5959 undef, 5960 undef, 5961 0x0054, # T 5962 0x0049, # I 5963 0x0054, # T 5964 NEVER_CHAR, # (Y) 5965 ]->[length $self->{kwd}] or 5966 $nc == [ 5967 undef, 5968 undef, 5969 0x0074, # t 5970 0x0069, # i 5971 0x0074, # t 5972 NEVER_CHAR, # (y) 5973 ]->[length $self->{kwd}]) { 5974 ## Stay in the state. 5975 $self->{kwd} .= chr $nc; 5976 5977 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 5978 $self->{line_prev} = $self->{line}; 5979 $self->{column_prev} = $self->{column}; 5980 $self->{column}++; 5981 $self->{nc} 5982 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 5983 } else { 5984 $self->{set_nc}->($self); 5985 } 5986 5987 redo A; 5988 } elsif ((length $self->{kwd}) == 5 and 5989 ($nc == 0x0059 or # Y 5990 $nc == 0x0079)) { # y 5991 if ($self->{kwd} ne 'ENTIT' or $nc == 0x0079) { 5992 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 5993 text => 'ENTITY', 5994 line => $self->{line_prev}, 5995 column => $self->{column_prev} - 4); 5996 } 5997 $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', 5998 line => $self->{line_prev}, 5999 column => $self->{column_prev} - 6}; 6000 $self->{state} = DOCTYPE_MD_STATE; 6001 6002 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6003 $self->{line_prev} = $self->{line}; 6004 $self->{column_prev} = $self->{column}; 6005 $self->{column}++; 6006 $self->{nc} 6007 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6008 } else { 6009 $self->{set_nc}->($self); 6010 } 6011 6012 redo A; 6013 } else { 6014 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 6015 line => $self->{line_prev}, 6016 column => $self->{column_prev} - 1 6017 - (length $self->{kwd}) 6018 + 1 * ($nc == EOF_CHAR)); 6019 $self->{state} = BOGUS_COMMENT_STATE; 6020 ## Reconsume. 6021 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 6022 redo A; 6023 } 6024 } elsif ($state == MD_ELEMENT_STATE) { 6025 if ($nc == [ 6026 undef, 6027 undef, 6028 0x0045, # E 6029 0x004D, # M 6030 0x0045, # E 6031 0x004E, # N 6032 NEVER_CHAR, # (T) 6033 ]->[length $self->{kwd}] or 6034 $nc == [ 6035 undef, 6036 undef, 6037 0x0065, # e 6038 0x006D, # m 6039 0x0065, # e 6040 0x006E, # n 6041 NEVER_CHAR, # (t) 6042 ]->[length $self->{kwd}]) { 6043 ## Stay in the state. 6044 $self->{kwd} .= chr $nc; 6045 6046 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6047 $self->{line_prev} = $self->{line}; 6048 $self->{column_prev} = $self->{column}; 6049 $self->{column}++; 6050 $self->{nc} 6051 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6052 } else { 6053 $self->{set_nc}->($self); 6054 } 6055 6056 redo A; 6057 } elsif ((length $self->{kwd}) == 6 and 6058 ($nc == 0x0054 or # T 6059 $nc == 0x0074)) { # t 6060 if ($self->{kwd} ne 'ELEMEN' or $nc == 0x0074) { 6061 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 6062 text => 'ELEMENT', 6063 line => $self->{line_prev}, 6064 column => $self->{column_prev} - 5); 6065 } 6066 $self->{ct} = {type => ELEMENT_TOKEN, name => '', 6067 line => $self->{line_prev}, 6068 column => $self->{column_prev} - 7}; 6069 $self->{state} = DOCTYPE_MD_STATE; 6070 6071 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6072 $self->{line_prev} = $self->{line}; 6073 $self->{column_prev} = $self->{column}; 6074 $self->{column}++; 6075 $self->{nc} 6076 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6077 } else { 6078 $self->{set_nc}->($self); 6079 } 6080 6081 redo A; 6082 } else { 6083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 6084 line => $self->{line_prev}, 6085 column => $self->{column_prev} - 1 6086 - (length $self->{kwd}) 6087 + 1 * ($nc == EOF_CHAR)); 6088 $self->{state} = BOGUS_COMMENT_STATE; 6089 ## Reconsume. 6090 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 6091 redo A; 6092 } 6093 } elsif ($state == MD_ATTLIST_STATE) { 6094 if ($nc == [ 6095 undef, 6096 0x0054, # T 6097 0x0054, # T 6098 0x004C, # L 6099 0x0049, # I 6100 0x0053, # S 6101 NEVER_CHAR, # (T) 6102 ]->[length $self->{kwd}] or 6103 $nc == [ 6104 undef, 6105 0x0074, # t 6106 0x0074, # t 6107 0x006C, # l 6108 0x0069, # i 6109 0x0073, # s 6110 NEVER_CHAR, # (t) 6111 ]->[length $self->{kwd}]) { 6112 ## Stay in the state. 6113 $self->{kwd} .= chr $nc; 6114 6115 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6116 $self->{line_prev} = $self->{line}; 6117 $self->{column_prev} = $self->{column}; 6118 $self->{column}++; 6119 $self->{nc} 6120 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6121 } else { 6122 $self->{set_nc}->($self); 6123 } 6124 6125 redo A; 6126 } elsif ((length $self->{kwd}) == 6 and 6127 ($nc == 0x0054 or # T 6128 $nc == 0x0074)) { # t 6129 if ($self->{kwd} ne 'ATTLIS' or $nc == 0x0074) { 6130 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 6131 text => 'ATTLIST', 6132 line => $self->{line_prev}, 6133 column => $self->{column_prev} - 5); 6134 } 6135 $self->{ct} = {type => ATTLIST_TOKEN, name => '', 6136 attrdefs => [], 6137 line => $self->{line_prev}, 6138 column => $self->{column_prev} - 7}; 6139 $self->{state} = DOCTYPE_MD_STATE; 6140 6141 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6142 $self->{line_prev} = $self->{line}; 6143 $self->{column_prev} = $self->{column}; 6144 $self->{column}++; 6145 $self->{nc} 6146 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6147 } else { 6148 $self->{set_nc}->($self); 6149 } 6150 6151 redo A; 6152 } else { 6153 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 6154 line => $self->{line_prev}, 6155 column => $self->{column_prev} - 1 6156 - (length $self->{kwd}) 6157 + 1 * ($nc == EOF_CHAR)); 6158 $self->{state} = BOGUS_COMMENT_STATE; 6159 ## Reconsume. 6160 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 6161 redo A; 6162 } 6163 } elsif ($state == MD_NOTATION_STATE) { 6164 if ($nc == [ 6165 undef, 6166 0x004F, # O 6167 0x0054, # T 6168 0x0041, # A 6169 0x0054, # T 6170 0x0049, # I 6171 0x004F, # O 6172 NEVER_CHAR, # (N) 6173 ]->[length $self->{kwd}] or 6174 $nc == [ 6175 undef, 6176 0x006F, # o 6177 0x0074, # t 6178 0x0061, # a 6179 0x0074, # t 6180 0x0069, # i 6181 0x006F, # o 6182 NEVER_CHAR, # (n) 6183 ]->[length $self->{kwd}]) { 6184 ## Stay in the state. 6185 $self->{kwd} .= chr $nc; 6186 6187 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6188 $self->{line_prev} = $self->{line}; 6189 $self->{column_prev} = $self->{column}; 6190 $self->{column}++; 6191 $self->{nc} 6192 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6193 } else { 6194 $self->{set_nc}->($self); 6195 } 6196 6197 redo A; 6198 } elsif ((length $self->{kwd}) == 7 and 6199 ($nc == 0x004E or # N 6200 $nc == 0x006E)) { # n 6201 if ($self->{kwd} ne 'NOTATIO' or $nc == 0x006E) { 6202 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 6203 text => 'NOTATION', 6204 line => $self->{line_prev}, 6205 column => $self->{column_prev} - 6); 6206 } 6207 $self->{ct} = {type => NOTATION_TOKEN, name => '', 6208 line => $self->{line_prev}, 6209 column => $self->{column_prev} - 8}; 6210 $self->{state} = DOCTYPE_MD_STATE; 6211 6212 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6213 $self->{line_prev} = $self->{line}; 6214 $self->{column_prev} = $self->{column}; 6215 $self->{column}++; 6216 $self->{nc} 6217 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6218 } else { 6219 $self->{set_nc}->($self); 6220 } 6221 6222 redo A; 6223 } else { 6224 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', 6225 line => $self->{line_prev}, 6226 column => $self->{column_prev} - 1 6227 - (length $self->{kwd}) 6228 + 1 * ($nc == EOF_CHAR)); 6229 $self->{state} = BOGUS_COMMENT_STATE; 6230 ## Reconsume. 6231 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 6232 redo A; 6233 } 6234 } elsif ($state == DOCTYPE_MD_STATE) { 6235 ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and 6236 ## "DOCTYPE NOTATION state". 6237 6238 if ($is_space->{$nc}) { 6239 ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state". 6240 $self->{state} = BEFORE_MD_NAME_STATE; 6241 6242 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6243 $self->{line_prev} = $self->{line}; 6244 $self->{column_prev} = $self->{column}; 6245 $self->{column}++; 6246 $self->{nc} 6247 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6248 } else { 6249 $self->{set_nc}->($self); 6250 } 6251 6252 redo A; 6253 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and 6254 $nc == 0x0025) { # % 6255 ## XML5: Switch to the "DOCTYPE bogus comment state". 6256 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type 6257 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE; 6258 6259 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6260 $self->{line_prev} = $self->{line}; 6261 $self->{column_prev} = $self->{column}; 6262 $self->{column}++; 6263 $self->{nc} 6264 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6265 } else { 6266 $self->{set_nc}->($self); 6267 } 6268 6269 redo A; 6270 } elsif ($nc == EOF_CHAR) { 6271 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6272 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6273 ## Reconsume. 6274 redo A; 6275 } elsif ($nc == 0x003E) { # > 6276 ## XML5: Switch to the "DOCTYPE bogus comment state". 6277 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type 6278 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6279 6280 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6281 $self->{line_prev} = $self->{line}; 6282 $self->{column_prev} = $self->{column}; 6283 $self->{column}++; 6284 $self->{nc} 6285 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6286 } else { 6287 $self->{set_nc}->($self); 6288 } 6289 6290 redo A; 6291 } else { 6292 ## XML5: Switch to the "DOCTYPE bogus comment state". 6293 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type 6294 $self->{state} = BEFORE_MD_NAME_STATE; 6295 redo A; 6296 } 6297 } elsif ($state == BEFORE_MD_NAME_STATE) { 6298 ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type 6299 ## before state", "DOCTYPE ATTLIST name before state". 6300 6301 if ($is_space->{$nc}) { 6302 ## Stay in the state. 6303 6304 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6305 $self->{line_prev} = $self->{line}; 6306 $self->{column_prev} = $self->{column}; 6307 $self->{column}++; 6308 $self->{nc} 6309 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6310 } else { 6311 $self->{set_nc}->($self); 6312 } 6313 6314 redo A; 6315 } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and 6316 $nc == 0x0025) { # % 6317 $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE; 6318 6319 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6320 $self->{line_prev} = $self->{line}; 6321 $self->{column_prev} = $self->{column}; 6322 $self->{column}++; 6323 $self->{nc} 6324 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6325 } else { 6326 $self->{set_nc}->($self); 6327 } 6328 6329 redo A; 6330 } elsif ($nc == 0x003E) { # > 6331 ## XML5: Same as "Anything else". 6332 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type 6333 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6334 6335 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6336 $self->{line_prev} = $self->{line}; 6337 $self->{column_prev} = $self->{column}; 6338 $self->{column}++; 6339 $self->{nc} 6340 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6341 } else { 6342 $self->{set_nc}->($self); 6343 } 6344 6345 redo A; 6346 } elsif ($nc == EOF_CHAR) { 6347 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6348 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6349 ## Reconsume. 6350 redo A; 6351 } else { 6352 ## XML5: [ATTLIST] Not defined yet. 6353 if ($nc == 0x0000) { 6354 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 6355 } 6356 $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 6357 $self->{state} = MD_NAME_STATE; 6358 6359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6360 $self->{line_prev} = $self->{line}; 6361 $self->{column_prev} = $self->{column}; 6362 $self->{column}++; 6363 $self->{nc} 6364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6365 } else { 6366 $self->{set_nc}->($self); 6367 } 6368 6369 redo A; 6370 } 6371 } elsif ($state == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) { 6372 if ($is_space->{$nc}) { 6373 ## XML5: Switch to the "DOCTYPE ENTITY parameter state". 6374 $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN; 6375 $self->{state} = BEFORE_MD_NAME_STATE; 6376 6377 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6378 $self->{line_prev} = $self->{line}; 6379 $self->{column_prev} = $self->{column}; 6380 $self->{column}++; 6381 $self->{nc} 6382 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6383 } else { 6384 $self->{set_nc}->($self); 6385 } 6386 6387 redo A; 6388 } elsif ($nc == 0x003E) { # > 6389 ## XML5: Same as "Anything else". 6390 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type 6391 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6392 6393 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6394 $self->{line_prev} = $self->{line}; 6395 $self->{column_prev} = $self->{column}; 6396 $self->{column}++; 6397 $self->{nc} 6398 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6399 } else { 6400 $self->{set_nc}->($self); 6401 } 6402 6403 redo A; 6404 } elsif ($nc == EOF_CHAR) { 6405 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); 6406 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6407 ## Reconsume. 6408 redo A; 6409 } else { 6410 ## XML5: No parse error. 6411 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type 6412 $self->{state} = BOGUS_COMMENT_STATE; 6413 $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded 6414 ## Reconsume. 6415 redo A; 6416 } 6417 } elsif ($state == MD_NAME_STATE) { 6418 ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state". 6419 6420 if ($is_space->{$nc}) { 6421 if ($self->{ct}->{type} == ATTLIST_TOKEN) { 6422 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; 6423 } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) { 6424 $self->{state} = AFTER_ELEMENT_NAME_STATE; 6425 } else { # ENTITY/NOTATION 6426 $self->{state} = AFTER_DOCTYPE_NAME_STATE; 6427 } 6428 6429 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6430 $self->{line_prev} = $self->{line}; 6431 $self->{column_prev} = $self->{column}; 6432 $self->{column}++; 6433 $self->{nc} 6434 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6435 } else { 6436 $self->{set_nc}->($self); 6437 } 6438 6439 redo A; 6440 } elsif ($nc == 0x003E) { # > 6441 if ($self->{ct}->{type} == ATTLIST_TOKEN) { 6442 # 6443 } else { 6444 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type 6445 } 6446 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6447 6448 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6449 $self->{line_prev} = $self->{line}; 6450 $self->{column_prev} = $self->{column}; 6451 $self->{column}++; 6452 $self->{nc} 6453 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6454 } else { 6455 $self->{set_nc}->($self); 6456 } 6457 6458 return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION 6459 redo A; 6460 } elsif ($nc == EOF_CHAR) { 6461 ## XML5: [ATTLIST] No parse error. 6462 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); 6463 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6464 ## Reconsume. 6465 redo A; 6466 } else { 6467 ## XML5: [ATTLIST] Not defined yet. 6468 if ($nc == 0x0000) { 6469 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 6470 } 6471 $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 6472 ## Stay in the state. 6473 6474 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6475 $self->{line_prev} = $self->{line}; 6476 $self->{column_prev} = $self->{column}; 6477 $self->{column}++; 6478 $self->{nc} 6479 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6480 } else { 6481 $self->{set_nc}->($self); 6482 } 6483 6484 redo A; 6485 } 6486 } elsif ($state == DOCTYPE_ATTLIST_NAME_AFTER_STATE) { 6487 if ($is_space->{$nc}) { 6488 ## Stay in the state. 6489 6490 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6491 $self->{line_prev} = $self->{line}; 6492 $self->{column_prev} = $self->{column}; 6493 $self->{column}++; 6494 $self->{nc} 6495 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6496 } else { 6497 $self->{set_nc}->($self); 6498 } 6499 6500 redo A; 6501 } elsif ($nc == 0x003E) { # > 6502 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6503 6504 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6505 $self->{line_prev} = $self->{line}; 6506 $self->{column_prev} = $self->{column}; 6507 $self->{column}++; 6508 $self->{nc} 6509 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6510 } else { 6511 $self->{set_nc}->($self); 6512 } 6513 6514 return ($self->{ct}); # ATTLIST 6515 redo A; 6516 } elsif ($nc == EOF_CHAR) { 6517 ## XML5: No parse error. 6518 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6519 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6520 ## Discard the current token. 6521 redo A; 6522 } else { 6523 ## XML5: Not defined yet. 6524 if ($nc == 0x0000) { 6525 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 6526 } 6527 $self->{ca} = {name => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, # attrdef 6528 tokens => [], 6529 line => $self->{line}, column => $self->{column}}; 6530 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE; 6531 6532 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6533 $self->{line_prev} = $self->{line}; 6534 $self->{column_prev} = $self->{column}; 6535 $self->{column}++; 6536 $self->{nc} 6537 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6538 } else { 6539 $self->{set_nc}->($self); 6540 } 6541 6542 redo A; 6543 } 6544 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) { 6545 if ($is_space->{$nc}) { 6546 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE; 6547 6548 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6549 $self->{line_prev} = $self->{line}; 6550 $self->{column_prev} = $self->{column}; 6551 $self->{column}++; 6552 $self->{nc} 6553 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6554 } else { 6555 $self->{set_nc}->($self); 6556 } 6557 6558 redo A; 6559 } elsif ($nc == 0x003E) { # > 6560 ## XML5: Same as "anything else". 6561 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type 6562 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6563 6564 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6565 $self->{line_prev} = $self->{line}; 6566 $self->{column_prev} = $self->{column}; 6567 $self->{column}++; 6568 $self->{nc} 6569 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6570 } else { 6571 $self->{set_nc}->($self); 6572 } 6573 6574 return ($self->{ct}); # ATTLIST 6575 redo A; 6576 } elsif ($nc == 0x0028) { # ( 6577 ## XML5: Same as "anything else". 6578 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type 6579 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 6580 6581 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6582 $self->{line_prev} = $self->{line}; 6583 $self->{column_prev} = $self->{column}; 6584 $self->{column}++; 6585 $self->{nc} 6586 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6587 } else { 6588 $self->{set_nc}->($self); 6589 } 6590 6591 redo A; 6592 } elsif ($nc == EOF_CHAR) { 6593 ## XML5: No parse error. 6594 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6595 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6596 6597 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6598 $self->{line_prev} = $self->{line}; 6599 $self->{column_prev} = $self->{column}; 6600 $self->{column}++; 6601 $self->{nc} 6602 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6603 } else { 6604 $self->{set_nc}->($self); 6605 } 6606 6607 ## Discard the current token. 6608 redo A; 6609 } else { 6610 ## XML5: Not defined yet. 6611 if ($nc == 0x0000) { 6612 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 6613 } 6614 $self->{ca}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 6615 ## Stay in the state. 6616 6617 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6618 $self->{line_prev} = $self->{line}; 6619 $self->{column_prev} = $self->{column}; 6620 $self->{column}++; 6621 $self->{nc} 6622 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6623 } else { 6624 $self->{set_nc}->($self); 6625 } 6626 6627 redo A; 6628 } 6629 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) { 6630 if ($is_space->{$nc}) { 6631 ## Stay in the state. 6632 6633 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6634 $self->{line_prev} = $self->{line}; 6635 $self->{column_prev} = $self->{column}; 6636 $self->{column}++; 6637 $self->{nc} 6638 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6639 } else { 6640 $self->{set_nc}->($self); 6641 } 6642 6643 redo A; 6644 } elsif ($nc == 0x003E) { # > 6645 ## XML5: Same as "anything else". 6646 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type 6647 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6648 6649 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6650 $self->{line_prev} = $self->{line}; 6651 $self->{column_prev} = $self->{column}; 6652 $self->{column}++; 6653 $self->{nc} 6654 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6655 } else { 6656 $self->{set_nc}->($self); 6657 } 6658 6659 return ($self->{ct}); # ATTLIST 6660 redo A; 6661 } elsif ($nc == 0x0028) { # ( 6662 ## XML5: Same as "anything else". 6663 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 6664 6665 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6666 $self->{line_prev} = $self->{line}; 6667 $self->{column_prev} = $self->{column}; 6668 $self->{column}++; 6669 $self->{nc} 6670 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6671 } else { 6672 $self->{set_nc}->($self); 6673 } 6674 6675 redo A; 6676 } elsif ($nc == EOF_CHAR) { 6677 ## XML5: No parse error. 6678 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6679 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6680 6681 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6682 $self->{line_prev} = $self->{line}; 6683 $self->{column_prev} = $self->{column}; 6684 $self->{column}++; 6685 $self->{nc} 6686 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6687 } else { 6688 $self->{set_nc}->($self); 6689 } 6690 6691 ## Discard the token. 6692 redo A; 6693 } else { 6694 ## XML5: Not defined yet. 6695 $self->{ca}->{type} = chr $nc; 6696 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE; 6697 6698 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6699 $self->{line_prev} = $self->{line}; 6700 $self->{column_prev} = $self->{column}; 6701 $self->{column}++; 6702 $self->{nc} 6703 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6704 } else { 6705 $self->{set_nc}->($self); 6706 } 6707 6708 redo A; 6709 } 6710 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) { 6711 if ($is_space->{$nc}) { 6712 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE; 6713 6714 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6715 $self->{line_prev} = $self->{line}; 6716 $self->{column_prev} = $self->{column}; 6717 $self->{column}++; 6718 $self->{nc} 6719 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6720 } else { 6721 $self->{set_nc}->($self); 6722 } 6723 6724 redo A; 6725 } elsif ($nc == 0x0023) { # # 6726 ## XML5: Same as "anything else". 6727 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 6728 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; 6729 6730 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6731 $self->{line_prev} = $self->{line}; 6732 $self->{column_prev} = $self->{column}; 6733 $self->{column}++; 6734 $self->{nc} 6735 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6736 } else { 6737 $self->{set_nc}->($self); 6738 } 6739 6740 redo A; 6741 } elsif ($nc == 0x0022) { # " 6742 ## XML5: Same as "anything else". 6743 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 6744 $self->{ca}->{value} = ''; 6745 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 6746 6747 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6748 $self->{line_prev} = $self->{line}; 6749 $self->{column_prev} = $self->{column}; 6750 $self->{column}++; 6751 $self->{nc} 6752 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6753 } else { 6754 $self->{set_nc}->($self); 6755 } 6756 6757 redo A; 6758 } elsif ($nc == 0x0027) { # ' 6759 ## XML5: Same as "anything else". 6760 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 6761 $self->{ca}->{value} = ''; 6762 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 6763 6764 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6765 $self->{line_prev} = $self->{line}; 6766 $self->{column_prev} = $self->{column}; 6767 $self->{column}++; 6768 $self->{nc} 6769 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6770 } else { 6771 $self->{set_nc}->($self); 6772 } 6773 6774 redo A; 6775 } elsif ($nc == 0x003E) { # > 6776 ## XML5: Same as "anything else". 6777 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type 6778 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6779 6780 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6781 $self->{line_prev} = $self->{line}; 6782 $self->{column_prev} = $self->{column}; 6783 $self->{column}++; 6784 $self->{nc} 6785 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6786 } else { 6787 $self->{set_nc}->($self); 6788 } 6789 6790 return ($self->{ct}); # ATTLIST 6791 redo A; 6792 } elsif ($nc == 0x0028) { # ( 6793 ## XML5: Same as "anything else". 6794 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type 6795 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 6796 6797 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6798 $self->{line_prev} = $self->{line}; 6799 $self->{column_prev} = $self->{column}; 6800 $self->{column}++; 6801 $self->{nc} 6802 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6803 } else { 6804 $self->{set_nc}->($self); 6805 } 6806 6807 redo A; 6808 } elsif ($nc == EOF_CHAR) { 6809 ## XML5: No parse error. 6810 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6811 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6812 6813 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6814 $self->{line_prev} = $self->{line}; 6815 $self->{column_prev} = $self->{column}; 6816 $self->{column}++; 6817 $self->{nc} 6818 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6819 } else { 6820 $self->{set_nc}->($self); 6821 } 6822 6823 ## Discard the token. 6824 redo A; 6825 } else { 6826 ## XML5: Not defined yet. 6827 $self->{ca}->{type} .= chr $nc; 6828 ## Stay in the state. 6829 6830 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6831 $self->{line_prev} = $self->{line}; 6832 $self->{column_prev} = $self->{column}; 6833 $self->{column}++; 6834 $self->{nc} 6835 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6836 } else { 6837 $self->{set_nc}->($self); 6838 } 6839 6840 redo A; 6841 } 6842 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) { 6843 if ($is_space->{$nc}) { 6844 ## Stay in the state. 6845 6846 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6847 $self->{line_prev} = $self->{line}; 6848 $self->{column_prev} = $self->{column}; 6849 $self->{column}++; 6850 $self->{nc} 6851 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6852 } else { 6853 $self->{set_nc}->($self); 6854 } 6855 6856 redo A; 6857 } elsif ($nc == 0x0028) { # ( 6858 ## XML5: Same as "anything else". 6859 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 6860 6861 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6862 $self->{line_prev} = $self->{line}; 6863 $self->{column_prev} = $self->{column}; 6864 $self->{column}++; 6865 $self->{nc} 6866 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6867 } else { 6868 $self->{set_nc}->($self); 6869 } 6870 6871 redo A; 6872 } elsif ($nc == 0x0023) { # # 6873 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; 6874 6875 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6876 $self->{line_prev} = $self->{line}; 6877 $self->{column_prev} = $self->{column}; 6878 $self->{column}++; 6879 $self->{nc} 6880 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6881 } else { 6882 $self->{set_nc}->($self); 6883 } 6884 6885 redo A; 6886 } elsif ($nc == 0x0022) { # " 6887 ## XML5: Same as "anything else". 6888 $self->{ca}->{value} = ''; 6889 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 6890 6891 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6892 $self->{line_prev} = $self->{line}; 6893 $self->{column_prev} = $self->{column}; 6894 $self->{column}++; 6895 $self->{nc} 6896 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6897 } else { 6898 $self->{set_nc}->($self); 6899 } 6900 6901 redo A; 6902 } elsif ($nc == 0x0027) { # ' 6903 ## XML5: Same as "anything else". 6904 $self->{ca}->{value} = ''; 6905 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 6906 6907 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6908 $self->{line_prev} = $self->{line}; 6909 $self->{column_prev} = $self->{column}; 6910 $self->{column}++; 6911 $self->{nc} 6912 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6913 } else { 6914 $self->{set_nc}->($self); 6915 } 6916 6917 redo A; 6918 } elsif ($nc == 0x003E) { # > 6919 ## XML5: Same as "anything else". 6920 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type 6921 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 6922 6923 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6924 $self->{line_prev} = $self->{line}; 6925 $self->{column_prev} = $self->{column}; 6926 $self->{column}++; 6927 $self->{nc} 6928 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6929 } else { 6930 $self->{set_nc}->($self); 6931 } 6932 6933 return ($self->{ct}); # ATTLIST 6934 redo A; 6935 } elsif ($nc == EOF_CHAR) { 6936 ## XML5: No parse error. 6937 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 6938 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 6939 6940 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6941 $self->{line_prev} = $self->{line}; 6942 $self->{column_prev} = $self->{column}; 6943 $self->{column}++; 6944 $self->{nc} 6945 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6946 } else { 6947 $self->{set_nc}->($self); 6948 } 6949 6950 ## Discard the current token. 6951 redo A; 6952 } else { 6953 ## XML5: Switch to the "DOCTYPE bogus comment state". 6954 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type 6955 $self->{ca}->{value} = ''; 6956 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; 6957 ## Reconsume. 6958 redo A; 6959 } 6960 } elsif ($state == BEFORE_ALLOWED_TOKEN_STATE) { 6961 if ($is_space->{$nc}) { 6962 ## Stay in the state. 6963 6964 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6965 $self->{line_prev} = $self->{line}; 6966 $self->{column_prev} = $self->{column}; 6967 $self->{column}++; 6968 $self->{nc} 6969 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6970 } else { 6971 $self->{set_nc}->($self); 6972 } 6973 6974 redo A; 6975 } elsif ($nc == 0x007C) { # | 6976 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type 6977 ## Stay in the state. 6978 6979 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6980 $self->{line_prev} = $self->{line}; 6981 $self->{column_prev} = $self->{column}; 6982 $self->{column}++; 6983 $self->{nc} 6984 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 6985 } else { 6986 $self->{set_nc}->($self); 6987 } 6988 6989 redo A; 6990 } elsif ($nc == 0x0029) { # ) 6991 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type 6992 $self->{state} = AFTER_ALLOWED_TOKENS_STATE; 6993 6994 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 6995 $self->{line_prev} = $self->{line}; 6996 $self->{column_prev} = $self->{column}; 6997 $self->{column}++; 6998 $self->{nc} 6999 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7000 } else { 7001 $self->{set_nc}->($self); 7002 } 7003 7004 redo A; 7005 } elsif ($nc == 0x003E) { # > 7006 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type 7007 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7008 7009 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7010 $self->{line_prev} = $self->{line}; 7011 $self->{column_prev} = $self->{column}; 7012 $self->{column}++; 7013 $self->{nc} 7014 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7015 } else { 7016 $self->{set_nc}->($self); 7017 } 7018 7019 return ($self->{ct}); # ATTLIST 7020 redo A; 7021 } elsif ($nc == EOF_CHAR) { 7022 ## XML5: No parse error. 7023 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7024 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7025 7026 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7027 $self->{line_prev} = $self->{line}; 7028 $self->{column_prev} = $self->{column}; 7029 $self->{column}++; 7030 $self->{nc} 7031 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7032 } else { 7033 $self->{set_nc}->($self); 7034 } 7035 7036 ## Discard the current token. 7037 redo A; 7038 } else { 7039 if ($nc == 0x000) { 7040 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 7041 } 7042 push @{$self->{ca}->{tokens}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 7043 $self->{state} = ALLOWED_TOKEN_STATE; 7044 7045 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7046 $self->{line_prev} = $self->{line}; 7047 $self->{column_prev} = $self->{column}; 7048 $self->{column}++; 7049 $self->{nc} 7050 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7051 } else { 7052 $self->{set_nc}->($self); 7053 } 7054 7055 redo A; 7056 } 7057 } elsif ($state == ALLOWED_TOKEN_STATE) { 7058 if ($is_space->{$nc}) { 7059 $self->{state} = AFTER_ALLOWED_TOKEN_STATE; 7060 7061 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7062 $self->{line_prev} = $self->{line}; 7063 $self->{column_prev} = $self->{column}; 7064 $self->{column}++; 7065 $self->{nc} 7066 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7067 } else { 7068 $self->{set_nc}->($self); 7069 } 7070 7071 redo A; 7072 } elsif ($nc == 0x007C) { # | 7073 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 7074 7075 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7076 $self->{line_prev} = $self->{line}; 7077 $self->{column_prev} = $self->{column}; 7078 $self->{column}++; 7079 $self->{nc} 7080 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7081 } else { 7082 $self->{set_nc}->($self); 7083 } 7084 7085 redo A; 7086 } elsif ($nc == 0x0029) { # ) 7087 $self->{state} = AFTER_ALLOWED_TOKENS_STATE; 7088 7089 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7090 $self->{line_prev} = $self->{line}; 7091 $self->{column_prev} = $self->{column}; 7092 $self->{column}++; 7093 $self->{nc} 7094 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7095 } else { 7096 $self->{set_nc}->($self); 7097 } 7098 7099 redo A; 7100 } elsif ($nc == 0x003E) { # > 7101 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type 7102 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7103 7104 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7105 $self->{line_prev} = $self->{line}; 7106 $self->{column_prev} = $self->{column}; 7107 $self->{column}++; 7108 $self->{nc} 7109 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7110 } else { 7111 $self->{set_nc}->($self); 7112 } 7113 7114 return ($self->{ct}); # ATTLIST 7115 redo A; 7116 } elsif ($nc == EOF_CHAR) { 7117 ## XML5: No parse error. 7118 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7119 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7120 7121 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7122 $self->{line_prev} = $self->{line}; 7123 $self->{column_prev} = $self->{column}; 7124 $self->{column}++; 7125 $self->{nc} 7126 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7127 } else { 7128 $self->{set_nc}->($self); 7129 } 7130 7131 ## Discard the current token. 7132 redo A; 7133 } else { 7134 if ($nc == 0x0000) { 7135 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 7136 } 7137 $self->{ca}->{tokens}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 7138 ## Stay in the state. 7139 7140 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7141 $self->{line_prev} = $self->{line}; 7142 $self->{column_prev} = $self->{column}; 7143 $self->{column}++; 7144 $self->{nc} 7145 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7146 } else { 7147 $self->{set_nc}->($self); 7148 } 7149 7150 redo A; 7151 } 7152 } elsif ($state == AFTER_ALLOWED_TOKEN_STATE) { 7153 if ($is_space->{$nc}) { 7154 ## Stay in the state. 7155 7156 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7157 $self->{line_prev} = $self->{line}; 7158 $self->{column_prev} = $self->{column}; 7159 $self->{column}++; 7160 $self->{nc} 7161 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7162 } else { 7163 $self->{set_nc}->($self); 7164 } 7165 7166 redo A; 7167 } elsif ($nc == 0x007C) { # | 7168 $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; 7169 7170 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7171 $self->{line_prev} = $self->{line}; 7172 $self->{column_prev} = $self->{column}; 7173 $self->{column}++; 7174 $self->{nc} 7175 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7176 } else { 7177 $self->{set_nc}->($self); 7178 } 7179 7180 redo A; 7181 } elsif ($nc == 0x0029) { # ) 7182 $self->{state} = AFTER_ALLOWED_TOKENS_STATE; 7183 7184 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7185 $self->{line_prev} = $self->{line}; 7186 $self->{column_prev} = $self->{column}; 7187 $self->{column}++; 7188 $self->{nc} 7189 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7190 } else { 7191 $self->{set_nc}->($self); 7192 } 7193 7194 redo A; 7195 } elsif ($nc == 0x003E) { # > 7196 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type 7197 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7198 7199 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7200 $self->{line_prev} = $self->{line}; 7201 $self->{column_prev} = $self->{column}; 7202 $self->{column}++; 7203 $self->{nc} 7204 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7205 } else { 7206 $self->{set_nc}->($self); 7207 } 7208 7209 return ($self->{ct}); # ATTLIST 7210 redo A; 7211 } elsif ($nc == EOF_CHAR) { 7212 ## XML5: No parse error. 7213 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7214 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7215 7216 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7217 $self->{line_prev} = $self->{line}; 7218 $self->{column_prev} = $self->{column}; 7219 $self->{column}++; 7220 $self->{nc} 7221 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7222 } else { 7223 $self->{set_nc}->($self); 7224 } 7225 7226 ## Discard the current token. 7227 redo A; 7228 } else { 7229 $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type 7230 line => $self->{line_prev}, 7231 column => $self->{column_prev}); 7232 if ($nc == 0x0000) { 7233 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 7234 } 7235 $self->{ca}->{tokens}->[-1] .= ' ' . ($nc == 0x0000 ? "\x{FFFD}" : chr $nc); 7236 $self->{state} = ALLOWED_TOKEN_STATE; 7237 7238 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7239 $self->{line_prev} = $self->{line}; 7240 $self->{column_prev} = $self->{column}; 7241 $self->{column}++; 7242 $self->{nc} 7243 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7244 } else { 7245 $self->{set_nc}->($self); 7246 } 7247 7248 redo A; 7249 } 7250 } elsif ($state == AFTER_ALLOWED_TOKENS_STATE) { 7251 if ($is_space->{$nc}) { 7252 $self->{state} = BEFORE_ATTR_DEFAULT_STATE; 7253 7254 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7255 $self->{line_prev} = $self->{line}; 7256 $self->{column_prev} = $self->{column}; 7257 $self->{column}++; 7258 $self->{nc} 7259 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7260 } else { 7261 $self->{set_nc}->($self); 7262 } 7263 7264 redo A; 7265 } elsif ($nc == 0x0023) { # # 7266 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 7267 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; 7268 7269 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7270 $self->{line_prev} = $self->{line}; 7271 $self->{column_prev} = $self->{column}; 7272 $self->{column}++; 7273 $self->{nc} 7274 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7275 } else { 7276 $self->{set_nc}->($self); 7277 } 7278 7279 redo A; 7280 } elsif ($nc == 0x0022) { # " 7281 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 7282 $self->{ca}->{value} = ''; 7283 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 7284 7285 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7286 $self->{line_prev} = $self->{line}; 7287 $self->{column_prev} = $self->{column}; 7288 $self->{column}++; 7289 $self->{nc} 7290 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7291 } else { 7292 $self->{set_nc}->($self); 7293 } 7294 7295 redo A; 7296 } elsif ($nc == 0x0027) { # ' 7297 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 7298 $self->{ca}->{value} = ''; 7299 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 7300 7301 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7302 $self->{line_prev} = $self->{line}; 7303 $self->{column_prev} = $self->{column}; 7304 $self->{column}++; 7305 $self->{nc} 7306 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7307 } else { 7308 $self->{set_nc}->($self); 7309 } 7310 7311 redo A; 7312 } elsif ($nc == 0x003E) { # > 7313 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type 7314 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7315 7316 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7317 $self->{line_prev} = $self->{line}; 7318 $self->{column_prev} = $self->{column}; 7319 $self->{column}++; 7320 $self->{nc} 7321 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7322 } else { 7323 $self->{set_nc}->($self); 7324 } 7325 7326 return ($self->{ct}); # ATTLIST 7327 redo A; 7328 } elsif ($nc == EOF_CHAR) { 7329 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7330 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7331 7332 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7333 $self->{line_prev} = $self->{line}; 7334 $self->{column_prev} = $self->{column}; 7335 $self->{column}++; 7336 $self->{nc} 7337 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7338 } else { 7339 $self->{set_nc}->($self); 7340 } 7341 7342 ## Discard the current token. 7343 redo A; 7344 } else { 7345 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type 7346 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; 7347 ## Reconsume. 7348 redo A; 7349 } 7350 } elsif ($state == BEFORE_ATTR_DEFAULT_STATE) { 7351 if ($is_space->{$nc}) { 7352 ## Stay in the state. 7353 7354 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7355 $self->{line_prev} = $self->{line}; 7356 $self->{column_prev} = $self->{column}; 7357 $self->{column}++; 7358 $self->{nc} 7359 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7360 } else { 7361 $self->{set_nc}->($self); 7362 } 7363 7364 redo A; 7365 } elsif ($nc == 0x0023) { # # 7366 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; 7367 7368 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7369 $self->{line_prev} = $self->{line}; 7370 $self->{column_prev} = $self->{column}; 7371 $self->{column}++; 7372 $self->{nc} 7373 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7374 } else { 7375 $self->{set_nc}->($self); 7376 } 7377 7378 redo A; 7379 } elsif ($nc == 0x0022) { # " 7380 $self->{ca}->{value} = ''; 7381 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 7382 7383 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7384 $self->{line_prev} = $self->{line}; 7385 $self->{column_prev} = $self->{column}; 7386 $self->{column}++; 7387 $self->{nc} 7388 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7389 } else { 7390 $self->{set_nc}->($self); 7391 } 7392 7393 redo A; 7394 } elsif ($nc == 0x0027) { # ' 7395 $self->{ca}->{value} = ''; 7396 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 7397 7398 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7399 $self->{line_prev} = $self->{line}; 7400 $self->{column_prev} = $self->{column}; 7401 $self->{column}++; 7402 $self->{nc} 7403 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7404 } else { 7405 $self->{set_nc}->($self); 7406 } 7407 7408 redo A; 7409 } elsif ($nc == 0x003E) { # > 7410 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type 7411 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7412 7413 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7414 $self->{line_prev} = $self->{line}; 7415 $self->{column_prev} = $self->{column}; 7416 $self->{column}++; 7417 $self->{nc} 7418 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7419 } else { 7420 $self->{set_nc}->($self); 7421 } 7422 7423 return ($self->{ct}); # ATTLIST 7424 redo A; 7425 } elsif ($nc == EOF_CHAR) { 7426 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7427 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7428 7429 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7430 $self->{line_prev} = $self->{line}; 7431 $self->{column_prev} = $self->{column}; 7432 $self->{column}++; 7433 $self->{nc} 7434 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7435 } else { 7436 $self->{set_nc}->($self); 7437 } 7438 7439 ## Discard the current token. 7440 redo A; 7441 } else { 7442 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type 7443 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; 7444 ## Reconsume. 7445 redo A; 7446 } 7447 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) { 7448 if ($is_space->{$nc}) { 7449 ## XML5: No parse error. 7450 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type 7451 $self->{state} = BOGUS_MD_STATE; 7452 ## Reconsume. 7453 redo A; 7454 } elsif ($nc == 0x0022) { # " 7455 # XXX parse error? 7456 ## XML5: Same as "anything else". 7457 $self->{ca}->{value} = ''; 7458 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 7459 7460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7461 $self->{line_prev} = $self->{line}; 7462 $self->{column_prev} = $self->{column}; 7463 $self->{column}++; 7464 $self->{nc} 7465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7466 } else { 7467 $self->{set_nc}->($self); 7468 } 7469 7470 redo A; 7471 } elsif ($nc == 0x0027) { # ' 7472 # XXX parse error? 7473 ## XML5: Same as "anything else". 7474 $self->{ca}->{value} = ''; 7475 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 7476 7477 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7478 $self->{line_prev} = $self->{line}; 7479 $self->{column_prev} = $self->{column}; 7480 $self->{column}++; 7481 $self->{nc} 7482 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7483 } else { 7484 $self->{set_nc}->($self); 7485 } 7486 7487 redo A; 7488 } elsif ($nc == 0x003E) { # > 7489 ## XML5: Same as "anything else". 7490 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type 7491 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7492 7493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7494 $self->{line_prev} = $self->{line}; 7495 $self->{column_prev} = $self->{column}; 7496 $self->{column}++; 7497 $self->{nc} 7498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7499 } else { 7500 $self->{set_nc}->($self); 7501 } 7502 7503 return ($self->{ct}); # ATTLIST 7504 redo A; 7505 } elsif ($nc == EOF_CHAR) { 7506 ## XML5: No parse error. 7507 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7508 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7509 7510 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7511 $self->{line_prev} = $self->{line}; 7512 $self->{column_prev} = $self->{column}; 7513 $self->{column}++; 7514 $self->{nc} 7515 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7516 } else { 7517 $self->{set_nc}->($self); 7518 } 7519 7520 ## Discard the current token. 7521 redo A; 7522 } else { 7523 $self->{ca}->{default} = chr $nc; 7524 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE; 7525 7526 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7527 $self->{line_prev} = $self->{line}; 7528 $self->{column_prev} = $self->{column}; 7529 $self->{column}++; 7530 $self->{nc} 7531 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7532 } else { 7533 $self->{set_nc}->($self); 7534 } 7535 7536 redo A; 7537 } 7538 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) { 7539 if ($is_space->{$nc}) { 7540 $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE; 7541 7542 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7543 $self->{line_prev} = $self->{line}; 7544 $self->{column_prev} = $self->{column}; 7545 $self->{column}++; 7546 $self->{nc} 7547 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7548 } else { 7549 $self->{set_nc}->($self); 7550 } 7551 7552 redo A; 7553 } elsif ($nc == 0x0022) { # " 7554 ## XML5: Same as "anything else". 7555 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 7556 $self->{ca}->{value} = ''; 7557 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 7558 7559 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7560 $self->{line_prev} = $self->{line}; 7561 $self->{column_prev} = $self->{column}; 7562 $self->{column}++; 7563 $self->{nc} 7564 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7565 } else { 7566 $self->{set_nc}->($self); 7567 } 7568 7569 redo A; 7570 } elsif ($nc == 0x0027) { # ' 7571 ## XML5: Same as "anything else". 7572 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type 7573 $self->{ca}->{value} = ''; 7574 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 7575 7576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7577 $self->{line_prev} = $self->{line}; 7578 $self->{column_prev} = $self->{column}; 7579 $self->{column}++; 7580 $self->{nc} 7581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7582 } else { 7583 $self->{set_nc}->($self); 7584 } 7585 7586 redo A; 7587 } elsif ($nc == 0x003E) { # > 7588 ## XML5: Same as "anything else". 7589 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 7590 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7591 7592 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7593 $self->{line_prev} = $self->{line}; 7594 $self->{column_prev} = $self->{column}; 7595 $self->{column}++; 7596 $self->{nc} 7597 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7598 } else { 7599 $self->{set_nc}->($self); 7600 } 7601 7602 return ($self->{ct}); # ATTLIST 7603 redo A; 7604 } elsif ($nc == EOF_CHAR) { 7605 ## XML5: No parse error. 7606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7607 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 7608 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7609 7610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7611 $self->{line_prev} = $self->{line}; 7612 $self->{column_prev} = $self->{column}; 7613 $self->{column}++; 7614 $self->{nc} 7615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7616 } else { 7617 $self->{set_nc}->($self); 7618 } 7619 7620 ## Discard the current token. 7621 redo A; 7622 } else { 7623 $self->{ca}->{default} .= chr $nc; 7624 ## Stay in the state. 7625 7626 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7627 $self->{line_prev} = $self->{line}; 7628 $self->{column_prev} = $self->{column}; 7629 $self->{column}++; 7630 $self->{nc} 7631 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7632 } else { 7633 $self->{set_nc}->($self); 7634 } 7635 7636 redo A; 7637 } 7638 } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) { 7639 if ($is_space->{$nc}) { 7640 ## Stay in the state. 7641 7642 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7643 $self->{line_prev} = $self->{line}; 7644 $self->{column_prev} = $self->{column}; 7645 $self->{column}++; 7646 $self->{nc} 7647 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7648 } else { 7649 $self->{set_nc}->($self); 7650 } 7651 7652 redo A; 7653 } elsif ($nc == 0x0022) { # " 7654 $self->{ca}->{value} = ''; 7655 $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; 7656 7657 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7658 $self->{line_prev} = $self->{line}; 7659 $self->{column_prev} = $self->{column}; 7660 $self->{column}++; 7661 $self->{nc} 7662 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7663 } else { 7664 $self->{set_nc}->($self); 7665 } 7666 7667 redo A; 7668 } elsif ($nc == 0x0027) { # ' 7669 $self->{ca}->{value} = ''; 7670 $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; 7671 7672 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7673 $self->{line_prev} = $self->{line}; 7674 $self->{column_prev} = $self->{column}; 7675 $self->{column}++; 7676 $self->{nc} 7677 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7678 } else { 7679 $self->{set_nc}->($self); 7680 } 7681 7682 redo A; 7683 } elsif ($nc == 0x003E) { # > 7684 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 7685 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7686 7687 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7688 $self->{line_prev} = $self->{line}; 7689 $self->{column_prev} = $self->{column}; 7690 $self->{column}++; 7691 $self->{nc} 7692 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7693 } else { 7694 $self->{set_nc}->($self); 7695 } 7696 7697 return ($self->{ct}); # ATTLIST 7698 redo A; 7699 } elsif ($nc == EOF_CHAR) { 7700 ## XML5: No parse error. 7701 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7702 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 7703 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". 7704 7705 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7706 $self->{line_prev} = $self->{line}; 7707 $self->{column_prev} = $self->{column}; 7708 $self->{column}++; 7709 $self->{nc} 7710 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7711 } else { 7712 $self->{set_nc}->($self); 7713 } 7714 7715 ## Discard the current token. 7716 redo A; 7717 } else { 7718 ## XML5: Not defined yet. 7719 if ($self->{ca}->{default} eq 'FIXED') { 7720 $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; 7721 } else { 7722 push @{$self->{ct}->{attrdefs}}, $self->{ca}; 7723 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; 7724 } 7725 ## Reconsume. 7726 redo A; 7727 } 7728 } elsif ($state == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) { 7729 if ($is_space->{$nc} or 7730 $nc == EOF_CHAR or 7731 $nc == 0x003E) { # > 7732 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; 7733 ## Reconsume. 7734 redo A; 7735 } else { 7736 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type 7737 $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; 7738 ## Reconsume. 7739 redo A; 7740 } 7741 } elsif ($state == NDATA_STATE) { 7742 ## ASCII case-insensitive 7743 if ($nc == [ 7744 undef, 7745 0x0044, # D 7746 0x0041, # A 7747 0x0054, # T 7748 NEVER_CHAR, # (A) 7749 ]->[length $self->{kwd}] or 7750 $nc == [ 7751 undef, 7752 0x0064, # d 7753 0x0061, # a 7754 0x0074, # t 7755 NEVER_CHAR, # (a) 7756 ]->[length $self->{kwd}]) { 7757 7758 ## Stay in the state. 7759 $self->{kwd} .= chr $nc; 7760 7761 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7762 $self->{line_prev} = $self->{line}; 7763 $self->{column_prev} = $self->{column}; 7764 $self->{column}++; 7765 $self->{nc} 7766 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7767 } else { 7768 $self->{set_nc}->($self); 7769 } 7770 7771 redo A; 7772 } elsif ((length $self->{kwd}) == 4 and 7773 ($nc == 0x0041 or # A 7774 $nc == 0x0061)) { # a 7775 if ($self->{kwd} ne 'NDAT' or $nc == 0x0061) { # a 7776 7777 $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type 7778 text => 'NDATA', 7779 line => $self->{line_prev}, 7780 column => $self->{column_prev} - 4); 7781 } else { 7782 7783 } 7784 $self->{state} = AFTER_NDATA_STATE; 7785 7786 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7787 $self->{line_prev} = $self->{line}; 7788 $self->{column_prev} = $self->{column}; 7789 $self->{column}++; 7790 $self->{nc} 7791 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7792 } else { 7793 $self->{set_nc}->($self); 7794 } 7795 7796 redo A; 7797 } else { 7798 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type 7799 line => $self->{line_prev}, 7800 column => $self->{column_prev} + 1 7801 - length $self->{kwd}); 7802 7803 $self->{state} = BOGUS_MD_STATE; 7804 ## Reconsume. 7805 redo A; 7806 } 7807 } elsif ($state == AFTER_NDATA_STATE) { 7808 if ($is_space->{$nc}) { 7809 $self->{state} = BEFORE_NOTATION_NAME_STATE; 7810 7811 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7812 $self->{line_prev} = $self->{line}; 7813 $self->{column_prev} = $self->{column}; 7814 $self->{column}++; 7815 $self->{nc} 7816 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7817 } else { 7818 $self->{set_nc}->($self); 7819 } 7820 7821 redo A; 7822 } elsif ($nc == 0x003E) { # > 7823 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type 7824 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7825 7826 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7827 $self->{line_prev} = $self->{line}; 7828 $self->{column_prev} = $self->{column}; 7829 $self->{column}++; 7830 $self->{nc} 7831 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7832 } else { 7833 $self->{set_nc}->($self); 7834 } 7835 7836 return ($self->{ct}); # ENTITY 7837 redo A; 7838 } elsif ($nc == EOF_CHAR) { 7839 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7840 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7841 7842 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7843 $self->{line_prev} = $self->{line}; 7844 $self->{column_prev} = $self->{column}; 7845 $self->{column}++; 7846 $self->{nc} 7847 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7848 } else { 7849 $self->{set_nc}->($self); 7850 } 7851 7852 ## Discard the current token. 7853 redo A; 7854 } else { 7855 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type 7856 line => $self->{line_prev}, 7857 column => $self->{column_prev} + 1 7858 - length $self->{kwd}); 7859 $self->{state} = BOGUS_MD_STATE; 7860 ## Reconsume. 7861 redo A; 7862 } 7863 } elsif ($state == BEFORE_NOTATION_NAME_STATE) { 7864 if ($is_space->{$nc}) { 7865 ## Stay in the state. 7866 7867 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7868 $self->{line_prev} = $self->{line}; 7869 $self->{column_prev} = $self->{column}; 7870 $self->{column}++; 7871 $self->{nc} 7872 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7873 } else { 7874 $self->{set_nc}->($self); 7875 } 7876 7877 redo A; 7878 } elsif ($nc == 0x003E) { # > 7879 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type 7880 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7881 7882 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7883 $self->{line_prev} = $self->{line}; 7884 $self->{column_prev} = $self->{column}; 7885 $self->{column}++; 7886 $self->{nc} 7887 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7888 } else { 7889 $self->{set_nc}->($self); 7890 } 7891 7892 return ($self->{ct}); # ENTITY 7893 redo A; 7894 } elsif ($nc == EOF_CHAR) { 7895 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7896 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7897 7898 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7899 $self->{line_prev} = $self->{line}; 7900 $self->{column_prev} = $self->{column}; 7901 $self->{column}++; 7902 $self->{nc} 7903 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7904 } else { 7905 $self->{set_nc}->($self); 7906 } 7907 7908 ## Discard the current token. 7909 redo A; 7910 } else { 7911 if ($nc == 0x0000) { 7912 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 7913 } 7914 $self->{ct}->{notation} = $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY 7915 $self->{state} = NOTATION_NAME_STATE; 7916 7917 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7918 $self->{line_prev} = $self->{line}; 7919 $self->{column_prev} = $self->{column}; 7920 $self->{column}++; 7921 $self->{nc} 7922 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7923 } else { 7924 $self->{set_nc}->($self); 7925 } 7926 7927 redo A; 7928 } 7929 } elsif ($state == NOTATION_NAME_STATE) { 7930 if ($is_space->{$nc}) { 7931 $self->{state} = AFTER_MD_DEF_STATE; 7932 7933 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7934 $self->{line_prev} = $self->{line}; 7935 $self->{column_prev} = $self->{column}; 7936 $self->{column}++; 7937 $self->{nc} 7938 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7939 } else { 7940 $self->{set_nc}->($self); 7941 } 7942 7943 redo A; 7944 } elsif ($nc == 0x003E) { # > 7945 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7946 7947 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7948 $self->{line_prev} = $self->{line}; 7949 $self->{column_prev} = $self->{column}; 7950 $self->{column}++; 7951 $self->{nc} 7952 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7953 } else { 7954 $self->{set_nc}->($self); 7955 } 7956 7957 return ($self->{ct}); # ENTITY 7958 redo A; 7959 } elsif ($nc == EOF_CHAR) { 7960 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 7961 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 7962 7963 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7964 $self->{line_prev} = $self->{line}; 7965 $self->{column_prev} = $self->{column}; 7966 $self->{column}++; 7967 $self->{nc} 7968 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7969 } else { 7970 $self->{set_nc}->($self); 7971 } 7972 7973 ## The current token. 7974 redo A; 7975 } else { 7976 if ($nc == 0x0000) { 7977 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 7978 } 7979 $self->{ct}->{notation} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY 7980 ## Stay in the state. 7981 7982 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7983 $self->{line_prev} = $self->{line}; 7984 $self->{column_prev} = $self->{column}; 7985 $self->{column}++; 7986 $self->{nc} 7987 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 7988 } else { 7989 $self->{set_nc}->($self); 7990 } 7991 7992 redo A; 7993 } 7994 } elsif ($state == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) { 7995 if ($nc == 0x0022) { # " 7996 $self->{state} = AFTER_MD_DEF_STATE; 7997 7998 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 7999 $self->{line_prev} = $self->{line}; 8000 $self->{column_prev} = $self->{column}; 8001 $self->{column}++; 8002 $self->{nc} 8003 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8004 } else { 8005 $self->{set_nc}->($self); 8006 } 8007 8008 redo A; 8009 } elsif ($nc == 0x0026) { # & 8010 $self->{prev_state} = $state; 8011 $self->{state} = ENTITY_VALUE_ENTITY_STATE; 8012 $self->{entity_add} = 0x0022; # " 8013 8014 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8015 $self->{line_prev} = $self->{line}; 8016 $self->{column_prev} = $self->{column}; 8017 $self->{column}++; 8018 $self->{nc} 8019 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8020 } else { 8021 $self->{set_nc}->($self); 8022 } 8023 8024 redo A; 8025## TODO: % 8026 } elsif ($nc == EOF_CHAR) { 8027 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type 8028 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8029 ## Reconsume. 8030 ## Discard the current token. 8031 redo A; 8032 } else { 8033 if ($nc == 0x0000) { 8034 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8035 } 8036 $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY 8037 8038 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8039 $self->{line_prev} = $self->{line}; 8040 $self->{column_prev} = $self->{column}; 8041 $self->{column}++; 8042 $self->{nc} 8043 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8044 } else { 8045 $self->{set_nc}->($self); 8046 } 8047 8048 redo A; 8049 } 8050 } elsif ($state == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) { 8051 if ($nc == 0x0027) { # ' 8052 $self->{state} = AFTER_MD_DEF_STATE; 8053 8054 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8055 $self->{line_prev} = $self->{line}; 8056 $self->{column_prev} = $self->{column}; 8057 $self->{column}++; 8058 $self->{nc} 8059 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8060 } else { 8061 $self->{set_nc}->($self); 8062 } 8063 8064 redo A; 8065 } elsif ($nc == 0x0026) { # & 8066 $self->{prev_state} = $state; 8067 $self->{state} = ENTITY_VALUE_ENTITY_STATE; 8068 $self->{entity_add} = 0x0027; # ' 8069 8070 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8071 $self->{line_prev} = $self->{line}; 8072 $self->{column_prev} = $self->{column}; 8073 $self->{column}++; 8074 $self->{nc} 8075 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8076 } else { 8077 $self->{set_nc}->($self); 8078 } 8079 8080 redo A; 8081## TODO: % 8082 } elsif ($nc == EOF_CHAR) { 8083 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type 8084 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8085 ## Reconsume. 8086 ## Discard the current token. 8087 redo A; 8088 } else { 8089 if ($nc == 0x0000) { 8090 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8091 } 8092 $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY 8093 8094 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8095 $self->{line_prev} = $self->{line}; 8096 $self->{column_prev} = $self->{column}; 8097 $self->{column}++; 8098 $self->{nc} 8099 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8100 } else { 8101 $self->{set_nc}->($self); 8102 } 8103 8104 redo A; 8105 } 8106 } elsif ($state == ENTITY_VALUE_ENTITY_STATE) { 8107 if ($is_space->{$nc} or 8108 { 8109 0x003C => 1, 0x0026 => 1, (EOF_CHAR) => 1, # <, & 8110 $self->{entity_add} => 1, 8111 }->{$nc}) { 8112 $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', 8113 line => $self->{line_prev}, 8114 column => $self->{column_prev} 8115 + ($nc == EOF_CHAR ? 1 : 0)); 8116 ## Don't consume 8117 ## Return nothing. 8118 # 8119 } elsif ($nc == 0x0023) { # # 8120 $self->{ca} = $self->{ct}; 8121 $self->{state} = ENTITY_HASH_STATE; 8122 $self->{kwd} = '#'; 8123 8124 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8125 $self->{line_prev} = $self->{line}; 8126 $self->{column_prev} = $self->{column}; 8127 $self->{column}++; 8128 $self->{nc} 8129 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8130 } else { 8131 $self->{set_nc}->($self); 8132 } 8133 8134 redo A; 8135 } else { 8136 # 8137 } 8138 8139 $self->{ct}->{value} .= '&'; 8140 $self->{state} = $self->{prev_state}; 8141 ## Reconsume. 8142 redo A; 8143 } elsif ($state == AFTER_ELEMENT_NAME_STATE) { 8144 if ($is_space->{$nc}) { 8145 $self->{state} = BEFORE_ELEMENT_CONTENT_STATE; 8146 8147 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8148 $self->{line_prev} = $self->{line}; 8149 $self->{column_prev} = $self->{column}; 8150 $self->{column}++; 8151 $self->{nc} 8152 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8153 } else { 8154 $self->{set_nc}->($self); 8155 } 8156 8157 redo A; 8158 } elsif ($nc == 0x0028) { # ( 8159 $self->{state} = AFTER_CM_GROUP_OPEN_STATE; 8160 $self->{ct}->{content} = ['(']; 8161 $self->{group_depth} = 1; 8162 8163 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8164 $self->{line_prev} = $self->{line}; 8165 $self->{column_prev} = $self->{column}; 8166 $self->{column}++; 8167 $self->{nc} 8168 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8169 } else { 8170 $self->{set_nc}->($self); 8171 } 8172 8173 redo A; 8174 } elsif ($nc == 0x003E) { # > 8175 $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type 8176 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8177 8178 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8179 $self->{line_prev} = $self->{line}; 8180 $self->{column_prev} = $self->{column}; 8181 $self->{column}++; 8182 $self->{nc} 8183 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8184 } else { 8185 $self->{set_nc}->($self); 8186 } 8187 8188 return ($self->{ct}); # ELEMENT 8189 redo A; 8190 } elsif ($nc == EOF_CHAR) { 8191 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8192 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8193 8194 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8195 $self->{line_prev} = $self->{line}; 8196 $self->{column_prev} = $self->{column}; 8197 $self->{column}++; 8198 $self->{nc} 8199 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8200 } else { 8201 $self->{set_nc}->($self); 8202 } 8203 8204 ## Discard the current token. 8205 redo A; 8206 } else { 8207 if ($nc == 0x0000) { 8208 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8209 } 8210 $self->{ct}->{content} = [$nc == 0x0000 ? "\x{FFFD}" : chr $nc]; 8211 $self->{state} = CONTENT_KEYWORD_STATE; 8212 8213 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8214 $self->{line_prev} = $self->{line}; 8215 $self->{column_prev} = $self->{column}; 8216 $self->{column}++; 8217 $self->{nc} 8218 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8219 } else { 8220 $self->{set_nc}->($self); 8221 } 8222 8223 redo A; 8224 } 8225 } elsif ($state == CONTENT_KEYWORD_STATE) { 8226 if ($is_space->{$nc}) { 8227 $self->{state} = AFTER_MD_DEF_STATE; 8228 8229 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8230 $self->{line_prev} = $self->{line}; 8231 $self->{column_prev} = $self->{column}; 8232 $self->{column}++; 8233 $self->{nc} 8234 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8235 } else { 8236 $self->{set_nc}->($self); 8237 } 8238 8239 redo A; 8240 } elsif ($nc == 0x003E) { # > 8241 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8242 8243 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8244 $self->{line_prev} = $self->{line}; 8245 $self->{column_prev} = $self->{column}; 8246 $self->{column}++; 8247 $self->{nc} 8248 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8249 } else { 8250 $self->{set_nc}->($self); 8251 } 8252 8253 return ($self->{ct}); # ELEMENT 8254 redo A; 8255 } elsif ($nc == EOF_CHAR) { 8256 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8257 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8258 8259 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8260 $self->{line_prev} = $self->{line}; 8261 $self->{column_prev} = $self->{column}; 8262 $self->{column}++; 8263 $self->{nc} 8264 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8265 } else { 8266 $self->{set_nc}->($self); 8267 } 8268 8269 ## Discard the current token. 8270 redo A; 8271 } else { 8272 if ($nc == 0x0000) { 8273 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8274 } 8275 $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ELEMENT 8276 ## Stay in the state. 8277 8278 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8279 $self->{line_prev} = $self->{line}; 8280 $self->{column_prev} = $self->{column}; 8281 $self->{column}++; 8282 $self->{nc} 8283 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8284 } else { 8285 $self->{set_nc}->($self); 8286 } 8287 8288 redo A; 8289 } 8290 } elsif ($state == AFTER_CM_GROUP_OPEN_STATE) { 8291 if ($is_space->{$nc}) { 8292 ## Stay in the state. 8293 8294 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8295 $self->{line_prev} = $self->{line}; 8296 $self->{column_prev} = $self->{column}; 8297 $self->{column}++; 8298 $self->{nc} 8299 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8300 } else { 8301 $self->{set_nc}->($self); 8302 } 8303 8304 redo A; 8305 } elsif ($nc == 0x0028) { # ( 8306 $self->{group_depth}++; 8307 push @{$self->{ct}->{content}}, chr $nc; 8308 ## Stay in the state. 8309 8310 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8311 $self->{line_prev} = $self->{line}; 8312 $self->{column_prev} = $self->{column}; 8313 $self->{column}++; 8314 $self->{nc} 8315 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8316 } else { 8317 $self->{set_nc}->($self); 8318 } 8319 8320 redo A; 8321 } elsif ($nc == 0x007C or # | 8322 $nc == 0x002C) { # , 8323 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type 8324 ## Stay in the state. 8325 8326 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8327 $self->{line_prev} = $self->{line}; 8328 $self->{column_prev} = $self->{column}; 8329 $self->{column}++; 8330 $self->{nc} 8331 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8332 } else { 8333 $self->{set_nc}->($self); 8334 } 8335 8336 redo A; 8337 } elsif ($nc == 0x0029) { # ) 8338 $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type 8339 push @{$self->{ct}->{content}}, chr $nc; 8340 $self->{group_depth}--; 8341 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; 8342 8343 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8344 $self->{line_prev} = $self->{line}; 8345 $self->{column_prev} = $self->{column}; 8346 $self->{column}++; 8347 $self->{nc} 8348 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8349 } else { 8350 $self->{set_nc}->($self); 8351 } 8352 8353 redo A; 8354 } elsif ($nc == 0x003E) { # > 8355 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type 8356 push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8357 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8358 8359 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8360 $self->{line_prev} = $self->{line}; 8361 $self->{column_prev} = $self->{column}; 8362 $self->{column}++; 8363 $self->{nc} 8364 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8365 } else { 8366 $self->{set_nc}->($self); 8367 } 8368 8369 return ($self->{ct}); # ELEMENT 8370 redo A; 8371 } elsif ($nc == EOF_CHAR) { 8372 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8373 #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8374 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8375 8376 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8377 $self->{line_prev} = $self->{line}; 8378 $self->{column_prev} = $self->{column}; 8379 $self->{column}++; 8380 $self->{nc} 8381 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8382 } else { 8383 $self->{set_nc}->($self); 8384 } 8385 8386 ## Discard the current token. 8387 redo A; 8388 } else { 8389 if ($nc == 0x0000) { 8390 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8391 } 8392 push @{$self->{ct}->{content}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 8393 $self->{state} = CM_ELEMENT_NAME_STATE; 8394 8395 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8396 $self->{line_prev} = $self->{line}; 8397 $self->{column_prev} = $self->{column}; 8398 $self->{column}++; 8399 $self->{nc} 8400 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8401 } else { 8402 $self->{set_nc}->($self); 8403 } 8404 8405 redo A; 8406 } 8407 } elsif ($state == CM_ELEMENT_NAME_STATE) { 8408 if ($is_space->{$nc}) { 8409 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; 8410 8411 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8412 $self->{line_prev} = $self->{line}; 8413 $self->{column_prev} = $self->{column}; 8414 $self->{column}++; 8415 $self->{nc} 8416 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8417 } else { 8418 $self->{set_nc}->($self); 8419 } 8420 8421 redo A; 8422 } elsif ($nc == 0x002A or # * 8423 $nc == 0x002B or # + 8424 $nc == 0x003F) { # ? 8425 push @{$self->{ct}->{content}}, chr $nc; 8426 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; 8427 8428 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8429 $self->{line_prev} = $self->{line}; 8430 $self->{column_prev} = $self->{column}; 8431 $self->{column}++; 8432 $self->{nc} 8433 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8434 } else { 8435 $self->{set_nc}->($self); 8436 } 8437 8438 redo A; 8439 } elsif ($nc == 0x007C or # | 8440 $nc == 0x002C) { # , 8441 push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', '; 8442 $self->{state} = AFTER_CM_GROUP_OPEN_STATE; 8443 8444 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8445 $self->{line_prev} = $self->{line}; 8446 $self->{column_prev} = $self->{column}; 8447 $self->{column}++; 8448 $self->{nc} 8449 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8450 } else { 8451 $self->{set_nc}->($self); 8452 } 8453 8454 redo A; 8455 } elsif ($nc == 0x0029) { # ) 8456 $self->{group_depth}--; 8457 push @{$self->{ct}->{content}}, chr $nc; 8458 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; 8459 8460 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8461 $self->{line_prev} = $self->{line}; 8462 $self->{column_prev} = $self->{column}; 8463 $self->{column}++; 8464 $self->{nc} 8465 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8466 } else { 8467 $self->{set_nc}->($self); 8468 } 8469 8470 redo A; 8471 } elsif ($nc == 0x003E) { # > 8472 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type 8473 push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8474 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8475 8476 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8477 $self->{line_prev} = $self->{line}; 8478 $self->{column_prev} = $self->{column}; 8479 $self->{column}++; 8480 $self->{nc} 8481 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8482 } else { 8483 $self->{set_nc}->($self); 8484 } 8485 8486 return ($self->{ct}); # ELEMENT 8487 redo A; 8488 } elsif ($nc == EOF_CHAR) { 8489 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8490 #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8491 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8492 8493 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8494 $self->{line_prev} = $self->{line}; 8495 $self->{column_prev} = $self->{column}; 8496 $self->{column}++; 8497 $self->{nc} 8498 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8499 } else { 8500 $self->{set_nc}->($self); 8501 } 8502 8503 ## Discard the token. 8504 redo A; 8505 } else { 8506 if ($nc == 0x0000) { 8507 $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); 8508 } 8509 $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; 8510 ## Stay in the state. 8511 8512 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8513 $self->{line_prev} = $self->{line}; 8514 $self->{column_prev} = $self->{column}; 8515 $self->{column}++; 8516 $self->{nc} 8517 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8518 } else { 8519 $self->{set_nc}->($self); 8520 } 8521 8522 redo A; 8523 } 8524 } elsif ($state == AFTER_CM_ELEMENT_NAME_STATE) { 8525 if ($is_space->{$nc}) { 8526 ## Stay in the state. 8527 8528 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8529 $self->{line_prev} = $self->{line}; 8530 $self->{column_prev} = $self->{column}; 8531 $self->{column}++; 8532 $self->{nc} 8533 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8534 } else { 8535 $self->{set_nc}->($self); 8536 } 8537 8538 redo A; 8539 } elsif ($nc == 0x007C or # | 8540 $nc == 0x002C) { # , 8541 push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', '; 8542 $self->{state} = AFTER_CM_GROUP_OPEN_STATE; 8543 8544 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8545 $self->{line_prev} = $self->{line}; 8546 $self->{column_prev} = $self->{column}; 8547 $self->{column}++; 8548 $self->{nc} 8549 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8550 } else { 8551 $self->{set_nc}->($self); 8552 } 8553 8554 redo A; 8555 } elsif ($nc == 0x0029) { # ) 8556 $self->{group_depth}--; 8557 push @{$self->{ct}->{content}}, chr $nc; 8558 $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; 8559 8560 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8561 $self->{line_prev} = $self->{line}; 8562 $self->{column_prev} = $self->{column}; 8563 $self->{column}++; 8564 $self->{nc} 8565 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8566 } else { 8567 $self->{set_nc}->($self); 8568 } 8569 8570 redo A; 8571 } elsif ($nc == 0x003E) { # > 8572 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type 8573 push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8574 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8575 8576 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8577 $self->{line_prev} = $self->{line}; 8578 $self->{column_prev} = $self->{column}; 8579 $self->{column}++; 8580 $self->{nc} 8581 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8582 } else { 8583 $self->{set_nc}->($self); 8584 } 8585 8586 return ($self->{ct}); # ELEMENT 8587 redo A; 8588 } elsif ($nc == EOF_CHAR) { 8589 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8590 #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8591 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8592 8593 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8594 $self->{line_prev} = $self->{line}; 8595 $self->{column_prev} = $self->{column}; 8596 $self->{column}++; 8597 $self->{nc} 8598 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8599 } else { 8600 $self->{set_nc}->($self); 8601 } 8602 8603 ## Discard the current token. 8604 redo A; 8605 } else { 8606 $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type 8607 push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8608 $self->{state} = BOGUS_MD_STATE; 8609 8610 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8611 $self->{line_prev} = $self->{line}; 8612 $self->{column_prev} = $self->{column}; 8613 $self->{column}++; 8614 $self->{nc} 8615 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8616 } else { 8617 $self->{set_nc}->($self); 8618 } 8619 8620 redo A; 8621 } 8622 } elsif ($state == AFTER_CM_GROUP_CLOSE_STATE) { 8623 if ($is_space->{$nc}) { 8624 if ($self->{group_depth}) { 8625 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; 8626 } else { 8627 $self->{state} = AFTER_MD_DEF_STATE; 8628 } 8629 8630 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8631 $self->{line_prev} = $self->{line}; 8632 $self->{column_prev} = $self->{column}; 8633 $self->{column}++; 8634 $self->{nc} 8635 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8636 } else { 8637 $self->{set_nc}->($self); 8638 } 8639 8640 redo A; 8641 } elsif ($nc == 0x002A or # * 8642 $nc == 0x002B or # + 8643 $nc == 0x003F) { # ? 8644 push @{$self->{ct}->{content}}, chr $nc; 8645 if ($self->{group_depth}) { 8646 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; 8647 } else { 8648 $self->{state} = AFTER_MD_DEF_STATE; 8649 } 8650 8651 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8652 $self->{line_prev} = $self->{line}; 8653 $self->{column_prev} = $self->{column}; 8654 $self->{column}++; 8655 $self->{nc} 8656 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8657 } else { 8658 $self->{set_nc}->($self); 8659 } 8660 8661 redo A; 8662 } elsif ($nc == 0x0029) { # ) 8663 if ($self->{group_depth}) { 8664 $self->{group_depth}--; 8665 push @{$self->{ct}->{content}}, chr $nc; 8666 ## Stay in the state. 8667 8668 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8669 $self->{line_prev} = $self->{line}; 8670 $self->{column_prev} = $self->{column}; 8671 $self->{column}++; 8672 $self->{nc} 8673 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8674 } else { 8675 $self->{set_nc}->($self); 8676 } 8677 8678 redo A; 8679 } else { 8680 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type 8681 $self->{state} = BOGUS_MD_STATE; 8682 ## Reconsume. 8683 redo A; 8684 } 8685 } elsif ($nc == 0x003E) { # > 8686 if ($self->{group_depth}) { 8687 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type 8688 push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8689 } 8690 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8691 8692 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8693 $self->{line_prev} = $self->{line}; 8694 $self->{column_prev} = $self->{column}; 8695 $self->{column}++; 8696 $self->{nc} 8697 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8698 } else { 8699 $self->{set_nc}->($self); 8700 } 8701 8702 return ($self->{ct}); # ELEMENT 8703 redo A; 8704 } elsif ($nc == EOF_CHAR) { 8705 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8706 #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; 8707 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8708 8709 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8710 $self->{line_prev} = $self->{line}; 8711 $self->{column_prev} = $self->{column}; 8712 $self->{column}++; 8713 $self->{nc} 8714 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8715 } else { 8716 $self->{set_nc}->($self); 8717 } 8718 8719 ## Discard the current token. 8720 redo A; 8721 } else { 8722 if ($self->{group_depth}) { 8723 $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; 8724 } else { 8725 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type 8726 $self->{state} = BOGUS_MD_STATE; 8727 } 8728 ## Reconsume. 8729 redo A; 8730 } 8731 } elsif ($state == AFTER_MD_DEF_STATE) { 8732 if ($is_space->{$nc}) { 8733 ## Stay in the state. 8734 8735 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8736 $self->{line_prev} = $self->{line}; 8737 $self->{column_prev} = $self->{column}; 8738 $self->{column}++; 8739 $self->{nc} 8740 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8741 } else { 8742 $self->{set_nc}->($self); 8743 } 8744 8745 redo A; 8746 } elsif ($nc == 0x003E) { # > 8747 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8748 8749 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8750 $self->{line_prev} = $self->{line}; 8751 $self->{column_prev} = $self->{column}; 8752 $self->{column}++; 8753 $self->{nc} 8754 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8755 } else { 8756 $self->{set_nc}->($self); 8757 } 8758 8759 return ($self->{ct}); # ENTITY/ELEMENT 8760 redo A; 8761 } elsif ($nc == EOF_CHAR) { 8762 $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type 8763 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8764 8765 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8766 $self->{line_prev} = $self->{line}; 8767 $self->{column_prev} = $self->{column}; 8768 $self->{column}++; 8769 $self->{nc} 8770 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8771 } else { 8772 $self->{set_nc}->($self); 8773 } 8774 8775 ## Discard the current token. 8776 redo A; 8777 } else { 8778 $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type 8779 $self->{state} = BOGUS_MD_STATE; 8780 ## Reconsume. 8781 redo A; 8782 } 8783 } elsif ($state == BOGUS_MD_STATE) { 8784 if ($nc == 0x003E) { # > 8785 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8786 8787 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8788 $self->{line_prev} = $self->{line}; 8789 $self->{column_prev} = $self->{column}; 8790 $self->{column}++; 8791 $self->{nc} 8792 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8793 } else { 8794 $self->{set_nc}->($self); 8795 } 8796 8797 return ($self->{ct}); # ATTLIST/ENTITY/NOTATION 8798 redo A; 8799 } elsif ($nc == EOF_CHAR) { 8800 $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; 8801 ## Reconsume. 8802 ## Discard the current token. 8803 redo A; 8804 } else { 8805 ## Stay in the state. 8806 8807 if ($self->{char_buffer_pos} < length $self->{char_buffer}) { 8808 $self->{line_prev} = $self->{line}; 8809 $self->{column_prev} = $self->{column}; 8810 $self->{column}++; 8811 $self->{nc} 8812 = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); 8813 } else { 8814 $self->{set_nc}->($self); 8815 } 8816 8817 redo A; 8818 } 8819 } else { 8820 die "$0: $state: Unknown state"; 8821 } 8822 } # A 8823 8824 die "$0: _get_next_token: unexpected case"; 8825} # _get_next_token 8826 88271; 8828 8829# Copyright 2007-2011 Wakaba <w@suika.fam.cx>. 8830# 8831# This library is free software; you can redistribute it and/or modify 8832# it under the same terms as Perl itself. 8833 8834