1<?php 2 3/* 4 5Copyright 2007 Jeroen van der Meer <http://jero.net/> 6Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/> 7Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> 8 9Permission is hereby granted, free of charge, to any person obtaining a 10copy of this software and associated documentation files (the 11"Software"), to deal in the Software without restriction, including 12without limitation the rights to use, copy, modify, merge, publish, 13distribute, sublicense, and/or sell copies of the Software, and to 14permit persons to whom the Software is furnished to do so, subject to 15the following conditions: 16 17The above copyright notice and this permission notice shall be included 18in all copies or substantial portions of the Software. 19 20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 21OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 23IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 24CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 25TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 26SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27 28*/ 29 30// Some conventions: 31// /* */ indicates verbatim text from the HTML 5 specification 32// // indicates regular comments 33 34// all flags are in hyphenated form 35 36class HTML5_Tokenizer { 37 /** 38 * @var HTML5_InputStream 39 * 40 * Points to an InputStream object. 41 */ 42 protected $stream; 43 44 /** 45 * @var HTML5_TreeBuilder 46 * 47 * Tree builder that the tokenizer emits token to. 48 */ 49 private $tree; 50 51 /** 52 * @var int 53 * 54 * Current content model we are parsing as. 55 */ 56 protected $content_model; 57 58 /** 59 * Current token that is being built, but not yet emitted. Also 60 * is the last token emitted, if applicable. 61 */ 62 protected $token; 63 64 // These are constants describing the content model 65 const PCDATA = 0; 66 const RCDATA = 1; 67 const CDATA = 2; 68 const PLAINTEXT = 3; 69 70 // These are constants describing tokens 71 // XXX should probably be moved somewhere else, probably the 72 // HTML5 class. 73 const DOCTYPE = 0; 74 const STARTTAG = 1; 75 const ENDTAG = 2; 76 const COMMENT = 3; 77 const CHARACTER = 4; 78 const SPACECHARACTER = 5; 79 const EOF = 6; 80 const PARSEERROR = 7; 81 82 // These are constants representing bunches of characters. 83 const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; 84 const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; 85 const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz'; 86 const DIGIT = '0123456789'; 87 const HEX = '0123456789ABCDEFabcdef'; 88 const WHITESPACE = "\t\n\x0c "; 89 90 /** 91 * @param $data | Data to parse 92 * @param HTML5_TreeBuilder|null $builder 93 */ 94 public function __construct($data, $builder = null) { 95 $this->stream = new HTML5_InputStream($data); 96 if (!$builder) { 97 $this->tree = new HTML5_TreeBuilder; 98 } else { 99 $this->tree = $builder; 100 } 101 $this->content_model = self::PCDATA; 102 } 103 104 /** 105 * @param null $context 106 */ 107 public function parseFragment($context = null) { 108 $this->tree->setupContext($context); 109 if ($this->tree->content_model) { 110 $this->content_model = $this->tree->content_model; 111 $this->tree->content_model = null; 112 } 113 $this->parse(); 114 } 115 116 // XXX maybe convert this into an iterator? regardless, this function 117 // and the save function should go into a Parser facade of some sort 118 /** 119 * Performs the actual parsing of the document. 120 */ 121 public function parse() { 122 // Current state 123 $state = 'data'; 124 // This is used to avoid having to have look-behind in the data state. 125 $lastFourChars = ''; 126 /** 127 * Escape flag as specified by the HTML5 specification: "used to 128 * control the behavior of the tokeniser. It is either true or 129 * false, and initially must be set to the false state." 130 */ 131 $escape = false; 132 //echo "\n\n"; 133 while($state !== null) { 134 135 /*echo $state . ' '; 136 switch ($this->content_model) { 137 case self::PCDATA: echo 'PCDATA'; break; 138 case self::RCDATA: echo 'RCDATA'; break; 139 case self::CDATA: echo 'CDATA'; break; 140 case self::PLAINTEXT: echo 'PLAINTEXT'; break; 141 } 142 if ($escape) echo " escape"; 143 echo "\n";*/ 144 145 switch($state) { 146 case 'data': 147 148 /* Consume the next input character */ 149 $char = $this->stream->char(); 150 $lastFourChars .= $char; 151 if (strlen($lastFourChars) > 4) { 152 $lastFourChars = substr($lastFourChars, -4); 153 } 154 155 // see below for meaning 156 $hyp_cond = 157 !$escape && 158 ( 159 $this->content_model === self::RCDATA || 160 $this->content_model === self::CDATA 161 ); 162 $amp_cond = 163 !$escape && 164 ( 165 $this->content_model === self::PCDATA || 166 $this->content_model === self::RCDATA 167 ); 168 $lt_cond = 169 $this->content_model === self::PCDATA || 170 ( 171 ( 172 $this->content_model === self::RCDATA || 173 $this->content_model === self::CDATA 174 ) && 175 !$escape 176 ); 177 $gt_cond = 178 $escape && 179 ( 180 $this->content_model === self::RCDATA || 181 $this->content_model === self::CDATA 182 ); 183 184 if ($char === '&' && $amp_cond === true) { 185 /* U+0026 AMPERSAND (&) 186 When the content model flag is set to one of the PCDATA or RCDATA 187 states and the escape flag is false: switch to the 188 character reference data state. Otherwise: treat it as per 189 the "anything else" entry below. */ 190 $state = 'character reference data'; 191 192 } elseif ( 193 $char === '-' && 194 $hyp_cond === true && 195 $lastFourChars === '<!--' 196 ) { 197 /* 198 U+002D HYPHEN-MINUS (-) 199 If the content model flag is set to either the RCDATA state or 200 the CDATA state, and the escape flag is false, and there are at 201 least three characters before this one in the input stream, and the 202 last four characters in the input stream, including this one, are 203 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 204 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 205 $escape = true; 206 207 /* In any case, emit the input character as a character token. Stay 208 in the data state. */ 209 $this->emitToken(array( 210 'type' => self::CHARACTER, 211 'data' => '-' 212 )); 213 // We do the "any case" part as part of "anything else". 214 215 /* U+003C LESS-THAN SIGN (<) */ 216 } elseif ($char === '<' && $lt_cond === true) { 217 /* When the content model flag is set to the PCDATA state: switch 218 to the tag open state. 219 220 When the content model flag is set to either the RCDATA state or 221 the CDATA state and the escape flag is false: switch to the tag 222 open state. 223 224 Otherwise: treat it as per the "anything else" entry below. */ 225 $state = 'tag open'; 226 227 /* U+003E GREATER-THAN SIGN (>) */ 228 } elseif ( 229 $char === '>' && 230 $gt_cond === true && 231 substr($lastFourChars, 1) === '-->' 232 ) { 233 /* If the content model flag is set to either the RCDATA state or 234 the CDATA state, and the escape flag is true, and the last three 235 characters in the input stream including this one are U+002D 236 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 237 set the escape flag to false. */ 238 $escape = false; 239 240 /* In any case, emit the input character as a character token. 241 Stay in the data state. */ 242 $this->emitToken(array( 243 'type' => self::CHARACTER, 244 'data' => '>' 245 )); 246 // We do the "any case" part as part of "anything else". 247 248 } elseif ($char === false) { 249 /* EOF 250 Emit an end-of-file token. */ 251 $state = null; 252 $this->tree->emitToken(array( 253 'type' => self::EOF 254 )); 255 256 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 257 // Directly after emitting a token you switch back to the "data 258 // state". At that point spaceCharacters are important so they are 259 // emitted separately. 260 $chars = $this->stream->charsWhile(self::WHITESPACE); 261 $this->emitToken(array( 262 'type' => self::SPACECHARACTER, 263 'data' => $char . $chars 264 )); 265 $lastFourChars .= $chars; 266 if (strlen($lastFourChars) > 4) { 267 $lastFourChars = substr($lastFourChars, -4); 268 } 269 } else { 270 /* Anything else 271 THIS IS AN OPTIMIZATION: Get as many character that 272 otherwise would also be treated as a character token and emit it 273 as a single character token. Stay in the data state. */ 274 275 $mask = ''; 276 if ($hyp_cond === true) { 277 $mask .= '-'; 278 } 279 if ($amp_cond === true) { 280 $mask .= '&'; 281 } 282 if ($lt_cond === true) { 283 $mask .= '<'; 284 } 285 if ($gt_cond === true) { 286 $mask .= '>'; 287 } 288 289 if ($mask === '') { 290 $chars = $this->stream->remainingChars(); 291 } else { 292 $chars = $this->stream->charsUntil($mask); 293 } 294 295 $this->emitToken(array( 296 'type' => self::CHARACTER, 297 'data' => $char . $chars 298 )); 299 300 $lastFourChars .= $chars; 301 if (strlen($lastFourChars) > 4) { 302 $lastFourChars = substr($lastFourChars, -4); 303 } 304 305 $state = 'data'; 306 } 307 break; 308 309 case 'character reference data': 310 /* (This cannot happen if the content model flag 311 is set to the CDATA state.) */ 312 313 /* Attempt to consume a character reference, with no 314 additional allowed character. */ 315 $entity = $this->consumeCharacterReference(); 316 317 /* If nothing is returned, emit a U+0026 AMPERSAND 318 character token. Otherwise, emit the character token that 319 was returned. */ 320 // This is all done when consuming the character reference. 321 $this->emitToken(array( 322 'type' => self::CHARACTER, 323 'data' => $entity 324 )); 325 326 /* Finally, switch to the data state. */ 327 $state = 'data'; 328 break; 329 330 case 'tag open': 331 $char = $this->stream->char(); 332 333 switch ($this->content_model) { 334 case self::RCDATA: 335 case self::CDATA: 336 /* Consume the next input character. If it is a 337 U+002F SOLIDUS (/) character, switch to the close 338 tag open state. Otherwise, emit a U+003C LESS-THAN 339 SIGN character token and reconsume the current input 340 character in the data state. */ 341 // We consumed above. 342 343 if ($char === '/') { 344 $state = 'close tag open'; 345 } else { 346 $this->emitToken(array( 347 'type' => self::CHARACTER, 348 'data' => '<' 349 )); 350 351 $this->stream->unget(); 352 353 $state = 'data'; 354 } 355 break; 356 357 case self::PCDATA: 358 /* If the content model flag is set to the PCDATA state 359 Consume the next input character: */ 360 // We consumed above. 361 362 if ($char === '!') { 363 /* U+0021 EXCLAMATION MARK (!) 364 Switch to the markup declaration open state. */ 365 $state = 'markup declaration open'; 366 367 } elseif ($char === '/') { 368 /* U+002F SOLIDUS (/) 369 Switch to the close tag open state. */ 370 $state = 'close tag open'; 371 372 } elseif ('A' <= $char && $char <= 'Z') { 373 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 374 Create a new start tag token, set its tag name to the lowercase 375 version of the input character (add 0x0020 to the character's code 376 point), then switch to the tag name state. (Don't emit the token 377 yet; further details will be filled in before it is emitted.) */ 378 $this->token = array( 379 'name' => strtolower($char), 380 'type' => self::STARTTAG, 381 'attr' => array() 382 ); 383 384 $state = 'tag name'; 385 386 } elseif ('a' <= $char && $char <= 'z') { 387 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z 388 Create a new start tag token, set its tag name to the input 389 character, then switch to the tag name state. (Don't emit 390 the token yet; further details will be filled in before it 391 is emitted.) */ 392 $this->token = array( 393 'name' => $char, 394 'type' => self::STARTTAG, 395 'attr' => array() 396 ); 397 398 $state = 'tag name'; 399 400 } elseif ($char === '>') { 401 /* U+003E GREATER-THAN SIGN (>) 402 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 403 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 404 $this->emitToken(array( 405 'type' => self::PARSEERROR, 406 'data' => 'expected-tag-name-but-got-right-bracket' 407 )); 408 $this->emitToken(array( 409 'type' => self::CHARACTER, 410 'data' => '<>' 411 )); 412 413 $state = 'data'; 414 415 } elseif ($char === '?') { 416 /* U+003F QUESTION MARK (?) 417 Parse error. Switch to the bogus comment state. */ 418 $this->emitToken(array( 419 'type' => self::PARSEERROR, 420 'data' => 'expected-tag-name-but-got-question-mark' 421 )); 422 $this->token = array( 423 'data' => '?', 424 'type' => self::COMMENT 425 ); 426 $state = 'bogus comment'; 427 428 } else { 429 /* Anything else 430 Parse error. Emit a U+003C LESS-THAN SIGN character token and 431 reconsume the current input character in the data state. */ 432 $this->emitToken(array( 433 'type' => self::PARSEERROR, 434 'data' => 'expected-tag-name' 435 )); 436 $this->emitToken(array( 437 'type' => self::CHARACTER, 438 'data' => '<' 439 )); 440 441 $state = 'data'; 442 $this->stream->unget(); 443 } 444 break; 445 } 446 break; 447 448 case 'close tag open': 449 if ( 450 $this->content_model === self::RCDATA || 451 $this->content_model === self::CDATA 452 ) { 453 /* If the content model flag is set to the RCDATA or CDATA 454 states... */ 455 $name = strtolower($this->stream->charsWhile(self::ALPHA)); 456 $following = $this->stream->char(); 457 $this->stream->unget(); 458 if ( 459 !$this->token || 460 $this->token['name'] !== $name || 461 $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false)) 462 ) { 463 /* if no start tag token has ever been emitted by this instance 464 of the tokenizer (fragment case), or, if the next few 465 characters do not match the tag name of the last start tag 466 token emitted (compared in an ASCII case-insensitive manner), 467 or if they do but they are not immediately followed by one of 468 the following characters: 469 470 * U+0009 CHARACTER TABULATION 471 * U+000A LINE FEED (LF) 472 * U+000C FORM FEED (FF) 473 * U+0020 SPACE 474 * U+003E GREATER-THAN SIGN (>) 475 * U+002F SOLIDUS (/) 476 * EOF 477 478 ...then emit a U+003C LESS-THAN SIGN character token, a 479 U+002F SOLIDUS character token, and switch to the data 480 state to process the next input character. */ 481 // XXX: Probably ought to replace in_array with $following === x ||... 482 483 // We also need to emit $name now we've consumed that, as we 484 // know it'll just be emitted as a character token. 485 $this->emitToken(array( 486 'type' => self::CHARACTER, 487 'data' => '</' . $name 488 )); 489 490 $state = 'data'; 491 } else { 492 // This matches what would happen if we actually did the 493 // otherwise below (but we can't because we've consumed too 494 // much). 495 496 // Start the end tag token with the name we already have. 497 $this->token = array( 498 'name' => $name, 499 'type' => self::ENDTAG 500 ); 501 502 // Change to tag name state. 503 $state = 'tag name'; 504 } 505 } elseif ($this->content_model === self::PCDATA) { 506 /* Otherwise, if the content model flag is set to the PCDATA 507 state [...]: */ 508 $char = $this->stream->char(); 509 510 if ('A' <= $char && $char <= 'Z') { 511 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 512 Create a new end tag token, set its tag name to the lowercase version 513 of the input character (add 0x0020 to the character's code point), then 514 switch to the tag name state. (Don't emit the token yet; further details 515 will be filled in before it is emitted.) */ 516 $this->token = array( 517 'name' => strtolower($char), 518 'type' => self::ENDTAG 519 ); 520 521 $state = 'tag name'; 522 523 } elseif ('a' <= $char && $char <= 'z') { 524 /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z 525 Create a new end tag token, set its tag name to the 526 input character, then switch to the tag name state. 527 (Don't emit the token yet; further details will be 528 filled in before it is emitted.) */ 529 $this->token = array( 530 'name' => $char, 531 'type' => self::ENDTAG 532 ); 533 534 $state = 'tag name'; 535 536 } elseif ($char === '>') { 537 /* U+003E GREATER-THAN SIGN (>) 538 Parse error. Switch to the data state. */ 539 $this->emitToken(array( 540 'type' => self::PARSEERROR, 541 'data' => 'expected-closing-tag-but-got-right-bracket' 542 )); 543 $state = 'data'; 544 545 } elseif ($char === false) { 546 /* EOF 547 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 548 SOLIDUS character token. Reconsume the EOF character in the data state. */ 549 $this->emitToken(array( 550 'type' => self::PARSEERROR, 551 'data' => 'expected-closing-tag-but-got-eof' 552 )); 553 $this->emitToken(array( 554 'type' => self::CHARACTER, 555 'data' => '</' 556 )); 557 558 $this->stream->unget(); 559 $state = 'data'; 560 561 } else { 562 /* Parse error. Switch to the bogus comment state. */ 563 $this->emitToken(array( 564 'type' => self::PARSEERROR, 565 'data' => 'expected-closing-tag-but-got-char' 566 )); 567 $this->token = array( 568 'data' => $char, 569 'type' => self::COMMENT 570 ); 571 $state = 'bogus comment'; 572 } 573 } 574 break; 575 576 case 'tag name': 577 /* Consume the next input character: */ 578 $char = $this->stream->char(); 579 580 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 581 /* U+0009 CHARACTER TABULATION 582 U+000A LINE FEED (LF) 583 U+000C FORM FEED (FF) 584 U+0020 SPACE 585 Switch to the before attribute name state. */ 586 $state = 'before attribute name'; 587 588 } elseif ($char === '/') { 589 /* U+002F SOLIDUS (/) 590 Switch to the self-closing start tag state. */ 591 $state = 'self-closing start tag'; 592 593 } elseif ($char === '>') { 594 /* U+003E GREATER-THAN SIGN (>) 595 Emit the current tag token. Switch to the data state. */ 596 $this->emitToken($this->token); 597 $state = 'data'; 598 599 } elseif ('A' <= $char && $char <= 'Z') { 600 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 601 Append the lowercase version of the current input 602 character (add 0x0020 to the character's code point) to 603 the current tag token's tag name. Stay in the tag name state. */ 604 $chars = $this->stream->charsWhile(self::UPPER_ALPHA); 605 606 $this->token['name'] .= strtolower($char . $chars); 607 $state = 'tag name'; 608 609 } elseif ($char === false) { 610 /* EOF 611 Parse error. Reconsume the EOF character in the data state. */ 612 $this->emitToken(array( 613 'type' => self::PARSEERROR, 614 'data' => 'eof-in-tag-name' 615 )); 616 617 $this->stream->unget(); 618 $state = 'data'; 619 620 } else { 621 /* Anything else 622 Append the current input character to the current tag token's tag name. 623 Stay in the tag name state. */ 624 $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA); 625 626 $this->token['name'] .= $char . $chars; 627 $state = 'tag name'; 628 } 629 break; 630 631 case 'before attribute name': 632 /* Consume the next input character: */ 633 $char = $this->stream->char(); 634 635 // this conditional is optimized, check bottom 636 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 637 /* U+0009 CHARACTER TABULATION 638 U+000A LINE FEED (LF) 639 U+000C FORM FEED (FF) 640 U+0020 SPACE 641 Stay in the before attribute name state. */ 642 $state = 'before attribute name'; 643 644 } elseif ($char === '/') { 645 /* U+002F SOLIDUS (/) 646 Switch to the self-closing start tag state. */ 647 $state = 'self-closing start tag'; 648 649 } elseif ($char === '>') { 650 /* U+003E GREATER-THAN SIGN (>) 651 Emit the current tag token. Switch to the data state. */ 652 $this->emitToken($this->token); 653 $state = 'data'; 654 655 } elseif ('A' <= $char && $char <= 'Z') { 656 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 657 Start a new attribute in the current tag token. Set that 658 attribute's name to the lowercase version of the current 659 input character (add 0x0020 to the character's code 660 point), and its value to the empty string. Switch to the 661 attribute name state.*/ 662 $this->token['attr'][] = array( 663 'name' => strtolower($char), 664 'value' => '' 665 ); 666 667 $state = 'attribute name'; 668 669 } elseif ($char === false) { 670 /* EOF 671 Parse error. Reconsume the EOF character in the data state. */ 672 $this->emitToken(array( 673 'type' => self::PARSEERROR, 674 'data' => 'expected-attribute-name-but-got-eof' 675 )); 676 677 $this->stream->unget(); 678 $state = 'data'; 679 680 } else { 681 /* U+0022 QUOTATION MARK (") 682 U+0027 APOSTROPHE (') 683 U+003C LESS-THAN SIGN (<) 684 U+003D EQUALS SIGN (=) 685 Parse error. Treat it as per the "anything else" entry 686 below. */ 687 if ($char === '"' || $char === "'" || $char === '<' || $char === '=') { 688 $this->emitToken(array( 689 'type' => self::PARSEERROR, 690 'data' => 'invalid-character-in-attribute-name' 691 )); 692 } 693 694 /* Anything else 695 Start a new attribute in the current tag token. Set that attribute's 696 name to the current input character, and its value to the empty string. 697 Switch to the attribute name state. */ 698 $this->token['attr'][] = array( 699 'name' => $char, 700 'value' => '' 701 ); 702 703 $state = 'attribute name'; 704 } 705 break; 706 707 case 'attribute name': 708 // Consume the next input character: 709 $char = $this->stream->char(); 710 711 // this conditional is optimized, check bottom 712 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 713 /* U+0009 CHARACTER TABULATION 714 U+000A LINE FEED (LF) 715 U+000C FORM FEED (FF) 716 U+0020 SPACE 717 Switch to the after attribute name state. */ 718 $state = 'after attribute name'; 719 720 } elseif ($char === '/') { 721 /* U+002F SOLIDUS (/) 722 Switch to the self-closing start tag state. */ 723 $state = 'self-closing start tag'; 724 725 } elseif ($char === '=') { 726 /* U+003D EQUALS SIGN (=) 727 Switch to the before attribute value state. */ 728 $state = 'before attribute value'; 729 730 } elseif ($char === '>') { 731 /* U+003E GREATER-THAN SIGN (>) 732 Emit the current tag token. Switch to the data state. */ 733 $this->emitToken($this->token); 734 $state = 'data'; 735 736 } elseif ('A' <= $char && $char <= 'Z') { 737 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 738 Append the lowercase version of the current input 739 character (add 0x0020 to the character's code point) to 740 the current attribute's name. Stay in the attribute name 741 state. */ 742 $chars = $this->stream->charsWhile(self::UPPER_ALPHA); 743 744 $last = count($this->token['attr']) - 1; 745 $this->token['attr'][$last]['name'] .= strtolower($char . $chars); 746 747 $state = 'attribute name'; 748 749 } elseif ($char === false) { 750 /* EOF 751 Parse error. Reconsume the EOF character in the data state. */ 752 $this->emitToken(array( 753 'type' => self::PARSEERROR, 754 'data' => 'eof-in-attribute-name' 755 )); 756 757 $this->stream->unget(); 758 $state = 'data'; 759 760 } else { 761 /* U+0022 QUOTATION MARK (") 762 U+0027 APOSTROPHE (') 763 U+003C LESS-THAN SIGN (<) 764 Parse error. Treat it as per the "anything else" 765 entry below. */ 766 if ($char === '"' || $char === "'" || $char === '<') { 767 $this->emitToken(array( 768 'type' => self::PARSEERROR, 769 'data' => 'invalid-character-in-attribute-name' 770 )); 771 } 772 773 /* Anything else 774 Append the current input character to the current attribute's name. 775 Stay in the attribute name state. */ 776 $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA); 777 778 $last = count($this->token['attr']) - 1; 779 $this->token['attr'][$last]['name'] .= $char . $chars; 780 781 $state = 'attribute name'; 782 } 783 784 /* When the user agent leaves the attribute name state 785 (and before emitting the tag token, if appropriate), the 786 complete attribute's name must be compared to the other 787 attributes on the same token; if there is already an 788 attribute on the token with the exact same name, then this 789 is a parse error and the new attribute must be dropped, along 790 with the value that gets associated with it (if any). */ 791 // this might be implemented in the emitToken method 792 break; 793 794 case 'after attribute name': 795 // Consume the next input character: 796 $char = $this->stream->char(); 797 798 // this is an optimized conditional, check the bottom 799 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 800 /* U+0009 CHARACTER TABULATION 801 U+000A LINE FEED (LF) 802 U+000C FORM FEED (FF) 803 U+0020 SPACE 804 Stay in the after attribute name state. */ 805 $state = 'after attribute name'; 806 807 } elseif ($char === '/') { 808 /* U+002F SOLIDUS (/) 809 Switch to the self-closing start tag state. */ 810 $state = 'self-closing start tag'; 811 812 } elseif ($char === '=') { 813 /* U+003D EQUALS SIGN (=) 814 Switch to the before attribute value state. */ 815 $state = 'before attribute value'; 816 817 } elseif ($char === '>') { 818 /* U+003E GREATER-THAN SIGN (>) 819 Emit the current tag token. Switch to the data state. */ 820 $this->emitToken($this->token); 821 $state = 'data'; 822 823 } elseif ('A' <= $char && $char <= 'Z') { 824 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 825 Start a new attribute in the current tag token. Set that 826 attribute's name to the lowercase version of the current 827 input character (add 0x0020 to the character's code 828 point), and its value to the empty string. Switch to the 829 attribute name state. */ 830 $this->token['attr'][] = array( 831 'name' => strtolower($char), 832 'value' => '' 833 ); 834 835 $state = 'attribute name'; 836 837 } elseif ($char === false) { 838 /* EOF 839 Parse error. Reconsume the EOF character in the data state. */ 840 $this->emitToken(array( 841 'type' => self::PARSEERROR, 842 'data' => 'expected-end-of-tag-but-got-eof' 843 )); 844 845 $this->stream->unget(); 846 $state = 'data'; 847 848 } else { 849 /* U+0022 QUOTATION MARK (") 850 U+0027 APOSTROPHE (') 851 U+003C LESS-THAN SIGN(<) 852 Parse error. Treat it as per the "anything else" 853 entry below. */ 854 if ($char === '"' || $char === "'" || $char === "<") { 855 $this->emitToken(array( 856 'type' => self::PARSEERROR, 857 'data' => 'invalid-character-after-attribute-name' 858 )); 859 } 860 861 /* Anything else 862 Start a new attribute in the current tag token. Set that attribute's 863 name to the current input character, and its value to the empty string. 864 Switch to the attribute name state. */ 865 $this->token['attr'][] = array( 866 'name' => $char, 867 'value' => '' 868 ); 869 870 $state = 'attribute name'; 871 } 872 break; 873 874 case 'before attribute value': 875 // Consume the next input character: 876 $char = $this->stream->char(); 877 878 // this is an optimized conditional 879 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 880 /* U+0009 CHARACTER TABULATION 881 U+000A LINE FEED (LF) 882 U+000C FORM FEED (FF) 883 U+0020 SPACE 884 Stay in the before attribute value state. */ 885 $state = 'before attribute value'; 886 887 } elseif ($char === '"') { 888 /* U+0022 QUOTATION MARK (") 889 Switch to the attribute value (double-quoted) state. */ 890 $state = 'attribute value (double-quoted)'; 891 892 } elseif ($char === '&') { 893 /* U+0026 AMPERSAND (&) 894 Switch to the attribute value (unquoted) state and reconsume 895 this input character. */ 896 $this->stream->unget(); 897 $state = 'attribute value (unquoted)'; 898 899 } elseif ($char === '\'') { 900 /* U+0027 APOSTROPHE (') 901 Switch to the attribute value (single-quoted) state. */ 902 $state = 'attribute value (single-quoted)'; 903 904 } elseif ($char === '>') { 905 /* U+003E GREATER-THAN SIGN (>) 906 Parse error. Emit the current tag token. Switch to the data state. */ 907 $this->emitToken(array( 908 'type' => self::PARSEERROR, 909 'data' => 'expected-attribute-value-but-got-right-bracket' 910 )); 911 $this->emitToken($this->token); 912 $state = 'data'; 913 914 } elseif ($char === false) { 915 /* EOF 916 Parse error. Reconsume the EOF character in the data state. */ 917 $this->emitToken(array( 918 'type' => self::PARSEERROR, 919 'data' => 'expected-attribute-value-but-got-eof' 920 )); 921 $this->stream->unget(); 922 $state = 'data'; 923 924 } else { 925 /* U+003D EQUALS SIGN (=) 926 * U+003C LESS-THAN SIGN (<) 927 Parse error. Treat it as per the "anything else" entry below. */ 928 if ($char === '=' || $char === '<') { 929 $this->emitToken(array( 930 'type' => self::PARSEERROR, 931 'data' => 'equals-in-unquoted-attribute-value' 932 )); 933 } 934 935 /* Anything else 936 Append the current input character to the current attribute's value. 937 Switch to the attribute value (unquoted) state. */ 938 $last = count($this->token['attr']) - 1; 939 $this->token['attr'][$last]['value'] .= $char; 940 941 $state = 'attribute value (unquoted)'; 942 } 943 break; 944 945 case 'attribute value (double-quoted)': 946 // Consume the next input character: 947 $char = $this->stream->char(); 948 949 if ($char === '"') { 950 /* U+0022 QUOTATION MARK (") 951 Switch to the after attribute value (quoted) state. */ 952 $state = 'after attribute value (quoted)'; 953 954 } elseif ($char === '&') { 955 /* U+0026 AMPERSAND (&) 956 Switch to the character reference in attribute value 957 state, with the additional allowed character 958 being U+0022 QUOTATION MARK ("). */ 959 $this->characterReferenceInAttributeValue('"'); 960 961 } elseif ($char === false) { 962 /* EOF 963 Parse error. Reconsume the EOF character in the data state. */ 964 $this->emitToken(array( 965 'type' => self::PARSEERROR, 966 'data' => 'eof-in-attribute-value-double-quote' 967 )); 968 969 $this->stream->unget(); 970 $state = 'data'; 971 972 } else { 973 /* Anything else 974 Append the current input character to the current attribute's value. 975 Stay in the attribute value (double-quoted) state. */ 976 $chars = $this->stream->charsUntil('"&'); 977 978 $last = count($this->token['attr']) - 1; 979 $this->token['attr'][$last]['value'] .= $char . $chars; 980 981 $state = 'attribute value (double-quoted)'; 982 } 983 break; 984 985 case 'attribute value (single-quoted)': 986 // Consume the next input character: 987 $char = $this->stream->char(); 988 989 if ($char === "'") { 990 /* U+0022 QUOTATION MARK (') 991 Switch to the after attribute value state. */ 992 $state = 'after attribute value (quoted)'; 993 994 } elseif ($char === '&') { 995 /* U+0026 AMPERSAND (&) 996 Switch to the entity in attribute value state. */ 997 $this->characterReferenceInAttributeValue("'"); 998 999 } elseif ($char === false) { 1000 /* EOF 1001 Parse error. Reconsume the EOF character in the data state. */ 1002 $this->emitToken(array( 1003 'type' => self::PARSEERROR, 1004 'data' => 'eof-in-attribute-value-single-quote' 1005 )); 1006 1007 $this->stream->unget(); 1008 $state = 'data'; 1009 1010 } else { 1011 /* Anything else 1012 Append the current input character to the current attribute's value. 1013 Stay in the attribute value (single-quoted) state. */ 1014 $chars = $this->stream->charsUntil("'&"); 1015 1016 $last = count($this->token['attr']) - 1; 1017 $this->token['attr'][$last]['value'] .= $char . $chars; 1018 1019 $state = 'attribute value (single-quoted)'; 1020 } 1021 break; 1022 1023 case 'attribute value (unquoted)': 1024 // Consume the next input character: 1025 $char = $this->stream->char(); 1026 1027 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1028 /* U+0009 CHARACTER TABULATION 1029 U+000A LINE FEED (LF) 1030 U+000C FORM FEED (FF) 1031 U+0020 SPACE 1032 Switch to the before attribute name state. */ 1033 $state = 'before attribute name'; 1034 1035 } elseif ($char === '&') { 1036 /* U+0026 AMPERSAND (&) 1037 Switch to the entity in attribute value state, with the 1038 additional allowed character being U+003E 1039 GREATER-THAN SIGN (>). */ 1040 $this->characterReferenceInAttributeValue('>'); 1041 1042 } elseif ($char === '>') { 1043 /* U+003E GREATER-THAN SIGN (>) 1044 Emit the current tag token. Switch to the data state. */ 1045 $this->emitToken($this->token); 1046 $state = 'data'; 1047 1048 } elseif ($char === false) { 1049 /* EOF 1050 Parse error. Reconsume the EOF character in the data state. */ 1051 $this->emitToken(array( 1052 'type' => self::PARSEERROR, 1053 'data' => 'eof-in-attribute-value-no-quotes' 1054 )); 1055 $this->stream->unget(); 1056 $state = 'data'; 1057 1058 } else { 1059 /* U+0022 QUOTATION MARK (") 1060 U+0027 APOSTROPHE (') 1061 U+003C LESS-THAN SIGN (<) 1062 U+003D EQUALS SIGN (=) 1063 Parse error. Treat it as per the "anything else" 1064 entry below. */ 1065 if ($char === '"' || $char === "'" || $char === '=' || $char == '<') { 1066 $this->emitToken(array( 1067 'type' => self::PARSEERROR, 1068 'data' => 'unexpected-character-in-unquoted-attribute-value' 1069 )); 1070 } 1071 1072 /* Anything else 1073 Append the current input character to the current attribute's value. 1074 Stay in the attribute value (unquoted) state. */ 1075 $chars = $this->stream->charsUntil("\t\n\x0c &>\"'="); 1076 1077 $last = count($this->token['attr']) - 1; 1078 $this->token['attr'][$last]['value'] .= $char . $chars; 1079 1080 $state = 'attribute value (unquoted)'; 1081 } 1082 break; 1083 1084 case 'after attribute value (quoted)': 1085 /* Consume the next input character: */ 1086 $char = $this->stream->char(); 1087 1088 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1089 /* U+0009 CHARACTER TABULATION 1090 U+000A LINE FEED (LF) 1091 U+000C FORM FEED (FF) 1092 U+0020 SPACE 1093 Switch to the before attribute name state. */ 1094 $state = 'before attribute name'; 1095 1096 } elseif ($char === '/') { 1097 /* U+002F SOLIDUS (/) 1098 Switch to the self-closing start tag state. */ 1099 $state = 'self-closing start tag'; 1100 1101 } elseif ($char === '>') { 1102 /* U+003E GREATER-THAN SIGN (>) 1103 Emit the current tag token. Switch to the data state. */ 1104 $this->emitToken($this->token); 1105 $state = 'data'; 1106 1107 } elseif ($char === false) { 1108 /* EOF 1109 Parse error. Reconsume the EOF character in the data state. */ 1110 $this->emitToken(array( 1111 'type' => self::PARSEERROR, 1112 'data' => 'unexpected-EOF-after-attribute-value' 1113 )); 1114 $this->stream->unget(); 1115 $state = 'data'; 1116 1117 } else { 1118 /* Anything else 1119 Parse error. Reconsume the character in the before attribute 1120 name state. */ 1121 $this->emitToken(array( 1122 'type' => self::PARSEERROR, 1123 'data' => 'unexpected-character-after-attribute-value' 1124 )); 1125 $this->stream->unget(); 1126 $state = 'before attribute name'; 1127 } 1128 break; 1129 1130 case 'self-closing start tag': 1131 /* Consume the next input character: */ 1132 $char = $this->stream->char(); 1133 1134 if ($char === '>') { 1135 /* U+003E GREATER-THAN SIGN (>) 1136 Set the self-closing flag of the current tag token. 1137 Emit the current tag token. Switch to the data state. */ 1138 // not sure if this is the name we want 1139 $this->token['self-closing'] = true; 1140 $this->emitToken($this->token); 1141 $state = 'data'; 1142 1143 } elseif ($char === false) { 1144 /* EOF 1145 Parse error. Reconsume the EOF character in the data state. */ 1146 $this->emitToken(array( 1147 'type' => self::PARSEERROR, 1148 'data' => 'unexpected-eof-after-self-closing' 1149 )); 1150 $this->stream->unget(); 1151 $state = 'data'; 1152 1153 } else { 1154 /* Anything else 1155 Parse error. Reconsume the character in the before attribute name state. */ 1156 $this->emitToken(array( 1157 'type' => self::PARSEERROR, 1158 'data' => 'unexpected-character-after-self-closing' 1159 )); 1160 $this->stream->unget(); 1161 $state = 'before attribute name'; 1162 } 1163 break; 1164 1165 case 'bogus comment': 1166 /* (This can only happen if the content model flag is set to the PCDATA state.) */ 1167 /* Consume every character up to the first U+003E GREATER-THAN SIGN 1168 character (>) or the end of the file (EOF), whichever comes first. Emit 1169 a comment token whose data is the concatenation of all the characters 1170 starting from and including the character that caused the state machine 1171 to switch into the bogus comment state, up to and including the last 1172 consumed character before the U+003E character, if any, or up to the 1173 end of the file otherwise. (If the comment was started by the end of 1174 the file (EOF), the token is empty.) */ 1175 $this->token['data'] .= (string) $this->stream->charsUntil('>'); 1176 $this->stream->char(); 1177 1178 $this->emitToken($this->token); 1179 1180 /* Switch to the data state. */ 1181 $state = 'data'; 1182 break; 1183 1184 case 'markup declaration open': 1185 // Consume for below 1186 $hyphens = $this->stream->charsWhile('-', 2); 1187 if ($hyphens === '-') { 1188 $this->stream->unget(); 1189 } 1190 if ($hyphens !== '--') { 1191 $alpha = $this->stream->charsWhile(self::ALPHA, 7); 1192 } 1193 1194 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 1195 characters, consume those two characters, create a comment token whose 1196 data is the empty string, and switch to the comment state. */ 1197 if ($hyphens === '--') { 1198 $state = 'comment start'; 1199 $this->token = array( 1200 'data' => '', 1201 'type' => self::COMMENT 1202 ); 1203 1204 /* Otherwise if the next seven characters are a case-insensitive match 1205 for the word "DOCTYPE", then consume those characters and switch to the 1206 DOCTYPE state. */ 1207 } elseif (strtoupper($alpha) === 'DOCTYPE') { 1208 $state = 'DOCTYPE'; 1209 1210 // XXX not implemented 1211 /* Otherwise, if the insertion mode is "in foreign content" 1212 and the current node is not an element in the HTML namespace 1213 and the next seven characters are an ASCII case-sensitive 1214 match for the string "[CDATA[" (the five uppercase letters 1215 "CDATA" with a U+005B LEFT SQUARE BRACKET character before 1216 and after), then consume those characters and switch to the 1217 CDATA section state (which is unrelated to the content model 1218 flag's CDATA state). */ 1219 1220 /* Otherwise, is is a parse error. Switch to the bogus comment state. 1221 The next character that is consumed, if any, is the first character 1222 that will be in the comment. */ 1223 } else { 1224 $this->emitToken(array( 1225 'type' => self::PARSEERROR, 1226 'data' => 'expected-dashes-or-doctype' 1227 )); 1228 $this->token = array( 1229 'data' => (string) $alpha, 1230 'type' => self::COMMENT 1231 ); 1232 $state = 'bogus comment'; 1233 } 1234 break; 1235 1236 case 'comment start': 1237 /* Consume the next input character: */ 1238 $char = $this->stream->char(); 1239 1240 if ($char === '-') { 1241 /* U+002D HYPHEN-MINUS (-) 1242 Switch to the comment start dash state. */ 1243 $state = 'comment start dash'; 1244 } elseif ($char === '>') { 1245 /* U+003E GREATER-THAN SIGN (>) 1246 Parse error. Emit the comment token. Switch to the 1247 data state. */ 1248 $this->emitToken(array( 1249 'type' => self::PARSEERROR, 1250 'data' => 'incorrect-comment' 1251 )); 1252 $this->emitToken($this->token); 1253 $state = 'data'; 1254 } elseif ($char === false) { 1255 /* EOF 1256 Parse error. Emit the comment token. Reconsume the 1257 EOF character in the data state. */ 1258 $this->emitToken(array( 1259 'type' => self::PARSEERROR, 1260 'data' => 'eof-in-comment' 1261 )); 1262 $this->emitToken($this->token); 1263 $this->stream->unget(); 1264 $state = 'data'; 1265 } else { 1266 /* Anything else 1267 Append the input character to the comment token's 1268 data. Switch to the comment state. */ 1269 $this->token['data'] .= $char; 1270 $state = 'comment'; 1271 } 1272 break; 1273 1274 case 'comment start dash': 1275 /* Consume the next input character: */ 1276 $char = $this->stream->char(); 1277 if ($char === '-') { 1278 /* U+002D HYPHEN-MINUS (-) 1279 Switch to the comment end state */ 1280 $state = 'comment end'; 1281 } elseif ($char === '>') { 1282 /* U+003E GREATER-THAN SIGN (>) 1283 Parse error. Emit the comment token. Switch to the 1284 data state. */ 1285 $this->emitToken(array( 1286 'type' => self::PARSEERROR, 1287 'data' => 'incorrect-comment' 1288 )); 1289 $this->emitToken($this->token); 1290 $state = 'data'; 1291 } elseif ($char === false) { 1292 /* Parse error. Emit the comment token. Reconsume the 1293 EOF character in the data state. */ 1294 $this->emitToken(array( 1295 'type' => self::PARSEERROR, 1296 'data' => 'eof-in-comment' 1297 )); 1298 $this->emitToken($this->token); 1299 $this->stream->unget(); 1300 $state = 'data'; 1301 } else { 1302 $this->token['data'] .= '-' . $char; 1303 $state = 'comment'; 1304 } 1305 break; 1306 1307 case 'comment': 1308 /* Consume the next input character: */ 1309 $char = $this->stream->char(); 1310 1311 if ($char === '-') { 1312 /* U+002D HYPHEN-MINUS (-) 1313 Switch to the comment end dash state */ 1314 $state = 'comment end dash'; 1315 1316 } elseif ($char === false) { 1317 /* EOF 1318 Parse error. Emit the comment token. Reconsume the EOF character 1319 in the data state. */ 1320 $this->emitToken(array( 1321 'type' => self::PARSEERROR, 1322 'data' => 'eof-in-comment' 1323 )); 1324 $this->emitToken($this->token); 1325 $this->stream->unget(); 1326 $state = 'data'; 1327 1328 } else { 1329 /* Anything else 1330 Append the input character to the comment token's data. Stay in 1331 the comment state. */ 1332 $chars = $this->stream->charsUntil('-'); 1333 1334 $this->token['data'] .= $char . $chars; 1335 } 1336 break; 1337 1338 case 'comment end dash': 1339 /* Consume the next input character: */ 1340 $char = $this->stream->char(); 1341 1342 if ($char === '-') { 1343 /* U+002D HYPHEN-MINUS (-) 1344 Switch to the comment end state */ 1345 $state = 'comment end'; 1346 1347 } elseif ($char === false) { 1348 /* EOF 1349 Parse error. Emit the comment token. Reconsume the EOF character 1350 in the data state. */ 1351 $this->emitToken(array( 1352 'type' => self::PARSEERROR, 1353 'data' => 'eof-in-comment-end-dash' 1354 )); 1355 $this->emitToken($this->token); 1356 $this->stream->unget(); 1357 $state = 'data'; 1358 1359 } else { 1360 /* Anything else 1361 Append a U+002D HYPHEN-MINUS (-) character and the input 1362 character to the comment token's data. Switch to the comment state. */ 1363 $this->token['data'] .= '-'.$char; 1364 $state = 'comment'; 1365 } 1366 break; 1367 1368 case 'comment end': 1369 /* Consume the next input character: */ 1370 $char = $this->stream->char(); 1371 1372 if ($char === '>') { 1373 /* U+003E GREATER-THAN SIGN (>) 1374 Emit the comment token. Switch to the data state. */ 1375 $this->emitToken($this->token); 1376 $state = 'data'; 1377 1378 } elseif ($char === '-') { 1379 /* U+002D HYPHEN-MINUS (-) 1380 Parse error. Append a U+002D HYPHEN-MINUS (-) character 1381 to the comment token's data. Stay in the comment end 1382 state. */ 1383 $this->emitToken(array( 1384 'type' => self::PARSEERROR, 1385 'data' => 'unexpected-dash-after-double-dash-in-comment' 1386 )); 1387 $this->token['data'] .= '-'; 1388 1389 } elseif ($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') { 1390 $this->emitToken(array( 1391 'type' => self::PARSEERROR, 1392 'data' => 'unexpected-space-after-double-dash-in-comment' 1393 )); 1394 $this->token['data'] .= '--' . $char; 1395 $state = 'comment end space'; 1396 1397 } elseif ($char === '!') { 1398 $this->emitToken(array( 1399 'type' => self::PARSEERROR, 1400 'data' => 'unexpected-bang-after-double-dash-in-comment' 1401 )); 1402 $state = 'comment end bang'; 1403 1404 } elseif ($char === false) { 1405 /* EOF 1406 Parse error. Emit the comment token. Reconsume the 1407 EOF character in the data state. */ 1408 $this->emitToken(array( 1409 'type' => self::PARSEERROR, 1410 'data' => 'eof-in-comment-double-dash' 1411 )); 1412 $this->emitToken($this->token); 1413 $this->stream->unget(); 1414 $state = 'data'; 1415 1416 } else { 1417 /* Anything else 1418 Parse error. Append two U+002D HYPHEN-MINUS (-) 1419 characters and the input character to the comment token's 1420 data. Switch to the comment state. */ 1421 $this->emitToken(array( 1422 'type' => self::PARSEERROR, 1423 'data' => 'unexpected-char-in-comment' 1424 )); 1425 $this->token['data'] .= '--'.$char; 1426 $state = 'comment'; 1427 } 1428 break; 1429 1430 case 'comment end bang': 1431 $char = $this->stream->char(); 1432 if ($char === '>') { 1433 $this->emitToken($this->token); 1434 $state = 'data'; 1435 } elseif ($char === "-") { 1436 $this->token['data'] .= '--!'; 1437 $state = 'comment end dash'; 1438 } elseif ($char === false) { 1439 $this->emitToken(array( 1440 'type' => self::PARSEERROR, 1441 'data' => 'eof-in-comment-end-bang' 1442 )); 1443 $this->emitToken($this->token); 1444 $this->stream->unget(); 1445 $state = 'data'; 1446 } else { 1447 $this->token['data'] .= '--!' . $char; 1448 $state = 'comment'; 1449 } 1450 break; 1451 1452 case 'comment end space': 1453 $char = $this->stream->char(); 1454 if ($char === '>') { 1455 $this->emitToken($this->token); 1456 $state = 'data'; 1457 } elseif ($char === '-') { 1458 $state = 'comment end dash'; 1459 } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1460 $this->token['data'] .= $char; 1461 } elseif ($char === false) { 1462 $this->emitToken(array( 1463 'type' => self::PARSEERROR, 1464 'data' => 'unexpected-eof-in-comment-end-space', 1465 )); 1466 $this->emitToken($this->token); 1467 $this->stream->unget(); 1468 $state = 'data'; 1469 } else { 1470 $this->token['data'] .= $char; 1471 $state = 'comment'; 1472 } 1473 break; 1474 1475 case 'DOCTYPE': 1476 /* Consume the next input character: */ 1477 $char = $this->stream->char(); 1478 1479 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1480 /* U+0009 CHARACTER TABULATION 1481 U+000A LINE FEED (LF) 1482 U+000C FORM FEED (FF) 1483 U+0020 SPACE 1484 Switch to the before DOCTYPE name state. */ 1485 $state = 'before DOCTYPE name'; 1486 1487 } elseif ($char === false) { 1488 /* EOF 1489 Parse error. Create a new DOCTYPE token. Set its 1490 force-quirks flag to on. Emit the token. Reconsume the 1491 EOF character in the data state. */ 1492 $this->emitToken(array( 1493 'type' => self::PARSEERROR, 1494 'data' => 'need-space-after-doctype-but-got-eof' 1495 )); 1496 $this->emitToken(array( 1497 'name' => '', 1498 'type' => self::DOCTYPE, 1499 'force-quirks' => true, 1500 'error' => true 1501 )); 1502 $this->stream->unget(); 1503 $state = 'data'; 1504 1505 } else { 1506 /* Anything else 1507 Parse error. Reconsume the current character in the 1508 before DOCTYPE name state. */ 1509 $this->emitToken(array( 1510 'type' => self::PARSEERROR, 1511 'data' => 'need-space-after-doctype' 1512 )); 1513 $this->stream->unget(); 1514 $state = 'before DOCTYPE name'; 1515 } 1516 break; 1517 1518 case 'before DOCTYPE name': 1519 /* Consume the next input character: */ 1520 $char = $this->stream->char(); 1521 1522 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1523 /* U+0009 CHARACTER TABULATION 1524 U+000A LINE FEED (LF) 1525 U+000C FORM FEED (FF) 1526 U+0020 SPACE 1527 Stay in the before DOCTYPE name state. */ 1528 1529 } elseif ($char === '>') { 1530 /* U+003E GREATER-THAN SIGN (>) 1531 Parse error. Create a new DOCTYPE token. Set its 1532 force-quirks flag to on. Emit the token. Switch to the 1533 data state. */ 1534 $this->emitToken(array( 1535 'type' => self::PARSEERROR, 1536 'data' => 'expected-doctype-name-but-got-right-bracket' 1537 )); 1538 $this->emitToken(array( 1539 'name' => '', 1540 'type' => self::DOCTYPE, 1541 'force-quirks' => true, 1542 'error' => true 1543 )); 1544 1545 $state = 'data'; 1546 1547 } elseif ('A' <= $char && $char <= 'Z') { 1548 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 1549 Create a new DOCTYPE token. Set the token's name to the 1550 lowercase version of the input character (add 0x0020 to 1551 the character's code point). Switch to the DOCTYPE name 1552 state. */ 1553 $this->token = array( 1554 'name' => strtolower($char), 1555 'type' => self::DOCTYPE, 1556 'error' => true 1557 ); 1558 1559 $state = 'DOCTYPE name'; 1560 1561 } elseif ($char === false) { 1562 /* EOF 1563 Parse error. Create a new DOCTYPE token. Set its 1564 force-quirks flag to on. Emit the token. Reconsume the 1565 EOF character in the data state. */ 1566 $this->emitToken(array( 1567 'type' => self::PARSEERROR, 1568 'data' => 'expected-doctype-name-but-got-eof' 1569 )); 1570 $this->emitToken(array( 1571 'name' => '', 1572 'type' => self::DOCTYPE, 1573 'force-quirks' => true, 1574 'error' => true 1575 )); 1576 1577 $this->stream->unget(); 1578 $state = 'data'; 1579 1580 } else { 1581 /* Anything else 1582 Create a new DOCTYPE token. Set the token's name to the 1583 current input character. Switch to the DOCTYPE name state. */ 1584 $this->token = array( 1585 'name' => $char, 1586 'type' => self::DOCTYPE, 1587 'error' => true 1588 ); 1589 1590 $state = 'DOCTYPE name'; 1591 } 1592 break; 1593 1594 case 'DOCTYPE name': 1595 /* Consume the next input character: */ 1596 $char = $this->stream->char(); 1597 1598 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1599 /* U+0009 CHARACTER TABULATION 1600 U+000A LINE FEED (LF) 1601 U+000C FORM FEED (FF) 1602 U+0020 SPACE 1603 Switch to the after DOCTYPE name state. */ 1604 $state = 'after DOCTYPE name'; 1605 1606 } elseif ($char === '>') { 1607 /* U+003E GREATER-THAN SIGN (>) 1608 Emit the current DOCTYPE token. Switch to the data state. */ 1609 $this->emitToken($this->token); 1610 $state = 'data'; 1611 1612 } elseif ('A' <= $char && $char <= 'Z') { 1613 /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z 1614 Append the lowercase version of the input character 1615 (add 0x0020 to the character's code point) to the current 1616 DOCTYPE token's name. Stay in the DOCTYPE name state. */ 1617 $this->token['name'] .= strtolower($char); 1618 1619 } elseif ($char === false) { 1620 /* EOF 1621 Parse error. Set the DOCTYPE token's force-quirks flag 1622 to on. Emit that DOCTYPE token. Reconsume the EOF 1623 character in the data state. */ 1624 $this->emitToken(array( 1625 'type' => self::PARSEERROR, 1626 'data' => 'eof-in-doctype-name' 1627 )); 1628 $this->token['force-quirks'] = true; 1629 $this->emitToken($this->token); 1630 $this->stream->unget(); 1631 $state = 'data'; 1632 1633 } else { 1634 /* Anything else 1635 Append the current input character to the current 1636 DOCTYPE token's name. Stay in the DOCTYPE name state. */ 1637 $this->token['name'] .= $char; 1638 } 1639 1640 // XXX this is probably some sort of quirks mode designation, 1641 // check tree-builder to be sure. In general 'error' needs 1642 // to be specc'ified, this probably means removing it at the end 1643 $this->token['error'] = ($this->token['name'] === 'HTML') 1644 ? false 1645 : true; 1646 break; 1647 1648 case 'after DOCTYPE name': 1649 /* Consume the next input character: */ 1650 $char = $this->stream->char(); 1651 1652 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1653 /* U+0009 CHARACTER TABULATION 1654 U+000A LINE FEED (LF) 1655 U+000C FORM FEED (FF) 1656 U+0020 SPACE 1657 Stay in the after DOCTYPE name state. */ 1658 1659 } elseif ($char === '>') { 1660 /* U+003E GREATER-THAN SIGN (>) 1661 Emit the current DOCTYPE token. Switch to the data state. */ 1662 $this->emitToken($this->token); 1663 $state = 'data'; 1664 1665 } elseif ($char === false) { 1666 /* EOF 1667 Parse error. Set the DOCTYPE token's force-quirks flag 1668 to on. Emit that DOCTYPE token. Reconsume the EOF 1669 character in the data state. */ 1670 $this->emitToken(array( 1671 'type' => self::PARSEERROR, 1672 'data' => 'eof-in-doctype' 1673 )); 1674 $this->token['force-quirks'] = true; 1675 $this->emitToken($this->token); 1676 $this->stream->unget(); 1677 $state = 'data'; 1678 1679 } else { 1680 /* Anything else */ 1681 1682 $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5)); 1683 if ($nextSix === 'PUBLIC') { 1684 /* If the next six characters are an ASCII 1685 case-insensitive match for the word "PUBLIC", then 1686 consume those characters and switch to the before 1687 DOCTYPE public identifier state. */ 1688 $state = 'before DOCTYPE public identifier'; 1689 1690 } elseif ($nextSix === 'SYSTEM') { 1691 /* Otherwise, if the next six characters are an ASCII 1692 case-insensitive match for the word "SYSTEM", then 1693 consume those characters and switch to the before 1694 DOCTYPE system identifier state. */ 1695 $state = 'before DOCTYPE system identifier'; 1696 1697 } else { 1698 /* Otherwise, this is the parse error. Set the DOCTYPE 1699 token's force-quirks flag to on. Switch to the bogus 1700 DOCTYPE state. */ 1701 $this->emitToken(array( 1702 'type' => self::PARSEERROR, 1703 'data' => 'expected-space-or-right-bracket-in-doctype' 1704 )); 1705 $this->token['force-quirks'] = true; 1706 $this->token['error'] = true; 1707 $state = 'bogus DOCTYPE'; 1708 } 1709 } 1710 break; 1711 1712 case 'before DOCTYPE public identifier': 1713 /* Consume the next input character: */ 1714 $char = $this->stream->char(); 1715 1716 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1717 /* U+0009 CHARACTER TABULATION 1718 U+000A LINE FEED (LF) 1719 U+000C FORM FEED (FF) 1720 U+0020 SPACE 1721 Stay in the before DOCTYPE public identifier state. */ 1722 } elseif ($char === '"') { 1723 /* U+0022 QUOTATION MARK (") 1724 Set the DOCTYPE token's public identifier to the empty 1725 string (not missing), then switch to the DOCTYPE public 1726 identifier (double-quoted) state. */ 1727 $this->token['public'] = ''; 1728 $state = 'DOCTYPE public identifier (double-quoted)'; 1729 } elseif ($char === "'") { 1730 /* U+0027 APOSTROPHE (') 1731 Set the DOCTYPE token's public identifier to the empty 1732 string (not missing), then switch to the DOCTYPE public 1733 identifier (single-quoted) state. */ 1734 $this->token['public'] = ''; 1735 $state = 'DOCTYPE public identifier (single-quoted)'; 1736 } elseif ($char === '>') { 1737 /* Parse error. Set the DOCTYPE token's force-quirks flag 1738 to on. Emit that DOCTYPE token. Switch to the data state. */ 1739 $this->emitToken(array( 1740 'type' => self::PARSEERROR, 1741 'data' => 'unexpected-end-of-doctype' 1742 )); 1743 $this->token['force-quirks'] = true; 1744 $this->emitToken($this->token); 1745 $state = 'data'; 1746 } elseif ($char === false) { 1747 /* Parse error. Set the DOCTYPE token's force-quirks 1748 flag to on. Emit that DOCTYPE token. Reconsume the EOF 1749 character in the data state. */ 1750 $this->emitToken(array( 1751 'type' => self::PARSEERROR, 1752 'data' => 'eof-in-doctype' 1753 )); 1754 $this->token['force-quirks'] = true; 1755 $this->emitToken($this->token); 1756 $this->stream->unget(); 1757 $state = 'data'; 1758 } else { 1759 /* Parse error. Set the DOCTYPE token's force-quirks flag 1760 to on. Switch to the bogus DOCTYPE state. */ 1761 $this->emitToken(array( 1762 'type' => self::PARSEERROR, 1763 'data' => 'unexpected-char-in-doctype' 1764 )); 1765 $this->token['force-quirks'] = true; 1766 $state = 'bogus DOCTYPE'; 1767 } 1768 break; 1769 1770 case 'DOCTYPE public identifier (double-quoted)': 1771 /* Consume the next input character: */ 1772 $char = $this->stream->char(); 1773 1774 if ($char === '"') { 1775 /* U+0022 QUOTATION MARK (") 1776 Switch to the after DOCTYPE public identifier state. */ 1777 $state = 'after DOCTYPE public identifier'; 1778 } elseif ($char === '>') { 1779 /* U+003E GREATER-THAN SIGN (>) 1780 Parse error. Set the DOCTYPE token's force-quirks flag 1781 to on. Emit that DOCTYPE token. Switch to the data state. */ 1782 $this->emitToken(array( 1783 'type' => self::PARSEERROR, 1784 'data' => 'unexpected-end-of-doctype' 1785 )); 1786 $this->token['force-quirks'] = true; 1787 $this->emitToken($this->token); 1788 $state = 'data'; 1789 } elseif ($char === false) { 1790 /* EOF 1791 Parse error. Set the DOCTYPE token's force-quirks flag 1792 to on. Emit that DOCTYPE token. Reconsume the EOF 1793 character in the data state. */ 1794 $this->emitToken(array( 1795 'type' => self::PARSEERROR, 1796 'data' => 'eof-in-doctype' 1797 )); 1798 $this->token['force-quirks'] = true; 1799 $this->emitToken($this->token); 1800 $this->stream->unget(); 1801 $state = 'data'; 1802 } else { 1803 /* Anything else 1804 Append the current input character to the current 1805 DOCTYPE token's public identifier. Stay in the DOCTYPE 1806 public identifier (double-quoted) state. */ 1807 $this->token['public'] .= $char; 1808 } 1809 break; 1810 1811 case 'DOCTYPE public identifier (single-quoted)': 1812 /* Consume the next input character: */ 1813 $char = $this->stream->char(); 1814 1815 if ($char === "'") { 1816 /* U+0027 APOSTROPHE (') 1817 Switch to the after DOCTYPE public identifier state. */ 1818 $state = 'after DOCTYPE public identifier'; 1819 } elseif ($char === '>') { 1820 /* U+003E GREATER-THAN SIGN (>) 1821 Parse error. Set the DOCTYPE token's force-quirks flag 1822 to on. Emit that DOCTYPE token. Switch to the data state. */ 1823 $this->emitToken(array( 1824 'type' => self::PARSEERROR, 1825 'data' => 'unexpected-end-of-doctype' 1826 )); 1827 $this->token['force-quirks'] = true; 1828 $this->emitToken($this->token); 1829 $state = 'data'; 1830 } elseif ($char === false) { 1831 /* EOF 1832 Parse error. Set the DOCTYPE token's force-quirks flag 1833 to on. Emit that DOCTYPE token. Reconsume the EOF 1834 character in the data state. */ 1835 $this->emitToken(array( 1836 'type' => self::PARSEERROR, 1837 'data' => 'eof-in-doctype' 1838 )); 1839 $this->token['force-quirks'] = true; 1840 $this->emitToken($this->token); 1841 $this->stream->unget(); 1842 $state = 'data'; 1843 } else { 1844 /* Anything else 1845 Append the current input character to the current 1846 DOCTYPE token's public identifier. Stay in the DOCTYPE 1847 public identifier (double-quoted) state. */ 1848 $this->token['public'] .= $char; 1849 } 1850 break; 1851 1852 case 'after DOCTYPE public identifier': 1853 /* Consume the next input character: */ 1854 $char = $this->stream->char(); 1855 1856 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1857 /* U+0009 CHARACTER TABULATION 1858 U+000A LINE FEED (LF) 1859 U+000C FORM FEED (FF) 1860 U+0020 SPACE 1861 Stay in the after DOCTYPE public identifier state. */ 1862 } elseif ($char === '"') { 1863 /* U+0022 QUOTATION MARK (") 1864 Set the DOCTYPE token's system identifier to the 1865 empty string (not missing), then switch to the DOCTYPE 1866 system identifier (double-quoted) state. */ 1867 $this->token['system'] = ''; 1868 $state = 'DOCTYPE system identifier (double-quoted)'; 1869 } elseif ($char === "'") { 1870 /* U+0027 APOSTROPHE (') 1871 Set the DOCTYPE token's system identifier to the 1872 empty string (not missing), then switch to the DOCTYPE 1873 system identifier (single-quoted) state. */ 1874 $this->token['system'] = ''; 1875 $state = 'DOCTYPE system identifier (single-quoted)'; 1876 } elseif ($char === '>') { 1877 /* U+003E GREATER-THAN SIGN (>) 1878 Emit the current DOCTYPE token. Switch to the data state. */ 1879 $this->emitToken($this->token); 1880 $state = 'data'; 1881 } elseif ($char === false) { 1882 /* Parse error. Set the DOCTYPE token's force-quirks 1883 flag to on. Emit that DOCTYPE token. Reconsume the EOF 1884 character in the data state. */ 1885 $this->emitToken(array( 1886 'type' => self::PARSEERROR, 1887 'data' => 'eof-in-doctype' 1888 )); 1889 $this->token['force-quirks'] = true; 1890 $this->emitToken($this->token); 1891 $this->stream->unget(); 1892 $state = 'data'; 1893 } else { 1894 /* Anything else 1895 Parse error. Set the DOCTYPE token's force-quirks flag 1896 to on. Switch to the bogus DOCTYPE state. */ 1897 $this->emitToken(array( 1898 'type' => self::PARSEERROR, 1899 'data' => 'unexpected-char-in-doctype' 1900 )); 1901 $this->token['force-quirks'] = true; 1902 $state = 'bogus DOCTYPE'; 1903 } 1904 break; 1905 1906 case 'before DOCTYPE system identifier': 1907 /* Consume the next input character: */ 1908 $char = $this->stream->char(); 1909 1910 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 1911 /* U+0009 CHARACTER TABULATION 1912 U+000A LINE FEED (LF) 1913 U+000C FORM FEED (FF) 1914 U+0020 SPACE 1915 Stay in the before DOCTYPE system identifier state. */ 1916 } elseif ($char === '"') { 1917 /* U+0022 QUOTATION MARK (") 1918 Set the DOCTYPE token's system identifier to the empty 1919 string (not missing), then switch to the DOCTYPE system 1920 identifier (double-quoted) state. */ 1921 $this->token['system'] = ''; 1922 $state = 'DOCTYPE system identifier (double-quoted)'; 1923 } elseif ($char === "'") { 1924 /* U+0027 APOSTROPHE (') 1925 Set the DOCTYPE token's system identifier to the empty 1926 string (not missing), then switch to the DOCTYPE system 1927 identifier (single-quoted) state. */ 1928 $this->token['system'] = ''; 1929 $state = 'DOCTYPE system identifier (single-quoted)'; 1930 } elseif ($char === '>') { 1931 /* Parse error. Set the DOCTYPE token's force-quirks flag 1932 to on. Emit that DOCTYPE token. Switch to the data state. */ 1933 $this->emitToken(array( 1934 'type' => self::PARSEERROR, 1935 'data' => 'unexpected-char-in-doctype' 1936 )); 1937 $this->token['force-quirks'] = true; 1938 $this->emitToken($this->token); 1939 $state = 'data'; 1940 } elseif ($char === false) { 1941 /* Parse error. Set the DOCTYPE token's force-quirks 1942 flag to on. Emit that DOCTYPE token. Reconsume the EOF 1943 character in the data state. */ 1944 $this->emitToken(array( 1945 'type' => self::PARSEERROR, 1946 'data' => 'eof-in-doctype' 1947 )); 1948 $this->token['force-quirks'] = true; 1949 $this->emitToken($this->token); 1950 $this->stream->unget(); 1951 $state = 'data'; 1952 } else { 1953 /* Parse error. Set the DOCTYPE token's force-quirks flag 1954 to on. Switch to the bogus DOCTYPE state. */ 1955 $this->emitToken(array( 1956 'type' => self::PARSEERROR, 1957 'data' => 'unexpected-char-in-doctype' 1958 )); 1959 $this->token['force-quirks'] = true; 1960 $state = 'bogus DOCTYPE'; 1961 } 1962 break; 1963 1964 case 'DOCTYPE system identifier (double-quoted)': 1965 /* Consume the next input character: */ 1966 $char = $this->stream->char(); 1967 1968 if ($char === '"') { 1969 /* U+0022 QUOTATION MARK (") 1970 Switch to the after DOCTYPE system identifier state. */ 1971 $state = 'after DOCTYPE system identifier'; 1972 } elseif ($char === '>') { 1973 /* U+003E GREATER-THAN SIGN (>) 1974 Parse error. Set the DOCTYPE token's force-quirks flag 1975 to on. Emit that DOCTYPE token. Switch to the data state. */ 1976 $this->emitToken(array( 1977 'type' => self::PARSEERROR, 1978 'data' => 'unexpected-end-of-doctype' 1979 )); 1980 $this->token['force-quirks'] = true; 1981 $this->emitToken($this->token); 1982 $state = 'data'; 1983 } elseif ($char === false) { 1984 /* EOF 1985 Parse error. Set the DOCTYPE token's force-quirks flag 1986 to on. Emit that DOCTYPE token. Reconsume the EOF 1987 character in the data state. */ 1988 $this->emitToken(array( 1989 'type' => self::PARSEERROR, 1990 'data' => 'eof-in-doctype' 1991 )); 1992 $this->token['force-quirks'] = true; 1993 $this->emitToken($this->token); 1994 $this->stream->unget(); 1995 $state = 'data'; 1996 } else { 1997 /* Anything else 1998 Append the current input character to the current 1999 DOCTYPE token's system identifier. Stay in the DOCTYPE 2000 system identifier (double-quoted) state. */ 2001 $this->token['system'] .= $char; 2002 } 2003 break; 2004 2005 case 'DOCTYPE system identifier (single-quoted)': 2006 /* Consume the next input character: */ 2007 $char = $this->stream->char(); 2008 2009 if ($char === "'") { 2010 /* U+0027 APOSTROPHE (') 2011 Switch to the after DOCTYPE system identifier state. */ 2012 $state = 'after DOCTYPE system identifier'; 2013 } elseif ($char === '>') { 2014 /* U+003E GREATER-THAN SIGN (>) 2015 Parse error. Set the DOCTYPE token's force-quirks flag 2016 to on. Emit that DOCTYPE token. Switch to the data state. */ 2017 $this->emitToken(array( 2018 'type' => self::PARSEERROR, 2019 'data' => 'unexpected-end-of-doctype' 2020 )); 2021 $this->token['force-quirks'] = true; 2022 $this->emitToken($this->token); 2023 $state = 'data'; 2024 } elseif ($char === false) { 2025 /* EOF 2026 Parse error. Set the DOCTYPE token's force-quirks flag 2027 to on. Emit that DOCTYPE token. Reconsume the EOF 2028 character in the data state. */ 2029 $this->emitToken(array( 2030 'type' => self::PARSEERROR, 2031 'data' => 'eof-in-doctype' 2032 )); 2033 $this->token['force-quirks'] = true; 2034 $this->emitToken($this->token); 2035 $this->stream->unget(); 2036 $state = 'data'; 2037 } else { 2038 /* Anything else 2039 Append the current input character to the current 2040 DOCTYPE token's system identifier. Stay in the DOCTYPE 2041 system identifier (double-quoted) state. */ 2042 $this->token['system'] .= $char; 2043 } 2044 break; 2045 2046 case 'after DOCTYPE system identifier': 2047 /* Consume the next input character: */ 2048 $char = $this->stream->char(); 2049 2050 if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { 2051 /* U+0009 CHARACTER TABULATION 2052 U+000A LINE FEED (LF) 2053 U+000C FORM FEED (FF) 2054 U+0020 SPACE 2055 Stay in the after DOCTYPE system identifier state. */ 2056 } elseif ($char === '>') { 2057 /* U+003E GREATER-THAN SIGN (>) 2058 Emit the current DOCTYPE token. Switch to the data state. */ 2059 $this->emitToken($this->token); 2060 $state = 'data'; 2061 } elseif ($char === false) { 2062 /* Parse error. Set the DOCTYPE token's force-quirks 2063 flag to on. Emit that DOCTYPE token. Reconsume the EOF 2064 character in the data state. */ 2065 $this->emitToken(array( 2066 'type' => self::PARSEERROR, 2067 'data' => 'eof-in-doctype' 2068 )); 2069 $this->token['force-quirks'] = true; 2070 $this->emitToken($this->token); 2071 $this->stream->unget(); 2072 $state = 'data'; 2073 } else { 2074 /* Anything else 2075 Parse error. Switch to the bogus DOCTYPE state. 2076 (This does not set the DOCTYPE token's force-quirks 2077 flag to on.) */ 2078 $this->emitToken(array( 2079 'type' => self::PARSEERROR, 2080 'data' => 'unexpected-char-in-doctype' 2081 )); 2082 $state = 'bogus DOCTYPE'; 2083 } 2084 break; 2085 2086 case 'bogus DOCTYPE': 2087 /* Consume the next input character: */ 2088 $char = $this->stream->char(); 2089 2090 if ($char === '>') { 2091 /* U+003E GREATER-THAN SIGN (>) 2092 Emit the DOCTYPE token. Switch to the data state. */ 2093 $this->emitToken($this->token); 2094 $state = 'data'; 2095 2096 } elseif ($char === false) { 2097 /* EOF 2098 Emit the DOCTYPE token. Reconsume the EOF character in 2099 the data state. */ 2100 $this->emitToken($this->token); 2101 $this->stream->unget(); 2102 $state = 'data'; 2103 2104 } else { 2105 /* Anything else 2106 Stay in the bogus DOCTYPE state. */ 2107 } 2108 break; 2109 2110 // case 'cdataSection': 2111 } 2112 } 2113 } 2114 2115 /** 2116 * Returns a serialized representation of the tree. 2117 * 2118 * @return DOMDocument|DOMNodeList 2119 */ 2120 public function save() { 2121 return $this->tree->save(); 2122 } 2123 2124 /** 2125 * @return HTML5_TreeBuilder The tree 2126 */ 2127 public function getTree() 2128 { 2129 return $this->tree; 2130 } 2131 2132 2133 /** 2134 * Returns the input stream. 2135 * 2136 * @return HTML5_InputStream 2137 */ 2138 public function stream() { 2139 return $this->stream; 2140 } 2141 2142 /** 2143 * @param bool $allowed 2144 * @param bool $inattr 2145 * @return string 2146 */ 2147 private function consumeCharacterReference($allowed = false, $inattr = false) { 2148 // This goes quite far against spec, and is far closer to the Python 2149 // impl., mainly because we don't do the large unconsuming the spec 2150 // requires. 2151 2152 // All consumed characters. 2153 $chars = $this->stream->char(); 2154 2155 /* This section defines how to consume a character 2156 reference. This definition is used when parsing character 2157 references in text and in attributes. 2158 2159 The behavior depends on the identity of the next character 2160 (the one immediately after the U+0026 AMPERSAND character): */ 2161 2162 if ( 2163 $chars[0] === "\x09" || 2164 $chars[0] === "\x0A" || 2165 $chars[0] === "\x0C" || 2166 $chars[0] === "\x20" || 2167 $chars[0] === '<' || 2168 $chars[0] === '&' || 2169 $chars === false || 2170 $chars[0] === $allowed 2171 ) { 2172 /* U+0009 CHARACTER TABULATION 2173 U+000A LINE FEED (LF) 2174 U+000C FORM FEED (FF) 2175 U+0020 SPACE 2176 U+003C LESS-THAN SIGN 2177 U+0026 AMPERSAND 2178 EOF 2179 The additional allowed character, if there is one 2180 Not a character reference. No characters are consumed, 2181 and nothing is returned. (This is not an error, either.) */ 2182 // We already consumed, so unconsume. 2183 $this->stream->unget(); 2184 return '&'; 2185 } elseif ($chars[0] === '#') { 2186 /* Consume the U+0023 NUMBER SIGN. */ 2187 // Um, yeah, we already did that. 2188 /* The behavior further depends on the character after 2189 the U+0023 NUMBER SIGN: */ 2190 $chars .= $this->stream->char(); 2191 if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) { 2192 /* U+0078 LATIN SMALL LETTER X 2193 U+0058 LATIN CAPITAL LETTER X */ 2194 /* Consume the X. */ 2195 // Um, yeah, we already did that. 2196 /* Follow the steps below, but using the range of 2197 characters U+0030 DIGIT ZERO through to U+0039 DIGIT 2198 NINE, U+0061 LATIN SMALL LETTER A through to U+0066 2199 LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 2200 A, through to U+0046 LATIN CAPITAL LETTER F (in other 2201 words, 0123456789, ABCDEF, abcdef). */ 2202 $char_class = self::HEX; 2203 /* When it comes to interpreting the 2204 number, interpret it as a hexadecimal number. */ 2205 $hex = true; 2206 } else { 2207 /* Anything else */ 2208 // Unconsume because we shouldn't have consumed this. 2209 $chars = $chars[0]; 2210 $this->stream->unget(); 2211 /* Follow the steps below, but using the range of 2212 characters U+0030 DIGIT ZERO through to U+0039 DIGIT 2213 NINE (i.e. just 0123456789). */ 2214 $char_class = self::DIGIT; 2215 /* When it comes to interpreting the number, 2216 interpret it as a decimal number. */ 2217 $hex = false; 2218 } 2219 2220 /* Consume as many characters as match the range of characters given above. */ 2221 $consumed = $this->stream->charsWhile($char_class); 2222 if ($consumed === '' || $consumed === false) { 2223 /* If no characters match the range, then don't consume 2224 any characters (and unconsume the U+0023 NUMBER SIGN 2225 character and, if appropriate, the X character). This 2226 is a parse error; nothing is returned. */ 2227 $this->emitToken(array( 2228 'type' => self::PARSEERROR, 2229 'data' => 'expected-numeric-entity' 2230 )); 2231 return '&' . $chars; 2232 } else { 2233 /* Otherwise, if the next character is a U+003B SEMICOLON, 2234 consume that too. If it isn't, there is a parse error. */ 2235 if ($this->stream->char() !== ';') { 2236 $this->stream->unget(); 2237 $this->emitToken(array( 2238 'type' => self::PARSEERROR, 2239 'data' => 'numeric-entity-without-semicolon' 2240 )); 2241 } 2242 2243 /* If one or more characters match the range, then take 2244 them all and interpret the string of characters as a number 2245 (either hexadecimal or decimal as appropriate). */ 2246 $codepoint = $hex ? hexdec($consumed) : (int) $consumed; 2247 2248 /* If that number is one of the numbers in the first column 2249 of the following table, then this is a parse error. Find the 2250 row with that number in the first column, and return a 2251 character token for the Unicode character given in the 2252 second column of that row. */ 2253 $new_codepoint = HTML5_Data::getRealCodepoint($codepoint); 2254 if ($new_codepoint) { 2255 $this->emitToken(array( 2256 'type' => self::PARSEERROR, 2257 'data' => 'illegal-windows-1252-entity' 2258 )); 2259 return HTML5_Data::utf8chr($new_codepoint); 2260 } else { 2261 /* Otherwise, if the number is greater than 0x10FFFF, then 2262 * this is a parse error. Return a U+FFFD REPLACEMENT 2263 * CHARACTER. */ 2264 if ($codepoint > 0x10FFFF) { 2265 $this->emitToken(array( 2266 'type' => self::PARSEERROR, 2267 'data' => 'overlong-character-entity' // XXX probably not correct 2268 )); 2269 return "\xEF\xBF\xBD"; 2270 } 2271 /* Otherwise, return a character token for the Unicode 2272 * character whose code point is that number. If the 2273 * number is in the range 0x0001 to 0x0008, 0x000E to 2274 * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to 2275 * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 2276 * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 2277 * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 2278 * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 2279 * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 2280 * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 2281 * or 0x10FFFF, then this is a parse error. */ 2282 // && has higher precedence than || 2283 if ( 2284 $codepoint >= 0x0000 && $codepoint <= 0x0008 || 2285 $codepoint === 0x000B || 2286 $codepoint >= 0x000E && $codepoint <= 0x001F || 2287 $codepoint >= 0x007F && $codepoint <= 0x009F || 2288 $codepoint >= 0xD800 && $codepoint <= 0xDFFF || 2289 $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF || 2290 ($codepoint & 0xFFFE) === 0xFFFE || 2291 $codepoint == 0x10FFFF || $codepoint == 0x10FFFE 2292 ) { 2293 $this->emitToken(array( 2294 'type' => self::PARSEERROR, 2295 'data' => 'illegal-codepoint-for-numeric-entity' 2296 )); 2297 } 2298 return HTML5_Data::utf8chr($codepoint); 2299 } 2300 } 2301 } else { 2302 /* Anything else */ 2303 2304 /* Consume the maximum number of characters possible, 2305 with the consumed characters matching one of the 2306 identifiers in the first column of the named character 2307 references table (in a case-sensitive manner). */ 2308 // What we actually do here is consume as much as we can while it 2309 // matches the start of one of the identifiers in the first column. 2310 2311 $refs = HTML5_Data::getNamedCharacterReferences(); 2312 2313 // Get the longest string which is the start of an identifier 2314 // ($chars) as well as the longest identifier which matches ($id) 2315 // and its codepoint ($codepoint). 2316 $codepoint = false; 2317 $char = $chars; 2318 while ($char !== false && isset($refs[$char])) { 2319 $refs = $refs[$char]; 2320 if (isset($refs['codepoint'])) { 2321 $id = $chars; 2322 $codepoint = $refs['codepoint']; 2323 } 2324 $chars .= $char = $this->stream->char(); 2325 } 2326 2327 // Unconsume the one character we just took which caused the while 2328 // statement to fail. This could be anything and could cause state 2329 // changes (as if it matches the while loop it must be 2330 // alphanumeric so we can just concat it to whatever we get later). 2331 $this->stream->unget(); 2332 if ($char !== false) { 2333 $chars = substr($chars, 0, -1); 2334 } 2335 2336 /* If no match can be made, then this is a parse error. 2337 No characters are consumed, and nothing is returned. */ 2338 if (!$codepoint) { 2339 $this->emitToken(array( 2340 'type' => self::PARSEERROR, 2341 'data' => 'expected-named-entity' 2342 )); 2343 return '&' . $chars; 2344 } 2345 2346 /* If the last character matched is not a U+003B SEMICOLON 2347 (;), there is a parse error. */ 2348 $semicolon = true; 2349 if (substr($id, -1) !== ';') { 2350 $this->emitToken(array( 2351 'type' => self::PARSEERROR, 2352 'data' => 'named-entity-without-semicolon' 2353 )); 2354 $semicolon = false; 2355 } 2356 2357 /* If the character reference is being consumed as part of 2358 an attribute, and the last character matched is not a 2359 U+003B SEMICOLON (;), and the next character is in the 2360 range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041 2361 LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z, 2362 or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z, 2363 then, for historical reasons, all the characters that were 2364 matched after the U+0026 AMPERSAND (&) must be unconsumed, 2365 and nothing is returned. */ 2366 if ($inattr && !$semicolon) { 2367 // The next character is either the next character in $chars or in the stream. 2368 if (strlen($chars) > strlen($id)) { 2369 $next = substr($chars, strlen($id), 1); 2370 } else { 2371 $next = $this->stream->char(); 2372 $this->stream->unget(); 2373 } 2374 if ( 2375 '0' <= $next && $next <= '9' || 2376 'A' <= $next && $next <= 'Z' || 2377 'a' <= $next && $next <= 'z' 2378 ) { 2379 return '&' . $chars; 2380 } 2381 } 2382 2383 /* Otherwise, return a character token for the character 2384 corresponding to the character reference name (as given 2385 by the second column of the named character references table). */ 2386 return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id)); 2387 } 2388 } 2389 2390 /** 2391 * @param bool $allowed 2392 */ 2393 private function characterReferenceInAttributeValue($allowed = false) { 2394 /* Attempt to consume a character reference. */ 2395 $entity = $this->consumeCharacterReference($allowed, true); 2396 2397 /* If nothing is returned, append a U+0026 AMPERSAND 2398 character to the current attribute's value. 2399 2400 Otherwise, append the returned character token to the 2401 current attribute's value. */ 2402 $char = (!$entity) 2403 ? '&' 2404 : $entity; 2405 2406 $last = count($this->token['attr']) - 1; 2407 $this->token['attr'][$last]['value'] .= $char; 2408 2409 /* Finally, switch back to the attribute value state that you 2410 were in when were switched into this state. */ 2411 } 2412 2413 /** 2414 * Emits a token, passing it on to the tree builder. 2415 * 2416 * @param $token 2417 * @param bool $checkStream 2418 * @param bool $dry 2419 */ 2420 protected function emitToken($token, $checkStream = true, $dry = false) { 2421 if ($checkStream === true) { 2422 // Emit errors from input stream. 2423 while ($this->stream->errors) { 2424 $this->emitToken(array_shift($this->stream->errors), false); 2425 } 2426 } 2427 if ($token['type'] === self::ENDTAG && !empty($token['attr'])) { 2428 for ($i = 0; $i < count($token['attr']); $i++) { 2429 $this->emitToken(array( 2430 'type' => self::PARSEERROR, 2431 'data' => 'attributes-in-end-tag' 2432 )); 2433 } 2434 } 2435 if ($token['type'] === self::ENDTAG && !empty($token['self-closing'])) { 2436 $this->emitToken(array( 2437 'type' => self::PARSEERROR, 2438 'data' => 'self-closing-flag-on-end-tag', 2439 )); 2440 } 2441 if ($token['type'] === self::STARTTAG) { 2442 // This could be changed to actually pass the tree-builder a hash 2443 $hash = array(); 2444 foreach ($token['attr'] as $keypair) { 2445 if (isset($hash[$keypair['name']])) { 2446 $this->emitToken(array( 2447 'type' => self::PARSEERROR, 2448 'data' => 'duplicate-attribute', 2449 )); 2450 } else { 2451 $hash[$keypair['name']] = $keypair['value']; 2452 } 2453 } 2454 } 2455 2456 if ($dry === false) { 2457 // the current structure of attributes is not a terribly good one 2458 $this->tree->emitToken($token); 2459 } 2460 2461 if ($dry === false && is_int($this->tree->content_model)) { 2462 $this->content_model = $this->tree->content_model; 2463 $this->tree->content_model = null; 2464 2465 } elseif ($token['type'] === self::ENDTAG) { 2466 $this->content_model = self::PCDATA; 2467 } 2468 } 2469} 2470 2471