1<?php 2class HTML5 3{ 4 private $data; 5 private $char; 6 private $EOF; 7 private $state; 8 private $tree; 9 private $token; 10 private $content_model; 11 private $escape = false; 12 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute', 13 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;', 14 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;', 15 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;', 16 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;', 17 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;', 18 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;', 19 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;', 20 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;', 21 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN', 22 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;', 23 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;', 24 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig', 25 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;', 26 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;', 27 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil', 28 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;', 29 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;', 30 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;', 31 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth', 32 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12', 33 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt', 34 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc', 35 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;', 36 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;', 37 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;', 38 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro', 39 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;', 40 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;', 41 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;', 42 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash', 43 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;', 44 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;', 45 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;', 46 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;', 47 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;', 48 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;', 49 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;', 50 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;', 51 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc', 52 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;', 53 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;'); 54 55 const PCDATA = 0; 56 const RCDATA = 1; 57 const CDATA = 2; 58 const PLAINTEXT = 3; 59 60 const DOCTYPE = 0; 61 const STARTTAG = 1; 62 const ENDTAG = 2; 63 const COMMENT = 3; 64 const CHARACTR = 4; 65 const EOF = 5; 66 67 public function __construct($data) 68 { 69 $data = str_replace("\r\n", "\n", $data); 70 $date = str_replace("\r", null, $data); 71 72 $this->data = $data; 73 $this->char = -1; 74 $this->EOF = strlen($data); 75 $this->tree = new HTML5TreeConstructer; 76 $this->content_model = self::PCDATA; 77 78 $this->state = 'data'; 79 80 while($this->state !== null) { 81 $this->{$this->state.'State'}(); 82 } 83 } 84 85 public function save() 86 { 87 return $this->tree->save(); 88 } 89 90 private function char() 91 { 92 return ($this->char < $this->EOF) 93 ? $this->data[$this->char] 94 : false; 95 } 96 97 private function character($s, $l = 0) 98 { 99 if($s + $l < $this->EOF) { 100 if($l === 0) { 101 return $this->data[$s]; 102 } else { 103 return substr($this->data, $s, $l); 104 } 105 } 106 } 107 108 private function characters($char_class, $start) 109 { 110 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start)); 111 } 112 113 private function dataState() 114 { 115 // Consume the next input character 116 $this->char++; 117 $char = $this->char(); 118 119 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { 120 /* U+0026 AMPERSAND (&) 121 When the content model flag is set to one of the PCDATA or RCDATA 122 states: switch to the entity data state. Otherwise: treat it as per 123 the "anything else" entry below. */ 124 $this->state = 'entityData'; 125 126 } elseif($char === '-') { 127 /* If the content model flag is set to either the RCDATA state or 128 the CDATA state, and the escape flag is false, and there are at 129 least three characters before this one in the input stream, and the 130 last four characters in the input stream, including this one, are 131 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 132 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 133 if(($this->content_model === self::RCDATA || $this->content_model === 134 self::CDATA) && $this->escape === false && 135 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') { 136 $this->escape = true; 137 } 138 139 /* In any case, emit the input character as a character token. Stay 140 in the data state. */ 141 $this->emitToken(array( 142 'type' => self::CHARACTR, 143 'data' => $char 144 )); 145 146 /* U+003C LESS-THAN SIGN (<) */ 147 } elseif($char === '<' && ($this->content_model === self::PCDATA || 148 (($this->content_model === self::RCDATA || 149 $this->content_model === self::CDATA) && $this->escape === false))) { 150 /* When the content model flag is set to the PCDATA state: switch 151 to the tag open state. 152 153 When the content model flag is set to either the RCDATA state or 154 the CDATA state and the escape flag is false: switch to the tag 155 open state. 156 157 Otherwise: treat it as per the "anything else" entry below. */ 158 $this->state = 'tagOpen'; 159 160 /* U+003E GREATER-THAN SIGN (>) */ 161 } elseif($char === '>') { 162 /* If the content model flag is set to either the RCDATA state or 163 the CDATA state, and the escape flag is true, and the last three 164 characters in the input stream including this one are U+002D 165 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 166 set the escape flag to false. */ 167 if(($this->content_model === self::RCDATA || 168 $this->content_model === self::CDATA) && $this->escape === true && 169 $this->character($this->char, 3) === '-->') { 170 $this->escape = false; 171 } 172 173 /* In any case, emit the input character as a character token. 174 Stay in the data state. */ 175 $this->emitToken(array( 176 'type' => self::CHARACTR, 177 'data' => $char 178 )); 179 180 } elseif($this->char === $this->EOF) { 181 /* EOF 182 Emit an end-of-file token. */ 183 $this->EOF(); 184 185 } elseif($this->content_model === self::PLAINTEXT) { 186 /* When the content model flag is set to the PLAINTEXT state 187 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of 188 the text and emit it as a character token. */ 189 $this->emitToken(array( 190 'type' => self::CHARACTR, 191 'data' => substr($this->data, $this->char) 192 )); 193 194 $this->EOF(); 195 196 } else { 197 /* Anything else 198 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that 199 otherwise would also be treated as a character token and emit it 200 as a single character token. Stay in the data state. */ 201 $len = strcspn($this->data, '<&', $this->char); 202 $char = substr($this->data, $this->char, $len); 203 $this->char += $len - 1; 204 205 $this->emitToken(array( 206 'type' => self::CHARACTR, 207 'data' => $char 208 )); 209 210 $this->state = 'data'; 211 } 212 } 213 214 private function entityDataState() 215 { 216 // Attempt to consume an entity. 217 $entity = $this->entity(); 218 219 // If nothing is returned, emit a U+0026 AMPERSAND character token. 220 // Otherwise, emit the character token that was returned. 221 $char = (!$entity) ? '&' : $entity; 222 $this->emitToken($char); 223 224 // Finally, switch to the data state. 225 $this->state = 'data'; 226 } 227 228 private function tagOpenState() 229 { 230 switch($this->content_model) { 231 case self::RCDATA: 232 case self::CDATA: 233 /* If the next input character is a U+002F SOLIDUS (/) character, 234 consume it and switch to the close tag open state. If the next 235 input character is not a U+002F SOLIDUS (/) character, emit a 236 U+003C LESS-THAN SIGN character token and switch to the data 237 state to process the next input character. */ 238 if($this->character($this->char + 1) === '/') { 239 $this->char++; 240 $this->state = 'closeTagOpen'; 241 242 } else { 243 $this->emitToken(array( 244 'type' => self::CHARACTR, 245 'data' => '<' 246 )); 247 248 $this->state = 'data'; 249 } 250 break; 251 252 case self::PCDATA: 253 // If the content model flag is set to the PCDATA state 254 // Consume the next input character: 255 $this->char++; 256 $char = $this->char(); 257 258 if($char === '!') { 259 /* U+0021 EXCLAMATION MARK (!) 260 Switch to the markup declaration open state. */ 261 $this->state = 'markupDeclarationOpen'; 262 263 } elseif($char === '/') { 264 /* U+002F SOLIDUS (/) 265 Switch to the close tag open state. */ 266 $this->state = 'closeTagOpen'; 267 268 } elseif(preg_match('/^[A-Za-z]$/', $char)) { 269 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 270 Create a new start tag token, set its tag name to the lowercase 271 version of the input character (add 0x0020 to the character's code 272 point), then switch to the tag name state. (Don't emit the token 273 yet; further details will be filled in before it is emitted.) */ 274 $this->token = array( 275 'name' => strtolower($char), 276 'type' => self::STARTTAG, 277 'attr' => array() 278 ); 279 280 $this->state = 'tagName'; 281 282 } elseif($char === '>') { 283 /* U+003E GREATER-THAN SIGN (>) 284 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 285 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 286 $this->emitToken(array( 287 'type' => self::CHARACTR, 288 'data' => '<>' 289 )); 290 291 $this->state = 'data'; 292 293 } elseif($char === '?') { 294 /* U+003F QUESTION MARK (?) 295 Parse error. Switch to the bogus comment state. */ 296 $this->state = 'bogusComment'; 297 298 } else { 299 /* Anything else 300 Parse error. Emit a U+003C LESS-THAN SIGN character token and 301 reconsume the current input character in the data state. */ 302 $this->emitToken(array( 303 'type' => self::CHARACTR, 304 'data' => '<' 305 )); 306 307 $this->char--; 308 $this->state = 'data'; 309 } 310 break; 311 } 312 } 313 314 private function closeTagOpenState() 315 { 316 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); 317 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; 318 319 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && 320 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/', 321 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) { 322 /* If the content model flag is set to the RCDATA or CDATA states then 323 examine the next few characters. If they do not match the tag name of 324 the last start tag token emitted (case insensitively), or if they do but 325 they are not immediately followed by one of the following characters: 326 * U+0009 CHARACTER TABULATION 327 * U+000A LINE FEED (LF) 328 * U+000B LINE TABULATION 329 * U+000C FORM FEED (FF) 330 * U+0020 SPACE 331 * U+003E GREATER-THAN SIGN (>) 332 * U+002F SOLIDUS (/) 333 * EOF 334 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character 335 token, a U+002F SOLIDUS character token, and switch to the data state 336 to process the next input character. */ 337 $this->emitToken(array( 338 'type' => self::CHARACTR, 339 'data' => '</' 340 )); 341 342 $this->state = 'data'; 343 344 } else { 345 /* Otherwise, if the content model flag is set to the PCDATA state, 346 or if the next few characters do match that tag name, consume the 347 next input character: */ 348 $this->char++; 349 $char = $this->char(); 350 351 if(preg_match('/^[A-Za-z]$/', $char)) { 352 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 353 Create a new end tag token, set its tag name to the lowercase version 354 of the input character (add 0x0020 to the character's code point), then 355 switch to the tag name state. (Don't emit the token yet; further details 356 will be filled in before it is emitted.) */ 357 $this->token = array( 358 'name' => strtolower($char), 359 'type' => self::ENDTAG 360 ); 361 362 $this->state = 'tagName'; 363 364 } elseif($char === '>') { 365 /* U+003E GREATER-THAN SIGN (>) 366 Parse error. Switch to the data state. */ 367 $this->state = 'data'; 368 369 } elseif($this->char === $this->EOF) { 370 /* EOF 371 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 372 SOLIDUS character token. Reconsume the EOF character in the data state. */ 373 $this->emitToken(array( 374 'type' => self::CHARACTR, 375 'data' => '</' 376 )); 377 378 $this->char--; 379 $this->state = 'data'; 380 381 } else { 382 /* Parse error. Switch to the bogus comment state. */ 383 $this->state = 'bogusComment'; 384 } 385 } 386 } 387 388 private function tagNameState() 389 { 390 // Consume the next input character: 391 $this->char++; 392 $char = $this->character($this->char); 393 394 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 395 /* U+0009 CHARACTER TABULATION 396 U+000A LINE FEED (LF) 397 U+000B LINE TABULATION 398 U+000C FORM FEED (FF) 399 U+0020 SPACE 400 Switch to the before attribute name state. */ 401 $this->state = 'beforeAttributeName'; 402 403 } elseif($char === '>') { 404 /* U+003E GREATER-THAN SIGN (>) 405 Emit the current tag token. Switch to the data state. */ 406 $this->emitToken($this->token); 407 $this->state = 'data'; 408 409 } elseif($this->char === $this->EOF) { 410 /* EOF 411 Parse error. Emit the current tag token. Reconsume the EOF 412 character in the data state. */ 413 $this->emitToken($this->token); 414 415 $this->char--; 416 $this->state = 'data'; 417 418 } elseif($char === '/') { 419 /* U+002F SOLIDUS (/) 420 Parse error unless this is a permitted slash. Switch to the before 421 attribute name state. */ 422 $this->state = 'beforeAttributeName'; 423 424 } else { 425 /* Anything else 426 Append the current input character to the current tag token's tag name. 427 Stay in the tag name state. */ 428 $this->token['name'] .= strtolower($char); 429 $this->state = 'tagName'; 430 } 431 } 432 433 private function beforeAttributeNameState() 434 { 435 // Consume the next input character: 436 $this->char++; 437 $char = $this->character($this->char); 438 439 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 440 /* U+0009 CHARACTER TABULATION 441 U+000A LINE FEED (LF) 442 U+000B LINE TABULATION 443 U+000C FORM FEED (FF) 444 U+0020 SPACE 445 Stay in the before attribute name state. */ 446 $this->state = 'beforeAttributeName'; 447 448 } elseif($char === '>') { 449 /* U+003E GREATER-THAN SIGN (>) 450 Emit the current tag token. Switch to the data state. */ 451 $this->emitToken($this->token); 452 $this->state = 'data'; 453 454 } elseif($char === '/') { 455 /* U+002F SOLIDUS (/) 456 Parse error unless this is a permitted slash. Stay in the before 457 attribute name state. */ 458 $this->state = 'beforeAttributeName'; 459 460 } elseif($this->char === $this->EOF) { 461 /* EOF 462 Parse error. Emit the current tag token. Reconsume the EOF 463 character in the data state. */ 464 $this->emitToken($this->token); 465 466 $this->char--; 467 $this->state = 'data'; 468 469 } else { 470 /* Anything else 471 Start a new attribute in the current tag token. Set that attribute's 472 name to the current input character, and its value to the empty string. 473 Switch to the attribute name state. */ 474 $this->token['attr'][] = array( 475 'name' => strtolower($char), 476 'value' => null 477 ); 478 479 $this->state = 'attributeName'; 480 } 481 } 482 483 private function attributeNameState() 484 { 485 // Consume the next input character: 486 $this->char++; 487 $char = $this->character($this->char); 488 489 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 490 /* U+0009 CHARACTER TABULATION 491 U+000A LINE FEED (LF) 492 U+000B LINE TABULATION 493 U+000C FORM FEED (FF) 494 U+0020 SPACE 495 Stay in the before attribute name state. */ 496 $this->state = 'afterAttributeName'; 497 498 } elseif($char === '=') { 499 /* U+003D EQUALS SIGN (=) 500 Switch to the before attribute value state. */ 501 $this->state = 'beforeAttributeValue'; 502 503 } elseif($char === '>') { 504 /* U+003E GREATER-THAN SIGN (>) 505 Emit the current tag token. Switch to the data state. */ 506 $this->emitToken($this->token); 507 $this->state = 'data'; 508 509 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 510 /* U+002F SOLIDUS (/) 511 Parse error unless this is a permitted slash. Switch to the before 512 attribute name state. */ 513 $this->state = 'beforeAttributeName'; 514 515 } elseif($this->char === $this->EOF) { 516 /* EOF 517 Parse error. Emit the current tag token. Reconsume the EOF 518 character in the data state. */ 519 $this->emitToken($this->token); 520 521 $this->char--; 522 $this->state = 'data'; 523 524 } else { 525 /* Anything else 526 Append the current input character to the current attribute's name. 527 Stay in the attribute name state. */ 528 $last = count($this->token['attr']) - 1; 529 $this->token['attr'][$last]['name'] .= strtolower($char); 530 531 $this->state = 'attributeName'; 532 } 533 } 534 535 private function afterAttributeNameState() 536 { 537 // Consume the next input character: 538 $this->char++; 539 $char = $this->character($this->char); 540 541 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 542 /* U+0009 CHARACTER TABULATION 543 U+000A LINE FEED (LF) 544 U+000B LINE TABULATION 545 U+000C FORM FEED (FF) 546 U+0020 SPACE 547 Stay in the after attribute name state. */ 548 $this->state = 'afterAttributeName'; 549 550 } elseif($char === '=') { 551 /* U+003D EQUALS SIGN (=) 552 Switch to the before attribute value state. */ 553 $this->state = 'beforeAttributeValue'; 554 555 } elseif($char === '>') { 556 /* U+003E GREATER-THAN SIGN (>) 557 Emit the current tag token. Switch to the data state. */ 558 $this->emitToken($this->token); 559 $this->state = 'data'; 560 561 } elseif($char === '/' && $this->character($this->char + 1) !== '>') { 562 /* U+002F SOLIDUS (/) 563 Parse error unless this is a permitted slash. Switch to the 564 before attribute name state. */ 565 $this->state = 'beforeAttributeName'; 566 567 } elseif($this->char === $this->EOF) { 568 /* EOF 569 Parse error. Emit the current tag token. Reconsume the EOF 570 character in the data state. */ 571 $this->emitToken($this->token); 572 573 $this->char--; 574 $this->state = 'data'; 575 576 } else { 577 /* Anything else 578 Start a new attribute in the current tag token. Set that attribute's 579 name to the current input character, and its value to the empty string. 580 Switch to the attribute name state. */ 581 $this->token['attr'][] = array( 582 'name' => strtolower($char), 583 'value' => null 584 ); 585 586 $this->state = 'attributeName'; 587 } 588 } 589 590 private function beforeAttributeValueState() 591 { 592 // Consume the next input character: 593 $this->char++; 594 $char = $this->character($this->char); 595 596 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 597 /* U+0009 CHARACTER TABULATION 598 U+000A LINE FEED (LF) 599 U+000B LINE TABULATION 600 U+000C FORM FEED (FF) 601 U+0020 SPACE 602 Stay in the before attribute value state. */ 603 $this->state = 'beforeAttributeValue'; 604 605 } elseif($char === '"') { 606 /* U+0022 QUOTATION MARK (") 607 Switch to the attribute value (double-quoted) state. */ 608 $this->state = 'attributeValueDoubleQuoted'; 609 610 } elseif($char === '&') { 611 /* U+0026 AMPERSAND (&) 612 Switch to the attribute value (unquoted) state and reconsume 613 this input character. */ 614 $this->char--; 615 $this->state = 'attributeValueUnquoted'; 616 617 } elseif($char === '\'') { 618 /* U+0027 APOSTROPHE (') 619 Switch to the attribute value (single-quoted) state. */ 620 $this->state = 'attributeValueSingleQuoted'; 621 622 } elseif($char === '>') { 623 /* U+003E GREATER-THAN SIGN (>) 624 Emit the current tag token. Switch to the data state. */ 625 $this->emitToken($this->token); 626 $this->state = 'data'; 627 628 } else { 629 /* Anything else 630 Append the current input character to the current attribute's value. 631 Switch to the attribute value (unquoted) state. */ 632 $last = count($this->token['attr']) - 1; 633 $this->token['attr'][$last]['value'] .= $char; 634 635 $this->state = 'attributeValueUnquoted'; 636 } 637 } 638 639 private function attributeValueDoubleQuotedState() 640 { 641 // Consume the next input character: 642 $this->char++; 643 $char = $this->character($this->char); 644 645 if($char === '"') { 646 /* U+0022 QUOTATION MARK (") 647 Switch to the before attribute name state. */ 648 $this->state = 'beforeAttributeName'; 649 650 } elseif($char === '&') { 651 /* U+0026 AMPERSAND (&) 652 Switch to the entity in attribute value state. */ 653 $this->entityInAttributeValueState('double'); 654 655 } elseif($this->char === $this->EOF) { 656 /* EOF 657 Parse error. Emit the current tag token. Reconsume the character 658 in the data state. */ 659 $this->emitToken($this->token); 660 661 $this->char--; 662 $this->state = 'data'; 663 664 } else { 665 /* Anything else 666 Append the current input character to the current attribute's value. 667 Stay in the attribute value (double-quoted) state. */ 668 $last = count($this->token['attr']) - 1; 669 $this->token['attr'][$last]['value'] .= $char; 670 671 $this->state = 'attributeValueDoubleQuoted'; 672 } 673 } 674 675 private function attributeValueSingleQuotedState() 676 { 677 // Consume the next input character: 678 $this->char++; 679 $char = $this->character($this->char); 680 681 if($char === '\'') { 682 /* U+0022 QUOTATION MARK (') 683 Switch to the before attribute name state. */ 684 $this->state = 'beforeAttributeName'; 685 686 } elseif($char === '&') { 687 /* U+0026 AMPERSAND (&) 688 Switch to the entity in attribute value state. */ 689 $this->entityInAttributeValueState('single'); 690 691 } elseif($this->char === $this->EOF) { 692 /* EOF 693 Parse error. Emit the current tag token. Reconsume the character 694 in the data state. */ 695 $this->emitToken($this->token); 696 697 $this->char--; 698 $this->state = 'data'; 699 700 } else { 701 /* Anything else 702 Append the current input character to the current attribute's value. 703 Stay in the attribute value (single-quoted) state. */ 704 $last = count($this->token['attr']) - 1; 705 $this->token['attr'][$last]['value'] .= $char; 706 707 $this->state = 'attributeValueSingleQuoted'; 708 } 709 } 710 711 private function attributeValueUnquotedState() 712 { 713 // Consume the next input character: 714 $this->char++; 715 $char = $this->character($this->char); 716 717 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 718 /* U+0009 CHARACTER TABULATION 719 U+000A LINE FEED (LF) 720 U+000B LINE TABULATION 721 U+000C FORM FEED (FF) 722 U+0020 SPACE 723 Switch to the before attribute name state. */ 724 $this->state = 'beforeAttributeName'; 725 726 } elseif($char === '&') { 727 /* U+0026 AMPERSAND (&) 728 Switch to the entity in attribute value state. */ 729 $this->entityInAttributeValueState('non'); 730 731 } elseif($char === '>') { 732 /* U+003E GREATER-THAN SIGN (>) 733 Emit the current tag token. Switch to the data state. */ 734 $this->emitToken($this->token); 735 $this->state = 'data'; 736 737 } else { 738 /* Anything else 739 Append the current input character to the current attribute's value. 740 Stay in the attribute value (unquoted) state. */ 741 $last = count($this->token['attr']) - 1; 742 $this->token['attr'][$last]['value'] .= $char; 743 744 $this->state = 'attributeValueUnquoted'; 745 } 746 } 747 748 private function entityInAttributeValueState() 749 { 750 // Attempt to consume an entity. 751 $entity = $this->entity(); 752 753 // If nothing is returned, append a U+0026 AMPERSAND character to the 754 // current attribute's value. Otherwise, emit the character token that 755 // was returned. 756 $char = (!$entity) 757 ? '&' 758 : $entity; 759 760 $this->emitToken($char); 761 } 762 763 private function bogusCommentState() 764 { 765 /* Consume every character up to the first U+003E GREATER-THAN SIGN 766 character (>) or the end of the file (EOF), whichever comes first. Emit 767 a comment token whose data is the concatenation of all the characters 768 starting from and including the character that caused the state machine 769 to switch into the bogus comment state, up to and including the last 770 consumed character before the U+003E character, if any, or up to the 771 end of the file otherwise. (If the comment was started by the end of 772 the file (EOF), the token is empty.) */ 773 $data = $this->characters('^>', $this->char); 774 $this->emitToken(array( 775 'data' => $data, 776 'type' => self::COMMENT 777 )); 778 779 $this->char += strlen($data); 780 781 /* Switch to the data state. */ 782 $this->state = 'data'; 783 784 /* If the end of the file was reached, reconsume the EOF character. */ 785 if($this->char === $this->EOF) { 786 $this->char = $this->EOF - 1; 787 } 788 } 789 790 private function markupDeclarationOpenState() 791 { 792 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 793 characters, consume those two characters, create a comment token whose 794 data is the empty string, and switch to the comment state. */ 795 if($this->character($this->char + 1, 2) === '--') { 796 $this->char += 2; 797 $this->state = 'comment'; 798 $this->token = array( 799 'data' => null, 800 'type' => self::COMMENT 801 ); 802 803 /* Otherwise if the next seven chacacters are a case-insensitive match 804 for the word "DOCTYPE", then consume those characters and switch to the 805 DOCTYPE state. */ 806 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') { 807 $this->char += 7; 808 $this->state = 'doctype'; 809 810 /* Otherwise, is is a parse error. Switch to the bogus comment state. 811 The next character that is consumed, if any, is the first character 812 that will be in the comment. */ 813 } else { 814 $this->char++; 815 $this->state = 'bogusComment'; 816 } 817 } 818 819 private function commentState() 820 { 821 /* Consume the next input character: */ 822 $this->char++; 823 $char = $this->char(); 824 825 /* U+002D HYPHEN-MINUS (-) */ 826 if($char === '-') { 827 /* Switch to the comment dash state */ 828 $this->state = 'commentDash'; 829 830 /* EOF */ 831 } elseif($this->char === $this->EOF) { 832 /* Parse error. Emit the comment token. Reconsume the EOF character 833 in the data state. */ 834 $this->emitToken($this->token); 835 $this->char--; 836 $this->state = 'data'; 837 838 /* Anything else */ 839 } else { 840 /* Append the input character to the comment token's data. Stay in 841 the comment state. */ 842 $this->token['data'] .= $char; 843 } 844 } 845 846 private function commentDashState() 847 { 848 /* Consume the next input character: */ 849 $this->char++; 850 $char = $this->char(); 851 852 /* U+002D HYPHEN-MINUS (-) */ 853 if($char === '-') { 854 /* Switch to the comment end state */ 855 $this->state = 'commentEnd'; 856 857 /* EOF */ 858 } elseif($this->char === $this->EOF) { 859 /* Parse error. Emit the comment token. Reconsume the EOF character 860 in the data state. */ 861 $this->emitToken($this->token); 862 $this->char--; 863 $this->state = 'data'; 864 865 /* Anything else */ 866 } else { 867 /* Append a U+002D HYPHEN-MINUS (-) character and the input 868 character to the comment token's data. Switch to the comment state. */ 869 $this->token['data'] .= '-'.$char; 870 $this->state = 'comment'; 871 } 872 } 873 874 private function commentEndState() 875 { 876 /* Consume the next input character: */ 877 $this->char++; 878 $char = $this->char(); 879 880 if($char === '>') { 881 $this->emitToken($this->token); 882 $this->state = 'data'; 883 884 } elseif($char === '-') { 885 $this->token['data'] .= '-'; 886 887 } elseif($this->char === $this->EOF) { 888 $this->emitToken($this->token); 889 $this->char--; 890 $this->state = 'data'; 891 892 } else { 893 $this->token['data'] .= '--'.$char; 894 $this->state = 'comment'; 895 } 896 } 897 898 private function doctypeState() 899 { 900 /* Consume the next input character: */ 901 $this->char++; 902 $char = $this->char(); 903 904 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 905 $this->state = 'beforeDoctypeName'; 906 907 } else { 908 $this->char--; 909 $this->state = 'beforeDoctypeName'; 910 } 911 } 912 913 private function beforeDoctypeNameState() 914 { 915 /* Consume the next input character: */ 916 $this->char++; 917 $char = $this->char(); 918 919 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 920 // Stay in the before DOCTYPE name state. 921 922 } elseif(preg_match('/^[a-z]$/', $char)) { 923 $this->token = array( 924 'name' => strtoupper($char), 925 'type' => self::DOCTYPE, 926 'error' => true 927 ); 928 929 $this->state = 'doctypeName'; 930 931 } elseif($char === '>') { 932 $this->emitToken(array( 933 'name' => null, 934 'type' => self::DOCTYPE, 935 'error' => true 936 )); 937 938 $this->state = 'data'; 939 940 } elseif($this->char === $this->EOF) { 941 $this->emitToken(array( 942 'name' => null, 943 'type' => self::DOCTYPE, 944 'error' => true 945 )); 946 947 $this->char--; 948 $this->state = 'data'; 949 950 } else { 951 $this->token = array( 952 'name' => $char, 953 'type' => self::DOCTYPE, 954 'error' => true 955 ); 956 957 $this->state = 'doctypeName'; 958 } 959 } 960 961 private function doctypeNameState() 962 { 963 /* Consume the next input character: */ 964 $this->char++; 965 $char = $this->char(); 966 967 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 968 $this->state = 'AfterDoctypeName'; 969 970 } elseif($char === '>') { 971 $this->emitToken($this->token); 972 $this->state = 'data'; 973 974 } elseif(preg_match('/^[a-z]$/', $char)) { 975 $this->token['name'] .= strtoupper($char); 976 977 } elseif($this->char === $this->EOF) { 978 $this->emitToken($this->token); 979 $this->char--; 980 $this->state = 'data'; 981 982 } else { 983 $this->token['name'] .= $char; 984 } 985 986 $this->token['error'] = ($this->token['name'] === 'HTML') 987 ? false 988 : true; 989 } 990 991 private function afterDoctypeNameState() 992 { 993 /* Consume the next input character: */ 994 $this->char++; 995 $char = $this->char(); 996 997 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 998 // Stay in the DOCTYPE name state. 999 1000 } elseif($char === '>') { 1001 $this->emitToken($this->token); 1002 $this->state = 'data'; 1003 1004 } elseif($this->char === $this->EOF) { 1005 $this->emitToken($this->token); 1006 $this->char--; 1007 $this->state = 'data'; 1008 1009 } else { 1010 $this->token['error'] = true; 1011 $this->state = 'bogusDoctype'; 1012 } 1013 } 1014 1015 private function bogusDoctypeState() 1016 { 1017 /* Consume the next input character: */ 1018 $this->char++; 1019 $char = $this->char(); 1020 1021 if($char === '>') { 1022 $this->emitToken($this->token); 1023 $this->state = 'data'; 1024 1025 } elseif($this->char === $this->EOF) { 1026 $this->emitToken($this->token); 1027 $this->char--; 1028 $this->state = 'data'; 1029 1030 } else { 1031 // Stay in the bogus DOCTYPE state. 1032 } 1033 } 1034 1035 private function entity() 1036 { 1037 $start = $this->char; 1038 1039 // This section defines how to consume an entity. This definition is 1040 // used when parsing entities in text and in attributes. 1041 1042 // The behaviour depends on the identity of the next character (the 1043 // one immediately after the U+0026 AMPERSAND character): 1044 1045 switch($this->character($this->char + 1)) { 1046 // U+0023 NUMBER SIGN (#) 1047 case '#': 1048 1049 // The behaviour further depends on the character after the 1050 // U+0023 NUMBER SIGN: 1051 switch($this->character($this->char + 1)) { 1052 // U+0078 LATIN SMALL LETTER X 1053 // U+0058 LATIN CAPITAL LETTER X 1054 case 'x': 1055 case 'X': 1056 // Follow the steps below, but using the range of 1057 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1058 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 1059 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 1060 // A, through to U+0046 LATIN CAPITAL LETTER F (in other 1061 // words, 0-9, A-F, a-f). 1062 $char = 1; 1063 $char_class = '0-9A-Fa-f'; 1064 break; 1065 1066 // Anything else 1067 default: 1068 // Follow the steps below, but using the range of 1069 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1070 // NINE (i.e. just 0-9). 1071 $char = 0; 1072 $char_class = '0-9'; 1073 break; 1074 } 1075 1076 // Consume as many characters as match the range of characters 1077 // given above. 1078 $this->char++; 1079 $e_name = $this->characters($char_class, $this->char + $char + 1); 1080 $entity = $this->character($start, $this->char); 1081 $cond = strlen($e_name) > 0; 1082 1083 // The rest of the parsing happens below. 1084 break; 1085 1086 // Anything else 1087 default: 1088 // Consume the maximum number of characters possible, with the 1089 // consumed characters case-sensitively matching one of the 1090 // identifiers in the first column of the entities table. 1091 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); 1092 $len = strlen($e_name); 1093 1094 for($c = 1; $c <= $len; $c++) { 1095 $id = substr($e_name, 0, $c); 1096 $this->char++; 1097 1098 if(in_array($id, $this->entities)) { 1099 $entity = $id; 1100 break; 1101 } 1102 } 1103 1104 $cond = isset($entity); 1105 // The rest of the parsing happens below. 1106 break; 1107 } 1108 1109 if(!$cond) { 1110 // If no match can be made, then this is a parse error. No 1111 // characters are consumed, and nothing is returned. 1112 $this->char = $start; 1113 return false; 1114 } 1115 1116 // Return a character token for the character corresponding to the 1117 // entity name (as given by the second column of the entities table). 1118 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8'); 1119 } 1120 1121 private function emitToken($token) 1122 { 1123 $emit = $this->tree->emitToken($token); 1124 1125 if(is_int($emit)) { 1126 $this->content_model = $emit; 1127 1128 } elseif($token['type'] === self::ENDTAG) { 1129 $this->content_model = self::PCDATA; 1130 } 1131 } 1132 1133 private function EOF() 1134 { 1135 $this->state = null; 1136 $this->tree->emitToken(array( 1137 'type' => self::EOF 1138 )); 1139 } 1140} 1141 1142class HTML5TreeConstructer 1143{ 1144 public $stack = array(); 1145 1146 private $phase; 1147 private $mode; 1148 private $dom; 1149 private $foster_parent = null; 1150 private $a_formatting = array(); 1151 1152 private $head_pointer = null; 1153 private $form_pointer = null; 1154 1155 private $scoping = array('button','caption','html','marquee','object','table','td','th'); 1156 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u'); 1157 private $special = array('address','area','base','basefont','bgsound', 1158 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl', 1159 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5', 1160 'h6','head','hr','iframe','image','img','input','isindex','li','link', 1161 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup', 1162 'option','p','param','plaintext','pre','script','select','spacer','style', 1163 'tbody','textarea','tfoot','thead','title','tr','ul','wbr'); 1164 1165 // The different phases. 1166 const INIT_PHASE = 0; 1167 const ROOT_PHASE = 1; 1168 const MAIN_PHASE = 2; 1169 const END_PHASE = 3; 1170 1171 // The different insertion modes for the main phase. 1172 const BEFOR_HEAD = 0; 1173 const IN_HEAD = 1; 1174 const AFTER_HEAD = 2; 1175 const IN_BODY = 3; 1176 const IN_TABLE = 4; 1177 const IN_CAPTION = 5; 1178 const IN_CGROUP = 6; 1179 const IN_TBODY = 7; 1180 const IN_ROW = 8; 1181 const IN_CELL = 9; 1182 const IN_SELECT = 10; 1183 const AFTER_BODY = 11; 1184 const IN_FRAME = 12; 1185 const AFTR_FRAME = 13; 1186 1187 // The different types of elements. 1188 const SPECIAL = 0; 1189 const SCOPING = 1; 1190 const FORMATTING = 2; 1191 const PHRASING = 3; 1192 1193 const MARKER = 0; 1194 1195 public function __construct() 1196 { 1197 $this->phase = self::INIT_PHASE; 1198 $this->mode = self::BEFOR_HEAD; 1199 $this->dom = new DOMDocument; 1200 1201 $this->dom->encoding = 'UTF-8'; 1202 $this->dom->preserveWhiteSpace = true; 1203 $this->dom->substituteEntities = true; 1204 $this->dom->strictErrorChecking = false; 1205 } 1206 1207 // Process tag tokens 1208 public function emitToken($token) 1209 { 1210 switch($this->phase) { 1211 case self::INIT_PHASE: return $this->initPhase($token); break; 1212 case self::ROOT_PHASE: return $this->rootElementPhase($token); break; 1213 case self::MAIN_PHASE: return $this->mainPhase($token); break; 1214 case self::END_PHASE : return $this->trailingEndPhase($token); break; 1215 } 1216 } 1217 1218 private function initPhase($token) 1219 { 1220 /* Initially, the tree construction stage must handle each token 1221 emitted from the tokenisation stage as follows: */ 1222 1223 /* A DOCTYPE token that is marked as being in error 1224 A comment token 1225 A start tag token 1226 An end tag token 1227 A character token that is not one of one of U+0009 CHARACTER TABULATION, 1228 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1229 or U+0020 SPACE 1230 An end-of-file token */ 1231 if((isset($token['error']) && $token['error']) || 1232 $token['type'] === HTML5::COMMENT || 1233 $token['type'] === HTML5::STARTTAG || 1234 $token['type'] === HTML5::ENDTAG || 1235 $token['type'] === HTML5::EOF || 1236 ($token['type'] === HTML5::CHARACTR && isset($token['data']) && 1237 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) { 1238 /* This specification does not define how to handle this case. In 1239 particular, user agents may ignore the entirety of this specification 1240 altogether for such documents, and instead invoke special parse modes 1241 with a greater emphasis on backwards compatibility. */ 1242 1243 $this->phase = self::ROOT_PHASE; 1244 return $this->rootElementPhase($token); 1245 1246 /* A DOCTYPE token marked as being correct */ 1247 } elseif(isset($token['error']) && !$token['error']) { 1248 /* Append a DocumentType node to the Document node, with the name 1249 attribute set to the name given in the DOCTYPE token (which will be 1250 "HTML"), and the other attributes specific to DocumentType objects 1251 set to null, empty lists, or the empty string as appropriate. */ 1252 $doctype = new DOMDocumentType(null, null, 'HTML'); 1253 1254 /* Then, switch to the root element phase of the tree construction 1255 stage. */ 1256 $this->phase = self::ROOT_PHASE; 1257 1258 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1259 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1260 or U+0020 SPACE */ 1261 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/', 1262 $token['data'])) { 1263 /* Append that character to the Document node. */ 1264 $text = $this->dom->createTextNode($token['data']); 1265 $this->dom->appendChild($text); 1266 } 1267 } 1268 1269 private function rootElementPhase($token) 1270 { 1271 /* After the initial phase, as each token is emitted from the tokenisation 1272 stage, it must be processed as described in this section. */ 1273 1274 /* A DOCTYPE token */ 1275 if($token['type'] === HTML5::DOCTYPE) { 1276 // Parse error. Ignore the token. 1277 1278 /* A comment token */ 1279 } elseif($token['type'] === HTML5::COMMENT) { 1280 /* Append a Comment node to the Document object with the data 1281 attribute set to the data given in the comment token. */ 1282 $comment = $this->dom->createComment($token['data']); 1283 $this->dom->appendChild($comment); 1284 1285 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1286 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1287 or U+0020 SPACE */ 1288 } elseif($token['type'] === HTML5::CHARACTR && 1289 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1290 /* Append that character to the Document node. */ 1291 $text = $this->dom->createTextNode($token['data']); 1292 $this->dom->appendChild($text); 1293 1294 /* A character token that is not one of U+0009 CHARACTER TABULATION, 1295 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED 1296 (FF), or U+0020 SPACE 1297 A start tag token 1298 An end tag token 1299 An end-of-file token */ 1300 } elseif(($token['type'] === HTML5::CHARACTR && 1301 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 1302 $token['type'] === HTML5::STARTTAG || 1303 $token['type'] === HTML5::ENDTAG || 1304 $token['type'] === HTML5::EOF) { 1305 /* Create an HTMLElement node with the tag name html, in the HTML 1306 namespace. Append it to the Document object. Switch to the main 1307 phase and reprocess the current token. */ 1308 $html = $this->dom->createElement('html'); 1309 $this->dom->appendChild($html); 1310 $this->stack[] = $html; 1311 1312 $this->phase = self::MAIN_PHASE; 1313 return $this->mainPhase($token); 1314 } 1315 } 1316 1317 private function mainPhase($token) 1318 { 1319 /* Tokens in the main phase must be handled as follows: */ 1320 1321 /* A DOCTYPE token */ 1322 if($token['type'] === HTML5::DOCTYPE) { 1323 // Parse error. Ignore the token. 1324 1325 /* A start tag token with the tag name "html" */ 1326 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { 1327 /* If this start tag token was not the first start tag token, then 1328 it is a parse error. */ 1329 1330 /* For each attribute on the token, check to see if the attribute 1331 is already present on the top element of the stack of open elements. 1332 If it is not, add the attribute and its corresponding value to that 1333 element. */ 1334 foreach($token['attr'] as $attr) { 1335 if(!$this->stack[0]->hasAttribute($attr['name'])) { 1336 $this->stack[0]->setAttribute($attr['name'], $attr['value']); 1337 } 1338 } 1339 1340 /* An end-of-file token */ 1341 } elseif($token['type'] === HTML5::EOF) { 1342 /* Generate implied end tags. */ 1343 $this->generateImpliedEndTags(); 1344 1345 /* Anything else. */ 1346 } else { 1347 /* Depends on the insertion mode: */ 1348 switch($this->mode) { 1349 case self::BEFOR_HEAD: return $this->beforeHead($token); break; 1350 case self::IN_HEAD: return $this->inHead($token); break; 1351 case self::AFTER_HEAD: return $this->afterHead($token); break; 1352 case self::IN_BODY: return $this->inBody($token); break; 1353 case self::IN_TABLE: return $this->inTable($token); break; 1354 case self::IN_CAPTION: return $this->inCaption($token); break; 1355 case self::IN_CGROUP: return $this->inColumnGroup($token); break; 1356 case self::IN_TBODY: return $this->inTableBody($token); break; 1357 case self::IN_ROW: return $this->inRow($token); break; 1358 case self::IN_CELL: return $this->inCell($token); break; 1359 case self::IN_SELECT: return $this->inSelect($token); break; 1360 case self::AFTER_BODY: return $this->afterBody($token); break; 1361 case self::IN_FRAME: return $this->inFrameset($token); break; 1362 case self::AFTR_FRAME: return $this->afterFrameset($token); break; 1363 case self::END_PHASE: return $this->trailingEndPhase($token); break; 1364 } 1365 } 1366 } 1367 1368 private function beforeHead($token) 1369 { 1370 /* Handle the token as follows: */ 1371 1372 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1373 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1374 or U+0020 SPACE */ 1375 if($token['type'] === HTML5::CHARACTR && 1376 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1377 /* Append the character to the current node. */ 1378 $this->insertText($token['data']); 1379 1380 /* A comment token */ 1381 } elseif($token['type'] === HTML5::COMMENT) { 1382 /* Append a Comment node to the current node with the data attribute 1383 set to the data given in the comment token. */ 1384 $this->insertComment($token['data']); 1385 1386 /* A start tag token with the tag name "head" */ 1387 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { 1388 /* Create an element for the token, append the new element to the 1389 current node and push it onto the stack of open elements. */ 1390 $element = $this->insertElement($token); 1391 1392 /* Set the head element pointer to this new element node. */ 1393 $this->head_pointer = $element; 1394 1395 /* Change the insertion mode to "in head". */ 1396 $this->mode = self::IN_HEAD; 1397 1398 /* A start tag token whose tag name is one of: "base", "link", "meta", 1399 "script", "style", "title". Or an end tag with the tag name "html". 1400 Or a character token that is not one of U+0009 CHARACTER TABULATION, 1401 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1402 or U+0020 SPACE. Or any other start tag token */ 1403 } elseif($token['type'] === HTML5::STARTTAG || 1404 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || 1405 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/', 1406 $token['data']))) { 1407 /* Act as if a start tag token with the tag name "head" and no 1408 attributes had been seen, then reprocess the current token. */ 1409 $this->beforeHead(array( 1410 'name' => 'head', 1411 'type' => HTML5::STARTTAG, 1412 'attr' => array() 1413 )); 1414 1415 return $this->inHead($token); 1416 1417 /* Any other end tag */ 1418 } elseif($token['type'] === HTML5::ENDTAG) { 1419 /* Parse error. Ignore the token. */ 1420 } 1421 } 1422 1423 private function inHead($token) 1424 { 1425 /* Handle the token as follows: */ 1426 1427 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1428 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1429 or U+0020 SPACE. 1430 1431 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style 1432 or script element, append the character to the current node regardless 1433 of its content. */ 1434 if(($token['type'] === HTML5::CHARACTR && 1435 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( 1436 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName, 1437 array('title', 'style', 'script')))) { 1438 /* Append the character to the current node. */ 1439 $this->insertText($token['data']); 1440 1441 /* A comment token */ 1442 } elseif($token['type'] === HTML5::COMMENT) { 1443 /* Append a Comment node to the current node with the data attribute 1444 set to the data given in the comment token. */ 1445 $this->insertComment($token['data']); 1446 1447 } elseif($token['type'] === HTML5::ENDTAG && 1448 in_array($token['name'], array('title', 'style', 'script'))) { 1449 array_pop($this->stack); 1450 return HTML5::PCDATA; 1451 1452 /* A start tag with the tag name "title" */ 1453 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { 1454 /* Create an element for the token and append the new element to the 1455 node pointed to by the head element pointer, or, if that is null 1456 (innerHTML case), to the current node. */ 1457 if($this->head_pointer !== null) { 1458 $element = $this->insertElement($token, false); 1459 $this->head_pointer->appendChild($element); 1460 1461 } else { 1462 $element = $this->insertElement($token); 1463 } 1464 1465 /* Switch the tokeniser's content model flag to the RCDATA state. */ 1466 return HTML5::RCDATA; 1467 1468 /* A start tag with the tag name "style" */ 1469 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { 1470 /* Create an element for the token and append the new element to the 1471 node pointed to by the head element pointer, or, if that is null 1472 (innerHTML case), to the current node. */ 1473 if($this->head_pointer !== null) { 1474 $element = $this->insertElement($token, false); 1475 $this->head_pointer->appendChild($element); 1476 1477 } else { 1478 $this->insertElement($token); 1479 } 1480 1481 /* Switch the tokeniser's content model flag to the CDATA state. */ 1482 return HTML5::CDATA; 1483 1484 /* A start tag with the tag name "script" */ 1485 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { 1486 /* Create an element for the token. */ 1487 $element = $this->insertElement($token, false); 1488 $this->head_pointer->appendChild($element); 1489 1490 /* Switch the tokeniser's content model flag to the CDATA state. */ 1491 return HTML5::CDATA; 1492 1493 /* A start tag with the tag name "base", "link", or "meta" */ 1494 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1495 array('base', 'link', 'meta'))) { 1496 /* Create an element for the token and append the new element to the 1497 node pointed to by the head element pointer, or, if that is null 1498 (innerHTML case), to the current node. */ 1499 if($this->head_pointer !== null) { 1500 $element = $this->insertElement($token, false); 1501 $this->head_pointer->appendChild($element); 1502 array_pop($this->stack); 1503 1504 } else { 1505 $this->insertElement($token); 1506 } 1507 1508 /* An end tag with the tag name "head" */ 1509 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { 1510 /* If the current node is a head element, pop the current node off 1511 the stack of open elements. */ 1512 if($this->head_pointer->isSameNode(end($this->stack))) { 1513 array_pop($this->stack); 1514 1515 /* Otherwise, this is a parse error. */ 1516 } else { 1517 // k 1518 } 1519 1520 /* Change the insertion mode to "after head". */ 1521 $this->mode = self::AFTER_HEAD; 1522 1523 /* A start tag with the tag name "head" or an end tag except "html". */ 1524 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || 1525 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) { 1526 // Parse error. Ignore the token. 1527 1528 /* Anything else */ 1529 } else { 1530 /* If the current node is a head element, act as if an end tag 1531 token with the tag name "head" had been seen. */ 1532 if($this->head_pointer->isSameNode(end($this->stack))) { 1533 $this->inHead(array( 1534 'name' => 'head', 1535 'type' => HTML5::ENDTAG 1536 )); 1537 1538 /* Otherwise, change the insertion mode to "after head". */ 1539 } else { 1540 $this->mode = self::AFTER_HEAD; 1541 } 1542 1543 /* Then, reprocess the current token. */ 1544 return $this->afterHead($token); 1545 } 1546 } 1547 1548 private function afterHead($token) 1549 { 1550 /* Handle the token as follows: */ 1551 1552 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1553 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1554 or U+0020 SPACE */ 1555 if($token['type'] === HTML5::CHARACTR && 1556 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 1557 /* Append the character to the current node. */ 1558 $this->insertText($token['data']); 1559 1560 /* A comment token */ 1561 } elseif($token['type'] === HTML5::COMMENT) { 1562 /* Append a Comment node to the current node with the data attribute 1563 set to the data given in the comment token. */ 1564 $this->insertComment($token['data']); 1565 1566 /* A start tag token with the tag name "body" */ 1567 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { 1568 /* Insert a body element for the token. */ 1569 $this->insertElement($token); 1570 1571 /* Change the insertion mode to "in body". */ 1572 $this->mode = self::IN_BODY; 1573 1574 /* A start tag token with the tag name "frameset" */ 1575 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { 1576 /* Insert a frameset element for the token. */ 1577 $this->insertElement($token); 1578 1579 /* Change the insertion mode to "in frameset". */ 1580 $this->mode = self::IN_FRAME; 1581 1582 /* A start tag token whose tag name is one of: "base", "link", "meta", 1583 "script", "style", "title" */ 1584 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 1585 array('base', 'link', 'meta', 'script', 'style', 'title'))) { 1586 /* Parse error. Switch the insertion mode back to "in head" and 1587 reprocess the token. */ 1588 $this->mode = self::IN_HEAD; 1589 return $this->inHead($token); 1590 1591 /* Anything else */ 1592 } else { 1593 /* Act as if a start tag token with the tag name "body" and no 1594 attributes had been seen, and then reprocess the current token. */ 1595 $this->afterHead(array( 1596 'name' => 'body', 1597 'type' => HTML5::STARTTAG, 1598 'attr' => array() 1599 )); 1600 1601 return $this->inBody($token); 1602 } 1603 } 1604 1605 private function inBody($token) 1606 { 1607 /* Handle the token as follows: */ 1608 1609 switch($token['type']) { 1610 /* A character token */ 1611 case HTML5::CHARACTR: 1612 /* Reconstruct the active formatting elements, if any. */ 1613 $this->reconstructActiveFormattingElements(); 1614 1615 /* Append the token's character to the current node. */ 1616 $this->insertText($token['data']); 1617 break; 1618 1619 /* A comment token */ 1620 case HTML5::COMMENT: 1621 /* Append a Comment node to the current node with the data 1622 attribute set to the data given in the comment token. */ 1623 $this->insertComment($token['data']); 1624 break; 1625 1626 case HTML5::STARTTAG: 1627 switch($token['name']) { 1628 /* A start tag token whose tag name is one of: "script", 1629 "style" */ 1630 case 'script': case 'style': 1631 /* Process the token as if the insertion mode had been "in 1632 head". */ 1633 return $this->inHead($token); 1634 break; 1635 1636 /* A start tag token whose tag name is one of: "base", "link", 1637 "meta", "title" */ 1638 case 'base': case 'link': case 'meta': case 'title': 1639 /* Parse error. Process the token as if the insertion mode 1640 had been "in head". */ 1641 return $this->inHead($token); 1642 break; 1643 1644 /* A start tag token with the tag name "body" */ 1645 case 'body': 1646 /* Parse error. If the second element on the stack of open 1647 elements is not a body element, or, if the stack of open 1648 elements has only one node on it, then ignore the token. 1649 (innerHTML case) */ 1650 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { 1651 // Ignore 1652 1653 /* Otherwise, for each attribute on the token, check to see 1654 if the attribute is already present on the body element (the 1655 second element) on the stack of open elements. If it is not, 1656 add the attribute and its corresponding value to that 1657 element. */ 1658 } else { 1659 foreach($token['attr'] as $attr) { 1660 if(!$this->stack[1]->hasAttribute($attr['name'])) { 1661 $this->stack[1]->setAttribute($attr['name'], $attr['value']); 1662 } 1663 } 1664 } 1665 break; 1666 1667 /* A start tag whose tag name is one of: "address", 1668 "blockquote", "center", "dir", "div", "dl", "fieldset", 1669 "listing", "menu", "ol", "p", "ul" */ 1670 case 'address': case 'blockquote': case 'center': case 'dir': 1671 case 'div': case 'dl': case 'fieldset': case 'listing': 1672 case 'menu': case 'ol': case 'p': case 'ul': 1673 /* If the stack of open elements has a p element in scope, 1674 then act as if an end tag with the tag name p had been 1675 seen. */ 1676 if($this->elementInScope('p')) { 1677 $this->emitToken(array( 1678 'name' => 'p', 1679 'type' => HTML5::ENDTAG 1680 )); 1681 } 1682 1683 /* Insert an HTML element for the token. */ 1684 $this->insertElement($token); 1685 break; 1686 1687 /* A start tag whose tag name is "form" */ 1688 case 'form': 1689 /* If the form element pointer is not null, ignore the 1690 token with a parse error. */ 1691 if($this->form_pointer !== null) { 1692 // Ignore. 1693 1694 /* Otherwise: */ 1695 } else { 1696 /* If the stack of open elements has a p element in 1697 scope, then act as if an end tag with the tag name p 1698 had been seen. */ 1699 if($this->elementInScope('p')) { 1700 $this->emitToken(array( 1701 'name' => 'p', 1702 'type' => HTML5::ENDTAG 1703 )); 1704 } 1705 1706 /* Insert an HTML element for the token, and set the 1707 form element pointer to point to the element created. */ 1708 $element = $this->insertElement($token); 1709 $this->form_pointer = $element; 1710 } 1711 break; 1712 1713 /* A start tag whose tag name is "li", "dd" or "dt" */ 1714 case 'li': case 'dd': case 'dt': 1715 /* If the stack of open elements has a p element in scope, 1716 then act as if an end tag with the tag name p had been 1717 seen. */ 1718 if($this->elementInScope('p')) { 1719 $this->emitToken(array( 1720 'name' => 'p', 1721 'type' => HTML5::ENDTAG 1722 )); 1723 } 1724 1725 $stack_length = count($this->stack) - 1; 1726 1727 for($n = $stack_length; 0 <= $n; $n--) { 1728 /* 1. Initialise node to be the current node (the 1729 bottommost node of the stack). */ 1730 $stop = false; 1731 $node = $this->stack[$n]; 1732 $cat = $this->getElementCategory($node->tagName); 1733 1734 /* 2. If node is an li, dd or dt element, then pop all 1735 the nodes from the current node up to node, including 1736 node, then stop this algorithm. */ 1737 if($token['name'] === $node->tagName || ($token['name'] !== 'li' 1738 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) { 1739 for($x = $stack_length; $x >= $n ; $x--) { 1740 array_pop($this->stack); 1741 } 1742 1743 break; 1744 } 1745 1746 /* 3. If node is not in the formatting category, and is 1747 not in the phrasing category, and is not an address or 1748 div element, then stop this algorithm. */ 1749 if($cat !== self::FORMATTING && $cat !== self::PHRASING && 1750 $node->tagName !== 'address' && $node->tagName !== 'div') { 1751 break; 1752 } 1753 } 1754 1755 /* Finally, insert an HTML element with the same tag 1756 name as the token's. */ 1757 $this->insertElement($token); 1758 break; 1759 1760 /* A start tag token whose tag name is "plaintext" */ 1761 case 'plaintext': 1762 /* If the stack of open elements has a p element in scope, 1763 then act as if an end tag with the tag name p had been 1764 seen. */ 1765 if($this->elementInScope('p')) { 1766 $this->emitToken(array( 1767 'name' => 'p', 1768 'type' => HTML5::ENDTAG 1769 )); 1770 } 1771 1772 /* Insert an HTML element for the token. */ 1773 $this->insertElement($token); 1774 1775 return HTML5::PLAINTEXT; 1776 break; 1777 1778 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", 1779 "h5", "h6" */ 1780 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 1781 /* If the stack of open elements has a p element in scope, 1782 then act as if an end tag with the tag name p had been seen. */ 1783 if($this->elementInScope('p')) { 1784 $this->emitToken(array( 1785 'name' => 'p', 1786 'type' => HTML5::ENDTAG 1787 )); 1788 } 1789 1790 /* If the stack of open elements has in scope an element whose 1791 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 1792 this is a parse error; pop elements from the stack until an 1793 element with one of those tag names has been popped from the 1794 stack. */ 1795 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { 1796 array_pop($this->stack); 1797 } 1798 1799 /* Insert an HTML element for the token. */ 1800 $this->insertElement($token); 1801 break; 1802 1803 /* A start tag whose tag name is "a" */ 1804 case 'a': 1805 /* If the list of active formatting elements contains 1806 an element whose tag name is "a" between the end of the 1807 list and the last marker on the list (or the start of 1808 the list if there is no marker on the list), then this 1809 is a parse error; act as if an end tag with the tag name 1810 "a" had been seen, then remove that element from the list 1811 of active formatting elements and the stack of open 1812 elements if the end tag didn't already remove it (it 1813 might not have if the element is not in table scope). */ 1814 $leng = count($this->a_formatting); 1815 1816 for($n = $leng - 1; $n >= 0; $n--) { 1817 if($this->a_formatting[$n] === self::MARKER) { 1818 break; 1819 1820 } elseif($this->a_formatting[$n]->nodeName === 'a') { 1821 $this->emitToken(array( 1822 'name' => 'a', 1823 'type' => HTML5::ENDTAG 1824 )); 1825 break; 1826 } 1827 } 1828 1829 /* Reconstruct the active formatting elements, if any. */ 1830 $this->reconstructActiveFormattingElements(); 1831 1832 /* Insert an HTML element for the token. */ 1833 $el = $this->insertElement($token); 1834 1835 /* Add that element to the list of active formatting 1836 elements. */ 1837 $this->a_formatting[] = $el; 1838 break; 1839 1840 /* A start tag whose tag name is one of: "b", "big", "em", "font", 1841 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 1842 case 'b': case 'big': case 'em': case 'font': case 'i': 1843 case 'nobr': case 's': case 'small': case 'strike': 1844 case 'strong': case 'tt': case 'u': 1845 /* Reconstruct the active formatting elements, if any. */ 1846 $this->reconstructActiveFormattingElements(); 1847 1848 /* Insert an HTML element for the token. */ 1849 $el = $this->insertElement($token); 1850 1851 /* Add that element to the list of active formatting 1852 elements. */ 1853 $this->a_formatting[] = $el; 1854 break; 1855 1856 /* A start tag token whose tag name is "button" */ 1857 case 'button': 1858 /* If the stack of open elements has a button element in scope, 1859 then this is a parse error; act as if an end tag with the tag 1860 name "button" had been seen, then reprocess the token. (We don't 1861 do that. Unnecessary.) */ 1862 if($this->elementInScope('button')) { 1863 $this->inBody(array( 1864 'name' => 'button', 1865 'type' => HTML5::ENDTAG 1866 )); 1867 } 1868 1869 /* Reconstruct the active formatting elements, if any. */ 1870 $this->reconstructActiveFormattingElements(); 1871 1872 /* Insert an HTML element for the token. */ 1873 $this->insertElement($token); 1874 1875 /* Insert a marker at the end of the list of active 1876 formatting elements. */ 1877 $this->a_formatting[] = self::MARKER; 1878 break; 1879 1880 /* A start tag token whose tag name is one of: "marquee", "object" */ 1881 case 'marquee': case 'object': 1882 /* Reconstruct the active formatting elements, if any. */ 1883 $this->reconstructActiveFormattingElements(); 1884 1885 /* Insert an HTML element for the token. */ 1886 $this->insertElement($token); 1887 1888 /* Insert a marker at the end of the list of active 1889 formatting elements. */ 1890 $this->a_formatting[] = self::MARKER; 1891 break; 1892 1893 /* A start tag token whose tag name is "xmp" */ 1894 case 'xmp': 1895 /* Reconstruct the active formatting elements, if any. */ 1896 $this->reconstructActiveFormattingElements(); 1897 1898 /* Insert an HTML element for the token. */ 1899 $this->insertElement($token); 1900 1901 /* Switch the content model flag to the CDATA state. */ 1902 return HTML5::CDATA; 1903 break; 1904 1905 /* A start tag whose tag name is "table" */ 1906 case 'table': 1907 /* If the stack of open elements has a p element in scope, 1908 then act as if an end tag with the tag name p had been seen. */ 1909 if($this->elementInScope('p')) { 1910 $this->emitToken(array( 1911 'name' => 'p', 1912 'type' => HTML5::ENDTAG 1913 )); 1914 } 1915 1916 /* Insert an HTML element for the token. */ 1917 $this->insertElement($token); 1918 1919 /* Change the insertion mode to "in table". */ 1920 $this->mode = self::IN_TABLE; 1921 break; 1922 1923 /* A start tag whose tag name is one of: "area", "basefont", 1924 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ 1925 case 'area': case 'basefont': case 'bgsound': case 'br': 1926 case 'embed': case 'img': case 'param': case 'spacer': 1927 case 'wbr': 1928 /* Reconstruct the active formatting elements, if any. */ 1929 $this->reconstructActiveFormattingElements(); 1930 1931 /* Insert an HTML element for the token. */ 1932 $this->insertElement($token); 1933 1934 /* Immediately pop the current node off the stack of open elements. */ 1935 array_pop($this->stack); 1936 break; 1937 1938 /* A start tag whose tag name is "hr" */ 1939 case 'hr': 1940 /* If the stack of open elements has a p element in scope, 1941 then act as if an end tag with the tag name p had been seen. */ 1942 if($this->elementInScope('p')) { 1943 $this->emitToken(array( 1944 'name' => 'p', 1945 'type' => HTML5::ENDTAG 1946 )); 1947 } 1948 1949 /* Insert an HTML element for the token. */ 1950 $this->insertElement($token); 1951 1952 /* Immediately pop the current node off the stack of open elements. */ 1953 array_pop($this->stack); 1954 break; 1955 1956 /* A start tag whose tag name is "image" */ 1957 case 'image': 1958 /* Parse error. Change the token's tag name to "img" and 1959 reprocess it. (Don't ask.) */ 1960 $token['name'] = 'img'; 1961 return $this->inBody($token); 1962 break; 1963 1964 /* A start tag whose tag name is "input" */ 1965 case 'input': 1966 /* Reconstruct the active formatting elements, if any. */ 1967 $this->reconstructActiveFormattingElements(); 1968 1969 /* Insert an input element for the token. */ 1970 $element = $this->insertElement($token, false); 1971 1972 /* If the form element pointer is not null, then associate the 1973 input element with the form element pointed to by the form 1974 element pointer. */ 1975 $this->form_pointer !== null 1976 ? $this->form_pointer->appendChild($element) 1977 : end($this->stack)->appendChild($element); 1978 1979 /* Pop that input element off the stack of open elements. */ 1980 array_pop($this->stack); 1981 break; 1982 1983 /* A start tag whose tag name is "isindex" */ 1984 case 'isindex': 1985 /* Parse error. */ 1986 // w/e 1987 1988 /* If the form element pointer is not null, 1989 then ignore the token. */ 1990 if($this->form_pointer === null) { 1991 /* Act as if a start tag token with the tag name "form" had 1992 been seen. */ 1993 $this->inBody(array( 1994 'name' => 'body', 1995 'type' => HTML5::STARTTAG, 1996 'attr' => array() 1997 )); 1998 1999 /* Act as if a start tag token with the tag name "hr" had 2000 been seen. */ 2001 $this->inBody(array( 2002 'name' => 'hr', 2003 'type' => HTML5::STARTTAG, 2004 'attr' => array() 2005 )); 2006 2007 /* Act as if a start tag token with the tag name "p" had 2008 been seen. */ 2009 $this->inBody(array( 2010 'name' => 'p', 2011 'type' => HTML5::STARTTAG, 2012 'attr' => array() 2013 )); 2014 2015 /* Act as if a start tag token with the tag name "label" 2016 had been seen. */ 2017 $this->inBody(array( 2018 'name' => 'label', 2019 'type' => HTML5::STARTTAG, 2020 'attr' => array() 2021 )); 2022 2023 /* Act as if a stream of character tokens had been seen. */ 2024 $this->insertText('This is a searchable index. '. 2025 'Insert your search keywords here: '); 2026 2027 /* Act as if a start tag token with the tag name "input" 2028 had been seen, with all the attributes from the "isindex" 2029 token, except with the "name" attribute set to the value 2030 "isindex" (ignoring any explicit "name" attribute). */ 2031 $attr = $token['attr']; 2032 $attr[] = array('name' => 'name', 'value' => 'isindex'); 2033 2034 $this->inBody(array( 2035 'name' => 'input', 2036 'type' => HTML5::STARTTAG, 2037 'attr' => $attr 2038 )); 2039 2040 /* Act as if a stream of character tokens had been seen 2041 (see below for what they should say). */ 2042 $this->insertText('This is a searchable index. '. 2043 'Insert your search keywords here: '); 2044 2045 /* Act as if an end tag token with the tag name "label" 2046 had been seen. */ 2047 $this->inBody(array( 2048 'name' => 'label', 2049 'type' => HTML5::ENDTAG 2050 )); 2051 2052 /* Act as if an end tag token with the tag name "p" had 2053 been seen. */ 2054 $this->inBody(array( 2055 'name' => 'p', 2056 'type' => HTML5::ENDTAG 2057 )); 2058 2059 /* Act as if a start tag token with the tag name "hr" had 2060 been seen. */ 2061 $this->inBody(array( 2062 'name' => 'hr', 2063 'type' => HTML5::ENDTAG 2064 )); 2065 2066 /* Act as if an end tag token with the tag name "form" had 2067 been seen. */ 2068 $this->inBody(array( 2069 'name' => 'form', 2070 'type' => HTML5::ENDTAG 2071 )); 2072 } 2073 break; 2074 2075 /* A start tag whose tag name is "textarea" */ 2076 case 'textarea': 2077 $this->insertElement($token); 2078 2079 /* Switch the tokeniser's content model flag to the 2080 RCDATA state. */ 2081 return HTML5::RCDATA; 2082 break; 2083 2084 /* A start tag whose tag name is one of: "iframe", "noembed", 2085 "noframes" */ 2086 case 'iframe': case 'noembed': case 'noframes': 2087 $this->insertElement($token); 2088 2089 /* Switch the tokeniser's content model flag to the CDATA state. */ 2090 return HTML5::CDATA; 2091 break; 2092 2093 /* A start tag whose tag name is "select" */ 2094 case 'select': 2095 /* Reconstruct the active formatting elements, if any. */ 2096 $this->reconstructActiveFormattingElements(); 2097 2098 /* Insert an HTML element for the token. */ 2099 $this->insertElement($token); 2100 2101 /* Change the insertion mode to "in select". */ 2102 $this->mode = self::IN_SELECT; 2103 break; 2104 2105 /* A start or end tag whose tag name is one of: "caption", "col", 2106 "colgroup", "frame", "frameset", "head", "option", "optgroup", 2107 "tbody", "td", "tfoot", "th", "thead", "tr". */ 2108 case 'caption': case 'col': case 'colgroup': case 'frame': 2109 case 'frameset': case 'head': case 'option': case 'optgroup': 2110 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead': 2111 case 'tr': 2112 // Parse error. Ignore the token. 2113 break; 2114 2115 /* A start or end tag whose tag name is one of: "event-source", 2116 "section", "nav", "article", "aside", "header", "footer", 2117 "datagrid", "command" */ 2118 case 'event-source': case 'section': case 'nav': case 'article': 2119 case 'aside': case 'header': case 'footer': case 'datagrid': 2120 case 'command': 2121 // Work in progress! 2122 break; 2123 2124 /* A start tag token not covered by the previous entries */ 2125 default: 2126 /* Reconstruct the active formatting elements, if any. */ 2127 $this->reconstructActiveFormattingElements(); 2128 2129 $this->insertElement($token); 2130 break; 2131 } 2132 break; 2133 2134 case HTML5::ENDTAG: 2135 switch($token['name']) { 2136 /* An end tag with the tag name "body" */ 2137 case 'body': 2138 /* If the second element in the stack of open elements is 2139 not a body element, this is a parse error. Ignore the token. 2140 (innerHTML case) */ 2141 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { 2142 // Ignore. 2143 2144 /* If the current node is not the body element, then this 2145 is a parse error. */ 2146 } elseif(end($this->stack)->nodeName !== 'body') { 2147 // Parse error. 2148 } 2149 2150 /* Change the insertion mode to "after body". */ 2151 $this->mode = self::AFTER_BODY; 2152 break; 2153 2154 /* An end tag with the tag name "html" */ 2155 case 'html': 2156 /* Act as if an end tag with tag name "body" had been seen, 2157 then, if that token wasn't ignored, reprocess the current 2158 token. */ 2159 $this->inBody(array( 2160 'name' => 'body', 2161 'type' => HTML5::ENDTAG 2162 )); 2163 2164 return $this->afterBody($token); 2165 break; 2166 2167 /* An end tag whose tag name is one of: "address", "blockquote", 2168 "center", "dir", "div", "dl", "fieldset", "listing", "menu", 2169 "ol", "pre", "ul" */ 2170 case 'address': case 'blockquote': case 'center': case 'dir': 2171 case 'div': case 'dl': case 'fieldset': case 'listing': 2172 case 'menu': case 'ol': case 'pre': case 'ul': 2173 /* If the stack of open elements has an element in scope 2174 with the same tag name as that of the token, then generate 2175 implied end tags. */ 2176 if($this->elementInScope($token['name'])) { 2177 $this->generateImpliedEndTags(); 2178 2179 /* Now, if the current node is not an element with 2180 the same tag name as that of the token, then this 2181 is a parse error. */ 2182 // w/e 2183 2184 /* If the stack of open elements has an element in 2185 scope with the same tag name as that of the token, 2186 then pop elements from this stack until an element 2187 with that tag name has been popped from the stack. */ 2188 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2189 if($this->stack[$n]->nodeName === $token['name']) { 2190 $n = -1; 2191 } 2192 2193 array_pop($this->stack); 2194 } 2195 } 2196 break; 2197 2198 /* An end tag whose tag name is "form" */ 2199 case 'form': 2200 /* If the stack of open elements has an element in scope 2201 with the same tag name as that of the token, then generate 2202 implied end tags. */ 2203 if($this->elementInScope($token['name'])) { 2204 $this->generateImpliedEndTags(); 2205 2206 } 2207 2208 if(end($this->stack)->nodeName !== $token['name']) { 2209 /* Now, if the current node is not an element with the 2210 same tag name as that of the token, then this is a parse 2211 error. */ 2212 // w/e 2213 2214 } else { 2215 /* Otherwise, if the current node is an element with 2216 the same tag name as that of the token pop that element 2217 from the stack. */ 2218 array_pop($this->stack); 2219 } 2220 2221 /* In any case, set the form element pointer to null. */ 2222 $this->form_pointer = null; 2223 break; 2224 2225 /* An end tag whose tag name is "p" */ 2226 case 'p': 2227 /* If the stack of open elements has a p element in scope, 2228 then generate implied end tags, except for p elements. */ 2229 if($this->elementInScope('p')) { 2230 $this->generateImpliedEndTags(array('p')); 2231 2232 /* If the current node is not a p element, then this is 2233 a parse error. */ 2234 // k 2235 2236 /* If the stack of open elements has a p element in 2237 scope, then pop elements from this stack until the stack 2238 no longer has a p element in scope. */ 2239 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2240 if($this->elementInScope('p')) { 2241 array_pop($this->stack); 2242 2243 } else { 2244 break; 2245 } 2246 } 2247 } 2248 break; 2249 2250 /* An end tag whose tag name is "dd", "dt", or "li" */ 2251 case 'dd': case 'dt': case 'li': 2252 /* If the stack of open elements has an element in scope 2253 whose tag name matches the tag name of the token, then 2254 generate implied end tags, except for elements with the 2255 same tag name as the token. */ 2256 if($this->elementInScope($token['name'])) { 2257 $this->generateImpliedEndTags(array($token['name'])); 2258 2259 /* If the current node is not an element with the same 2260 tag name as the token, then this is a parse error. */ 2261 // w/e 2262 2263 /* If the stack of open elements has an element in scope 2264 whose tag name matches the tag name of the token, then 2265 pop elements from this stack until an element with that 2266 tag name has been popped from the stack. */ 2267 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2268 if($this->stack[$n]->nodeName === $token['name']) { 2269 $n = -1; 2270 } 2271 2272 array_pop($this->stack); 2273 } 2274 } 2275 break; 2276 2277 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", 2278 "h5", "h6" */ 2279 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': 2280 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); 2281 2282 /* If the stack of open elements has in scope an element whose 2283 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2284 generate implied end tags. */ 2285 if($this->elementInScope($elements)) { 2286 $this->generateImpliedEndTags(); 2287 2288 /* Now, if the current node is not an element with the same 2289 tag name as that of the token, then this is a parse error. */ 2290 // w/e 2291 2292 /* If the stack of open elements has in scope an element 2293 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or 2294 "h6", then pop elements from the stack until an element 2295 with one of those tag names has been popped from the stack. */ 2296 while($this->elementInScope($elements)) { 2297 array_pop($this->stack); 2298 } 2299 } 2300 break; 2301 2302 /* An end tag whose tag name is one of: "a", "b", "big", "em", 2303 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2304 case 'a': case 'b': case 'big': case 'em': case 'font': 2305 case 'i': case 'nobr': case 's': case 'small': case 'strike': 2306 case 'strong': case 'tt': case 'u': 2307 /* 1. Let the formatting element be the last element in 2308 the list of active formatting elements that: 2309 * is between the end of the list and the last scope 2310 marker in the list, if any, or the start of the list 2311 otherwise, and 2312 * has the same tag name as the token. 2313 */ 2314 while(true) { 2315 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) { 2316 if($this->a_formatting[$a] === self::MARKER) { 2317 break; 2318 2319 } elseif($this->a_formatting[$a]->tagName === $token['name']) { 2320 $formatting_element = $this->a_formatting[$a]; 2321 $in_stack = in_array($formatting_element, $this->stack, true); 2322 $fe_af_pos = $a; 2323 break; 2324 } 2325 } 2326 2327 /* If there is no such node, or, if that node is 2328 also in the stack of open elements but the element 2329 is not in scope, then this is a parse error. Abort 2330 these steps. The token is ignored. */ 2331 if(!isset($formatting_element) || ($in_stack && 2332 !$this->elementInScope($token['name']))) { 2333 break; 2334 2335 /* Otherwise, if there is such a node, but that node 2336 is not in the stack of open elements, then this is a 2337 parse error; remove the element from the list, and 2338 abort these steps. */ 2339 } elseif(isset($formatting_element) && !$in_stack) { 2340 unset($this->a_formatting[$fe_af_pos]); 2341 $this->a_formatting = array_merge($this->a_formatting); 2342 break; 2343 } 2344 2345 /* 2. Let the furthest block be the topmost node in the 2346 stack of open elements that is lower in the stack 2347 than the formatting element, and is not an element in 2348 the phrasing or formatting categories. There might 2349 not be one. */ 2350 $fe_s_pos = array_search($formatting_element, $this->stack, true); 2351 $length = count($this->stack); 2352 2353 for($s = $fe_s_pos + 1; $s < $length; $s++) { 2354 $category = $this->getElementCategory($this->stack[$s]->nodeName); 2355 2356 if($category !== self::PHRASING && $category !== self::FORMATTING) { 2357 $furthest_block = $this->stack[$s]; 2358 } 2359 } 2360 2361 /* 3. If there is no furthest block, then the UA must 2362 skip the subsequent steps and instead just pop all 2363 the nodes from the bottom of the stack of open 2364 elements, from the current node up to the formatting 2365 element, and remove the formatting element from the 2366 list of active formatting elements. */ 2367 if(!isset($furthest_block)) { 2368 for($n = $length - 1; $n >= $fe_s_pos; $n--) { 2369 array_pop($this->stack); 2370 } 2371 2372 unset($this->a_formatting[$fe_af_pos]); 2373 $this->a_formatting = array_merge($this->a_formatting); 2374 break; 2375 } 2376 2377 /* 4. Let the common ancestor be the element 2378 immediately above the formatting element in the stack 2379 of open elements. */ 2380 $common_ancestor = $this->stack[$fe_s_pos - 1]; 2381 2382 /* 5. If the furthest block has a parent node, then 2383 remove the furthest block from its parent node. */ 2384 if($furthest_block->parentNode !== null) { 2385 $furthest_block->parentNode->removeChild($furthest_block); 2386 } 2387 2388 /* 6. Let a bookmark note the position of the 2389 formatting element in the list of active formatting 2390 elements relative to the elements on either side 2391 of it in the list. */ 2392 $bookmark = $fe_af_pos; 2393 2394 /* 7. Let node and last node be the furthest block. 2395 Follow these steps: */ 2396 $node = $furthest_block; 2397 $last_node = $furthest_block; 2398 2399 while(true) { 2400 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { 2401 /* 7.1 Let node be the element immediately 2402 prior to node in the stack of open elements. */ 2403 $node = $this->stack[$n]; 2404 2405 /* 7.2 If node is not in the list of active 2406 formatting elements, then remove node from 2407 the stack of open elements and then go back 2408 to step 1. */ 2409 if(!in_array($node, $this->a_formatting, true)) { 2410 unset($this->stack[$n]); 2411 $this->stack = array_merge($this->stack); 2412 2413 } else { 2414 break; 2415 } 2416 } 2417 2418 /* 7.3 Otherwise, if node is the formatting 2419 element, then go to the next step in the overall 2420 algorithm. */ 2421 if($node === $formatting_element) { 2422 break; 2423 2424 /* 7.4 Otherwise, if last node is the furthest 2425 block, then move the aforementioned bookmark to 2426 be immediately after the node in the list of 2427 active formatting elements. */ 2428 } elseif($last_node === $furthest_block) { 2429 $bookmark = array_search($node, $this->a_formatting, true) + 1; 2430 } 2431 2432 /* 7.5 If node has any children, perform a 2433 shallow clone of node, replace the entry for 2434 node in the list of active formatting elements 2435 with an entry for the clone, replace the entry 2436 for node in the stack of open elements with an 2437 entry for the clone, and let node be the clone. */ 2438 if($node->hasChildNodes()) { 2439 $clone = $node->cloneNode(); 2440 $s_pos = array_search($node, $this->stack, true); 2441 $a_pos = array_search($node, $this->a_formatting, true); 2442 2443 $this->stack[$s_pos] = $clone; 2444 $this->a_formatting[$a_pos] = $clone; 2445 $node = $clone; 2446 } 2447 2448 /* 7.6 Insert last node into node, first removing 2449 it from its previous parent node if any. */ 2450 if($last_node->parentNode !== null) { 2451 $last_node->parentNode->removeChild($last_node); 2452 } 2453 2454 $node->appendChild($last_node); 2455 2456 /* 7.7 Let last node be node. */ 2457 $last_node = $node; 2458 } 2459 2460 /* 8. Insert whatever last node ended up being in 2461 the previous step into the common ancestor node, 2462 first removing it from its previous parent node if 2463 any. */ 2464 if($last_node->parentNode !== null) { 2465 $last_node->parentNode->removeChild($last_node); 2466 } 2467 2468 $common_ancestor->appendChild($last_node); 2469 2470 /* 9. Perform a shallow clone of the formatting 2471 element. */ 2472 $clone = $formatting_element->cloneNode(); 2473 2474 /* 10. Take all of the child nodes of the furthest 2475 block and append them to the clone created in the 2476 last step. */ 2477 while($furthest_block->hasChildNodes()) { 2478 $child = $furthest_block->firstChild; 2479 $furthest_block->removeChild($child); 2480 $clone->appendChild($child); 2481 } 2482 2483 /* 11. Append that clone to the furthest block. */ 2484 $furthest_block->appendChild($clone); 2485 2486 /* 12. Remove the formatting element from the list 2487 of active formatting elements, and insert the clone 2488 into the list of active formatting elements at the 2489 position of the aforementioned bookmark. */ 2490 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); 2491 unset($this->a_formatting[$fe_af_pos]); 2492 $this->a_formatting = array_merge($this->a_formatting); 2493 2494 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); 2495 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); 2496 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); 2497 2498 /* 13. Remove the formatting element from the stack 2499 of open elements, and insert the clone into the stack 2500 of open elements immediately after (i.e. in a more 2501 deeply nested position than) the position of the 2502 furthest block in that stack. */ 2503 $fe_s_pos = array_search($formatting_element, $this->stack, true); 2504 $fb_s_pos = array_search($furthest_block, $this->stack, true); 2505 unset($this->stack[$fe_s_pos]); 2506 2507 $s_part1 = array_slice($this->stack, 0, $fb_s_pos); 2508 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); 2509 $this->stack = array_merge($s_part1, array($clone), $s_part2); 2510 2511 /* 14. Jump back to step 1 in this series of steps. */ 2512 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); 2513 } 2514 break; 2515 2516 /* An end tag token whose tag name is one of: "button", 2517 "marquee", "object" */ 2518 case 'button': case 'marquee': case 'object': 2519 /* If the stack of open elements has an element in scope whose 2520 tag name matches the tag name of the token, then generate implied 2521 tags. */ 2522 if($this->elementInScope($token['name'])) { 2523 $this->generateImpliedEndTags(); 2524 2525 /* Now, if the current node is not an element with the same 2526 tag name as the token, then this is a parse error. */ 2527 // k 2528 2529 /* Now, if the stack of open elements has an element in scope 2530 whose tag name matches the tag name of the token, then pop 2531 elements from the stack until that element has been popped from 2532 the stack, and clear the list of active formatting elements up 2533 to the last marker. */ 2534 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2535 if($this->stack[$n]->nodeName === $token['name']) { 2536 $n = -1; 2537 } 2538 2539 array_pop($this->stack); 2540 } 2541 2542 $marker = end(array_keys($this->a_formatting, self::MARKER, true)); 2543 2544 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) { 2545 array_pop($this->a_formatting); 2546 } 2547 } 2548 break; 2549 2550 /* Or an end tag whose tag name is one of: "area", "basefont", 2551 "bgsound", "br", "embed", "hr", "iframe", "image", "img", 2552 "input", "isindex", "noembed", "noframes", "param", "select", 2553 "spacer", "table", "textarea", "wbr" */ 2554 case 'area': case 'basefont': case 'bgsound': case 'br': 2555 case 'embed': case 'hr': case 'iframe': case 'image': 2556 case 'img': case 'input': case 'isindex': case 'noembed': 2557 case 'noframes': case 'param': case 'select': case 'spacer': 2558 case 'table': case 'textarea': case 'wbr': 2559 // Parse error. Ignore the token. 2560 break; 2561 2562 /* An end tag token not covered by the previous entries */ 2563 default: 2564 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2565 /* Initialise node to be the current node (the bottommost 2566 node of the stack). */ 2567 $node = end($this->stack); 2568 2569 /* If node has the same tag name as the end tag token, 2570 then: */ 2571 if($token['name'] === $node->nodeName) { 2572 /* Generate implied end tags. */ 2573 $this->generateImpliedEndTags(); 2574 2575 /* If the tag name of the end tag token does not 2576 match the tag name of the current node, this is a 2577 parse error. */ 2578 // k 2579 2580 /* Pop all the nodes from the current node up to 2581 node, including node, then stop this algorithm. */ 2582 for($x = count($this->stack) - $n; $x >= $n; $x--) { 2583 array_pop($this->stack); 2584 } 2585 2586 } else { 2587 $category = $this->getElementCategory($node); 2588 2589 if($category !== self::SPECIAL && $category !== self::SCOPING) { 2590 /* Otherwise, if node is in neither the formatting 2591 category nor the phrasing category, then this is a 2592 parse error. Stop this algorithm. The end tag token 2593 is ignored. */ 2594 return false; 2595 } 2596 } 2597 } 2598 break; 2599 } 2600 break; 2601 } 2602 } 2603 2604 private function inTable($token) 2605 { 2606 $clear = array('html', 'table'); 2607 2608 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2609 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2610 or U+0020 SPACE */ 2611 if($token['type'] === HTML5::CHARACTR && 2612 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 2613 /* Append the character to the current node. */ 2614 $text = $this->dom->createTextNode($token['data']); 2615 end($this->stack)->appendChild($text); 2616 2617 /* A comment token */ 2618 } elseif($token['type'] === HTML5::COMMENT) { 2619 /* Append a Comment node to the current node with the data 2620 attribute set to the data given in the comment token. */ 2621 $comment = $this->dom->createComment($token['data']); 2622 end($this->stack)->appendChild($comment); 2623 2624 /* A start tag whose tag name is "caption" */ 2625 } elseif($token['type'] === HTML5::STARTTAG && 2626 $token['name'] === 'caption') { 2627 /* Clear the stack back to a table context. */ 2628 $this->clearStackToTableContext($clear); 2629 2630 /* Insert a marker at the end of the list of active 2631 formatting elements. */ 2632 $this->a_formatting[] = self::MARKER; 2633 2634 /* Insert an HTML element for the token, then switch the 2635 insertion mode to "in caption". */ 2636 $this->insertElement($token); 2637 $this->mode = self::IN_CAPTION; 2638 2639 /* A start tag whose tag name is "colgroup" */ 2640 } elseif($token['type'] === HTML5::STARTTAG && 2641 $token['name'] === 'colgroup') { 2642 /* Clear the stack back to a table context. */ 2643 $this->clearStackToTableContext($clear); 2644 2645 /* Insert an HTML element for the token, then switch the 2646 insertion mode to "in column group". */ 2647 $this->insertElement($token); 2648 $this->mode = self::IN_CGROUP; 2649 2650 /* A start tag whose tag name is "col" */ 2651 } elseif($token['type'] === HTML5::STARTTAG && 2652 $token['name'] === 'col') { 2653 $this->inTable(array( 2654 'name' => 'colgroup', 2655 'type' => HTML5::STARTTAG, 2656 'attr' => array() 2657 )); 2658 2659 $this->inColumnGroup($token); 2660 2661 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ 2662 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2663 array('tbody', 'tfoot', 'thead'))) { 2664 /* Clear the stack back to a table context. */ 2665 $this->clearStackToTableContext($clear); 2666 2667 /* Insert an HTML element for the token, then switch the insertion 2668 mode to "in table body". */ 2669 $this->insertElement($token); 2670 $this->mode = self::IN_TBODY; 2671 2672 /* A start tag whose tag name is one of: "td", "th", "tr" */ 2673 } elseif($token['type'] === HTML5::STARTTAG && 2674 in_array($token['name'], array('td', 'th', 'tr'))) { 2675 /* Act as if a start tag token with the tag name "tbody" had been 2676 seen, then reprocess the current token. */ 2677 $this->inTable(array( 2678 'name' => 'tbody', 2679 'type' => HTML5::STARTTAG, 2680 'attr' => array() 2681 )); 2682 2683 return $this->inTableBody($token); 2684 2685 /* A start tag whose tag name is "table" */ 2686 } elseif($token['type'] === HTML5::STARTTAG && 2687 $token['name'] === 'table') { 2688 /* Parse error. Act as if an end tag token with the tag name "table" 2689 had been seen, then, if that token wasn't ignored, reprocess the 2690 current token. */ 2691 $this->inTable(array( 2692 'name' => 'table', 2693 'type' => HTML5::ENDTAG 2694 )); 2695 2696 return $this->mainPhase($token); 2697 2698 /* An end tag whose tag name is "table" */ 2699 } elseif($token['type'] === HTML5::ENDTAG && 2700 $token['name'] === 'table') { 2701 /* If the stack of open elements does not have an element in table 2702 scope with the same tag name as the token, this is a parse error. 2703 Ignore the token. (innerHTML case) */ 2704 if(!$this->elementInScope($token['name'], true)) { 2705 return false; 2706 2707 /* Otherwise: */ 2708 } else { 2709 /* Generate implied end tags. */ 2710 $this->generateImpliedEndTags(); 2711 2712 /* Now, if the current node is not a table element, then this 2713 is a parse error. */ 2714 // w/e 2715 2716 /* Pop elements from this stack until a table element has been 2717 popped from the stack. */ 2718 while(true) { 2719 $current = end($this->stack)->nodeName; 2720 array_pop($this->stack); 2721 2722 if($current === 'table') { 2723 break; 2724 } 2725 } 2726 2727 /* Reset the insertion mode appropriately. */ 2728 $this->resetInsertionMode(); 2729 } 2730 2731 /* An end tag whose tag name is one of: "body", "caption", "col", 2732 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 2733 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 2734 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 2735 'tfoot', 'th', 'thead', 'tr'))) { 2736 // Parse error. Ignore the token. 2737 2738 /* Anything else */ 2739 } else { 2740 /* Parse error. Process the token as if the insertion mode was "in 2741 body", with the following exception: */ 2742 2743 /* If the current node is a table, tbody, tfoot, thead, or tr 2744 element, then, whenever a node would be inserted into the current 2745 node, it must instead be inserted into the foster parent element. */ 2746 if(in_array(end($this->stack)->nodeName, 2747 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { 2748 /* The foster parent element is the parent element of the last 2749 table element in the stack of open elements, if there is a 2750 table element and it has such a parent element. If there is no 2751 table element in the stack of open elements (innerHTML case), 2752 then the foster parent element is the first element in the 2753 stack of open elements (the html element). Otherwise, if there 2754 is a table element in the stack of open elements, but the last 2755 table element in the stack of open elements has no parent, or 2756 its parent node is not an element, then the foster parent 2757 element is the element before the last table element in the 2758 stack of open elements. */ 2759 for($n = count($this->stack) - 1; $n >= 0; $n--) { 2760 if($this->stack[$n]->nodeName === 'table') { 2761 $table = $this->stack[$n]; 2762 break; 2763 } 2764 } 2765 2766 if(isset($table) && $table->parentNode !== null) { 2767 $this->foster_parent = $table->parentNode; 2768 2769 } elseif(!isset($table)) { 2770 $this->foster_parent = $this->stack[0]; 2771 2772 } elseif(isset($table) && ($table->parentNode === null || 2773 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) { 2774 $this->foster_parent = $this->stack[$n - 1]; 2775 } 2776 } 2777 2778 $this->inBody($token); 2779 } 2780 } 2781 2782 private function inCaption($token) 2783 { 2784 /* An end tag whose tag name is "caption" */ 2785 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { 2786 /* If the stack of open elements does not have an element in table 2787 scope with the same tag name as the token, this is a parse error. 2788 Ignore the token. (innerHTML case) */ 2789 if(!$this->elementInScope($token['name'], true)) { 2790 // Ignore 2791 2792 /* Otherwise: */ 2793 } else { 2794 /* Generate implied end tags. */ 2795 $this->generateImpliedEndTags(); 2796 2797 /* Now, if the current node is not a caption element, then this 2798 is a parse error. */ 2799 // w/e 2800 2801 /* Pop elements from this stack until a caption element has 2802 been popped from the stack. */ 2803 while(true) { 2804 $node = end($this->stack)->nodeName; 2805 array_pop($this->stack); 2806 2807 if($node === 'caption') { 2808 break; 2809 } 2810 } 2811 2812 /* Clear the list of active formatting elements up to the last 2813 marker. */ 2814 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 2815 2816 /* Switch the insertion mode to "in table". */ 2817 $this->mode = self::IN_TABLE; 2818 } 2819 2820 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 2821 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag 2822 name is "table" */ 2823 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2824 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 2825 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG && 2826 $token['name'] === 'table')) { 2827 /* Parse error. Act as if an end tag with the tag name "caption" 2828 had been seen, then, if that token wasn't ignored, reprocess the 2829 current token. */ 2830 $this->inCaption(array( 2831 'name' => 'caption', 2832 'type' => HTML5::ENDTAG 2833 )); 2834 2835 return $this->inTable($token); 2836 2837 /* An end tag whose tag name is one of: "body", "col", "colgroup", 2838 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 2839 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 2840 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th', 2841 'thead', 'tr'))) { 2842 // Parse error. Ignore the token. 2843 2844 /* Anything else */ 2845 } else { 2846 /* Process the token as if the insertion mode was "in body". */ 2847 $this->inBody($token); 2848 } 2849 } 2850 2851 private function inColumnGroup($token) 2852 { 2853 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2854 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2855 or U+0020 SPACE */ 2856 if($token['type'] === HTML5::CHARACTR && 2857 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 2858 /* Append the character to the current node. */ 2859 $text = $this->dom->createTextNode($token['data']); 2860 end($this->stack)->appendChild($text); 2861 2862 /* A comment token */ 2863 } elseif($token['type'] === HTML5::COMMENT) { 2864 /* Append a Comment node to the current node with the data 2865 attribute set to the data given in the comment token. */ 2866 $comment = $this->dom->createComment($token['data']); 2867 end($this->stack)->appendChild($comment); 2868 2869 /* A start tag whose tag name is "col" */ 2870 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { 2871 /* Insert a col element for the token. Immediately pop the current 2872 node off the stack of open elements. */ 2873 $this->insertElement($token); 2874 array_pop($this->stack); 2875 2876 /* An end tag whose tag name is "colgroup" */ 2877 } elseif($token['type'] === HTML5::ENDTAG && 2878 $token['name'] === 'colgroup') { 2879 /* If the current node is the root html element, then this is a 2880 parse error, ignore the token. (innerHTML case) */ 2881 if(end($this->stack)->nodeName === 'html') { 2882 // Ignore 2883 2884 /* Otherwise, pop the current node (which will be a colgroup 2885 element) from the stack of open elements. Switch the insertion 2886 mode to "in table". */ 2887 } else { 2888 array_pop($this->stack); 2889 $this->mode = self::IN_TABLE; 2890 } 2891 2892 /* An end tag whose tag name is "col" */ 2893 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { 2894 /* Parse error. Ignore the token. */ 2895 2896 /* Anything else */ 2897 } else { 2898 /* Act as if an end tag with the tag name "colgroup" had been seen, 2899 and then, if that token wasn't ignored, reprocess the current token. */ 2900 $this->inColumnGroup(array( 2901 'name' => 'colgroup', 2902 'type' => HTML5::ENDTAG 2903 )); 2904 2905 return $this->inTable($token); 2906 } 2907 } 2908 2909 private function inTableBody($token) 2910 { 2911 $clear = array('tbody', 'tfoot', 'thead', 'html'); 2912 2913 /* A start tag whose tag name is "tr" */ 2914 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { 2915 /* Clear the stack back to a table body context. */ 2916 $this->clearStackToTableContext($clear); 2917 2918 /* Insert a tr element for the token, then switch the insertion 2919 mode to "in row". */ 2920 $this->insertElement($token); 2921 $this->mode = self::IN_ROW; 2922 2923 /* A start tag whose tag name is one of: "th", "td" */ 2924 } elseif($token['type'] === HTML5::STARTTAG && 2925 ($token['name'] === 'th' || $token['name'] === 'td')) { 2926 /* Parse error. Act as if a start tag with the tag name "tr" had 2927 been seen, then reprocess the current token. */ 2928 $this->inTableBody(array( 2929 'name' => 'tr', 2930 'type' => HTML5::STARTTAG, 2931 'attr' => array() 2932 )); 2933 2934 return $this->inRow($token); 2935 2936 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 2937 } elseif($token['type'] === HTML5::ENDTAG && 2938 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { 2939 /* If the stack of open elements does not have an element in table 2940 scope with the same tag name as the token, this is a parse error. 2941 Ignore the token. */ 2942 if(!$this->elementInScope($token['name'], true)) { 2943 // Ignore 2944 2945 /* Otherwise: */ 2946 } else { 2947 /* Clear the stack back to a table body context. */ 2948 $this->clearStackToTableContext($clear); 2949 2950 /* Pop the current node from the stack of open elements. Switch 2951 the insertion mode to "in table". */ 2952 array_pop($this->stack); 2953 $this->mode = self::IN_TABLE; 2954 } 2955 2956 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 2957 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ 2958 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'], 2959 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) || 2960 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) { 2961 /* If the stack of open elements does not have a tbody, thead, or 2962 tfoot element in table scope, this is a parse error. Ignore the 2963 token. (innerHTML case) */ 2964 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { 2965 // Ignore. 2966 2967 /* Otherwise: */ 2968 } else { 2969 /* Clear the stack back to a table body context. */ 2970 $this->clearStackToTableContext($clear); 2971 2972 /* Act as if an end tag with the same tag name as the current 2973 node ("tbody", "tfoot", or "thead") had been seen, then 2974 reprocess the current token. */ 2975 $this->inTableBody(array( 2976 'name' => end($this->stack)->nodeName, 2977 'type' => HTML5::ENDTAG 2978 )); 2979 2980 return $this->mainPhase($token); 2981 } 2982 2983 /* An end tag whose tag name is one of: "body", "caption", "col", 2984 "colgroup", "html", "td", "th", "tr" */ 2985 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 2986 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { 2987 /* Parse error. Ignore the token. */ 2988 2989 /* Anything else */ 2990 } else { 2991 /* Process the token as if the insertion mode was "in table". */ 2992 $this->inTable($token); 2993 } 2994 } 2995 2996 private function inRow($token) 2997 { 2998 $clear = array('tr', 'html'); 2999 3000 /* A start tag whose tag name is one of: "th", "td" */ 3001 if($token['type'] === HTML5::STARTTAG && 3002 ($token['name'] === 'th' || $token['name'] === 'td')) { 3003 /* Clear the stack back to a table row context. */ 3004 $this->clearStackToTableContext($clear); 3005 3006 /* Insert an HTML element for the token, then switch the insertion 3007 mode to "in cell". */ 3008 $this->insertElement($token); 3009 $this->mode = self::IN_CELL; 3010 3011 /* Insert a marker at the end of the list of active formatting 3012 elements. */ 3013 $this->a_formatting[] = self::MARKER; 3014 3015 /* An end tag whose tag name is "tr" */ 3016 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { 3017 /* If the stack of open elements does not have an element in table 3018 scope with the same tag name as the token, this is a parse error. 3019 Ignore the token. (innerHTML case) */ 3020 if(!$this->elementInScope($token['name'], true)) { 3021 // Ignore. 3022 3023 /* Otherwise: */ 3024 } else { 3025 /* Clear the stack back to a table row context. */ 3026 $this->clearStackToTableContext($clear); 3027 3028 /* Pop the current node (which will be a tr element) from the 3029 stack of open elements. Switch the insertion mode to "in table 3030 body". */ 3031 array_pop($this->stack); 3032 $this->mode = self::IN_TBODY; 3033 } 3034 3035 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3036 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ 3037 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3038 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) { 3039 /* Act as if an end tag with the tag name "tr" had been seen, then, 3040 if that token wasn't ignored, reprocess the current token. */ 3041 $this->inRow(array( 3042 'name' => 'tr', 3043 'type' => HTML5::ENDTAG 3044 )); 3045 3046 return $this->inCell($token); 3047 3048 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3049 } elseif($token['type'] === HTML5::ENDTAG && 3050 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) { 3051 /* If the stack of open elements does not have an element in table 3052 scope with the same tag name as the token, this is a parse error. 3053 Ignore the token. */ 3054 if(!$this->elementInScope($token['name'], true)) { 3055 // Ignore. 3056 3057 /* Otherwise: */ 3058 } else { 3059 /* Otherwise, act as if an end tag with the tag name "tr" had 3060 been seen, then reprocess the current token. */ 3061 $this->inRow(array( 3062 'name' => 'tr', 3063 'type' => HTML5::ENDTAG 3064 )); 3065 3066 return $this->inCell($token); 3067 } 3068 3069 /* An end tag whose tag name is one of: "body", "caption", "col", 3070 "colgroup", "html", "td", "th" */ 3071 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3072 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) { 3073 /* Parse error. Ignore the token. */ 3074 3075 /* Anything else */ 3076 } else { 3077 /* Process the token as if the insertion mode was "in table". */ 3078 $this->inTable($token); 3079 } 3080 } 3081 3082 private function inCell($token) 3083 { 3084 /* An end tag whose tag name is one of: "td", "th" */ 3085 if($token['type'] === HTML5::ENDTAG && 3086 ($token['name'] === 'td' || $token['name'] === 'th')) { 3087 /* If the stack of open elements does not have an element in table 3088 scope with the same tag name as that of the token, then this is a 3089 parse error and the token must be ignored. */ 3090 if(!$this->elementInScope($token['name'], true)) { 3091 // Ignore. 3092 3093 /* Otherwise: */ 3094 } else { 3095 /* Generate implied end tags, except for elements with the same 3096 tag name as the token. */ 3097 $this->generateImpliedEndTags(array($token['name'])); 3098 3099 /* Now, if the current node is not an element with the same tag 3100 name as the token, then this is a parse error. */ 3101 // k 3102 3103 /* Pop elements from this stack until an element with the same 3104 tag name as the token has been popped from the stack. */ 3105 while(true) { 3106 $node = end($this->stack)->nodeName; 3107 array_pop($this->stack); 3108 3109 if($node === $token['name']) { 3110 break; 3111 } 3112 } 3113 3114 /* Clear the list of active formatting elements up to the last 3115 marker. */ 3116 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3117 3118 /* Switch the insertion mode to "in row". (The current node 3119 will be a tr element at this point.) */ 3120 $this->mode = self::IN_ROW; 3121 } 3122 3123 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3124 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3125 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3126 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 3127 'thead', 'tr'))) { 3128 /* If the stack of open elements does not have a td or th element 3129 in table scope, then this is a parse error; ignore the token. 3130 (innerHTML case) */ 3131 if(!$this->elementInScope(array('td', 'th'), true)) { 3132 // Ignore. 3133 3134 /* Otherwise, close the cell (see below) and reprocess the current 3135 token. */ 3136 } else { 3137 $this->closeCell(); 3138 return $this->inRow($token); 3139 } 3140 3141 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3142 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3143 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'], 3144 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', 3145 'thead', 'tr'))) { 3146 /* If the stack of open elements does not have a td or th element 3147 in table scope, then this is a parse error; ignore the token. 3148 (innerHTML case) */ 3149 if(!$this->elementInScope(array('td', 'th'), true)) { 3150 // Ignore. 3151 3152 /* Otherwise, close the cell (see below) and reprocess the current 3153 token. */ 3154 } else { 3155 $this->closeCell(); 3156 return $this->inRow($token); 3157 } 3158 3159 /* An end tag whose tag name is one of: "body", "caption", "col", 3160 "colgroup", "html" */ 3161 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3162 array('body', 'caption', 'col', 'colgroup', 'html'))) { 3163 /* Parse error. Ignore the token. */ 3164 3165 /* An end tag whose tag name is one of: "table", "tbody", "tfoot", 3166 "thead", "tr" */ 3167 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'], 3168 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) { 3169 /* If the stack of open elements does not have an element in table 3170 scope with the same tag name as that of the token (which can only 3171 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), 3172 then this is a parse error and the token must be ignored. */ 3173 if(!$this->elementInScope($token['name'], true)) { 3174 // Ignore. 3175 3176 /* Otherwise, close the cell (see below) and reprocess the current 3177 token. */ 3178 } else { 3179 $this->closeCell(); 3180 return $this->inRow($token); 3181 } 3182 3183 /* Anything else */ 3184 } else { 3185 /* Process the token as if the insertion mode was "in body". */ 3186 $this->inBody($token); 3187 } 3188 } 3189 3190 private function inSelect($token) 3191 { 3192 /* Handle the token as follows: */ 3193 3194 /* A character token */ 3195 if($token['type'] === HTML5::CHARACTR) { 3196 /* Append the token's character to the current node. */ 3197 $this->insertText($token['data']); 3198 3199 /* A comment token */ 3200 } elseif($token['type'] === HTML5::COMMENT) { 3201 /* Append a Comment node to the current node with the data 3202 attribute set to the data given in the comment token. */ 3203 $this->insertComment($token['data']); 3204 3205 /* A start tag token whose tag name is "option" */ 3206 } elseif($token['type'] === HTML5::STARTTAG && 3207 $token['name'] === 'option') { 3208 /* If the current node is an option element, act as if an end tag 3209 with the tag name "option" had been seen. */ 3210 if(end($this->stack)->nodeName === 'option') { 3211 $this->inSelect(array( 3212 'name' => 'option', 3213 'type' => HTML5::ENDTAG 3214 )); 3215 } 3216 3217 /* Insert an HTML element for the token. */ 3218 $this->insertElement($token); 3219 3220 /* A start tag token whose tag name is "optgroup" */ 3221 } elseif($token['type'] === HTML5::STARTTAG && 3222 $token['name'] === 'optgroup') { 3223 /* If the current node is an option element, act as if an end tag 3224 with the tag name "option" had been seen. */ 3225 if(end($this->stack)->nodeName === 'option') { 3226 $this->inSelect(array( 3227 'name' => 'option', 3228 'type' => HTML5::ENDTAG 3229 )); 3230 } 3231 3232 /* If the current node is an optgroup element, act as if an end tag 3233 with the tag name "optgroup" had been seen. */ 3234 if(end($this->stack)->nodeName === 'optgroup') { 3235 $this->inSelect(array( 3236 'name' => 'optgroup', 3237 'type' => HTML5::ENDTAG 3238 )); 3239 } 3240 3241 /* Insert an HTML element for the token. */ 3242 $this->insertElement($token); 3243 3244 /* An end tag token whose tag name is "optgroup" */ 3245 } elseif($token['type'] === HTML5::ENDTAG && 3246 $token['name'] === 'optgroup') { 3247 /* First, if the current node is an option element, and the node 3248 immediately before it in the stack of open elements is an optgroup 3249 element, then act as if an end tag with the tag name "option" had 3250 been seen. */ 3251 $elements_in_stack = count($this->stack); 3252 3253 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' && 3254 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') { 3255 $this->inSelect(array( 3256 'name' => 'option', 3257 'type' => HTML5::ENDTAG 3258 )); 3259 } 3260 3261 /* If the current node is an optgroup element, then pop that node 3262 from the stack of open elements. Otherwise, this is a parse error, 3263 ignore the token. */ 3264 if($this->stack[$elements_in_stack - 1] === 'optgroup') { 3265 array_pop($this->stack); 3266 } 3267 3268 /* An end tag token whose tag name is "option" */ 3269 } elseif($token['type'] === HTML5::ENDTAG && 3270 $token['name'] === 'option') { 3271 /* If the current node is an option element, then pop that node 3272 from the stack of open elements. Otherwise, this is a parse error, 3273 ignore the token. */ 3274 if(end($this->stack)->nodeName === 'option') { 3275 array_pop($this->stack); 3276 } 3277 3278 /* An end tag whose tag name is "select" */ 3279 } elseif($token['type'] === HTML5::ENDTAG && 3280 $token['name'] === 'select') { 3281 /* If the stack of open elements does not have an element in table 3282 scope with the same tag name as the token, this is a parse error. 3283 Ignore the token. (innerHTML case) */ 3284 if(!$this->elementInScope($token['name'], true)) { 3285 // w/e 3286 3287 /* Otherwise: */ 3288 } else { 3289 /* Pop elements from the stack of open elements until a select 3290 element has been popped from the stack. */ 3291 while(true) { 3292 $current = end($this->stack)->nodeName; 3293 array_pop($this->stack); 3294 3295 if($current === 'select') { 3296 break; 3297 } 3298 } 3299 3300 /* Reset the insertion mode appropriately. */ 3301 $this->resetInsertionMode(); 3302 } 3303 3304 /* A start tag whose tag name is "select" */ 3305 } elseif($token['name'] === 'select' && 3306 $token['type'] === HTML5::STARTTAG) { 3307 /* Parse error. Act as if the token had been an end tag with the 3308 tag name "select" instead. */ 3309 $this->inSelect(array( 3310 'name' => 'select', 3311 'type' => HTML5::ENDTAG 3312 )); 3313 3314 /* An end tag whose tag name is one of: "caption", "table", "tbody", 3315 "tfoot", "thead", "tr", "td", "th" */ 3316 } elseif(in_array($token['name'], array('caption', 'table', 'tbody', 3317 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) { 3318 /* Parse error. */ 3319 // w/e 3320 3321 /* If the stack of open elements has an element in table scope with 3322 the same tag name as that of the token, then act as if an end tag 3323 with the tag name "select" had been seen, and reprocess the token. 3324 Otherwise, ignore the token. */ 3325 if($this->elementInScope($token['name'], true)) { 3326 $this->inSelect(array( 3327 'name' => 'select', 3328 'type' => HTML5::ENDTAG 3329 )); 3330 3331 $this->mainPhase($token); 3332 } 3333 3334 /* Anything else */ 3335 } else { 3336 /* Parse error. Ignore the token. */ 3337 } 3338 } 3339 3340 private function afterBody($token) 3341 { 3342 /* Handle the token as follows: */ 3343 3344 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3345 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3346 or U+0020 SPACE */ 3347 if($token['type'] === HTML5::CHARACTR && 3348 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3349 /* Process the token as it would be processed if the insertion mode 3350 was "in body". */ 3351 $this->inBody($token); 3352 3353 /* A comment token */ 3354 } elseif($token['type'] === HTML5::COMMENT) { 3355 /* Append a Comment node to the first element in the stack of open 3356 elements (the html element), with the data attribute set to the 3357 data given in the comment token. */ 3358 $comment = $this->dom->createComment($token['data']); 3359 $this->stack[0]->appendChild($comment); 3360 3361 /* An end tag with the tag name "html" */ 3362 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { 3363 /* If the parser was originally created in order to handle the 3364 setting of an element's innerHTML attribute, this is a parse error; 3365 ignore the token. (The element will be an html element in this 3366 case.) (innerHTML case) */ 3367 3368 /* Otherwise, switch to the trailing end phase. */ 3369 $this->phase = self::END_PHASE; 3370 3371 /* Anything else */ 3372 } else { 3373 /* Parse error. Set the insertion mode to "in body" and reprocess 3374 the token. */ 3375 $this->mode = self::IN_BODY; 3376 return $this->inBody($token); 3377 } 3378 } 3379 3380 private function inFrameset($token) 3381 { 3382 /* Handle the token as follows: */ 3383 3384 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3385 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3386 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 3387 if($token['type'] === HTML5::CHARACTR && 3388 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3389 /* Append the character to the current node. */ 3390 $this->insertText($token['data']); 3391 3392 /* A comment token */ 3393 } elseif($token['type'] === HTML5::COMMENT) { 3394 /* Append a Comment node to the current node with the data 3395 attribute set to the data given in the comment token. */ 3396 $this->insertComment($token['data']); 3397 3398 /* A start tag with the tag name "frameset" */ 3399 } elseif($token['name'] === 'frameset' && 3400 $token['type'] === HTML5::STARTTAG) { 3401 $this->insertElement($token); 3402 3403 /* An end tag with the tag name "frameset" */ 3404 } elseif($token['name'] === 'frameset' && 3405 $token['type'] === HTML5::ENDTAG) { 3406 /* If the current node is the root html element, then this is a 3407 parse error; ignore the token. (innerHTML case) */ 3408 if(end($this->stack)->nodeName === 'html') { 3409 // Ignore 3410 3411 } else { 3412 /* Otherwise, pop the current node from the stack of open 3413 elements. */ 3414 array_pop($this->stack); 3415 3416 /* If the parser was not originally created in order to handle 3417 the setting of an element's innerHTML attribute (innerHTML case), 3418 and the current node is no longer a frameset element, then change 3419 the insertion mode to "after frameset". */ 3420 $this->mode = self::AFTR_FRAME; 3421 } 3422 3423 /* A start tag with the tag name "frame" */ 3424 } elseif($token['name'] === 'frame' && 3425 $token['type'] === HTML5::STARTTAG) { 3426 /* Insert an HTML element for the token. */ 3427 $this->insertElement($token); 3428 3429 /* Immediately pop the current node off the stack of open elements. */ 3430 array_pop($this->stack); 3431 3432 /* A start tag with the tag name "noframes" */ 3433 } elseif($token['name'] === 'noframes' && 3434 $token['type'] === HTML5::STARTTAG) { 3435 /* Process the token as if the insertion mode had been "in body". */ 3436 $this->inBody($token); 3437 3438 /* Anything else */ 3439 } else { 3440 /* Parse error. Ignore the token. */ 3441 } 3442 } 3443 3444 private function afterFrameset($token) 3445 { 3446 /* Handle the token as follows: */ 3447 3448 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3449 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3450 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 3451 if($token['type'] === HTML5::CHARACTR && 3452 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3453 /* Append the character to the current node. */ 3454 $this->insertText($token['data']); 3455 3456 /* A comment token */ 3457 } elseif($token['type'] === HTML5::COMMENT) { 3458 /* Append a Comment node to the current node with the data 3459 attribute set to the data given in the comment token. */ 3460 $this->insertComment($token['data']); 3461 3462 /* An end tag with the tag name "html" */ 3463 } elseif($token['name'] === 'html' && 3464 $token['type'] === HTML5::ENDTAG) { 3465 /* Switch to the trailing end phase. */ 3466 $this->phase = self::END_PHASE; 3467 3468 /* A start tag with the tag name "noframes" */ 3469 } elseif($token['name'] === 'noframes' && 3470 $token['type'] === HTML5::STARTTAG) { 3471 /* Process the token as if the insertion mode had been "in body". */ 3472 $this->inBody($token); 3473 3474 /* Anything else */ 3475 } else { 3476 /* Parse error. Ignore the token. */ 3477 } 3478 } 3479 3480 private function trailingEndPhase($token) 3481 { 3482 /* After the main phase, as each token is emitted from the tokenisation 3483 stage, it must be processed as described in this section. */ 3484 3485 /* A DOCTYPE token */ 3486 if($token['type'] === HTML5::DOCTYPE) { 3487 // Parse error. Ignore the token. 3488 3489 /* A comment token */ 3490 } elseif($token['type'] === HTML5::COMMENT) { 3491 /* Append a Comment node to the Document object with the data 3492 attribute set to the data given in the comment token. */ 3493 $comment = $this->dom->createComment($token['data']); 3494 $this->dom->appendChild($comment); 3495 3496 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3497 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3498 or U+0020 SPACE */ 3499 } elseif($token['type'] === HTML5::CHARACTR && 3500 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) { 3501 /* Process the token as it would be processed in the main phase. */ 3502 $this->mainPhase($token); 3503 3504 /* A character token that is not one of U+0009 CHARACTER TABULATION, 3505 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3506 or U+0020 SPACE. Or a start tag token. Or an end tag token. */ 3507 } elseif(($token['type'] === HTML5::CHARACTR && 3508 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 3509 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) { 3510 /* Parse error. Switch back to the main phase and reprocess the 3511 token. */ 3512 $this->phase = self::MAIN_PHASE; 3513 return $this->mainPhase($token); 3514 3515 /* An end-of-file token */ 3516 } elseif($token['type'] === HTML5::EOF) { 3517 /* OMG DONE!! */ 3518 } 3519 } 3520 3521 private function insertElement($token, $append = true) 3522 { 3523 $el = $this->dom->createElement($token['name']); 3524 3525 foreach($token['attr'] as $attr) { 3526 if(!$el->hasAttribute($attr['name'])) { 3527 $el->setAttribute($attr['name'], $attr['value']); 3528 } 3529 } 3530 3531 $this->appendToRealParent($el); 3532 $this->stack[] = $el; 3533 3534 return $el; 3535 } 3536 3537 private function insertText($data) 3538 { 3539 $text = $this->dom->createTextNode($data); 3540 $this->appendToRealParent($text); 3541 } 3542 3543 private function insertComment($data) 3544 { 3545 $comment = $this->dom->createComment($data); 3546 $this->appendToRealParent($comment); 3547 } 3548 3549 private function appendToRealParent($node) 3550 { 3551 if($this->foster_parent === null) { 3552 end($this->stack)->appendChild($node); 3553 3554 } elseif($this->foster_parent !== null) { 3555 /* If the foster parent element is the parent element of the 3556 last table element in the stack of open elements, then the new 3557 node must be inserted immediately before the last table element 3558 in the stack of open elements in the foster parent element; 3559 otherwise, the new node must be appended to the foster parent 3560 element. */ 3561 for($n = count($this->stack) - 1; $n >= 0; $n--) { 3562 if($this->stack[$n]->nodeName === 'table' && 3563 $this->stack[$n]->parentNode !== null) { 3564 $table = $this->stack[$n]; 3565 break; 3566 } 3567 } 3568 3569 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode)) 3570 $this->foster_parent->insertBefore($node, $table); 3571 else 3572 $this->foster_parent->appendChild($node); 3573 3574 $this->foster_parent = null; 3575 } 3576 } 3577 3578 private function elementInScope($el, $table = false) 3579 { 3580 if(is_array($el)) { 3581 foreach($el as $element) { 3582 if($this->elementInScope($element, $table)) { 3583 return true; 3584 } 3585 } 3586 3587 return false; 3588 } 3589 3590 $leng = count($this->stack); 3591 3592 for($n = 0; $n < $leng; $n++) { 3593 /* 1. Initialise node to be the current node (the bottommost node of 3594 the stack). */ 3595 $node = $this->stack[$leng - 1 - $n]; 3596 3597 if($node->tagName === $el) { 3598 /* 2. If node is the target node, terminate in a match state. */ 3599 return true; 3600 3601 } elseif($node->tagName === 'table') { 3602 /* 3. Otherwise, if node is a table element, terminate in a failure 3603 state. */ 3604 return false; 3605 3606 } elseif($table === true && in_array($node->tagName, array('caption', 'td', 3607 'th', 'button', 'marquee', 'object'))) { 3608 /* 4. Otherwise, if the algorithm is the "has an element in scope" 3609 variant (rather than the "has an element in table scope" variant), 3610 and node is one of the following, terminate in a failure state. */ 3611 return false; 3612 3613 } elseif($node === $node->ownerDocument->documentElement) { 3614 /* 5. Otherwise, if node is an html element (root element), terminate 3615 in a failure state. (This can only happen if the node is the topmost 3616 node of the stack of open elements, and prevents the next step from 3617 being invoked if there are no more elements in the stack.) */ 3618 return false; 3619 } 3620 3621 /* Otherwise, set node to the previous entry in the stack of open 3622 elements and return to step 2. (This will never fail, since the loop 3623 will always terminate in the previous step if the top of the stack 3624 is reached.) */ 3625 } 3626 } 3627 3628 private function reconstructActiveFormattingElements() 3629 { 3630 /* 1. If there are no entries in the list of active formatting elements, 3631 then there is nothing to reconstruct; stop this algorithm. */ 3632 $formatting_elements = count($this->a_formatting); 3633 3634 if($formatting_elements === 0) { 3635 return false; 3636 } 3637 3638 /* 3. Let entry be the last (most recently added) element in the list 3639 of active formatting elements. */ 3640 $entry = end($this->a_formatting); 3641 3642 /* 2. If the last (most recently added) entry in the list of active 3643 formatting elements is a marker, or if it is an element that is in the 3644 stack of open elements, then there is nothing to reconstruct; stop this 3645 algorithm. */ 3646 if($entry === self::MARKER || in_array($entry, $this->stack, true)) { 3647 return false; 3648 } 3649 3650 for($a = $formatting_elements - 1; $a >= 0; true) { 3651 /* 4. If there are no entries before entry in the list of active 3652 formatting elements, then jump to step 8. */ 3653 if($a === 0) { 3654 $step_seven = false; 3655 break; 3656 } 3657 3658 /* 5. Let entry be the entry one earlier than entry in the list of 3659 active formatting elements. */ 3660 $a--; 3661 $entry = $this->a_formatting[$a]; 3662 3663 /* 6. If entry is neither a marker nor an element that is also in 3664 thetack of open elements, go to step 4. */ 3665 if($entry === self::MARKER || in_array($entry, $this->stack, true)) { 3666 break; 3667 } 3668 } 3669 3670 while(true) { 3671 /* 7. Let entry be the element one later than entry in the list of 3672 active formatting elements. */ 3673 if(isset($step_seven) && $step_seven === true) { 3674 $a++; 3675 $entry = $this->a_formatting[$a]; 3676 } 3677 3678 /* 8. Perform a shallow clone of the element entry to obtain clone. */ 3679 $clone = $entry->cloneNode(); 3680 3681 /* 9. Append clone to the current node and push it onto the stack 3682 of open elements so that it is the new current node. */ 3683 end($this->stack)->appendChild($clone); 3684 $this->stack[] = $clone; 3685 3686 /* 10. Replace the entry for entry in the list with an entry for 3687 clone. */ 3688 $this->a_formatting[$a] = $clone; 3689 3690 /* 11. If the entry for clone in the list of active formatting 3691 elements is not the last entry in the list, return to step 7. */ 3692 if(end($this->a_formatting) !== $clone) { 3693 $step_seven = true; 3694 } else { 3695 break; 3696 } 3697 } 3698 } 3699 3700 private function clearTheActiveFormattingElementsUpToTheLastMarker() 3701 { 3702 /* When the steps below require the UA to clear the list of active 3703 formatting elements up to the last marker, the UA must perform the 3704 following steps: */ 3705 3706 while(true) { 3707 /* 1. Let entry be the last (most recently added) entry in the list 3708 of active formatting elements. */ 3709 $entry = end($this->a_formatting); 3710 3711 /* 2. Remove entry from the list of active formatting elements. */ 3712 array_pop($this->a_formatting); 3713 3714 /* 3. If entry was a marker, then stop the algorithm at this point. 3715 The list has been cleared up to the last marker. */ 3716 if($entry === self::MARKER) { 3717 break; 3718 } 3719 } 3720 } 3721 3722 private function generateImpliedEndTags(array $exclude = array()) 3723 { 3724 /* When the steps below require the UA to generate implied end tags, 3725 then, if the current node is a dd element, a dt element, an li element, 3726 a p element, a td element, a th element, or a tr element, the UA must 3727 act as if an end tag with the respective tag name had been seen and 3728 then generate implied end tags again. */ 3729 $node = end($this->stack); 3730 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); 3731 3732 while(in_array(end($this->stack)->nodeName, $elements)) { 3733 array_pop($this->stack); 3734 } 3735 } 3736 3737 private function getElementCategory($name) 3738 { 3739 if(in_array($name, $this->special)) 3740 return self::SPECIAL; 3741 3742 elseif(in_array($name, $this->scoping)) 3743 return self::SCOPING; 3744 3745 elseif(in_array($name, $this->formatting)) 3746 return self::FORMATTING; 3747 3748 else 3749 return self::PHRASING; 3750 } 3751 3752 private function clearStackToTableContext($elements) 3753 { 3754 /* When the steps above require the UA to clear the stack back to a 3755 table context, it means that the UA must, while the current node is not 3756 a table element or an html element, pop elements from the stack of open 3757 elements. If this causes any elements to be popped from the stack, then 3758 this is a parse error. */ 3759 while(true) { 3760 $node = end($this->stack)->nodeName; 3761 3762 if(in_array($node, $elements)) { 3763 break; 3764 } else { 3765 array_pop($this->stack); 3766 } 3767 } 3768 } 3769 3770 private function resetInsertionMode() 3771 { 3772 /* 1. Let last be false. */ 3773 $last = false; 3774 $leng = count($this->stack); 3775 3776 for($n = $leng - 1; $n >= 0; $n--) { 3777 /* 2. Let node be the last node in the stack of open elements. */ 3778 $node = $this->stack[$n]; 3779 3780 /* 3. If node is the first node in the stack of open elements, then 3781 set last to true. If the element whose innerHTML attribute is being 3782 set is neither a td element nor a th element, then set node to the 3783 element whose innerHTML attribute is being set. (innerHTML case) */ 3784 if($this->stack[0]->isSameNode($node)) { 3785 $last = true; 3786 } 3787 3788 /* 4. If node is a select element, then switch the insertion mode to 3789 "in select" and abort these steps. (innerHTML case) */ 3790 if($node->nodeName === 'select') { 3791 $this->mode = self::IN_SELECT; 3792 break; 3793 3794 /* 5. If node is a td or th element, then switch the insertion mode 3795 to "in cell" and abort these steps. */ 3796 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') { 3797 $this->mode = self::IN_CELL; 3798 break; 3799 3800 /* 6. If node is a tr element, then switch the insertion mode to 3801 "in row" and abort these steps. */ 3802 } elseif($node->nodeName === 'tr') { 3803 $this->mode = self::IN_ROW; 3804 break; 3805 3806 /* 7. If node is a tbody, thead, or tfoot element, then switch the 3807 insertion mode to "in table body" and abort these steps. */ 3808 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { 3809 $this->mode = self::IN_TBODY; 3810 break; 3811 3812 /* 8. If node is a caption element, then switch the insertion mode 3813 to "in caption" and abort these steps. */ 3814 } elseif($node->nodeName === 'caption') { 3815 $this->mode = self::IN_CAPTION; 3816 break; 3817 3818 /* 9. If node is a colgroup element, then switch the insertion mode 3819 to "in column group" and abort these steps. (innerHTML case) */ 3820 } elseif($node->nodeName === 'colgroup') { 3821 $this->mode = self::IN_CGROUP; 3822 break; 3823 3824 /* 10. If node is a table element, then switch the insertion mode 3825 to "in table" and abort these steps. */ 3826 } elseif($node->nodeName === 'table') { 3827 $this->mode = self::IN_TABLE; 3828 break; 3829 3830 /* 11. If node is a head element, then switch the insertion mode 3831 to "in body" ("in body"! not "in head"!) and abort these steps. 3832 (innerHTML case) */ 3833 } elseif($node->nodeName === 'head') { 3834 $this->mode = self::IN_BODY; 3835 break; 3836 3837 /* 12. If node is a body element, then switch the insertion mode to 3838 "in body" and abort these steps. */ 3839 } elseif($node->nodeName === 'body') { 3840 $this->mode = self::IN_BODY; 3841 break; 3842 3843 /* 13. If node is a frameset element, then switch the insertion 3844 mode to "in frameset" and abort these steps. (innerHTML case) */ 3845 } elseif($node->nodeName === 'frameset') { 3846 $this->mode = self::IN_FRAME; 3847 break; 3848 3849 /* 14. If node is an html element, then: if the head element 3850 pointer is null, switch the insertion mode to "before head", 3851 otherwise, switch the insertion mode to "after head". In either 3852 case, abort these steps. (innerHTML case) */ 3853 } elseif($node->nodeName === 'html') { 3854 $this->mode = ($this->head_pointer === null) 3855 ? self::BEFOR_HEAD 3856 : self::AFTER_HEAD; 3857 3858 break; 3859 3860 /* 15. If last is true, then set the insertion mode to "in body" 3861 and abort these steps. (innerHTML case) */ 3862 } elseif($last) { 3863 $this->mode = self::IN_BODY; 3864 break; 3865 } 3866 } 3867 } 3868 3869 private function closeCell() 3870 { 3871 /* If the stack of open elements has a td or th element in table scope, 3872 then act as if an end tag token with that tag name had been seen. */ 3873 foreach(array('td', 'th') as $cell) { 3874 if($this->elementInScope($cell, true)) { 3875 $this->inCell(array( 3876 'name' => $cell, 3877 'type' => HTML5::ENDTAG 3878 )); 3879 3880 break; 3881 } 3882 } 3883 } 3884 3885 public function save() 3886 { 3887 return $this->dom; 3888 } 3889} 3890