1<?php
2class HTML5
3{
4    private $data;
5    private $char;
6    private $EOF;
7    private $state;
8    private $tree;
9    private $token;
10    private $content_model;
11    private $escape = false;
12    private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
13    'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
14    'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
15    'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
16    'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
17    'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
18    'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
19    'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
20    'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
21    'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
22    'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
23    'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
24    'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
25    'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
26    'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
27    'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
28    'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
29    'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
30    'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
31    'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
32    'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
33    'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
34    'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
35    'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
36    'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
37    'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
38    'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
39    'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
40    'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
41    'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
42    'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
43    'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
44    'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
45    'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
46    'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
47    'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
48    'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
49    'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
50    'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
51    'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
52    'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
53    'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
54
55    const PCDATA    = 0;
56    const RCDATA    = 1;
57    const CDATA     = 2;
58    const PLAINTEXT = 3;
59
60    const DOCTYPE  = 0;
61    const STARTTAG = 1;
62    const ENDTAG   = 2;
63    const COMMENT  = 3;
64    const CHARACTR = 4;
65    const EOF      = 5;
66
67    public function __construct($data)
68    {
69        $data = str_replace("\r\n", "\n", $data);
70        $date = str_replace("\r", null, $data);
71
72        $this->data = $data;
73        $this->char = -1;
74        $this->EOF  = strlen($data);
75        $this->tree = new HTML5TreeConstructer;
76        $this->content_model = self::PCDATA;
77
78        $this->state = 'data';
79
80        while($this->state !== null) {
81            $this->{$this->state.'State'}();
82        }
83    }
84
85    public function save()
86    {
87        return $this->tree->save();
88    }
89
90    private function char()
91    {
92        return ($this->char < $this->EOF)
93            ? $this->data[$this->char]
94            : false;
95    }
96
97    private function character($s, $l = 0)
98    {
99        if($s + $l < $this->EOF) {
100            if($l === 0) {
101                return $this->data[$s];
102            } else {
103                return substr($this->data, $s, $l);
104            }
105        }
106    }
107
108    private function characters($char_class, $start)
109    {
110        return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
111    }
112
113    private function dataState()
114    {
115        // Consume the next input character
116        $this->char++;
117        $char = $this->char();
118
119        if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
120            /* U+0026 AMPERSAND (&)
121            When the content model flag is set to one of the PCDATA or RCDATA
122            states: switch to the entity data state. Otherwise: treat it as per
123            the "anything else"    entry below. */
124            $this->state = 'entityData';
125
126        } elseif($char === '-') {
127            /* If the content model flag is set to either the RCDATA state or
128            the CDATA state, and the escape flag is false, and there are at
129            least three characters before this one in the input stream, and the
130            last four characters in the input stream, including this one, are
131            U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
132            and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
133            if(($this->content_model === self::RCDATA || $this->content_model ===
134            self::CDATA) && $this->escape === false &&
135            $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
136                $this->escape = true;
137            }
138
139            /* In any case, emit the input character as a character token. Stay
140            in the data state. */
141            $this->emitToken(array(
142                'type' => self::CHARACTR,
143                'data' => $char
144            ));
145
146        /* U+003C LESS-THAN SIGN (<) */
147        } elseif($char === '<' && ($this->content_model === self::PCDATA ||
148        (($this->content_model === self::RCDATA ||
149        $this->content_model === self::CDATA) && $this->escape === false))) {
150            /* When the content model flag is set to the PCDATA state: switch
151            to the tag open state.
152
153            When the content model flag is set to either the RCDATA state or
154            the CDATA state and the escape flag is false: switch to the tag
155            open state.
156
157            Otherwise: treat it as per the "anything else" entry below. */
158            $this->state = 'tagOpen';
159
160        /* U+003E GREATER-THAN SIGN (>) */
161        } elseif($char === '>') {
162            /* If the content model flag is set to either the RCDATA state or
163            the CDATA state, and the escape flag is true, and the last three
164            characters in the input stream including this one are U+002D
165            HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
166            set the escape flag to false. */
167            if(($this->content_model === self::RCDATA ||
168            $this->content_model === self::CDATA) && $this->escape === true &&
169            $this->character($this->char, 3) === '-->') {
170                $this->escape = false;
171            }
172
173            /* In any case, emit the input character as a character token.
174            Stay in the data state. */
175            $this->emitToken(array(
176                'type' => self::CHARACTR,
177                'data' => $char
178            ));
179
180        } elseif($this->char === $this->EOF) {
181            /* EOF
182            Emit an end-of-file token. */
183            $this->EOF();
184
185        } elseif($this->content_model === self::PLAINTEXT) {
186            /* When the content model flag is set to the PLAINTEXT state
187            THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
188            the text and emit it as a character token. */
189            $this->emitToken(array(
190                'type' => self::CHARACTR,
191                'data' => substr($this->data, $this->char)
192            ));
193
194            $this->EOF();
195
196        } else {
197            /* Anything else
198            THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
199            otherwise would also be treated as a character token and emit it
200            as a single character token. Stay in the data state. */
201            $len  = strcspn($this->data, '<&', $this->char);
202            $char = substr($this->data, $this->char, $len);
203            $this->char += $len - 1;
204
205            $this->emitToken(array(
206                'type' => self::CHARACTR,
207                'data' => $char
208            ));
209
210            $this->state = 'data';
211        }
212    }
213
214    private function entityDataState()
215    {
216        // Attempt to consume an entity.
217        $entity = $this->entity();
218
219        // If nothing is returned, emit a U+0026 AMPERSAND character token.
220        // Otherwise, emit the character token that was returned.
221        $char = (!$entity) ? '&' : $entity;
222        $this->emitToken($char);
223
224        // Finally, switch to the data state.
225        $this->state = 'data';
226    }
227
228    private function tagOpenState()
229    {
230        switch($this->content_model) {
231            case self::RCDATA:
232            case self::CDATA:
233                /* If the next input character is a U+002F SOLIDUS (/) character,
234                consume it and switch to the close tag open state. If the next
235                input character is not a U+002F SOLIDUS (/) character, emit a
236                U+003C LESS-THAN SIGN character token and switch to the data
237                state to process the next input character. */
238                if($this->character($this->char + 1) === '/') {
239                    $this->char++;
240                    $this->state = 'closeTagOpen';
241
242                } else {
243                    $this->emitToken(array(
244                        'type' => self::CHARACTR,
245                        'data' => '<'
246                    ));
247
248                    $this->state = 'data';
249                }
250            break;
251
252            case self::PCDATA:
253                // If the content model flag is set to the PCDATA state
254                // Consume the next input character:
255                $this->char++;
256                $char = $this->char();
257
258                if($char === '!') {
259                    /* U+0021 EXCLAMATION MARK (!)
260                    Switch to the markup declaration open state. */
261                    $this->state = 'markupDeclarationOpen';
262
263                } elseif($char === '/') {
264                    /* U+002F SOLIDUS (/)
265                    Switch to the close tag open state. */
266                    $this->state = 'closeTagOpen';
267
268                } elseif(preg_match('/^[A-Za-z]$/', $char)) {
269                    /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
270                    Create a new start tag token, set its tag name to the lowercase
271                    version of the input character (add 0x0020 to the character's code
272                    point), then switch to the tag name state. (Don't emit the token
273                    yet; further details will be filled in before it is emitted.) */
274                    $this->token = array(
275                        'name'  => strtolower($char),
276                        'type'  => self::STARTTAG,
277                        'attr'  => array()
278                    );
279
280                    $this->state = 'tagName';
281
282                } elseif($char === '>') {
283                    /* U+003E GREATER-THAN SIGN (>)
284                    Parse error. Emit a U+003C LESS-THAN SIGN character token and a
285                    U+003E GREATER-THAN SIGN character token. Switch to the data state. */
286                    $this->emitToken(array(
287                        'type' => self::CHARACTR,
288                        'data' => '<>'
289                    ));
290
291                    $this->state = 'data';
292
293                } elseif($char === '?') {
294                    /* U+003F QUESTION MARK (?)
295                    Parse error. Switch to the bogus comment state. */
296                    $this->state = 'bogusComment';
297
298                } else {
299                    /* Anything else
300                    Parse error. Emit a U+003C LESS-THAN SIGN character token and
301                    reconsume the current input character in the data state. */
302                    $this->emitToken(array(
303                        'type' => self::CHARACTR,
304                        'data' => '<'
305                    ));
306
307                    $this->char--;
308                    $this->state = 'data';
309                }
310            break;
311        }
312    }
313
314    private function closeTagOpenState()
315    {
316        $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
317        $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
318
319        if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
320        (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
321        $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
322            /* If the content model flag is set to the RCDATA or CDATA states then
323            examine the next few characters. If they do not match the tag name of
324            the last start tag token emitted (case insensitively), or if they do but
325            they are not immediately followed by one of the following characters:
326                * U+0009 CHARACTER TABULATION
327                * U+000A LINE FEED (LF)
328                * U+000B LINE TABULATION
329                * U+000C FORM FEED (FF)
330                * U+0020 SPACE
331                * U+003E GREATER-THAN SIGN (>)
332                * U+002F SOLIDUS (/)
333                * EOF
334            ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
335            token, a U+002F SOLIDUS character token, and switch to the data state
336            to process the next input character. */
337            $this->emitToken(array(
338                'type' => self::CHARACTR,
339                'data' => '</'
340            ));
341
342            $this->state = 'data';
343
344        } else {
345            /* Otherwise, if the content model flag is set to the PCDATA state,
346            or if the next few characters do match that tag name, consume the
347            next input character: */
348            $this->char++;
349            $char = $this->char();
350
351            if(preg_match('/^[A-Za-z]$/', $char)) {
352                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
353                Create a new end tag token, set its tag name to the lowercase version
354                of the input character (add 0x0020 to the character's code point), then
355                switch to the tag name state. (Don't emit the token yet; further details
356                will be filled in before it is emitted.) */
357                $this->token = array(
358                    'name'  => strtolower($char),
359                    'type'  => self::ENDTAG
360                );
361
362                $this->state = 'tagName';
363
364            } elseif($char === '>') {
365                /* U+003E GREATER-THAN SIGN (>)
366                Parse error. Switch to the data state. */
367                $this->state = 'data';
368
369            } elseif($this->char === $this->EOF) {
370                /* EOF
371                Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
372                SOLIDUS character token. Reconsume the EOF character in the data state. */
373                $this->emitToken(array(
374                    'type' => self::CHARACTR,
375                    'data' => '</'
376                ));
377
378                $this->char--;
379                $this->state = 'data';
380
381            } else {
382                /* Parse error. Switch to the bogus comment state. */
383                $this->state = 'bogusComment';
384            }
385        }
386    }
387
388    private function tagNameState()
389    {
390        // Consume the next input character:
391        $this->char++;
392        $char = $this->character($this->char);
393
394        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
395            /* U+0009 CHARACTER TABULATION
396            U+000A LINE FEED (LF)
397            U+000B LINE TABULATION
398            U+000C FORM FEED (FF)
399            U+0020 SPACE
400            Switch to the before attribute name state. */
401            $this->state = 'beforeAttributeName';
402
403        } elseif($char === '>') {
404            /* U+003E GREATER-THAN SIGN (>)
405            Emit the current tag token. Switch to the data state. */
406            $this->emitToken($this->token);
407            $this->state = 'data';
408
409        } elseif($this->char === $this->EOF) {
410            /* EOF
411            Parse error. Emit the current tag token. Reconsume the EOF
412            character in the data state. */
413            $this->emitToken($this->token);
414
415            $this->char--;
416            $this->state = 'data';
417
418        } elseif($char === '/') {
419            /* U+002F SOLIDUS (/)
420            Parse error unless this is a permitted slash. Switch to the before
421            attribute name state. */
422            $this->state = 'beforeAttributeName';
423
424        } else {
425            /* Anything else
426            Append the current input character to the current tag token's tag name.
427            Stay in the tag name state. */
428            $this->token['name'] .= strtolower($char);
429            $this->state = 'tagName';
430        }
431    }
432
433    private function beforeAttributeNameState()
434    {
435        // Consume the next input character:
436        $this->char++;
437        $char = $this->character($this->char);
438
439        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
440            /* U+0009 CHARACTER TABULATION
441            U+000A LINE FEED (LF)
442            U+000B LINE TABULATION
443            U+000C FORM FEED (FF)
444            U+0020 SPACE
445            Stay in the before attribute name state. */
446            $this->state = 'beforeAttributeName';
447
448        } elseif($char === '>') {
449            /* U+003E GREATER-THAN SIGN (>)
450            Emit the current tag token. Switch to the data state. */
451            $this->emitToken($this->token);
452            $this->state = 'data';
453
454        } elseif($char === '/') {
455            /* U+002F SOLIDUS (/)
456            Parse error unless this is a permitted slash. Stay in the before
457            attribute name state. */
458            $this->state = 'beforeAttributeName';
459
460        } elseif($this->char === $this->EOF) {
461            /* EOF
462            Parse error. Emit the current tag token. Reconsume the EOF
463            character in the data state. */
464            $this->emitToken($this->token);
465
466            $this->char--;
467            $this->state = 'data';
468
469        } else {
470            /* Anything else
471            Start a new attribute in the current tag token. Set that attribute's
472            name to the current input character, and its value to the empty string.
473            Switch to the attribute name state. */
474            $this->token['attr'][] = array(
475                'name'  => strtolower($char),
476                'value' => null
477            );
478
479            $this->state = 'attributeName';
480        }
481    }
482
483    private function attributeNameState()
484    {
485        // Consume the next input character:
486        $this->char++;
487        $char = $this->character($this->char);
488
489        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490            /* U+0009 CHARACTER TABULATION
491            U+000A LINE FEED (LF)
492            U+000B LINE TABULATION
493            U+000C FORM FEED (FF)
494            U+0020 SPACE
495            Stay in the before attribute name state. */
496            $this->state = 'afterAttributeName';
497
498        } elseif($char === '=') {
499            /* U+003D EQUALS SIGN (=)
500            Switch to the before attribute value state. */
501            $this->state = 'beforeAttributeValue';
502
503        } elseif($char === '>') {
504            /* U+003E GREATER-THAN SIGN (>)
505            Emit the current tag token. Switch to the data state. */
506            $this->emitToken($this->token);
507            $this->state = 'data';
508
509        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
510            /* U+002F SOLIDUS (/)
511            Parse error unless this is a permitted slash. Switch to the before
512            attribute name state. */
513            $this->state = 'beforeAttributeName';
514
515        } elseif($this->char === $this->EOF) {
516            /* EOF
517            Parse error. Emit the current tag token. Reconsume the EOF
518            character in the data state. */
519            $this->emitToken($this->token);
520
521            $this->char--;
522            $this->state = 'data';
523
524        } else {
525            /* Anything else
526            Append the current input character to the current attribute's name.
527            Stay in the attribute name state. */
528            $last = count($this->token['attr']) - 1;
529            $this->token['attr'][$last]['name'] .= strtolower($char);
530
531            $this->state = 'attributeName';
532        }
533    }
534
535    private function afterAttributeNameState()
536    {
537        // Consume the next input character:
538        $this->char++;
539        $char = $this->character($this->char);
540
541        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
542            /* U+0009 CHARACTER TABULATION
543            U+000A LINE FEED (LF)
544            U+000B LINE TABULATION
545            U+000C FORM FEED (FF)
546            U+0020 SPACE
547            Stay in the after attribute name state. */
548            $this->state = 'afterAttributeName';
549
550        } elseif($char === '=') {
551            /* U+003D EQUALS SIGN (=)
552            Switch to the before attribute value state. */
553            $this->state = 'beforeAttributeValue';
554
555        } elseif($char === '>') {
556            /* U+003E GREATER-THAN SIGN (>)
557            Emit the current tag token. Switch to the data state. */
558            $this->emitToken($this->token);
559            $this->state = 'data';
560
561        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
562            /* U+002F SOLIDUS (/)
563            Parse error unless this is a permitted slash. Switch to the
564            before attribute name state. */
565            $this->state = 'beforeAttributeName';
566
567        } elseif($this->char === $this->EOF) {
568            /* EOF
569            Parse error. Emit the current tag token. Reconsume the EOF
570            character in the data state. */
571            $this->emitToken($this->token);
572
573            $this->char--;
574            $this->state = 'data';
575
576        } else {
577            /* Anything else
578            Start a new attribute in the current tag token. Set that attribute's
579            name to the current input character, and its value to the empty string.
580            Switch to the attribute name state. */
581            $this->token['attr'][] = array(
582                'name'  => strtolower($char),
583                'value' => null
584            );
585
586            $this->state = 'attributeName';
587        }
588    }
589
590    private function beforeAttributeValueState()
591    {
592        // Consume the next input character:
593        $this->char++;
594        $char = $this->character($this->char);
595
596        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
597            /* U+0009 CHARACTER TABULATION
598            U+000A LINE FEED (LF)
599            U+000B LINE TABULATION
600            U+000C FORM FEED (FF)
601            U+0020 SPACE
602            Stay in the before attribute value state. */
603            $this->state = 'beforeAttributeValue';
604
605        } elseif($char === '"') {
606            /* U+0022 QUOTATION MARK (")
607            Switch to the attribute value (double-quoted) state. */
608            $this->state = 'attributeValueDoubleQuoted';
609
610        } elseif($char === '&') {
611            /* U+0026 AMPERSAND (&)
612            Switch to the attribute value (unquoted) state and reconsume
613            this input character. */
614            $this->char--;
615            $this->state = 'attributeValueUnquoted';
616
617        } elseif($char === '\'') {
618            /* U+0027 APOSTROPHE (')
619            Switch to the attribute value (single-quoted) state. */
620            $this->state = 'attributeValueSingleQuoted';
621
622        } elseif($char === '>') {
623            /* U+003E GREATER-THAN SIGN (>)
624            Emit the current tag token. Switch to the data state. */
625            $this->emitToken($this->token);
626            $this->state = 'data';
627
628        } else {
629            /* Anything else
630            Append the current input character to the current attribute's value.
631            Switch to the attribute value (unquoted) state. */
632            $last = count($this->token['attr']) - 1;
633            $this->token['attr'][$last]['value'] .= $char;
634
635            $this->state = 'attributeValueUnquoted';
636        }
637    }
638
639    private function attributeValueDoubleQuotedState()
640    {
641        // Consume the next input character:
642        $this->char++;
643        $char = $this->character($this->char);
644
645        if($char === '"') {
646            /* U+0022 QUOTATION MARK (")
647            Switch to the before attribute name state. */
648            $this->state = 'beforeAttributeName';
649
650        } elseif($char === '&') {
651            /* U+0026 AMPERSAND (&)
652            Switch to the entity in attribute value state. */
653            $this->entityInAttributeValueState('double');
654
655        } elseif($this->char === $this->EOF) {
656            /* EOF
657            Parse error. Emit the current tag token. Reconsume the character
658            in the data state. */
659            $this->emitToken($this->token);
660
661            $this->char--;
662            $this->state = 'data';
663
664        } else {
665            /* Anything else
666            Append the current input character to the current attribute's value.
667            Stay in the attribute value (double-quoted) state. */
668            $last = count($this->token['attr']) - 1;
669            $this->token['attr'][$last]['value'] .= $char;
670
671            $this->state = 'attributeValueDoubleQuoted';
672        }
673    }
674
675    private function attributeValueSingleQuotedState()
676    {
677        // Consume the next input character:
678        $this->char++;
679        $char = $this->character($this->char);
680
681        if($char === '\'') {
682            /* U+0022 QUOTATION MARK (')
683            Switch to the before attribute name state. */
684            $this->state = 'beforeAttributeName';
685
686        } elseif($char === '&') {
687            /* U+0026 AMPERSAND (&)
688            Switch to the entity in attribute value state. */
689            $this->entityInAttributeValueState('single');
690
691        } elseif($this->char === $this->EOF) {
692            /* EOF
693            Parse error. Emit the current tag token. Reconsume the character
694            in the data state. */
695            $this->emitToken($this->token);
696
697            $this->char--;
698            $this->state = 'data';
699
700        } else {
701            /* Anything else
702            Append the current input character to the current attribute's value.
703            Stay in the attribute value (single-quoted) state. */
704            $last = count($this->token['attr']) - 1;
705            $this->token['attr'][$last]['value'] .= $char;
706
707            $this->state = 'attributeValueSingleQuoted';
708        }
709    }
710
711    private function attributeValueUnquotedState()
712    {
713        // Consume the next input character:
714        $this->char++;
715        $char = $this->character($this->char);
716
717        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
718            /* U+0009 CHARACTER TABULATION
719            U+000A LINE FEED (LF)
720            U+000B LINE TABULATION
721            U+000C FORM FEED (FF)
722            U+0020 SPACE
723            Switch to the before attribute name state. */
724            $this->state = 'beforeAttributeName';
725
726        } elseif($char === '&') {
727            /* U+0026 AMPERSAND (&)
728            Switch to the entity in attribute value state. */
729            $this->entityInAttributeValueState('non');
730
731        } elseif($char === '>') {
732            /* U+003E GREATER-THAN SIGN (>)
733            Emit the current tag token. Switch to the data state. */
734            $this->emitToken($this->token);
735            $this->state = 'data';
736
737        } else {
738            /* Anything else
739            Append the current input character to the current attribute's value.
740            Stay in the attribute value (unquoted) state. */
741            $last = count($this->token['attr']) - 1;
742            $this->token['attr'][$last]['value'] .= $char;
743
744            $this->state = 'attributeValueUnquoted';
745        }
746    }
747
748    private function entityInAttributeValueState()
749    {
750        // Attempt to consume an entity.
751        $entity = $this->entity();
752
753        // If nothing is returned, append a U+0026 AMPERSAND character to the
754        // current attribute's value. Otherwise, emit the character token that
755        // was returned.
756        $char = (!$entity)
757            ? '&'
758            : $entity;
759
760        $this->emitToken($char);
761    }
762
763    private function bogusCommentState()
764    {
765        /* Consume every character up to the first U+003E GREATER-THAN SIGN
766        character (>) or the end of the file (EOF), whichever comes first. Emit
767        a comment token whose data is the concatenation of all the characters
768        starting from and including the character that caused the state machine
769        to switch into the bogus comment state, up to and including the last
770        consumed character before the U+003E character, if any, or up to the
771        end of the file otherwise. (If the comment was started by the end of
772        the file (EOF), the token is empty.) */
773        $data = $this->characters('^>', $this->char);
774        $this->emitToken(array(
775            'data' => $data,
776            'type' => self::COMMENT
777        ));
778
779        $this->char += strlen($data);
780
781        /* Switch to the data state. */
782        $this->state = 'data';
783
784        /* If the end of the file was reached, reconsume the EOF character. */
785        if($this->char === $this->EOF) {
786            $this->char = $this->EOF - 1;
787        }
788    }
789
790    private function markupDeclarationOpenState()
791    {
792        /* If the next two characters are both U+002D HYPHEN-MINUS (-)
793        characters, consume those two characters, create a comment token whose
794        data is the empty string, and switch to the comment state. */
795        if($this->character($this->char + 1, 2) === '--') {
796            $this->char += 2;
797            $this->state = 'comment';
798            $this->token = array(
799                'data' => null,
800                'type' => self::COMMENT
801            );
802
803        /* Otherwise if the next seven chacacters are a case-insensitive match
804        for the word "DOCTYPE", then consume those characters and switch to the
805        DOCTYPE state. */
806        } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
807            $this->char += 7;
808            $this->state = 'doctype';
809
810        /* Otherwise, is is a parse error. Switch to the bogus comment state.
811        The next character that is consumed, if any, is the first character
812        that will be in the comment. */
813        } else {
814            $this->char++;
815            $this->state = 'bogusComment';
816        }
817    }
818
819    private function commentState()
820    {
821        /* Consume the next input character: */
822        $this->char++;
823        $char = $this->char();
824
825        /* U+002D HYPHEN-MINUS (-) */
826        if($char === '-') {
827            /* Switch to the comment dash state  */
828            $this->state = 'commentDash';
829
830        /* EOF */
831        } elseif($this->char === $this->EOF) {
832            /* Parse error. Emit the comment token. Reconsume the EOF character
833            in the data state. */
834            $this->emitToken($this->token);
835            $this->char--;
836            $this->state = 'data';
837
838        /* Anything else */
839        } else {
840            /* Append the input character to the comment token's data. Stay in
841            the comment state. */
842            $this->token['data'] .= $char;
843        }
844    }
845
846    private function commentDashState()
847    {
848        /* Consume the next input character: */
849        $this->char++;
850        $char = $this->char();
851
852        /* U+002D HYPHEN-MINUS (-) */
853        if($char === '-') {
854            /* Switch to the comment end state  */
855            $this->state = 'commentEnd';
856
857        /* EOF */
858        } elseif($this->char === $this->EOF) {
859            /* Parse error. Emit the comment token. Reconsume the EOF character
860            in the data state. */
861            $this->emitToken($this->token);
862            $this->char--;
863            $this->state = 'data';
864
865        /* Anything else */
866        } else {
867            /* Append a U+002D HYPHEN-MINUS (-) character and the input
868            character to the comment token's data. Switch to the comment state. */
869            $this->token['data'] .= '-'.$char;
870            $this->state = 'comment';
871        }
872    }
873
874    private function commentEndState()
875    {
876        /* Consume the next input character: */
877        $this->char++;
878        $char = $this->char();
879
880        if($char === '>') {
881            $this->emitToken($this->token);
882            $this->state = 'data';
883
884        } elseif($char === '-') {
885            $this->token['data'] .= '-';
886
887        } elseif($this->char === $this->EOF) {
888            $this->emitToken($this->token);
889            $this->char--;
890            $this->state = 'data';
891
892        } else {
893            $this->token['data'] .= '--'.$char;
894            $this->state = 'comment';
895        }
896    }
897
898    private function doctypeState()
899    {
900        /* Consume the next input character: */
901        $this->char++;
902        $char = $this->char();
903
904        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
905            $this->state = 'beforeDoctypeName';
906
907        } else {
908            $this->char--;
909            $this->state = 'beforeDoctypeName';
910        }
911    }
912
913    private function beforeDoctypeNameState()
914    {
915        /* Consume the next input character: */
916        $this->char++;
917        $char = $this->char();
918
919        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
920            // Stay in the before DOCTYPE name state.
921
922        } elseif(preg_match('/^[a-z]$/', $char)) {
923            $this->token = array(
924                'name' => strtoupper($char),
925                'type' => self::DOCTYPE,
926                'error' => true
927            );
928
929            $this->state = 'doctypeName';
930
931        } elseif($char === '>') {
932            $this->emitToken(array(
933                'name' => null,
934                'type' => self::DOCTYPE,
935                'error' => true
936            ));
937
938            $this->state = 'data';
939
940        } elseif($this->char === $this->EOF) {
941            $this->emitToken(array(
942                'name' => null,
943                'type' => self::DOCTYPE,
944                'error' => true
945            ));
946
947            $this->char--;
948            $this->state = 'data';
949
950        } else {
951            $this->token = array(
952                'name' => $char,
953                'type' => self::DOCTYPE,
954                'error' => true
955            );
956
957            $this->state = 'doctypeName';
958        }
959    }
960
961    private function doctypeNameState()
962    {
963        /* Consume the next input character: */
964        $this->char++;
965        $char = $this->char();
966
967        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
968            $this->state = 'AfterDoctypeName';
969
970        } elseif($char === '>') {
971            $this->emitToken($this->token);
972            $this->state = 'data';
973
974        } elseif(preg_match('/^[a-z]$/', $char)) {
975            $this->token['name'] .= strtoupper($char);
976
977        } elseif($this->char === $this->EOF) {
978            $this->emitToken($this->token);
979            $this->char--;
980            $this->state = 'data';
981
982        } else {
983            $this->token['name'] .= $char;
984        }
985
986        $this->token['error'] = ($this->token['name'] === 'HTML')
987            ? false
988            : true;
989    }
990
991    private function afterDoctypeNameState()
992    {
993        /* Consume the next input character: */
994        $this->char++;
995        $char = $this->char();
996
997        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
998            // Stay in the DOCTYPE name state.
999
1000        } elseif($char === '>') {
1001            $this->emitToken($this->token);
1002            $this->state = 'data';
1003
1004        } elseif($this->char === $this->EOF) {
1005            $this->emitToken($this->token);
1006            $this->char--;
1007            $this->state = 'data';
1008
1009        } else {
1010            $this->token['error'] = true;
1011            $this->state = 'bogusDoctype';
1012        }
1013    }
1014
1015    private function bogusDoctypeState()
1016    {
1017        /* Consume the next input character: */
1018        $this->char++;
1019        $char = $this->char();
1020
1021        if($char === '>') {
1022            $this->emitToken($this->token);
1023            $this->state = 'data';
1024
1025        } elseif($this->char === $this->EOF) {
1026            $this->emitToken($this->token);
1027            $this->char--;
1028            $this->state = 'data';
1029
1030        } else {
1031            // Stay in the bogus DOCTYPE state.
1032        }
1033    }
1034
1035    private function entity()
1036    {
1037        $start = $this->char;
1038
1039        // This section defines how to consume an entity. This definition is
1040        // used when parsing entities in text and in attributes.
1041
1042        // The behaviour depends on the identity of the next character (the
1043        // one immediately after the U+0026 AMPERSAND character):
1044
1045        switch($this->character($this->char + 1)) {
1046            // U+0023 NUMBER SIGN (#)
1047            case '#':
1048
1049                // The behaviour further depends on the character after the
1050                // U+0023 NUMBER SIGN:
1051                switch($this->character($this->char + 1)) {
1052                    // U+0078 LATIN SMALL LETTER X
1053                    // U+0058 LATIN CAPITAL LETTER X
1054                    case 'x':
1055                    case 'X':
1056                        // Follow the steps below, but using the range of
1057                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1058                        // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1059                        // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1060                        // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1061                        // words, 0-9, A-F, a-f).
1062                        $char = 1;
1063                        $char_class = '0-9A-Fa-f';
1064                    break;
1065
1066                    // Anything else
1067                    default:
1068                        // Follow the steps below, but using the range of
1069                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1070                        // NINE (i.e. just 0-9).
1071                        $char = 0;
1072                        $char_class = '0-9';
1073                    break;
1074                }
1075
1076                // Consume as many characters as match the range of characters
1077                // given above.
1078                $this->char++;
1079                $e_name = $this->characters($char_class, $this->char + $char + 1);
1080                $entity = $this->character($start, $this->char);
1081                $cond = strlen($e_name) > 0;
1082
1083                // The rest of the parsing happens below.
1084            break;
1085
1086            // Anything else
1087            default:
1088                // Consume the maximum number of characters possible, with the
1089                // consumed characters case-sensitively matching one of the
1090                // identifiers in the first column of the entities table.
1091                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1092                $len = strlen($e_name);
1093
1094                for($c = 1; $c <= $len; $c++) {
1095                    $id = substr($e_name, 0, $c);
1096                    $this->char++;
1097
1098                    if(in_array($id, $this->entities)) {
1099                        $entity = $id;
1100                        break;
1101                    }
1102                }
1103
1104                $cond = isset($entity);
1105                // The rest of the parsing happens below.
1106            break;
1107        }
1108
1109        if(!$cond) {
1110            // If no match can be made, then this is a parse error. No
1111            // characters are consumed, and nothing is returned.
1112            $this->char = $start;
1113            return false;
1114        }
1115
1116        // Return a character token for the character corresponding to the
1117        // entity name (as given by the second column of the entities table).
1118        return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1119    }
1120
1121    private function emitToken($token)
1122    {
1123        $emit = $this->tree->emitToken($token);
1124
1125        if(is_int($emit)) {
1126            $this->content_model = $emit;
1127
1128        } elseif($token['type'] === self::ENDTAG) {
1129            $this->content_model = self::PCDATA;
1130        }
1131    }
1132
1133    private function EOF()
1134    {
1135        $this->state = null;
1136        $this->tree->emitToken(array(
1137            'type' => self::EOF
1138        ));
1139    }
1140}
1141
1142class HTML5TreeConstructer
1143{
1144    public $stack = array();
1145
1146    private $phase;
1147    private $mode;
1148    private $dom;
1149    private $foster_parent = null;
1150    private $a_formatting  = array();
1151
1152    private $head_pointer = null;
1153    private $form_pointer = null;
1154
1155    private $scoping = array('button','caption','html','marquee','object','table','td','th');
1156    private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1157    private $special = array('address','area','base','basefont','bgsound',
1158    'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1159    'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1160    'h6','head','hr','iframe','image','img','input','isindex','li','link',
1161    'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1162    'option','p','param','plaintext','pre','script','select','spacer','style',
1163    'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1164
1165    // The different phases.
1166    const INIT_PHASE = 0;
1167    const ROOT_PHASE = 1;
1168    const MAIN_PHASE = 2;
1169    const END_PHASE  = 3;
1170
1171    // The different insertion modes for the main phase.
1172    const BEFOR_HEAD = 0;
1173    const IN_HEAD    = 1;
1174    const AFTER_HEAD = 2;
1175    const IN_BODY    = 3;
1176    const IN_TABLE   = 4;
1177    const IN_CAPTION = 5;
1178    const IN_CGROUP  = 6;
1179    const IN_TBODY   = 7;
1180    const IN_ROW     = 8;
1181    const IN_CELL    = 9;
1182    const IN_SELECT  = 10;
1183    const AFTER_BODY = 11;
1184    const IN_FRAME   = 12;
1185    const AFTR_FRAME = 13;
1186
1187    // The different types of elements.
1188    const SPECIAL    = 0;
1189    const SCOPING    = 1;
1190    const FORMATTING = 2;
1191    const PHRASING   = 3;
1192
1193    const MARKER     = 0;
1194
1195    public function __construct()
1196    {
1197        $this->phase = self::INIT_PHASE;
1198        $this->mode = self::BEFOR_HEAD;
1199        $this->dom = new DOMDocument;
1200
1201        $this->dom->encoding = 'UTF-8';
1202        $this->dom->preserveWhiteSpace = true;
1203        $this->dom->substituteEntities = true;
1204        $this->dom->strictErrorChecking = false;
1205    }
1206
1207    // Process tag tokens
1208    public function emitToken($token)
1209    {
1210        switch($this->phase) {
1211            case self::INIT_PHASE: return $this->initPhase($token); break;
1212            case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1213            case self::MAIN_PHASE: return $this->mainPhase($token); break;
1214            case self::END_PHASE : return $this->trailingEndPhase($token); break;
1215        }
1216    }
1217
1218    private function initPhase($token)
1219    {
1220        /* Initially, the tree construction stage must handle each token
1221        emitted from the tokenisation stage as follows: */
1222
1223        /* A DOCTYPE token that is marked as being in error
1224        A comment token
1225        A start tag token
1226        An end tag token
1227        A character token that is not one of one of U+0009 CHARACTER TABULATION,
1228            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1229            or U+0020 SPACE
1230        An end-of-file token */
1231        if((isset($token['error']) && $token['error']) ||
1232        $token['type'] === HTML5::COMMENT ||
1233        $token['type'] === HTML5::STARTTAG ||
1234        $token['type'] === HTML5::ENDTAG ||
1235        $token['type'] === HTML5::EOF ||
1236        ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1237        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1238            /* This specification does not define how to handle this case. In
1239            particular, user agents may ignore the entirety of this specification
1240            altogether for such documents, and instead invoke special parse modes
1241            with a greater emphasis on backwards compatibility. */
1242
1243            $this->phase = self::ROOT_PHASE;
1244            return $this->rootElementPhase($token);
1245
1246        /* A DOCTYPE token marked as being correct */
1247        } elseif(isset($token['error']) && !$token['error']) {
1248            /* Append a DocumentType node to the Document  node, with the name
1249            attribute set to the name given in the DOCTYPE token (which will be
1250            "HTML"), and the other attributes specific to DocumentType objects
1251            set to null, empty lists, or the empty string as appropriate. */
1252            $doctype = new DOMDocumentType(null, null, 'HTML');
1253
1254            /* Then, switch to the root element phase of the tree construction
1255            stage. */
1256            $this->phase = self::ROOT_PHASE;
1257
1258        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1259        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1260        or U+0020 SPACE */
1261        } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1262        $token['data'])) {
1263            /* Append that character  to the Document node. */
1264            $text = $this->dom->createTextNode($token['data']);
1265            $this->dom->appendChild($text);
1266        }
1267    }
1268
1269    private function rootElementPhase($token)
1270    {
1271        /* After the initial phase, as each token is emitted from the tokenisation
1272        stage, it must be processed as described in this section. */
1273
1274        /* A DOCTYPE token */
1275        if($token['type'] === HTML5::DOCTYPE) {
1276            // Parse error. Ignore the token.
1277
1278        /* A comment token */
1279        } elseif($token['type'] === HTML5::COMMENT) {
1280            /* Append a Comment node to the Document object with the data
1281            attribute set to the data given in the comment token. */
1282            $comment = $this->dom->createComment($token['data']);
1283            $this->dom->appendChild($comment);
1284
1285        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1286        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1287        or U+0020 SPACE */
1288        } elseif($token['type'] === HTML5::CHARACTR &&
1289        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1290            /* Append that character  to the Document node. */
1291            $text = $this->dom->createTextNode($token['data']);
1292            $this->dom->appendChild($text);
1293
1294        /* A character token that is not one of U+0009 CHARACTER TABULATION,
1295            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1296            (FF), or U+0020 SPACE
1297        A start tag token
1298        An end tag token
1299        An end-of-file token */
1300        } elseif(($token['type'] === HTML5::CHARACTR &&
1301        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1302        $token['type'] === HTML5::STARTTAG ||
1303        $token['type'] === HTML5::ENDTAG ||
1304        $token['type'] === HTML5::EOF) {
1305            /* Create an HTMLElement node with the tag name html, in the HTML
1306            namespace. Append it to the Document object. Switch to the main
1307            phase and reprocess the current token. */
1308            $html = $this->dom->createElement('html');
1309            $this->dom->appendChild($html);
1310            $this->stack[] = $html;
1311
1312            $this->phase = self::MAIN_PHASE;
1313            return $this->mainPhase($token);
1314        }
1315    }
1316
1317    private function mainPhase($token)
1318    {
1319        /* Tokens in the main phase must be handled as follows: */
1320
1321        /* A DOCTYPE token */
1322        if($token['type'] === HTML5::DOCTYPE) {
1323            // Parse error. Ignore the token.
1324
1325        /* A start tag token with the tag name "html" */
1326        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1327            /* If this start tag token was not the first start tag token, then
1328            it is a parse error. */
1329
1330            /* For each attribute on the token, check to see if the attribute
1331            is already present on the top element of the stack of open elements.
1332            If it is not, add the attribute and its corresponding value to that
1333            element. */
1334            foreach($token['attr'] as $attr) {
1335                if(!$this->stack[0]->hasAttribute($attr['name'])) {
1336                    $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1337                }
1338            }
1339
1340        /* An end-of-file token */
1341        } elseif($token['type'] === HTML5::EOF) {
1342            /* Generate implied end tags. */
1343            $this->generateImpliedEndTags();
1344
1345        /* Anything else. */
1346        } else {
1347            /* Depends on the insertion mode: */
1348            switch($this->mode) {
1349                case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1350                case self::IN_HEAD:    return $this->inHead($token); break;
1351                case self::AFTER_HEAD: return $this->afterHead($token); break;
1352                case self::IN_BODY:    return $this->inBody($token); break;
1353                case self::IN_TABLE:   return $this->inTable($token); break;
1354                case self::IN_CAPTION: return $this->inCaption($token); break;
1355                case self::IN_CGROUP:  return $this->inColumnGroup($token); break;
1356                case self::IN_TBODY:   return $this->inTableBody($token); break;
1357                case self::IN_ROW:     return $this->inRow($token); break;
1358                case self::IN_CELL:    return $this->inCell($token); break;
1359                case self::IN_SELECT:  return $this->inSelect($token); break;
1360                case self::AFTER_BODY: return $this->afterBody($token); break;
1361                case self::IN_FRAME:   return $this->inFrameset($token); break;
1362                case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1363                case self::END_PHASE:  return $this->trailingEndPhase($token); break;
1364            }
1365        }
1366    }
1367
1368    private function beforeHead($token)
1369    {
1370        /* Handle the token as follows: */
1371
1372        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1373        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1374        or U+0020 SPACE */
1375        if($token['type'] === HTML5::CHARACTR &&
1376        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1377            /* Append the character to the current node. */
1378            $this->insertText($token['data']);
1379
1380        /* A comment token */
1381        } elseif($token['type'] === HTML5::COMMENT) {
1382            /* Append a Comment node to the current node with the data attribute
1383            set to the data given in the comment token. */
1384            $this->insertComment($token['data']);
1385
1386        /* A start tag token with the tag name "head" */
1387        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1388            /* Create an element for the token, append the new element to the
1389            current node and push it onto the stack of open elements. */
1390            $element = $this->insertElement($token);
1391
1392            /* Set the head element pointer to this new element node. */
1393            $this->head_pointer = $element;
1394
1395            /* Change the insertion mode to "in head". */
1396            $this->mode = self::IN_HEAD;
1397
1398        /* A start tag token whose tag name is one of: "base", "link", "meta",
1399        "script", "style", "title". Or an end tag with the tag name "html".
1400        Or a character token that is not one of U+0009 CHARACTER TABULATION,
1401        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1402        or U+0020 SPACE. Or any other start tag token */
1403        } elseif($token['type'] === HTML5::STARTTAG ||
1404        ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1405        ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1406        $token['data']))) {
1407            /* Act as if a start tag token with the tag name "head" and no
1408            attributes had been seen, then reprocess the current token. */
1409            $this->beforeHead(array(
1410                'name' => 'head',
1411                'type' => HTML5::STARTTAG,
1412                'attr' => array()
1413            ));
1414
1415            return $this->inHead($token);
1416
1417        /* Any other end tag */
1418        } elseif($token['type'] === HTML5::ENDTAG) {
1419            /* Parse error. Ignore the token. */
1420        }
1421    }
1422
1423    private function inHead($token)
1424    {
1425        /* Handle the token as follows: */
1426
1427        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1428        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1429        or U+0020 SPACE.
1430
1431        THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1432        or script element, append the character to the current node regardless
1433        of its content. */
1434        if(($token['type'] === HTML5::CHARACTR &&
1435        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1436        $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1437        array('title', 'style', 'script')))) {
1438            /* Append the character to the current node. */
1439            $this->insertText($token['data']);
1440
1441        /* A comment token */
1442        } elseif($token['type'] === HTML5::COMMENT) {
1443            /* Append a Comment node to the current node with the data attribute
1444            set to the data given in the comment token. */
1445            $this->insertComment($token['data']);
1446
1447        } elseif($token['type'] === HTML5::ENDTAG &&
1448        in_array($token['name'], array('title', 'style', 'script'))) {
1449            array_pop($this->stack);
1450            return HTML5::PCDATA;
1451
1452        /* A start tag with the tag name "title" */
1453        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1454            /* Create an element for the token and append the new element to the
1455            node pointed to by the head element pointer, or, if that is null
1456            (innerHTML case), to the current node. */
1457            if($this->head_pointer !== null) {
1458                $element = $this->insertElement($token, false);
1459                $this->head_pointer->appendChild($element);
1460
1461            } else {
1462                $element = $this->insertElement($token);
1463            }
1464
1465            /* Switch the tokeniser's content model flag  to the RCDATA state. */
1466            return HTML5::RCDATA;
1467
1468        /* A start tag with the tag name "style" */
1469        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1470            /* Create an element for the token and append the new element to the
1471            node pointed to by the head element pointer, or, if that is null
1472            (innerHTML case), to the current node. */
1473            if($this->head_pointer !== null) {
1474                $element = $this->insertElement($token, false);
1475                $this->head_pointer->appendChild($element);
1476
1477            } else {
1478                $this->insertElement($token);
1479            }
1480
1481            /* Switch the tokeniser's content model flag  to the CDATA state. */
1482            return HTML5::CDATA;
1483
1484        /* A start tag with the tag name "script" */
1485        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1486            /* Create an element for the token. */
1487            $element = $this->insertElement($token, false);
1488            $this->head_pointer->appendChild($element);
1489
1490            /* Switch the tokeniser's content model flag  to the CDATA state. */
1491            return HTML5::CDATA;
1492
1493        /* A start tag with the tag name "base", "link", or "meta" */
1494        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1495        array('base', 'link', 'meta'))) {
1496            /* Create an element for the token and append the new element to the
1497            node pointed to by the head element pointer, or, if that is null
1498            (innerHTML case), to the current node. */
1499            if($this->head_pointer !== null) {
1500                $element = $this->insertElement($token, false);
1501                $this->head_pointer->appendChild($element);
1502                array_pop($this->stack);
1503
1504            } else {
1505                $this->insertElement($token);
1506            }
1507
1508        /* An end tag with the tag name "head" */
1509        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1510            /* If the current node is a head element, pop the current node off
1511            the stack of open elements. */
1512            if($this->head_pointer->isSameNode(end($this->stack))) {
1513                array_pop($this->stack);
1514
1515            /* Otherwise, this is a parse error. */
1516            } else {
1517                // k
1518            }
1519
1520            /* Change the insertion mode to "after head". */
1521            $this->mode = self::AFTER_HEAD;
1522
1523        /* A start tag with the tag name "head" or an end tag except "html". */
1524        } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1525        ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1526            // Parse error. Ignore the token.
1527
1528        /* Anything else */
1529        } else {
1530            /* If the current node is a head element, act as if an end tag
1531            token with the tag name "head" had been seen. */
1532            if($this->head_pointer->isSameNode(end($this->stack))) {
1533                $this->inHead(array(
1534                    'name' => 'head',
1535                    'type' => HTML5::ENDTAG
1536                ));
1537
1538            /* Otherwise, change the insertion mode to "after head". */
1539            } else {
1540                $this->mode = self::AFTER_HEAD;
1541            }
1542
1543            /* Then, reprocess the current token. */
1544            return $this->afterHead($token);
1545        }
1546    }
1547
1548    private function afterHead($token)
1549    {
1550        /* Handle the token as follows: */
1551
1552        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1553        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1554        or U+0020 SPACE */
1555        if($token['type'] === HTML5::CHARACTR &&
1556        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1557            /* Append the character to the current node. */
1558            $this->insertText($token['data']);
1559
1560        /* A comment token */
1561        } elseif($token['type'] === HTML5::COMMENT) {
1562            /* Append a Comment node to the current node with the data attribute
1563            set to the data given in the comment token. */
1564            $this->insertComment($token['data']);
1565
1566        /* A start tag token with the tag name "body" */
1567        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1568            /* Insert a body element for the token. */
1569            $this->insertElement($token);
1570
1571            /* Change the insertion mode to "in body". */
1572            $this->mode = self::IN_BODY;
1573
1574        /* A start tag token with the tag name "frameset" */
1575        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1576            /* Insert a frameset element for the token. */
1577            $this->insertElement($token);
1578
1579            /* Change the insertion mode to "in frameset". */
1580            $this->mode = self::IN_FRAME;
1581
1582        /* A start tag token whose tag name is one of: "base", "link", "meta",
1583        "script", "style", "title" */
1584        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1585        array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1586            /* Parse error. Switch the insertion mode back to "in head" and
1587            reprocess the token. */
1588            $this->mode = self::IN_HEAD;
1589            return $this->inHead($token);
1590
1591        /* Anything else */
1592        } else {
1593            /* Act as if a start tag token with the tag name "body" and no
1594            attributes had been seen, and then reprocess the current token. */
1595            $this->afterHead(array(
1596                'name' => 'body',
1597                'type' => HTML5::STARTTAG,
1598                'attr' => array()
1599            ));
1600
1601            return $this->inBody($token);
1602        }
1603    }
1604
1605    private function inBody($token)
1606    {
1607        /* Handle the token as follows: */
1608
1609        switch($token['type']) {
1610            /* A character token */
1611            case HTML5::CHARACTR:
1612                /* Reconstruct the active formatting elements, if any. */
1613                $this->reconstructActiveFormattingElements();
1614
1615                /* Append the token's character to the current node. */
1616                $this->insertText($token['data']);
1617            break;
1618
1619            /* A comment token */
1620            case HTML5::COMMENT:
1621                /* Append a Comment node to the current node with the data
1622                attribute set to the data given in the comment token. */
1623                $this->insertComment($token['data']);
1624            break;
1625
1626            case HTML5::STARTTAG:
1627            switch($token['name']) {
1628                /* A start tag token whose tag name is one of: "script",
1629                "style" */
1630                case 'script': case 'style':
1631                    /* Process the token as if the insertion mode had been "in
1632                    head". */
1633                    return $this->inHead($token);
1634                break;
1635
1636                /* A start tag token whose tag name is one of: "base", "link",
1637                "meta", "title" */
1638                case 'base': case 'link': case 'meta': case 'title':
1639                    /* Parse error. Process the token as if the insertion mode
1640                    had    been "in head". */
1641                    return $this->inHead($token);
1642                break;
1643
1644                /* A start tag token with the tag name "body" */
1645                case 'body':
1646                    /* Parse error. If the second element on the stack of open
1647                    elements is not a body element, or, if the stack of open
1648                    elements has only one node on it, then ignore the token.
1649                    (innerHTML case) */
1650                    if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1651                        // Ignore
1652
1653                    /* Otherwise, for each attribute on the token, check to see
1654                    if the attribute is already present on the body element (the
1655                    second element)    on the stack of open elements. If it is not,
1656                    add the attribute and its corresponding value to that
1657                    element. */
1658                    } else {
1659                        foreach($token['attr'] as $attr) {
1660                            if(!$this->stack[1]->hasAttribute($attr['name'])) {
1661                                $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1662                            }
1663                        }
1664                    }
1665                break;
1666
1667                /* A start tag whose tag name is one of: "address",
1668                "blockquote", "center", "dir", "div", "dl", "fieldset",
1669                "listing", "menu", "ol", "p", "ul" */
1670                case 'address': case 'blockquote': case 'center': case 'dir':
1671                case 'div': case 'dl': case 'fieldset': case 'listing':
1672                case 'menu': case 'ol': case 'p': case 'ul':
1673                    /* If the stack of open elements has a p element in scope,
1674                    then act as if an end tag with the tag name p had been
1675                    seen. */
1676                    if($this->elementInScope('p')) {
1677                        $this->emitToken(array(
1678                            'name' => 'p',
1679                            'type' => HTML5::ENDTAG
1680                        ));
1681                    }
1682
1683                    /* Insert an HTML element for the token. */
1684                    $this->insertElement($token);
1685                break;
1686
1687                /* A start tag whose tag name is "form" */
1688                case 'form':
1689                    /* If the form element pointer is not null, ignore the
1690                    token with a parse error. */
1691                    if($this->form_pointer !== null) {
1692                        // Ignore.
1693
1694                    /* Otherwise: */
1695                    } else {
1696                        /* If the stack of open elements has a p element in
1697                        scope, then act as if an end tag with the tag name p
1698                        had been seen. */
1699                        if($this->elementInScope('p')) {
1700                            $this->emitToken(array(
1701                                'name' => 'p',
1702                                'type' => HTML5::ENDTAG
1703                            ));
1704                        }
1705
1706                        /* Insert an HTML element for the token, and set the
1707                        form element pointer to point to the element created. */
1708                        $element = $this->insertElement($token);
1709                        $this->form_pointer = $element;
1710                    }
1711                break;
1712
1713                /* A start tag whose tag name is "li", "dd" or "dt" */
1714                case 'li': case 'dd': case 'dt':
1715                    /* If the stack of open elements has a p  element in scope,
1716                    then act as if an end tag with the tag name p had been
1717                    seen. */
1718                    if($this->elementInScope('p')) {
1719                        $this->emitToken(array(
1720                            'name' => 'p',
1721                            'type' => HTML5::ENDTAG
1722                        ));
1723                    }
1724
1725                    $stack_length = count($this->stack) - 1;
1726
1727                    for($n = $stack_length; 0 <= $n; $n--) {
1728                        /* 1. Initialise node to be the current node (the
1729                        bottommost node of the stack). */
1730                        $stop = false;
1731                        $node = $this->stack[$n];
1732                        $cat  = $this->getElementCategory($node->tagName);
1733
1734                        /* 2. If node is an li, dd or dt element, then pop all
1735                        the    nodes from the current node up to node, including
1736                        node, then stop this algorithm. */
1737                        if($token['name'] === $node->tagName ||    ($token['name'] !== 'li'
1738                        && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1739                            for($x = $stack_length; $x >= $n ; $x--) {
1740                                array_pop($this->stack);
1741                            }
1742
1743                            break;
1744                        }
1745
1746                        /* 3. If node is not in the formatting category, and is
1747                        not    in the phrasing category, and is not an address or
1748                        div element, then stop this algorithm. */
1749                        if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1750                        $node->tagName !== 'address' && $node->tagName !== 'div') {
1751                            break;
1752                        }
1753                    }
1754
1755                    /* Finally, insert an HTML element with the same tag
1756                    name as the    token's. */
1757                    $this->insertElement($token);
1758                break;
1759
1760                /* A start tag token whose tag name is "plaintext" */
1761                case 'plaintext':
1762                    /* If the stack of open elements has a p  element in scope,
1763                    then act as if an end tag with the tag name p had been
1764                    seen. */
1765                    if($this->elementInScope('p')) {
1766                        $this->emitToken(array(
1767                            'name' => 'p',
1768                            'type' => HTML5::ENDTAG
1769                        ));
1770                    }
1771
1772                    /* Insert an HTML element for the token. */
1773                    $this->insertElement($token);
1774
1775                    return HTML5::PLAINTEXT;
1776                break;
1777
1778                /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1779                "h5", "h6" */
1780                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1781                    /* If the stack of open elements has a p  element in scope,
1782                    then act as if an end tag with the tag name p had been seen. */
1783                    if($this->elementInScope('p')) {
1784                        $this->emitToken(array(
1785                            'name' => 'p',
1786                            'type' => HTML5::ENDTAG
1787                        ));
1788                    }
1789
1790                    /* If the stack of open elements has in scope an element whose
1791                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1792                    this is a parse error; pop elements from the stack until an
1793                    element with one of those tag names has been popped from the
1794                    stack. */
1795                    while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1796                        array_pop($this->stack);
1797                    }
1798
1799                    /* Insert an HTML element for the token. */
1800                    $this->insertElement($token);
1801                break;
1802
1803                /* A start tag whose tag name is "a" */
1804                case 'a':
1805                    /* If the list of active formatting elements contains
1806                    an element whose tag name is "a" between the end of the
1807                    list and the last marker on the list (or the start of
1808                    the list if there is no marker on the list), then this
1809                    is a parse error; act as if an end tag with the tag name
1810                    "a" had been seen, then remove that element from the list
1811                    of active formatting elements and the stack of open
1812                    elements if the end tag didn't already remove it (it
1813                    might not have if the element is not in table scope). */
1814                    $leng = count($this->a_formatting);
1815
1816                    for($n = $leng - 1; $n >= 0; $n--) {
1817                        if($this->a_formatting[$n] === self::MARKER) {
1818                            break;
1819
1820                        } elseif($this->a_formatting[$n]->nodeName === 'a') {
1821                            $this->emitToken(array(
1822                                'name' => 'a',
1823                                'type' => HTML5::ENDTAG
1824                            ));
1825                            break;
1826                        }
1827                    }
1828
1829                    /* Reconstruct the active formatting elements, if any. */
1830                    $this->reconstructActiveFormattingElements();
1831
1832                    /* Insert an HTML element for the token. */
1833                    $el = $this->insertElement($token);
1834
1835                    /* Add that element to the list of active formatting
1836                    elements. */
1837                    $this->a_formatting[] = $el;
1838                break;
1839
1840                /* A start tag whose tag name is one of: "b", "big", "em", "font",
1841                "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1842                case 'b': case 'big': case 'em': case 'font': case 'i':
1843                case 'nobr': case 's': case 'small': case 'strike':
1844                case 'strong': case 'tt': case 'u':
1845                    /* Reconstruct the active formatting elements, if any. */
1846                    $this->reconstructActiveFormattingElements();
1847
1848                    /* Insert an HTML element for the token. */
1849                    $el = $this->insertElement($token);
1850
1851                    /* Add that element to the list of active formatting
1852                    elements. */
1853                    $this->a_formatting[] = $el;
1854                break;
1855
1856                /* A start tag token whose tag name is "button" */
1857                case 'button':
1858                    /* If the stack of open elements has a button element in scope,
1859                    then this is a parse error; act as if an end tag with the tag
1860                    name "button" had been seen, then reprocess the token. (We don't
1861                    do that. Unnecessary.) */
1862                    if($this->elementInScope('button')) {
1863                        $this->inBody(array(
1864                            'name' => 'button',
1865                            'type' => HTML5::ENDTAG
1866                        ));
1867                    }
1868
1869                    /* Reconstruct the active formatting elements, if any. */
1870                    $this->reconstructActiveFormattingElements();
1871
1872                    /* Insert an HTML element for the token. */
1873                    $this->insertElement($token);
1874
1875                    /* Insert a marker at the end of the list of active
1876                    formatting elements. */
1877                    $this->a_formatting[] = self::MARKER;
1878                break;
1879
1880                /* A start tag token whose tag name is one of: "marquee", "object" */
1881                case 'marquee': case 'object':
1882                    /* Reconstruct the active formatting elements, if any. */
1883                    $this->reconstructActiveFormattingElements();
1884
1885                    /* Insert an HTML element for the token. */
1886                    $this->insertElement($token);
1887
1888                    /* Insert a marker at the end of the list of active
1889                    formatting elements. */
1890                    $this->a_formatting[] = self::MARKER;
1891                break;
1892
1893                /* A start tag token whose tag name is "xmp" */
1894                case 'xmp':
1895                    /* Reconstruct the active formatting elements, if any. */
1896                    $this->reconstructActiveFormattingElements();
1897
1898                    /* Insert an HTML element for the token. */
1899                    $this->insertElement($token);
1900
1901                    /* Switch the content model flag to the CDATA state. */
1902                    return HTML5::CDATA;
1903                break;
1904
1905                /* A start tag whose tag name is "table" */
1906                case 'table':
1907                    /* If the stack of open elements has a p element in scope,
1908                    then act as if an end tag with the tag name p had been seen. */
1909                    if($this->elementInScope('p')) {
1910                        $this->emitToken(array(
1911                            'name' => 'p',
1912                            'type' => HTML5::ENDTAG
1913                        ));
1914                    }
1915
1916                    /* Insert an HTML element for the token. */
1917                    $this->insertElement($token);
1918
1919                    /* Change the insertion mode to "in table". */
1920                    $this->mode = self::IN_TABLE;
1921                break;
1922
1923                /* A start tag whose tag name is one of: "area", "basefont",
1924                "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1925                case 'area': case 'basefont': case 'bgsound': case 'br':
1926                case 'embed': case 'img': case 'param': case 'spacer':
1927                case 'wbr':
1928                    /* Reconstruct the active formatting elements, if any. */
1929                    $this->reconstructActiveFormattingElements();
1930
1931                    /* Insert an HTML element for the token. */
1932                    $this->insertElement($token);
1933
1934                    /* Immediately pop the current node off the stack of open elements. */
1935                    array_pop($this->stack);
1936                break;
1937
1938                /* A start tag whose tag name is "hr" */
1939                case 'hr':
1940                    /* If the stack of open elements has a p element in scope,
1941                    then act as if an end tag with the tag name p had been seen. */
1942                    if($this->elementInScope('p')) {
1943                        $this->emitToken(array(
1944                            'name' => 'p',
1945                            'type' => HTML5::ENDTAG
1946                        ));
1947                    }
1948
1949                    /* Insert an HTML element for the token. */
1950                    $this->insertElement($token);
1951
1952                    /* Immediately pop the current node off the stack of open elements. */
1953                    array_pop($this->stack);
1954                break;
1955
1956                /* A start tag whose tag name is "image" */
1957                case 'image':
1958                    /* Parse error. Change the token's tag name to "img" and
1959                    reprocess it. (Don't ask.) */
1960                    $token['name'] = 'img';
1961                    return $this->inBody($token);
1962                break;
1963
1964                /* A start tag whose tag name is "input" */
1965                case 'input':
1966                    /* Reconstruct the active formatting elements, if any. */
1967                    $this->reconstructActiveFormattingElements();
1968
1969                    /* Insert an input element for the token. */
1970                    $element = $this->insertElement($token, false);
1971
1972                    /* If the form element pointer is not null, then associate the
1973                    input element with the form element pointed to by the form
1974                    element pointer. */
1975                    $this->form_pointer !== null
1976                        ? $this->form_pointer->appendChild($element)
1977                        : end($this->stack)->appendChild($element);
1978
1979                    /* Pop that input element off the stack of open elements. */
1980                    array_pop($this->stack);
1981                break;
1982
1983                /* A start tag whose tag name is "isindex" */
1984                case 'isindex':
1985                    /* Parse error. */
1986                    // w/e
1987
1988                    /* If the form element pointer is not null,
1989                    then ignore the token. */
1990                    if($this->form_pointer === null) {
1991                        /* Act as if a start tag token with the tag name "form" had
1992                        been seen. */
1993                        $this->inBody(array(
1994                            'name' => 'body',
1995                            'type' => HTML5::STARTTAG,
1996                            'attr' => array()
1997                        ));
1998
1999                        /* Act as if a start tag token with the tag name "hr" had
2000                        been seen. */
2001                        $this->inBody(array(
2002                            'name' => 'hr',
2003                            'type' => HTML5::STARTTAG,
2004                            'attr' => array()
2005                        ));
2006
2007                        /* Act as if a start tag token with the tag name "p" had
2008                        been seen. */
2009                        $this->inBody(array(
2010                            'name' => 'p',
2011                            'type' => HTML5::STARTTAG,
2012                            'attr' => array()
2013                        ));
2014
2015                        /* Act as if a start tag token with the tag name "label"
2016                        had been seen. */
2017                        $this->inBody(array(
2018                            'name' => 'label',
2019                            'type' => HTML5::STARTTAG,
2020                            'attr' => array()
2021                        ));
2022
2023                        /* Act as if a stream of character tokens had been seen. */
2024                        $this->insertText('This is a searchable index. '.
2025                        'Insert your search keywords here: ');
2026
2027                        /* Act as if a start tag token with the tag name "input"
2028                        had been seen, with all the attributes from the "isindex"
2029                        token, except with the "name" attribute set to the value
2030                        "isindex" (ignoring any explicit "name" attribute). */
2031                        $attr = $token['attr'];
2032                        $attr[] = array('name' => 'name', 'value' => 'isindex');
2033
2034                        $this->inBody(array(
2035                            'name' => 'input',
2036                            'type' => HTML5::STARTTAG,
2037                            'attr' => $attr
2038                        ));
2039
2040                        /* Act as if a stream of character tokens had been seen
2041                        (see below for what they should say). */
2042                        $this->insertText('This is a searchable index. '.
2043                        'Insert your search keywords here: ');
2044
2045                        /* Act as if an end tag token with the tag name "label"
2046                        had been seen. */
2047                        $this->inBody(array(
2048                            'name' => 'label',
2049                            'type' => HTML5::ENDTAG
2050                        ));
2051
2052                        /* Act as if an end tag token with the tag name "p" had
2053                        been seen. */
2054                        $this->inBody(array(
2055                            'name' => 'p',
2056                            'type' => HTML5::ENDTAG
2057                        ));
2058
2059                        /* Act as if a start tag token with the tag name "hr" had
2060                        been seen. */
2061                        $this->inBody(array(
2062                            'name' => 'hr',
2063                            'type' => HTML5::ENDTAG
2064                        ));
2065
2066                        /* Act as if an end tag token with the tag name "form" had
2067                        been seen. */
2068                        $this->inBody(array(
2069                            'name' => 'form',
2070                            'type' => HTML5::ENDTAG
2071                        ));
2072                    }
2073                break;
2074
2075                /* A start tag whose tag name is "textarea" */
2076                case 'textarea':
2077                    $this->insertElement($token);
2078
2079                    /* Switch the tokeniser's content model flag to the
2080                    RCDATA state. */
2081                    return HTML5::RCDATA;
2082                break;
2083
2084                /* A start tag whose tag name is one of: "iframe", "noembed",
2085                "noframes" */
2086                case 'iframe': case 'noembed': case 'noframes':
2087                    $this->insertElement($token);
2088
2089                    /* Switch the tokeniser's content model flag to the CDATA state. */
2090                    return HTML5::CDATA;
2091                break;
2092
2093                /* A start tag whose tag name is "select" */
2094                case 'select':
2095                    /* Reconstruct the active formatting elements, if any. */
2096                    $this->reconstructActiveFormattingElements();
2097
2098                    /* Insert an HTML element for the token. */
2099                    $this->insertElement($token);
2100
2101                    /* Change the insertion mode to "in select". */
2102                    $this->mode = self::IN_SELECT;
2103                break;
2104
2105                /* A start or end tag whose tag name is one of: "caption", "col",
2106                "colgroup", "frame", "frameset", "head", "option", "optgroup",
2107                "tbody", "td", "tfoot", "th", "thead", "tr". */
2108                case 'caption': case 'col': case 'colgroup': case 'frame':
2109                case 'frameset': case 'head': case 'option': case 'optgroup':
2110                case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2111                case 'tr':
2112                    // Parse error. Ignore the token.
2113                break;
2114
2115                /* A start or end tag whose tag name is one of: "event-source",
2116                "section", "nav", "article", "aside", "header", "footer",
2117                "datagrid", "command" */
2118                case 'event-source': case 'section': case 'nav': case 'article':
2119                case 'aside': case 'header': case 'footer': case 'datagrid':
2120                case 'command':
2121                    // Work in progress!
2122                break;
2123
2124                /* A start tag token not covered by the previous entries */
2125                default:
2126                    /* Reconstruct the active formatting elements, if any. */
2127                    $this->reconstructActiveFormattingElements();
2128
2129                    $this->insertElement($token);
2130                break;
2131            }
2132            break;
2133
2134            case HTML5::ENDTAG:
2135            switch($token['name']) {
2136                /* An end tag with the tag name "body" */
2137                case 'body':
2138                    /* If the second element in the stack of open elements is
2139                    not a body element, this is a parse error. Ignore the token.
2140                    (innerHTML case) */
2141                    if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2142                        // Ignore.
2143
2144                    /* If the current node is not the body element, then this
2145                    is a parse error. */
2146                    } elseif(end($this->stack)->nodeName !== 'body') {
2147                        // Parse error.
2148                    }
2149
2150                    /* Change the insertion mode to "after body". */
2151                    $this->mode = self::AFTER_BODY;
2152                break;
2153
2154                /* An end tag with the tag name "html" */
2155                case 'html':
2156                    /* Act as if an end tag with tag name "body" had been seen,
2157                    then, if that token wasn't ignored, reprocess the current
2158                    token. */
2159                    $this->inBody(array(
2160                        'name' => 'body',
2161                        'type' => HTML5::ENDTAG
2162                    ));
2163
2164                    return $this->afterBody($token);
2165                break;
2166
2167                /* An end tag whose tag name is one of: "address", "blockquote",
2168                "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2169                "ol", "pre", "ul" */
2170                case 'address': case 'blockquote': case 'center': case 'dir':
2171                case 'div': case 'dl': case 'fieldset': case 'listing':
2172                case 'menu': case 'ol': case 'pre': case 'ul':
2173                    /* If the stack of open elements has an element in scope
2174                    with the same tag name as that of the token, then generate
2175                    implied end tags. */
2176                    if($this->elementInScope($token['name'])) {
2177                        $this->generateImpliedEndTags();
2178
2179                        /* Now, if the current node is not an element with
2180                        the same tag name as that of the token, then this
2181                        is a parse error. */
2182                        // w/e
2183
2184                        /* If the stack of open elements has an element in
2185                        scope with the same tag name as that of the token,
2186                        then pop elements from this stack until an element
2187                        with that tag name has been popped from the stack. */
2188                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2189                            if($this->stack[$n]->nodeName === $token['name']) {
2190                                $n = -1;
2191                            }
2192
2193                            array_pop($this->stack);
2194                        }
2195                    }
2196                break;
2197
2198                /* An end tag whose tag name is "form" */
2199                case 'form':
2200                    /* If the stack of open elements has an element in scope
2201                    with the same tag name as that of the token, then generate
2202                    implied    end tags. */
2203                    if($this->elementInScope($token['name'])) {
2204                        $this->generateImpliedEndTags();
2205
2206                    }
2207
2208                    if(end($this->stack)->nodeName !== $token['name']) {
2209                        /* Now, if the current node is not an element with the
2210                        same tag name as that of the token, then this is a parse
2211                        error. */
2212                        // w/e
2213
2214                    } else {
2215                        /* Otherwise, if the current node is an element with
2216                        the same tag name as that of the token pop that element
2217                        from the stack. */
2218                        array_pop($this->stack);
2219                    }
2220
2221                    /* In any case, set the form element pointer to null. */
2222                    $this->form_pointer = null;
2223                break;
2224
2225                /* An end tag whose tag name is "p" */
2226                case 'p':
2227                    /* If the stack of open elements has a p element in scope,
2228                    then generate implied end tags, except for p elements. */
2229                    if($this->elementInScope('p')) {
2230                        $this->generateImpliedEndTags(array('p'));
2231
2232                        /* If the current node is not a p element, then this is
2233                        a parse error. */
2234                        // k
2235
2236                        /* If the stack of open elements has a p element in
2237                        scope, then pop elements from this stack until the stack
2238                        no longer has a p element in scope. */
2239                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2240                            if($this->elementInScope('p')) {
2241                                array_pop($this->stack);
2242
2243                            } else {
2244                                break;
2245                            }
2246                        }
2247                    }
2248                break;
2249
2250                /* An end tag whose tag name is "dd", "dt", or "li" */
2251                case 'dd': case 'dt': case 'li':
2252                    /* If the stack of open elements has an element in scope
2253                    whose tag name matches the tag name of the token, then
2254                    generate implied end tags, except for elements with the
2255                    same tag name as the token. */
2256                    if($this->elementInScope($token['name'])) {
2257                        $this->generateImpliedEndTags(array($token['name']));
2258
2259                        /* If the current node is not an element with the same
2260                        tag name as the token, then this is a parse error. */
2261                        // w/e
2262
2263                        /* If the stack of open elements has an element in scope
2264                        whose tag name matches the tag name of the token, then
2265                        pop elements from this stack until an element with that
2266                        tag name has been popped from the stack. */
2267                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2268                            if($this->stack[$n]->nodeName === $token['name']) {
2269                                $n = -1;
2270                            }
2271
2272                            array_pop($this->stack);
2273                        }
2274                    }
2275                break;
2276
2277                /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2278                "h5", "h6" */
2279                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2280                    $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2281
2282                    /* If the stack of open elements has in scope an element whose
2283                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2284                    generate implied end tags. */
2285                    if($this->elementInScope($elements)) {
2286                        $this->generateImpliedEndTags();
2287
2288                        /* Now, if the current node is not an element with the same
2289                        tag name as that of the token, then this is a parse error. */
2290                        // w/e
2291
2292                        /* If the stack of open elements has in scope an element
2293                        whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2294                        "h6", then pop elements from the stack until an element
2295                        with one of those tag names has been popped from the stack. */
2296                        while($this->elementInScope($elements)) {
2297                            array_pop($this->stack);
2298                        }
2299                    }
2300                break;
2301
2302                /* An end tag whose tag name is one of: "a", "b", "big", "em",
2303                "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2304                case 'a': case 'b': case 'big': case 'em': case 'font':
2305                case 'i': case 'nobr': case 's': case 'small': case 'strike':
2306                case 'strong': case 'tt': case 'u':
2307                    /* 1. Let the formatting element be the last element in
2308                    the list of active formatting elements that:
2309                        * is between the end of the list and the last scope
2310                        marker in the list, if any, or the start of the list
2311                        otherwise, and
2312                        * has the same tag name as the token.
2313                    */
2314                    while(true) {
2315                        for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2316                            if($this->a_formatting[$a] === self::MARKER) {
2317                                break;
2318
2319                            } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2320                                $formatting_element = $this->a_formatting[$a];
2321                                $in_stack = in_array($formatting_element, $this->stack, true);
2322                                $fe_af_pos = $a;
2323                                break;
2324                            }
2325                        }
2326
2327                        /* If there is no such node, or, if that node is
2328                        also in the stack of open elements but the element
2329                        is not in scope, then this is a parse error. Abort
2330                        these steps. The token is ignored. */
2331                        if(!isset($formatting_element) || ($in_stack &&
2332                        !$this->elementInScope($token['name']))) {
2333                            break;
2334
2335                        /* Otherwise, if there is such a node, but that node
2336                        is not in the stack of open elements, then this is a
2337                        parse error; remove the element from the list, and
2338                        abort these steps. */
2339                        } elseif(isset($formatting_element) && !$in_stack) {
2340                            unset($this->a_formatting[$fe_af_pos]);
2341                            $this->a_formatting = array_merge($this->a_formatting);
2342                            break;
2343                        }
2344
2345                        /* 2. Let the furthest block be the topmost node in the
2346                        stack of open elements that is lower in the stack
2347                        than the formatting element, and is not an element in
2348                        the phrasing or formatting categories. There might
2349                        not be one. */
2350                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2351                        $length = count($this->stack);
2352
2353                        for($s = $fe_s_pos + 1; $s < $length; $s++) {
2354                            $category = $this->getElementCategory($this->stack[$s]->nodeName);
2355
2356                            if($category !== self::PHRASING && $category !== self::FORMATTING) {
2357                                $furthest_block = $this->stack[$s];
2358                            }
2359                        }
2360
2361                        /* 3. If there is no furthest block, then the UA must
2362                        skip the subsequent steps and instead just pop all
2363                        the nodes from the bottom of the stack of open
2364                        elements, from the current node up to the formatting
2365                        element, and remove the formatting element from the
2366                        list of active formatting elements. */
2367                        if(!isset($furthest_block)) {
2368                            for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2369                                array_pop($this->stack);
2370                            }
2371
2372                            unset($this->a_formatting[$fe_af_pos]);
2373                            $this->a_formatting = array_merge($this->a_formatting);
2374                            break;
2375                        }
2376
2377                        /* 4. Let the common ancestor be the element
2378                        immediately above the formatting element in the stack
2379                        of open elements. */
2380                        $common_ancestor = $this->stack[$fe_s_pos - 1];
2381
2382                        /* 5. If the furthest block has a parent node, then
2383                        remove the furthest block from its parent node. */
2384                        if($furthest_block->parentNode !== null) {
2385                            $furthest_block->parentNode->removeChild($furthest_block);
2386                        }
2387
2388                        /* 6. Let a bookmark note the position of the
2389                        formatting element in the list of active formatting
2390                        elements relative to the elements on either side
2391                        of it in the list. */
2392                        $bookmark = $fe_af_pos;
2393
2394                        /* 7. Let node and last node  be the furthest block.
2395                        Follow these steps: */
2396                        $node = $furthest_block;
2397                        $last_node = $furthest_block;
2398
2399                        while(true) {
2400                            for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2401                                /* 7.1 Let node be the element immediately
2402                                prior to node in the stack of open elements. */
2403                                $node = $this->stack[$n];
2404
2405                                /* 7.2 If node is not in the list of active
2406                                formatting elements, then remove node from
2407                                the stack of open elements and then go back
2408                                to step 1. */
2409                                if(!in_array($node, $this->a_formatting, true)) {
2410                                    unset($this->stack[$n]);
2411                                    $this->stack = array_merge($this->stack);
2412
2413                                } else {
2414                                    break;
2415                                }
2416                            }
2417
2418                            /* 7.3 Otherwise, if node is the formatting
2419                            element, then go to the next step in the overall
2420                            algorithm. */
2421                            if($node === $formatting_element) {
2422                                break;
2423
2424                            /* 7.4 Otherwise, if last node is the furthest
2425                            block, then move the aforementioned bookmark to
2426                            be immediately after the node in the list of
2427                            active formatting elements. */
2428                            } elseif($last_node === $furthest_block) {
2429                                $bookmark = array_search($node, $this->a_formatting, true) + 1;
2430                            }
2431
2432                            /* 7.5 If node has any children, perform a
2433                            shallow clone of node, replace the entry for
2434                            node in the list of active formatting elements
2435                            with an entry for the clone, replace the entry
2436                            for node in the stack of open elements with an
2437                            entry for the clone, and let node be the clone. */
2438                            if($node->hasChildNodes()) {
2439                                $clone = $node->cloneNode();
2440                                $s_pos = array_search($node, $this->stack, true);
2441                                $a_pos = array_search($node, $this->a_formatting, true);
2442
2443                                $this->stack[$s_pos] = $clone;
2444                                $this->a_formatting[$a_pos] = $clone;
2445                                $node = $clone;
2446                            }
2447
2448                            /* 7.6 Insert last node into node, first removing
2449                            it from its previous parent node if any. */
2450                            if($last_node->parentNode !== null) {
2451                                $last_node->parentNode->removeChild($last_node);
2452                            }
2453
2454                            $node->appendChild($last_node);
2455
2456                            /* 7.7 Let last node be node. */
2457                            $last_node = $node;
2458                        }
2459
2460                        /* 8. Insert whatever last node ended up being in
2461                        the previous step into the common ancestor node,
2462                        first removing it from its previous parent node if
2463                        any. */
2464                        if($last_node->parentNode !== null) {
2465                            $last_node->parentNode->removeChild($last_node);
2466                        }
2467
2468                        $common_ancestor->appendChild($last_node);
2469
2470                        /* 9. Perform a shallow clone of the formatting
2471                        element. */
2472                        $clone = $formatting_element->cloneNode();
2473
2474                        /* 10. Take all of the child nodes of the furthest
2475                        block and append them to the clone created in the
2476                        last step. */
2477                        while($furthest_block->hasChildNodes()) {
2478                            $child = $furthest_block->firstChild;
2479                            $furthest_block->removeChild($child);
2480                            $clone->appendChild($child);
2481                        }
2482
2483                        /* 11. Append that clone to the furthest block. */
2484                        $furthest_block->appendChild($clone);
2485
2486                        /* 12. Remove the formatting element from the list
2487                        of active formatting elements, and insert the clone
2488                        into the list of active formatting elements at the
2489                        position of the aforementioned bookmark. */
2490                        $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2491                        unset($this->a_formatting[$fe_af_pos]);
2492                        $this->a_formatting = array_merge($this->a_formatting);
2493
2494                        $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2495                        $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2496                        $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2497
2498                        /* 13. Remove the formatting element from the stack
2499                        of open elements, and insert the clone into the stack
2500                        of open elements immediately after (i.e. in a more
2501                        deeply nested position than) the position of the
2502                        furthest block in that stack. */
2503                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2504                        $fb_s_pos = array_search($furthest_block, $this->stack, true);
2505                        unset($this->stack[$fe_s_pos]);
2506
2507                        $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2508                        $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2509                        $this->stack = array_merge($s_part1, array($clone), $s_part2);
2510
2511                        /* 14. Jump back to step 1 in this series of steps. */
2512                        unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2513                    }
2514                break;
2515
2516                /* An end tag token whose tag name is one of: "button",
2517                "marquee", "object" */
2518                case 'button': case 'marquee': case 'object':
2519                    /* If the stack of open elements has an element in scope whose
2520                    tag name matches the tag name of the token, then generate implied
2521                    tags. */
2522                    if($this->elementInScope($token['name'])) {
2523                        $this->generateImpliedEndTags();
2524
2525                        /* Now, if the current node is not an element with the same
2526                        tag name as the token, then this is a parse error. */
2527                        // k
2528
2529                        /* Now, if the stack of open elements has an element in scope
2530                        whose tag name matches the tag name of the token, then pop
2531                        elements from the stack until that element has been popped from
2532                        the stack, and clear the list of active formatting elements up
2533                        to the last marker. */
2534                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2535                            if($this->stack[$n]->nodeName === $token['name']) {
2536                                $n = -1;
2537                            }
2538
2539                            array_pop($this->stack);
2540                        }
2541
2542                        $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2543
2544                        for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2545                            array_pop($this->a_formatting);
2546                        }
2547                    }
2548                break;
2549
2550                /* Or an end tag whose tag name is one of: "area", "basefont",
2551                "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2552                "input", "isindex", "noembed", "noframes", "param", "select",
2553                "spacer", "table", "textarea", "wbr" */
2554                case 'area': case 'basefont': case 'bgsound': case 'br':
2555                case 'embed': case 'hr': case 'iframe': case 'image':
2556                case 'img': case 'input': case 'isindex': case 'noembed':
2557                case 'noframes': case 'param': case 'select': case 'spacer':
2558                case 'table': case 'textarea': case 'wbr':
2559                    // Parse error. Ignore the token.
2560                break;
2561
2562                /* An end tag token not covered by the previous entries */
2563                default:
2564                    for($n = count($this->stack) - 1; $n >= 0; $n--) {
2565                        /* Initialise node to be the current node (the bottommost
2566                        node of the stack). */
2567                        $node = end($this->stack);
2568
2569                        /* If node has the same tag name as the end tag token,
2570                        then: */
2571                        if($token['name'] === $node->nodeName) {
2572                            /* Generate implied end tags. */
2573                            $this->generateImpliedEndTags();
2574
2575                            /* If the tag name of the end tag token does not
2576                            match the tag name of the current node, this is a
2577                            parse error. */
2578                            // k
2579
2580                            /* Pop all the nodes from the current node up to
2581                            node, including node, then stop this algorithm. */
2582                            for($x = count($this->stack) - $n; $x >= $n; $x--) {
2583                                array_pop($this->stack);
2584                            }
2585
2586                        } else {
2587                            $category = $this->getElementCategory($node);
2588
2589                            if($category !== self::SPECIAL && $category !== self::SCOPING) {
2590                                /* Otherwise, if node is in neither the formatting
2591                                category nor the phrasing category, then this is a
2592                                parse error. Stop this algorithm. The end tag token
2593                                is ignored. */
2594                                return false;
2595                            }
2596                        }
2597                    }
2598                break;
2599            }
2600            break;
2601        }
2602    }
2603
2604    private function inTable($token)
2605    {
2606        $clear = array('html', 'table');
2607
2608        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2609        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2610        or U+0020 SPACE */
2611        if($token['type'] === HTML5::CHARACTR &&
2612        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2613            /* Append the character to the current node. */
2614            $text = $this->dom->createTextNode($token['data']);
2615            end($this->stack)->appendChild($text);
2616
2617        /* A comment token */
2618        } elseif($token['type'] === HTML5::COMMENT) {
2619            /* Append a Comment node to the current node with the data
2620            attribute set to the data given in the comment token. */
2621            $comment = $this->dom->createComment($token['data']);
2622            end($this->stack)->appendChild($comment);
2623
2624        /* A start tag whose tag name is "caption" */
2625        } elseif($token['type'] === HTML5::STARTTAG &&
2626        $token['name'] === 'caption') {
2627            /* Clear the stack back to a table context. */
2628            $this->clearStackToTableContext($clear);
2629
2630            /* Insert a marker at the end of the list of active
2631            formatting elements. */
2632            $this->a_formatting[] = self::MARKER;
2633
2634            /* Insert an HTML element for the token, then switch the
2635            insertion mode to "in caption". */
2636            $this->insertElement($token);
2637            $this->mode = self::IN_CAPTION;
2638
2639        /* A start tag whose tag name is "colgroup" */
2640        } elseif($token['type'] === HTML5::STARTTAG &&
2641        $token['name'] === 'colgroup') {
2642            /* Clear the stack back to a table context. */
2643            $this->clearStackToTableContext($clear);
2644
2645            /* Insert an HTML element for the token, then switch the
2646            insertion mode to "in column group". */
2647            $this->insertElement($token);
2648            $this->mode = self::IN_CGROUP;
2649
2650        /* A start tag whose tag name is "col" */
2651        } elseif($token['type'] === HTML5::STARTTAG &&
2652        $token['name'] === 'col') {
2653            $this->inTable(array(
2654                'name' => 'colgroup',
2655                'type' => HTML5::STARTTAG,
2656                'attr' => array()
2657            ));
2658
2659            $this->inColumnGroup($token);
2660
2661        /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2662        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2663        array('tbody', 'tfoot', 'thead'))) {
2664            /* Clear the stack back to a table context. */
2665            $this->clearStackToTableContext($clear);
2666
2667            /* Insert an HTML element for the token, then switch the insertion
2668            mode to "in table body". */
2669            $this->insertElement($token);
2670            $this->mode = self::IN_TBODY;
2671
2672        /* A start tag whose tag name is one of: "td", "th", "tr" */
2673        } elseif($token['type'] === HTML5::STARTTAG &&
2674        in_array($token['name'], array('td', 'th', 'tr'))) {
2675            /* Act as if a start tag token with the tag name "tbody" had been
2676            seen, then reprocess the current token. */
2677            $this->inTable(array(
2678                'name' => 'tbody',
2679                'type' => HTML5::STARTTAG,
2680                'attr' => array()
2681            ));
2682
2683            return $this->inTableBody($token);
2684
2685        /* A start tag whose tag name is "table" */
2686        } elseif($token['type'] === HTML5::STARTTAG &&
2687        $token['name'] === 'table') {
2688            /* Parse error. Act as if an end tag token with the tag name "table"
2689            had been seen, then, if that token wasn't ignored, reprocess the
2690            current token. */
2691            $this->inTable(array(
2692                'name' => 'table',
2693                'type' => HTML5::ENDTAG
2694            ));
2695
2696            return $this->mainPhase($token);
2697
2698        /* An end tag whose tag name is "table" */
2699        } elseif($token['type'] === HTML5::ENDTAG &&
2700        $token['name'] === 'table') {
2701            /* If the stack of open elements does not have an element in table
2702            scope with the same tag name as the token, this is a parse error.
2703            Ignore the token. (innerHTML case) */
2704            if(!$this->elementInScope($token['name'], true)) {
2705                return false;
2706
2707            /* Otherwise: */
2708            } else {
2709                /* Generate implied end tags. */
2710                $this->generateImpliedEndTags();
2711
2712                /* Now, if the current node is not a table element, then this
2713                is a parse error. */
2714                // w/e
2715
2716                /* Pop elements from this stack until a table element has been
2717                popped from the stack. */
2718                while(true) {
2719                    $current = end($this->stack)->nodeName;
2720                    array_pop($this->stack);
2721
2722                    if($current === 'table') {
2723                        break;
2724                    }
2725                }
2726
2727                /* Reset the insertion mode appropriately. */
2728                $this->resetInsertionMode();
2729            }
2730
2731        /* An end tag whose tag name is one of: "body", "caption", "col",
2732        "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2733        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2734        array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2735        'tfoot', 'th', 'thead', 'tr'))) {
2736            // Parse error. Ignore the token.
2737
2738        /* Anything else */
2739        } else {
2740            /* Parse error. Process the token as if the insertion mode was "in
2741            body", with the following exception: */
2742
2743            /* If the current node is a table, tbody, tfoot, thead, or tr
2744            element, then, whenever a node would be inserted into the current
2745            node, it must instead be inserted into the foster parent element. */
2746            if(in_array(end($this->stack)->nodeName,
2747            array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2748                /* The foster parent element is the parent element of the last
2749                table element in the stack of open elements, if there is a
2750                table element and it has such a parent element. If there is no
2751                table element in the stack of open elements (innerHTML case),
2752                then the foster parent element is the first element in the
2753                stack of open elements (the html  element). Otherwise, if there
2754                is a table element in the stack of open elements, but the last
2755                table element in the stack of open elements has no parent, or
2756                its parent node is not an element, then the foster parent
2757                element is the element before the last table element in the
2758                stack of open elements. */
2759                for($n = count($this->stack) - 1; $n >= 0; $n--) {
2760                    if($this->stack[$n]->nodeName === 'table') {
2761                        $table = $this->stack[$n];
2762                        break;
2763                    }
2764                }
2765
2766                if(isset($table) && $table->parentNode !== null) {
2767                    $this->foster_parent = $table->parentNode;
2768
2769                } elseif(!isset($table)) {
2770                    $this->foster_parent = $this->stack[0];
2771
2772                } elseif(isset($table) && ($table->parentNode === null ||
2773                $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2774                    $this->foster_parent = $this->stack[$n - 1];
2775                }
2776            }
2777
2778            $this->inBody($token);
2779        }
2780    }
2781
2782    private function inCaption($token)
2783    {
2784        /* An end tag whose tag name is "caption" */
2785        if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2786            /* If the stack of open elements does not have an element in table
2787            scope with the same tag name as the token, this is a parse error.
2788            Ignore the token. (innerHTML case) */
2789            if(!$this->elementInScope($token['name'], true)) {
2790                // Ignore
2791
2792            /* Otherwise: */
2793            } else {
2794                /* Generate implied end tags. */
2795                $this->generateImpliedEndTags();
2796
2797                /* Now, if the current node is not a caption element, then this
2798                is a parse error. */
2799                // w/e
2800
2801                /* Pop elements from this stack until a caption element has
2802                been popped from the stack. */
2803                while(true) {
2804                    $node = end($this->stack)->nodeName;
2805                    array_pop($this->stack);
2806
2807                    if($node === 'caption') {
2808                        break;
2809                    }
2810                }
2811
2812                /* Clear the list of active formatting elements up to the last
2813                marker. */
2814                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2815
2816                /* Switch the insertion mode to "in table". */
2817                $this->mode = self::IN_TABLE;
2818            }
2819
2820        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2821        "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2822        name is "table" */
2823        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2824        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2825        'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2826        $token['name'] === 'table')) {
2827            /* Parse error. Act as if an end tag with the tag name "caption"
2828            had been seen, then, if that token wasn't ignored, reprocess the
2829            current token. */
2830            $this->inCaption(array(
2831                'name' => 'caption',
2832                'type' => HTML5::ENDTAG
2833            ));
2834
2835            return $this->inTable($token);
2836
2837        /* An end tag whose tag name is one of: "body", "col", "colgroup",
2838        "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2839        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2840        array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2841        'thead', 'tr'))) {
2842            // Parse error. Ignore the token.
2843
2844        /* Anything else */
2845        } else {
2846            /* Process the token as if the insertion mode was "in body". */
2847            $this->inBody($token);
2848        }
2849    }
2850
2851    private function inColumnGroup($token)
2852    {
2853        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2854        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2855        or U+0020 SPACE */
2856        if($token['type'] === HTML5::CHARACTR &&
2857        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2858            /* Append the character to the current node. */
2859            $text = $this->dom->createTextNode($token['data']);
2860            end($this->stack)->appendChild($text);
2861
2862        /* A comment token */
2863        } elseif($token['type'] === HTML5::COMMENT) {
2864            /* Append a Comment node to the current node with the data
2865            attribute set to the data given in the comment token. */
2866            $comment = $this->dom->createComment($token['data']);
2867            end($this->stack)->appendChild($comment);
2868
2869        /* A start tag whose tag name is "col" */
2870        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2871            /* Insert a col element for the token. Immediately pop the current
2872            node off the stack of open elements. */
2873            $this->insertElement($token);
2874            array_pop($this->stack);
2875
2876        /* An end tag whose tag name is "colgroup" */
2877        } elseif($token['type'] === HTML5::ENDTAG &&
2878        $token['name'] === 'colgroup') {
2879            /* If the current node is the root html element, then this is a
2880            parse error, ignore the token. (innerHTML case) */
2881            if(end($this->stack)->nodeName === 'html') {
2882                // Ignore
2883
2884            /* Otherwise, pop the current node (which will be a colgroup
2885            element) from the stack of open elements. Switch the insertion
2886            mode to "in table". */
2887            } else {
2888                array_pop($this->stack);
2889                $this->mode = self::IN_TABLE;
2890            }
2891
2892        /* An end tag whose tag name is "col" */
2893        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2894            /* Parse error. Ignore the token. */
2895
2896        /* Anything else */
2897        } else {
2898            /* Act as if an end tag with the tag name "colgroup" had been seen,
2899            and then, if that token wasn't ignored, reprocess the current token. */
2900            $this->inColumnGroup(array(
2901                'name' => 'colgroup',
2902                'type' => HTML5::ENDTAG
2903            ));
2904
2905            return $this->inTable($token);
2906        }
2907    }
2908
2909    private function inTableBody($token)
2910    {
2911        $clear = array('tbody', 'tfoot', 'thead', 'html');
2912
2913        /* A start tag whose tag name is "tr" */
2914        if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2915            /* Clear the stack back to a table body context. */
2916            $this->clearStackToTableContext($clear);
2917
2918            /* Insert a tr element for the token, then switch the insertion
2919            mode to "in row". */
2920            $this->insertElement($token);
2921            $this->mode = self::IN_ROW;
2922
2923        /* A start tag whose tag name is one of: "th", "td" */
2924        } elseif($token['type'] === HTML5::STARTTAG &&
2925        ($token['name'] === 'th' ||    $token['name'] === 'td')) {
2926            /* Parse error. Act as if a start tag with the tag name "tr" had
2927            been seen, then reprocess the current token. */
2928            $this->inTableBody(array(
2929                'name' => 'tr',
2930                'type' => HTML5::STARTTAG,
2931                'attr' => array()
2932            ));
2933
2934            return $this->inRow($token);
2935
2936        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2937        } elseif($token['type'] === HTML5::ENDTAG &&
2938        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2939            /* If the stack of open elements does not have an element in table
2940            scope with the same tag name as the token, this is a parse error.
2941            Ignore the token. */
2942            if(!$this->elementInScope($token['name'], true)) {
2943                // Ignore
2944
2945            /* Otherwise: */
2946            } else {
2947                /* Clear the stack back to a table body context. */
2948                $this->clearStackToTableContext($clear);
2949
2950                /* Pop the current node from the stack of open elements. Switch
2951                the insertion mode to "in table". */
2952                array_pop($this->stack);
2953                $this->mode = self::IN_TABLE;
2954            }
2955
2956        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2957        "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2958        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2959        array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2960        ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2961            /* If the stack of open elements does not have a tbody, thead, or
2962            tfoot element in table scope, this is a parse error. Ignore the
2963            token. (innerHTML case) */
2964            if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2965                // Ignore.
2966
2967            /* Otherwise: */
2968            } else {
2969                /* Clear the stack back to a table body context. */
2970                $this->clearStackToTableContext($clear);
2971
2972                /* Act as if an end tag with the same tag name as the current
2973                node ("tbody", "tfoot", or "thead") had been seen, then
2974                reprocess the current token. */
2975                $this->inTableBody(array(
2976                    'name' => end($this->stack)->nodeName,
2977                    'type' => HTML5::ENDTAG
2978                ));
2979
2980                return $this->mainPhase($token);
2981            }
2982
2983        /* An end tag whose tag name is one of: "body", "caption", "col",
2984        "colgroup", "html", "td", "th", "tr" */
2985        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2986        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2987            /* Parse error. Ignore the token. */
2988
2989        /* Anything else */
2990        } else {
2991            /* Process the token as if the insertion mode was "in table". */
2992            $this->inTable($token);
2993        }
2994    }
2995
2996    private function inRow($token)
2997    {
2998        $clear = array('tr', 'html');
2999
3000        /* A start tag whose tag name is one of: "th", "td" */
3001        if($token['type'] === HTML5::STARTTAG &&
3002        ($token['name'] === 'th' || $token['name'] === 'td')) {
3003            /* Clear the stack back to a table row context. */
3004            $this->clearStackToTableContext($clear);
3005
3006            /* Insert an HTML element for the token, then switch the insertion
3007            mode to "in cell". */
3008            $this->insertElement($token);
3009            $this->mode = self::IN_CELL;
3010
3011            /* Insert a marker at the end of the list of active formatting
3012            elements. */
3013            $this->a_formatting[] = self::MARKER;
3014
3015        /* An end tag whose tag name is "tr" */
3016        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3017            /* If the stack of open elements does not have an element in table
3018            scope with the same tag name as the token, this is a parse error.
3019            Ignore the token. (innerHTML case) */
3020            if(!$this->elementInScope($token['name'], true)) {
3021                // Ignore.
3022
3023            /* Otherwise: */
3024            } else {
3025                /* Clear the stack back to a table row context. */
3026                $this->clearStackToTableContext($clear);
3027
3028                /* Pop the current node (which will be a tr element) from the
3029                stack of open elements. Switch the insertion mode to "in table
3030                body". */
3031                array_pop($this->stack);
3032                $this->mode = self::IN_TBODY;
3033            }
3034
3035        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3036        "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3037        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3038        array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3039            /* Act as if an end tag with the tag name "tr" had been seen, then,
3040            if that token wasn't ignored, reprocess the current token. */
3041            $this->inRow(array(
3042                'name' => 'tr',
3043                'type' => HTML5::ENDTAG
3044            ));
3045
3046            return $this->inCell($token);
3047
3048        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3049        } elseif($token['type'] === HTML5::ENDTAG &&
3050        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3051            /* If the stack of open elements does not have an element in table
3052            scope with the same tag name as the token, this is a parse error.
3053            Ignore the token. */
3054            if(!$this->elementInScope($token['name'], true)) {
3055                // Ignore.
3056
3057            /* Otherwise: */
3058            } else {
3059                /* Otherwise, act as if an end tag with the tag name "tr" had
3060                been seen, then reprocess the current token. */
3061                $this->inRow(array(
3062                    'name' => 'tr',
3063                    'type' => HTML5::ENDTAG
3064                ));
3065
3066                return $this->inCell($token);
3067            }
3068
3069        /* An end tag whose tag name is one of: "body", "caption", "col",
3070        "colgroup", "html", "td", "th" */
3071        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3072        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3073            /* Parse error. Ignore the token. */
3074
3075        /* Anything else */
3076        } else {
3077            /* Process the token as if the insertion mode was "in table". */
3078            $this->inTable($token);
3079        }
3080    }
3081
3082    private function inCell($token)
3083    {
3084        /* An end tag whose tag name is one of: "td", "th" */
3085        if($token['type'] === HTML5::ENDTAG &&
3086        ($token['name'] === 'td' || $token['name'] === 'th')) {
3087            /* If the stack of open elements does not have an element in table
3088            scope with the same tag name as that of the token, then this is a
3089            parse error and the token must be ignored. */
3090            if(!$this->elementInScope($token['name'], true)) {
3091                // Ignore.
3092
3093            /* Otherwise: */
3094            } else {
3095                /* Generate implied end tags, except for elements with the same
3096                tag name as the token. */
3097                $this->generateImpliedEndTags(array($token['name']));
3098
3099                /* Now, if the current node is not an element with the same tag
3100                name as the token, then this is a parse error. */
3101                // k
3102
3103                /* Pop elements from this stack until an element with the same
3104                tag name as the token has been popped from the stack. */
3105                while(true) {
3106                    $node = end($this->stack)->nodeName;
3107                    array_pop($this->stack);
3108
3109                    if($node === $token['name']) {
3110                        break;
3111                    }
3112                }
3113
3114                /* Clear the list of active formatting elements up to the last
3115                marker. */
3116                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3117
3118                /* Switch the insertion mode to "in row". (The current node
3119                will be a tr element at this point.) */
3120                $this->mode = self::IN_ROW;
3121            }
3122
3123        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3124        "tbody", "td", "tfoot", "th", "thead", "tr" */
3125        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3126        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3127        'thead', 'tr'))) {
3128            /* If the stack of open elements does not have a td or th element
3129            in table scope, then this is a parse error; ignore the token.
3130            (innerHTML case) */
3131            if(!$this->elementInScope(array('td', 'th'), true)) {
3132                // Ignore.
3133
3134            /* Otherwise, close the cell (see below) and reprocess the current
3135            token. */
3136            } else {
3137                $this->closeCell();
3138                return $this->inRow($token);
3139            }
3140
3141        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3142        "tbody", "td", "tfoot", "th", "thead", "tr" */
3143        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3144        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3145        'thead', 'tr'))) {
3146            /* If the stack of open elements does not have a td or th element
3147            in table scope, then this is a parse error; ignore the token.
3148            (innerHTML case) */
3149            if(!$this->elementInScope(array('td', 'th'), true)) {
3150                // Ignore.
3151
3152            /* Otherwise, close the cell (see below) and reprocess the current
3153            token. */
3154            } else {
3155                $this->closeCell();
3156                return $this->inRow($token);
3157            }
3158
3159        /* An end tag whose tag name is one of: "body", "caption", "col",
3160        "colgroup", "html" */
3161        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3162        array('body', 'caption', 'col', 'colgroup', 'html'))) {
3163            /* Parse error. Ignore the token. */
3164
3165        /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3166        "thead", "tr" */
3167        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3168        array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3169            /* If the stack of open elements does not have an element in table
3170            scope with the same tag name as that of the token (which can only
3171            happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3172            then this is a parse error and the token must be ignored. */
3173            if(!$this->elementInScope($token['name'], true)) {
3174                // Ignore.
3175
3176            /* Otherwise, close the cell (see below) and reprocess the current
3177            token. */
3178            } else {
3179                $this->closeCell();
3180                return $this->inRow($token);
3181            }
3182
3183        /* Anything else */
3184        } else {
3185            /* Process the token as if the insertion mode was "in body". */
3186            $this->inBody($token);
3187        }
3188    }
3189
3190    private function inSelect($token)
3191    {
3192        /* Handle the token as follows: */
3193
3194        /* A character token */
3195        if($token['type'] === HTML5::CHARACTR) {
3196            /* Append the token's character to the current node. */
3197            $this->insertText($token['data']);
3198
3199        /* A comment token */
3200        } elseif($token['type'] === HTML5::COMMENT) {
3201            /* Append a Comment node to the current node with the data
3202            attribute set to the data given in the comment token. */
3203            $this->insertComment($token['data']);
3204
3205        /* A start tag token whose tag name is "option" */
3206        } elseif($token['type'] === HTML5::STARTTAG &&
3207        $token['name'] === 'option') {
3208            /* If the current node is an option element, act as if an end tag
3209            with the tag name "option" had been seen. */
3210            if(end($this->stack)->nodeName === 'option') {
3211                $this->inSelect(array(
3212                    'name' => 'option',
3213                    'type' => HTML5::ENDTAG
3214                ));
3215            }
3216
3217            /* Insert an HTML element for the token. */
3218            $this->insertElement($token);
3219
3220        /* A start tag token whose tag name is "optgroup" */
3221        } elseif($token['type'] === HTML5::STARTTAG &&
3222        $token['name'] === 'optgroup') {
3223            /* If the current node is an option element, act as if an end tag
3224            with the tag name "option" had been seen. */
3225            if(end($this->stack)->nodeName === 'option') {
3226                $this->inSelect(array(
3227                    'name' => 'option',
3228                    'type' => HTML5::ENDTAG
3229                ));
3230            }
3231
3232            /* If the current node is an optgroup element, act as if an end tag
3233            with the tag name "optgroup" had been seen. */
3234            if(end($this->stack)->nodeName === 'optgroup') {
3235                $this->inSelect(array(
3236                    'name' => 'optgroup',
3237                    'type' => HTML5::ENDTAG
3238                ));
3239            }
3240
3241            /* Insert an HTML element for the token. */
3242            $this->insertElement($token);
3243
3244        /* An end tag token whose tag name is "optgroup" */
3245        } elseif($token['type'] === HTML5::ENDTAG &&
3246        $token['name'] === 'optgroup') {
3247            /* First, if the current node is an option element, and the node
3248            immediately before it in the stack of open elements is an optgroup
3249            element, then act as if an end tag with the tag name "option" had
3250            been seen. */
3251            $elements_in_stack = count($this->stack);
3252
3253            if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3254            $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3255                $this->inSelect(array(
3256                    'name' => 'option',
3257                    'type' => HTML5::ENDTAG
3258                ));
3259            }
3260
3261            /* If the current node is an optgroup element, then pop that node
3262            from the stack of open elements. Otherwise, this is a parse error,
3263            ignore the token. */
3264            if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3265                array_pop($this->stack);
3266            }
3267
3268        /* An end tag token whose tag name is "option" */
3269        } elseif($token['type'] === HTML5::ENDTAG &&
3270        $token['name'] === 'option') {
3271            /* If the current node is an option element, then pop that node
3272            from the stack of open elements. Otherwise, this is a parse error,
3273            ignore the token. */
3274            if(end($this->stack)->nodeName === 'option') {
3275                array_pop($this->stack);
3276            }
3277
3278        /* An end tag whose tag name is "select" */
3279        } elseif($token['type'] === HTML5::ENDTAG &&
3280        $token['name'] === 'select') {
3281            /* If the stack of open elements does not have an element in table
3282            scope with the same tag name as the token, this is a parse error.
3283            Ignore the token. (innerHTML case) */
3284            if(!$this->elementInScope($token['name'], true)) {
3285                // w/e
3286
3287            /* Otherwise: */
3288            } else {
3289                /* Pop elements from the stack of open elements until a select
3290                element has been popped from the stack. */
3291                while(true) {
3292                    $current = end($this->stack)->nodeName;
3293                    array_pop($this->stack);
3294
3295                    if($current === 'select') {
3296                        break;
3297                    }
3298                }
3299
3300                /* Reset the insertion mode appropriately. */
3301                $this->resetInsertionMode();
3302            }
3303
3304        /* A start tag whose tag name is "select" */
3305        } elseif($token['name'] === 'select' &&
3306        $token['type'] === HTML5::STARTTAG) {
3307            /* Parse error. Act as if the token had been an end tag with the
3308            tag name "select" instead. */
3309            $this->inSelect(array(
3310                'name' => 'select',
3311                'type' => HTML5::ENDTAG
3312            ));
3313
3314        /* An end tag whose tag name is one of: "caption", "table", "tbody",
3315        "tfoot", "thead", "tr", "td", "th" */
3316        } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3317        'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3318            /* Parse error. */
3319            // w/e
3320
3321            /* If the stack of open elements has an element in table scope with
3322            the same tag name as that of the token, then act as if an end tag
3323            with the tag name "select" had been seen, and reprocess the token.
3324            Otherwise, ignore the token. */
3325            if($this->elementInScope($token['name'], true)) {
3326                $this->inSelect(array(
3327                    'name' => 'select',
3328                    'type' => HTML5::ENDTAG
3329                ));
3330
3331                $this->mainPhase($token);
3332            }
3333
3334        /* Anything else */
3335        } else {
3336            /* Parse error. Ignore the token. */
3337        }
3338    }
3339
3340    private function afterBody($token)
3341    {
3342        /* Handle the token as follows: */
3343
3344        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3345        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3346        or U+0020 SPACE */
3347        if($token['type'] === HTML5::CHARACTR &&
3348        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3349            /* Process the token as it would be processed if the insertion mode
3350            was "in body". */
3351            $this->inBody($token);
3352
3353        /* A comment token */
3354        } elseif($token['type'] === HTML5::COMMENT) {
3355            /* Append a Comment node to the first element in the stack of open
3356            elements (the html element), with the data attribute set to the
3357            data given in the comment token. */
3358            $comment = $this->dom->createComment($token['data']);
3359            $this->stack[0]->appendChild($comment);
3360
3361        /* An end tag with the tag name "html" */
3362        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3363            /* If the parser was originally created in order to handle the
3364            setting of an element's innerHTML attribute, this is a parse error;
3365            ignore the token. (The element will be an html element in this
3366            case.) (innerHTML case) */
3367
3368            /* Otherwise, switch to the trailing end phase. */
3369            $this->phase = self::END_PHASE;
3370
3371        /* Anything else */
3372        } else {
3373            /* Parse error. Set the insertion mode to "in body" and reprocess
3374            the token. */
3375            $this->mode = self::IN_BODY;
3376            return $this->inBody($token);
3377        }
3378    }
3379
3380    private function inFrameset($token)
3381    {
3382        /* Handle the token as follows: */
3383
3384        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3385        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3386        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3387        if($token['type'] === HTML5::CHARACTR &&
3388        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3389            /* Append the character to the current node. */
3390            $this->insertText($token['data']);
3391
3392        /* A comment token */
3393        } elseif($token['type'] === HTML5::COMMENT) {
3394            /* Append a Comment node to the current node with the data
3395            attribute set to the data given in the comment token. */
3396            $this->insertComment($token['data']);
3397
3398        /* A start tag with the tag name "frameset" */
3399        } elseif($token['name'] === 'frameset' &&
3400        $token['type'] === HTML5::STARTTAG) {
3401            $this->insertElement($token);
3402
3403        /* An end tag with the tag name "frameset" */
3404        } elseif($token['name'] === 'frameset' &&
3405        $token['type'] === HTML5::ENDTAG) {
3406            /* If the current node is the root html element, then this is a
3407            parse error; ignore the token. (innerHTML case) */
3408            if(end($this->stack)->nodeName === 'html') {
3409                // Ignore
3410
3411            } else {
3412                /* Otherwise, pop the current node from the stack of open
3413                elements. */
3414                array_pop($this->stack);
3415
3416                /* If the parser was not originally created in order to handle
3417                the setting of an element's innerHTML attribute (innerHTML case),
3418                and the current node is no longer a frameset element, then change
3419                the insertion mode to "after frameset". */
3420                $this->mode = self::AFTR_FRAME;
3421            }
3422
3423        /* A start tag with the tag name "frame" */
3424        } elseif($token['name'] === 'frame' &&
3425        $token['type'] === HTML5::STARTTAG) {
3426            /* Insert an HTML element for the token. */
3427            $this->insertElement($token);
3428
3429            /* Immediately pop the current node off the stack of open elements. */
3430            array_pop($this->stack);
3431
3432        /* A start tag with the tag name "noframes" */
3433        } elseif($token['name'] === 'noframes' &&
3434        $token['type'] === HTML5::STARTTAG) {
3435            /* Process the token as if the insertion mode had been "in body". */
3436            $this->inBody($token);
3437
3438        /* Anything else */
3439        } else {
3440            /* Parse error. Ignore the token. */
3441        }
3442    }
3443
3444    private function afterFrameset($token)
3445    {
3446        /* Handle the token as follows: */
3447
3448        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3449        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3450        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3451        if($token['type'] === HTML5::CHARACTR &&
3452        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3453            /* Append the character to the current node. */
3454            $this->insertText($token['data']);
3455
3456        /* A comment token */
3457        } elseif($token['type'] === HTML5::COMMENT) {
3458            /* Append a Comment node to the current node with the data
3459            attribute set to the data given in the comment token. */
3460            $this->insertComment($token['data']);
3461
3462        /* An end tag with the tag name "html" */
3463        } elseif($token['name'] === 'html' &&
3464        $token['type'] === HTML5::ENDTAG) {
3465            /* Switch to the trailing end phase. */
3466            $this->phase = self::END_PHASE;
3467
3468        /* A start tag with the tag name "noframes" */
3469        } elseif($token['name'] === 'noframes' &&
3470        $token['type'] === HTML5::STARTTAG) {
3471            /* Process the token as if the insertion mode had been "in body". */
3472            $this->inBody($token);
3473
3474        /* Anything else */
3475        } else {
3476            /* Parse error. Ignore the token. */
3477        }
3478    }
3479
3480    private function trailingEndPhase($token)
3481    {
3482        /* After the main phase, as each token is emitted from the tokenisation
3483        stage, it must be processed as described in this section. */
3484
3485        /* A DOCTYPE token */
3486        if($token['type'] === HTML5::DOCTYPE) {
3487            // Parse error. Ignore the token.
3488
3489        /* A comment token */
3490        } elseif($token['type'] === HTML5::COMMENT) {
3491            /* Append a Comment node to the Document object with the data
3492            attribute set to the data given in the comment token. */
3493            $comment = $this->dom->createComment($token['data']);
3494            $this->dom->appendChild($comment);
3495
3496        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3497        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3498        or U+0020 SPACE */
3499        } elseif($token['type'] === HTML5::CHARACTR &&
3500        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3501            /* Process the token as it would be processed in the main phase. */
3502            $this->mainPhase($token);
3503
3504        /* A character token that is not one of U+0009 CHARACTER TABULATION,
3505        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3506        or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3507        } elseif(($token['type'] === HTML5::CHARACTR &&
3508        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3509        $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3510            /* Parse error. Switch back to the main phase and reprocess the
3511            token. */
3512            $this->phase = self::MAIN_PHASE;
3513            return $this->mainPhase($token);
3514
3515        /* An end-of-file token */
3516        } elseif($token['type'] === HTML5::EOF) {
3517            /* OMG DONE!! */
3518        }
3519    }
3520
3521    private function insertElement($token, $append = true)
3522    {
3523        $el = $this->dom->createElement($token['name']);
3524
3525        foreach($token['attr'] as $attr) {
3526            if(!$el->hasAttribute($attr['name'])) {
3527                $el->setAttribute($attr['name'], $attr['value']);
3528            }
3529        }
3530
3531        $this->appendToRealParent($el);
3532        $this->stack[] = $el;
3533
3534        return $el;
3535    }
3536
3537    private function insertText($data)
3538    {
3539        $text = $this->dom->createTextNode($data);
3540        $this->appendToRealParent($text);
3541    }
3542
3543    private function insertComment($data)
3544    {
3545        $comment = $this->dom->createComment($data);
3546        $this->appendToRealParent($comment);
3547    }
3548
3549    private function appendToRealParent($node)
3550    {
3551        if($this->foster_parent === null) {
3552            end($this->stack)->appendChild($node);
3553
3554        } elseif($this->foster_parent !== null) {
3555            /* If the foster parent element is the parent element of the
3556            last table element in the stack of open elements, then the new
3557            node must be inserted immediately before the last table element
3558            in the stack of open elements in the foster parent element;
3559            otherwise, the new node must be appended to the foster parent
3560            element. */
3561            for($n = count($this->stack) - 1; $n >= 0; $n--) {
3562                if($this->stack[$n]->nodeName === 'table' &&
3563                $this->stack[$n]->parentNode !== null) {
3564                    $table = $this->stack[$n];
3565                    break;
3566                }
3567            }
3568
3569            if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3570                $this->foster_parent->insertBefore($node, $table);
3571            else
3572                $this->foster_parent->appendChild($node);
3573
3574            $this->foster_parent = null;
3575        }
3576    }
3577
3578    private function elementInScope($el, $table = false)
3579    {
3580        if(is_array($el)) {
3581            foreach($el as $element) {
3582                if($this->elementInScope($element, $table)) {
3583                    return true;
3584                }
3585            }
3586
3587            return false;
3588        }
3589
3590        $leng = count($this->stack);
3591
3592        for($n = 0; $n < $leng; $n++) {
3593            /* 1. Initialise node to be the current node (the bottommost node of
3594            the stack). */
3595            $node = $this->stack[$leng - 1 - $n];
3596
3597            if($node->tagName === $el) {
3598                /* 2. If node is the target node, terminate in a match state. */
3599                return true;
3600
3601            } elseif($node->tagName === 'table') {
3602                /* 3. Otherwise, if node is a table element, terminate in a failure
3603                state. */
3604                return false;
3605
3606            } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3607            'th', 'button', 'marquee', 'object'))) {
3608                /* 4. Otherwise, if the algorithm is the "has an element in scope"
3609                variant (rather than the "has an element in table scope" variant),
3610                and node is one of the following, terminate in a failure state. */
3611                return false;
3612
3613            } elseif($node === $node->ownerDocument->documentElement) {
3614                /* 5. Otherwise, if node is an html element (root element), terminate
3615                in a failure state. (This can only happen if the node is the topmost
3616                node of the    stack of open elements, and prevents the next step from
3617                being invoked if there are no more elements in the stack.) */
3618                return false;
3619            }
3620
3621            /* Otherwise, set node to the previous entry in the stack of open
3622            elements and return to step 2. (This will never fail, since the loop
3623            will always terminate in the previous step if the top of the stack
3624            is reached.) */
3625        }
3626    }
3627
3628    private function reconstructActiveFormattingElements()
3629    {
3630        /* 1. If there are no entries in the list of active formatting elements,
3631        then there is nothing to reconstruct; stop this algorithm. */
3632        $formatting_elements = count($this->a_formatting);
3633
3634        if($formatting_elements === 0) {
3635            return false;
3636        }
3637
3638        /* 3. Let entry be the last (most recently added) element in the list
3639        of active formatting elements. */
3640        $entry = end($this->a_formatting);
3641
3642        /* 2. If the last (most recently added) entry in the list of active
3643        formatting elements is a marker, or if it is an element that is in the
3644        stack of open elements, then there is nothing to reconstruct; stop this
3645        algorithm. */
3646        if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3647            return false;
3648        }
3649
3650        for($a = $formatting_elements - 1; $a >= 0; true) {
3651            /* 4. If there are no entries before entry in the list of active
3652            formatting elements, then jump to step 8. */
3653            if($a === 0) {
3654                $step_seven = false;
3655                break;
3656            }
3657
3658            /* 5. Let entry be the entry one earlier than entry in the list of
3659            active formatting elements. */
3660            $a--;
3661            $entry = $this->a_formatting[$a];
3662
3663            /* 6. If entry is neither a marker nor an element that is also in
3664            thetack of open elements, go to step 4. */
3665            if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3666                break;
3667            }
3668        }
3669
3670        while(true) {
3671            /* 7. Let entry be the element one later than entry in the list of
3672            active formatting elements. */
3673            if(isset($step_seven) && $step_seven === true) {
3674                $a++;
3675                $entry = $this->a_formatting[$a];
3676            }
3677
3678            /* 8. Perform a shallow clone of the element entry to obtain clone. */
3679            $clone = $entry->cloneNode();
3680
3681            /* 9. Append clone to the current node and push it onto the stack
3682            of open elements  so that it is the new current node. */
3683            end($this->stack)->appendChild($clone);
3684            $this->stack[] = $clone;
3685
3686            /* 10. Replace the entry for entry in the list with an entry for
3687            clone. */
3688            $this->a_formatting[$a] = $clone;
3689
3690            /* 11. If the entry for clone in the list of active formatting
3691            elements is not the last entry in the list, return to step 7. */
3692            if(end($this->a_formatting) !== $clone) {
3693                $step_seven = true;
3694            } else {
3695                break;
3696            }
3697        }
3698    }
3699
3700    private function clearTheActiveFormattingElementsUpToTheLastMarker()
3701    {
3702        /* When the steps below require the UA to clear the list of active
3703        formatting elements up to the last marker, the UA must perform the
3704        following steps: */
3705
3706        while(true) {
3707            /* 1. Let entry be the last (most recently added) entry in the list
3708            of active formatting elements. */
3709            $entry = end($this->a_formatting);
3710
3711            /* 2. Remove entry from the list of active formatting elements. */
3712            array_pop($this->a_formatting);
3713
3714            /* 3. If entry was a marker, then stop the algorithm at this point.
3715            The list has been cleared up to the last marker. */
3716            if($entry === self::MARKER) {
3717                break;
3718            }
3719        }
3720    }
3721
3722    private function generateImpliedEndTags(array $exclude = array())
3723    {
3724        /* When the steps below require the UA to generate implied end tags,
3725        then, if the current node is a dd element, a dt element, an li element,
3726        a p element, a td element, a th  element, or a tr element, the UA must
3727        act as if an end tag with the respective tag name had been seen and
3728        then generate implied end tags again. */
3729        $node = end($this->stack);
3730        $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3731
3732        while(in_array(end($this->stack)->nodeName, $elements)) {
3733            array_pop($this->stack);
3734        }
3735    }
3736
3737    private function getElementCategory($name)
3738    {
3739        if(in_array($name, $this->special))
3740            return self::SPECIAL;
3741
3742        elseif(in_array($name, $this->scoping))
3743            return self::SCOPING;
3744
3745        elseif(in_array($name, $this->formatting))
3746            return self::FORMATTING;
3747
3748        else
3749            return self::PHRASING;
3750    }
3751
3752    private function clearStackToTableContext($elements)
3753    {
3754        /* When the steps above require the UA to clear the stack back to a
3755        table context, it means that the UA must, while the current node is not
3756        a table element or an html element, pop elements from the stack of open
3757        elements. If this causes any elements to be popped from the stack, then
3758        this is a parse error. */
3759        while(true) {
3760            $node = end($this->stack)->nodeName;
3761
3762            if(in_array($node, $elements)) {
3763                break;
3764            } else {
3765                array_pop($this->stack);
3766            }
3767        }
3768    }
3769
3770    private function resetInsertionMode()
3771    {
3772        /* 1. Let last be false. */
3773        $last = false;
3774        $leng = count($this->stack);
3775
3776        for($n = $leng - 1; $n >= 0; $n--) {
3777            /* 2. Let node be the last node in the stack of open elements. */
3778            $node = $this->stack[$n];
3779
3780            /* 3. If node is the first node in the stack of open elements, then
3781            set last to true. If the element whose innerHTML  attribute is being
3782            set is neither a td  element nor a th element, then set node to the
3783            element whose innerHTML  attribute is being set. (innerHTML  case) */
3784            if($this->stack[0]->isSameNode($node)) {
3785                $last = true;
3786            }
3787
3788            /* 4. If node is a select element, then switch the insertion mode to
3789            "in select" and abort these steps. (innerHTML case) */
3790            if($node->nodeName === 'select') {
3791                $this->mode = self::IN_SELECT;
3792                break;
3793
3794            /* 5. If node is a td or th element, then switch the insertion mode
3795            to "in cell" and abort these steps. */
3796            } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3797                $this->mode = self::IN_CELL;
3798                break;
3799
3800            /* 6. If node is a tr element, then switch the insertion mode to
3801            "in    row" and abort these steps. */
3802            } elseif($node->nodeName === 'tr') {
3803                $this->mode = self::IN_ROW;
3804                break;
3805
3806            /* 7. If node is a tbody, thead, or tfoot element, then switch the
3807            insertion mode to "in table body" and abort these steps. */
3808            } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3809                $this->mode = self::IN_TBODY;
3810                break;
3811
3812            /* 8. If node is a caption element, then switch the insertion mode
3813            to "in caption" and abort these steps. */
3814            } elseif($node->nodeName === 'caption') {
3815                $this->mode = self::IN_CAPTION;
3816                break;
3817
3818            /* 9. If node is a colgroup element, then switch the insertion mode
3819            to "in column group" and abort these steps. (innerHTML case) */
3820            } elseif($node->nodeName === 'colgroup') {
3821                $this->mode = self::IN_CGROUP;
3822                break;
3823
3824            /* 10. If node is a table element, then switch the insertion mode
3825            to "in table" and abort these steps. */
3826            } elseif($node->nodeName === 'table') {
3827                $this->mode = self::IN_TABLE;
3828                break;
3829
3830            /* 11. If node is a head element, then switch the insertion mode
3831            to "in body" ("in body"! not "in head"!) and abort these steps.
3832            (innerHTML case) */
3833            } elseif($node->nodeName === 'head') {
3834                $this->mode = self::IN_BODY;
3835                break;
3836
3837            /* 12. If node is a body element, then switch the insertion mode to
3838            "in body" and abort these steps. */
3839            } elseif($node->nodeName === 'body') {
3840                $this->mode = self::IN_BODY;
3841                break;
3842
3843            /* 13. If node is a frameset element, then switch the insertion
3844            mode to "in frameset" and abort these steps. (innerHTML case) */
3845            } elseif($node->nodeName === 'frameset') {
3846                $this->mode = self::IN_FRAME;
3847                break;
3848
3849            /* 14. If node is an html element, then: if the head element
3850            pointer is null, switch the insertion mode to "before head",
3851            otherwise, switch the insertion mode to "after head". In either
3852            case, abort these steps. (innerHTML case) */
3853            } elseif($node->nodeName === 'html') {
3854                $this->mode = ($this->head_pointer === null)
3855                    ? self::BEFOR_HEAD
3856                    : self::AFTER_HEAD;
3857
3858                break;
3859
3860            /* 15. If last is true, then set the insertion mode to "in body"
3861            and    abort these steps. (innerHTML case) */
3862            } elseif($last) {
3863                $this->mode = self::IN_BODY;
3864                break;
3865            }
3866        }
3867    }
3868
3869    private function closeCell()
3870    {
3871        /* If the stack of open elements has a td or th element in table scope,
3872        then act as if an end tag token with that tag name had been seen. */
3873        foreach(array('td', 'th') as $cell) {
3874            if($this->elementInScope($cell, true)) {
3875                $this->inCell(array(
3876                    'name' => $cell,
3877                    'type' => HTML5::ENDTAG
3878                ));
3879
3880                break;
3881            }
3882        }
3883    }
3884
3885    public function save()
3886    {
3887        return $this->dom;
3888    }
3889}
3890